43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
55#define DEBUG_TYPE "si-lower"
61 cl::desc(
"Do not align and prefetch loops"),
65 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
66 cl::desc(
"Use indirect register addressing for divergent indexes"),
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
83 return AMDGPU::SGPR0 +
Reg;
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
218 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
331 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
332 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
333 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
334 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
335 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
336 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
337 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
338 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
370 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
384 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
398 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
412 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
426 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
441 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
442 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
445 if (Subtarget->hasPkMovB32()) {
466 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
467 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
472 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
476 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
477 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
478 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
479 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
503 if (Subtarget->hasSMemRealTime() ||
508 if (Subtarget->has16BitInsts()) {
518 if (Subtarget->hasMadMacF32Insts())
536 if (Subtarget->hasIntClamp())
539 if (Subtarget->hasAddNoCarryInsts())
545 {MVT::f32, MVT::f64},
Custom);
551 {MVT::f32, MVT::f64},
Legal);
553 if (Subtarget->haveRoundOpsF64())
583 if (Subtarget->has16BitInsts()) {
636 if (Subtarget->hasBF16TransInsts())
659 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
660 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
661 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
816 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
817 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
818 MVT::v32f16, MVT::v32bf16},
828 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
832 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
836 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
837 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
845 if (Subtarget->hasVOP3PInsts()) {
857 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
860 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
861 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
862 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
865 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
873 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
881 {MVT::v2f16, MVT::v4f16},
Custom);
887 if (Subtarget->hasBF16PackedInsts()) {
892 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
899 if (Subtarget->hasPackedFP32Ops()) {
903 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
910 if (Subtarget->has16BitInsts()) {
923 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
924 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
925 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
926 MVT::v32f16, MVT::v32bf16},
931 if (Subtarget->hasVMulU64Inst())
933 else if (Subtarget->hasScalarSMulU64())
936 if (Subtarget->hasMad64_32())
939 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
942 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
944 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
947 if (Subtarget->hasMinimum3Maximum3F32())
950 if (Subtarget->hasMinimum3Maximum3PKF16()) {
954 if (!Subtarget->hasMinimum3Maximum3F16())
959 if (Subtarget->hasVOP3PInsts()) {
962 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
966 if (Subtarget->hasIntMinMax64())
971 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
972 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
977 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
978 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
979 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
980 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
984 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
985 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
986 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
987 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1002 if (Subtarget->hasBF16ConversionInsts()) {
1007 if (Subtarget->hasBF16TransInsts()) {
1011 if (Subtarget->hasCvtPkF16F32Inst()) {
1013 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1064 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1105 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1118 EVT DestVT,
EVT SrcVT)
const {
1120 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1121 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1123 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1130 LLT DestTy,
LLT SrcTy)
const {
1131 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1132 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1134 SrcTy.getScalarSizeInBits() == 16 &&
1155 return Subtarget->has16BitInsts()
1161 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1165 if (!Subtarget->has16BitInsts() && VT.
getSizeInBits() == 16)
1187 return (NumElts + 1) / 2;
1193 return NumElts * ((
Size + 31) / 32);
1202 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1211 MVT SimpleIntermediateVT =
1213 IntermediateVT = SimpleIntermediateVT;
1214 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1215 NumIntermediates = (NumElts + 1) / 2;
1216 return (NumElts + 1) / 2;
1221 IntermediateVT = RegisterVT;
1222 NumIntermediates = NumElts;
1223 return NumIntermediates;
1228 RegisterVT = MVT::i16;
1229 IntermediateVT = ScalarVT;
1230 NumIntermediates = NumElts;
1231 return NumIntermediates;
1235 RegisterVT = MVT::i32;
1236 IntermediateVT = ScalarVT;
1237 NumIntermediates = NumElts;
1238 return NumIntermediates;
1242 RegisterVT = MVT::i32;
1243 IntermediateVT = RegisterVT;
1244 NumIntermediates = NumElts * ((
Size + 31) / 32);
1245 return NumIntermediates;
1250 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1255 unsigned MaxNumLanes) {
1256 assert(MaxNumLanes != 0);
1260 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1271 unsigned MaxNumLanes) {
1277 assert(ST->getNumContainedTypes() == 2 &&
1278 ST->getContainedType(1)->isIntegerTy(32));
1292 return MVT::amdgpuBufferFatPointer;
1294 DL.getPointerSizeInBits(AS) == 192)
1295 return MVT::amdgpuBufferStridedPointer;
1304 DL.getPointerSizeInBits(AS) == 160) ||
1306 DL.getPointerSizeInBits(AS) == 192))
1313 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1314 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1315 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1317 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1318 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1319 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1320 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1321 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1322 case Intrinsic::amdgcn_flat_load_monitor_b32:
1323 case Intrinsic::amdgcn_global_load_monitor_b32:
1325 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1326 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1327 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1328 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1329 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1330 case Intrinsic::amdgcn_flat_load_monitor_b64:
1331 case Intrinsic::amdgcn_global_load_monitor_b64:
1333 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1334 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1335 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1336 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1337 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1338 case Intrinsic::amdgcn_flat_load_monitor_b128:
1339 case Intrinsic::amdgcn_global_load_monitor_b128:
1375 unsigned IntrID)
const {
1377 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1391 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1404 if (RsrcIntr->IsImage) {
1419 Info.ptrVal = RsrcArg;
1423 if (RsrcIntr->IsImage) {
1424 unsigned MaxNumLanes = 4;
1439 std::numeric_limits<unsigned>::max());
1449 if (RsrcIntr->IsImage) {
1469 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1471 Info.memVT = MVT::i32;
1478 case Intrinsic::amdgcn_raw_buffer_load_lds:
1479 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1480 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1481 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1482 case Intrinsic::amdgcn_struct_buffer_load_lds:
1483 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1484 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1485 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1499 CI.
getContext(), Width * 8 * Subtarget->getWavefrontSize());
1508 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1509 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1510 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1511 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1514 std::numeric_limits<unsigned>::max());
1527 case Intrinsic::amdgcn_ds_ordered_add:
1528 case Intrinsic::amdgcn_ds_ordered_swap: {
1542 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1543 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1546 Info.ptrVal =
nullptr;
1552 case Intrinsic::amdgcn_ds_append:
1553 case Intrinsic::amdgcn_ds_consume: {
1567 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1568 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1569 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1574 Info.memVT = MVT::i64;
1582 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1583 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1584 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1587 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1590 ->getElementType(0));
1599 case Intrinsic::amdgcn_global_atomic_fmin_num:
1600 case Intrinsic::amdgcn_global_atomic_fmax_num:
1601 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1602 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1603 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1614 case Intrinsic::amdgcn_cluster_load_b32:
1615 case Intrinsic::amdgcn_cluster_load_b64:
1616 case Intrinsic::amdgcn_cluster_load_b128:
1617 case Intrinsic::amdgcn_ds_load_tr6_b96:
1618 case Intrinsic::amdgcn_ds_load_tr4_b64:
1619 case Intrinsic::amdgcn_ds_load_tr8_b64:
1620 case Intrinsic::amdgcn_ds_load_tr16_b128:
1621 case Intrinsic::amdgcn_global_load_tr6_b96:
1622 case Intrinsic::amdgcn_global_load_tr4_b64:
1623 case Intrinsic::amdgcn_global_load_tr_b64:
1624 case Intrinsic::amdgcn_global_load_tr_b128:
1625 case Intrinsic::amdgcn_ds_read_tr4_b64:
1626 case Intrinsic::amdgcn_ds_read_tr6_b96:
1627 case Intrinsic::amdgcn_ds_read_tr8_b64:
1628 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1637 case Intrinsic::amdgcn_flat_load_monitor_b32:
1638 case Intrinsic::amdgcn_flat_load_monitor_b64:
1639 case Intrinsic::amdgcn_flat_load_monitor_b128:
1640 case Intrinsic::amdgcn_global_load_monitor_b32:
1641 case Intrinsic::amdgcn_global_load_monitor_b64:
1642 case Intrinsic::amdgcn_global_load_monitor_b128: {
1653 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1654 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1655 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1666 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1667 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1668 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1679 case Intrinsic::amdgcn_ds_gws_init:
1680 case Intrinsic::amdgcn_ds_gws_barrier:
1681 case Intrinsic::amdgcn_ds_gws_sema_v:
1682 case Intrinsic::amdgcn_ds_gws_sema_br:
1683 case Intrinsic::amdgcn_ds_gws_sema_p:
1684 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1694 Info.memVT = MVT::i32;
1696 Info.align =
Align(4);
1698 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1705 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1706 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1707 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1708 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1709 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1710 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1711 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1712 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1727 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1728 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1729 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1730 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1745 case Intrinsic::amdgcn_load_to_lds:
1746 case Intrinsic::amdgcn_load_async_to_lds:
1747 case Intrinsic::amdgcn_global_load_lds:
1748 case Intrinsic::amdgcn_global_load_async_lds: {
1767 Width * 8 * Subtarget->getWavefrontSize());
1773 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1774 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1775 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1776 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1786 Info.memVT = MVT::i32;
1788 Info.align =
Align(4);
1794 case Intrinsic::amdgcn_s_prefetch_data:
1795 case Intrinsic::amdgcn_flat_prefetch:
1796 case Intrinsic::amdgcn_global_prefetch: {
1812 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1815 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1816 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1828 Type *&AccessTy)
const {
1829 Value *Ptr =
nullptr;
1830 switch (
II->getIntrinsicID()) {
1831 case Intrinsic::amdgcn_cluster_load_b128:
1832 case Intrinsic::amdgcn_cluster_load_b64:
1833 case Intrinsic::amdgcn_cluster_load_b32:
1834 case Intrinsic::amdgcn_ds_append:
1835 case Intrinsic::amdgcn_ds_consume:
1836 case Intrinsic::amdgcn_ds_load_tr8_b64:
1837 case Intrinsic::amdgcn_ds_load_tr16_b128:
1838 case Intrinsic::amdgcn_ds_load_tr4_b64:
1839 case Intrinsic::amdgcn_ds_load_tr6_b96:
1840 case Intrinsic::amdgcn_ds_read_tr4_b64:
1841 case Intrinsic::amdgcn_ds_read_tr6_b96:
1842 case Intrinsic::amdgcn_ds_read_tr8_b64:
1843 case Intrinsic::amdgcn_ds_read_tr16_b64:
1844 case Intrinsic::amdgcn_ds_ordered_add:
1845 case Intrinsic::amdgcn_ds_ordered_swap:
1846 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1847 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1848 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1849 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1850 case Intrinsic::amdgcn_global_atomic_fmax_num:
1851 case Intrinsic::amdgcn_global_atomic_fmin_num:
1852 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1853 case Intrinsic::amdgcn_global_load_tr_b64:
1854 case Intrinsic::amdgcn_global_load_tr_b128:
1855 case Intrinsic::amdgcn_global_load_tr4_b64:
1856 case Intrinsic::amdgcn_global_load_tr6_b96:
1857 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1858 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1859 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1860 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1861 Ptr =
II->getArgOperand(0);
1863 case Intrinsic::amdgcn_load_to_lds:
1864 case Intrinsic::amdgcn_load_async_to_lds:
1865 case Intrinsic::amdgcn_global_load_lds:
1866 case Intrinsic::amdgcn_global_load_async_lds:
1867 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1868 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1869 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1870 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1871 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1872 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1873 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1874 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1875 Ptr =
II->getArgOperand(1);
1880 AccessTy =
II->getType();
1886 unsigned AddrSpace)
const {
1887 if (!Subtarget->hasFlatInstOffsets()) {
1898 return AM.
Scale == 0 &&
1899 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1900 AM.
BaseOffs, AddrSpace, FlatVariant));
1904 if (Subtarget->hasFlatGlobalInsts())
1907 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1920 return isLegalMUBUFAddressingMode(AM);
1923bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1934 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1946 if (AM.HasBaseReg) {
1978 return isLegalMUBUFAddressingMode(AM);
1980 if (!Subtarget->hasScalarSubwordLoads()) {
1985 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
2033 return Subtarget->hasFlatScratchEnabled()
2035 : isLegalMUBUFAddressingMode(AM);
2082 unsigned Size,
unsigned AddrSpace,
Align Alignment,
2091 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
2094 Align RequiredAlignment(
2096 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
Size > 32 &&
2097 Alignment < RequiredAlignment)
2112 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2118 RequiredAlignment =
Align(4);
2120 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2136 *IsFast = (Alignment >= RequiredAlignment) ? 64
2137 : (Alignment <
Align(4)) ? 32
2144 if (!Subtarget->hasDS96AndDS128())
2150 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2159 *IsFast = (Alignment >= RequiredAlignment) ? 96
2160 : (Alignment <
Align(4)) ? 32
2167 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2173 RequiredAlignment =
Align(8);
2175 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2184 *IsFast = (Alignment >= RequiredAlignment) ? 128
2185 : (Alignment <
Align(4)) ? 32
2202 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2204 return Alignment >= RequiredAlignment ||
2205 Subtarget->hasUnalignedDSAccessEnabled();
2213 bool AlignedBy4 = Alignment >=
Align(4);
2214 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2216 *IsFast = AlignedBy4 ?
Size : 1;
2221 *IsFast = AlignedBy4;
2232 return Alignment >=
Align(4) ||
2233 Subtarget->hasUnalignedBufferAccessEnabled();
2245 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2260 return Size >= 32 && Alignment >=
Align(4);
2265 unsigned *IsFast)
const {
2267 Alignment, Flags, IsFast);
2272 const AttributeList &FuncAttributes)
const {
2278 if (
Op.size() >= 16 &&
2282 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2300 unsigned DestAS)
const {
2303 Subtarget->hasGloballyAddressableScratch()) {
2333 unsigned Index)
const {
2345 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2350 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2385 auto [InputPtrReg, RC, ArgTy] =
2401 const SDLoc &SL)
const {
2408 const SDLoc &SL)
const {
2411 std::optional<uint32_t> KnownSize =
2413 if (KnownSize.has_value())
2440 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2455SDValue SITargetLowering::lowerKernargMemParameter(
2460 MachinePointerInfo PtrInfo =
2469 int64_t OffsetDiff =
Offset - AlignDownOffset;
2475 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2486 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2491 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2496 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2505 const SDLoc &SL)
const {
2574 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2577 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2578 if (ConvertedVal == ArgValue)
2579 return ConvertedVal;
2584SDValue SITargetLowering::lowerWorkGroupId(
2589 if (!Subtarget->hasClusters())
2590 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2598 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2599 SDLoc SL(ClusterIdXYZ);
2600 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2603 SDValue ClusterWorkGroupIdXYZ =
2604 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2614 return ClusterIdXYZ;
2616 using namespace AMDGPU::Hwreg;
2620 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2631SDValue SITargetLowering::getPreloadedValue(
2634 const ArgDescriptor *
Reg =
nullptr;
2635 const TargetRegisterClass *RC;
2639 const ArgDescriptor WorkGroupIDX =
2647 const ArgDescriptor WorkGroupIDZ =
2649 const ArgDescriptor ClusterWorkGroupIDX =
2651 const ArgDescriptor ClusterWorkGroupIDY =
2653 const ArgDescriptor ClusterWorkGroupIDZ =
2655 const ArgDescriptor ClusterWorkGroupMaxIDX =
2657 const ArgDescriptor ClusterWorkGroupMaxIDY =
2659 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2661 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2664 auto LoadConstant = [&](
unsigned N) {
2668 if (Subtarget->hasArchitectedSGPRs() &&
2675 Reg = &WorkGroupIDX;
2676 RC = &AMDGPU::SReg_32RegClass;
2680 Reg = &WorkGroupIDY;
2681 RC = &AMDGPU::SReg_32RegClass;
2685 Reg = &WorkGroupIDZ;
2686 RC = &AMDGPU::SReg_32RegClass;
2690 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2691 return LoadConstant(0);
2692 Reg = &ClusterWorkGroupIDX;
2693 RC = &AMDGPU::SReg_32RegClass;
2697 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2698 return LoadConstant(0);
2699 Reg = &ClusterWorkGroupIDY;
2700 RC = &AMDGPU::SReg_32RegClass;
2704 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2705 return LoadConstant(0);
2706 Reg = &ClusterWorkGroupIDZ;
2707 RC = &AMDGPU::SReg_32RegClass;
2712 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2713 Reg = &ClusterWorkGroupMaxIDX;
2714 RC = &AMDGPU::SReg_32RegClass;
2719 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2720 Reg = &ClusterWorkGroupMaxIDY;
2721 RC = &AMDGPU::SReg_32RegClass;
2726 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2727 Reg = &ClusterWorkGroupMaxIDZ;
2728 RC = &AMDGPU::SReg_32RegClass;
2732 Reg = &ClusterWorkGroupMaxFlatID;
2733 RC = &AMDGPU::SReg_32RegClass;
2764 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2768 "vector type argument should have been split");
2773 bool SkipArg = !Arg->
Used && !Info->isPSInputAllocated(PSInputNum);
2781 "unexpected vector split in ps argument type");
2795 Info->markPSInputAllocated(PSInputNum);
2797 Info->markPSInputEnabled(PSInputNum);
2813 if (Info.hasWorkItemIDX()) {
2819 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2823 if (Info.hasWorkItemIDY()) {
2824 assert(Info.hasWorkItemIDX());
2825 if (Subtarget->hasPackedTID()) {
2826 Info.setWorkItemIDY(
2829 unsigned Reg = AMDGPU::VGPR1;
2837 if (Info.hasWorkItemIDZ()) {
2838 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2839 if (Subtarget->hasPackedTID()) {
2840 Info.setWorkItemIDZ(
2843 unsigned Reg = AMDGPU::VGPR2;
2863 if (RegIdx == ArgVGPRs.
size()) {
2870 unsigned Reg = ArgVGPRs[RegIdx];
2882 unsigned NumArgRegs) {
2885 if (RegIdx == ArgSGPRs.
size())
2888 unsigned Reg = ArgSGPRs[RegIdx];
2930 const unsigned Mask = 0x3ff;
2933 if (Info.hasWorkItemIDX()) {
2935 Info.setWorkItemIDX(Arg);
2938 if (Info.hasWorkItemIDY()) {
2940 Info.setWorkItemIDY(Arg);
2943 if (Info.hasWorkItemIDZ())
2955 const unsigned Mask = 0x3ff;
2964 auto &
ArgInfo = Info.getArgInfo();
2976 if (Info.hasImplicitArgPtr())
2984 if (Info.hasWorkGroupIDX())
2987 if (Info.hasWorkGroupIDY())
2990 if (Info.hasWorkGroupIDZ())
2993 if (Info.hasLDSKernelId())
3004 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
3005 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
3011 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
3012 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
3017 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
3018 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
3024 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3030 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
3039 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3044 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
3045 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3050 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
3051 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3066 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3068 bool InPreloadSequence =
true;
3070 bool AlignedForImplictArgs =
false;
3071 unsigned ImplicitArgOffset = 0;
3072 for (
auto &Arg :
F.args()) {
3073 if (!InPreloadSequence || !Arg.hasInRegAttr())
3076 unsigned ArgIdx = Arg.getArgNo();
3079 if (InIdx < Ins.
size() &&
3080 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3083 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
3084 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3086 assert(ArgLocs[ArgIdx].isMemLoc());
3087 auto &ArgLoc = ArgLocs[InIdx];
3089 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3091 unsigned NumAllocSGPRs =
3092 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3095 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
3096 if (!AlignedForImplictArgs) {
3098 alignTo(LastExplicitArgOffset,
3099 Subtarget->getAlignmentForImplicitArgPtr()) -
3100 LastExplicitArgOffset;
3101 AlignedForImplictArgs =
true;
3103 ArgOffset += ImplicitArgOffset;
3107 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3108 assert(InIdx >= 1 &&
"No previous SGPR");
3109 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3110 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3114 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3115 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3118 InPreloadSequence =
false;
3124 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3126 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3128 if (PreloadRegs->
size() > 1)
3129 RC = &AMDGPU::SGPR_32RegClass;
3130 for (
auto &Reg : *PreloadRegs) {
3136 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3145 if (Info.hasLDSKernelId()) {
3146 Register Reg = Info.addLDSKernelId();
3147 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3156 bool IsShader)
const {
3157 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3158 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3164 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3166 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3170 unsigned NumRequiredSystemSGPRs =
3171 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3172 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3173 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3174 Register Reg = Info.addReservedUserSGPR();
3175 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3180 if (!HasArchitectedSGPRs) {
3181 if (Info.hasWorkGroupIDX()) {
3182 Register Reg = Info.addWorkGroupIDX();
3183 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3187 if (Info.hasWorkGroupIDY()) {
3188 Register Reg = Info.addWorkGroupIDY();
3189 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3193 if (Info.hasWorkGroupIDZ()) {
3194 Register Reg = Info.addWorkGroupIDZ();
3195 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3200 if (Info.hasWorkGroupInfo()) {
3201 Register Reg = Info.addWorkGroupInfo();
3202 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3206 if (Info.hasPrivateSegmentWaveByteOffset()) {
3208 unsigned PrivateSegmentWaveByteOffsetReg;
3211 PrivateSegmentWaveByteOffsetReg =
3212 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3216 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3218 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3221 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3223 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3224 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3227 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3228 Info.getNumPreloadedSGPRs() >= 16);
3243 if (HasStackObjects)
3244 Info.setHasNonSpillStackObjects(
true);
3249 HasStackObjects =
true;
3253 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3255 if (!ST.hasFlatScratchEnabled()) {
3256 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3263 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3265 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3275 Info.setScratchRSrcReg(ReservedBufferReg);
3294 if (!MRI.
isLiveIn(AMDGPU::SGPR32)) {
3295 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3302 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3304 Info.setStackPtrOffsetReg(
Reg);
3309 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3316 if (ST.getFrameLowering()->hasFP(MF)) {
3317 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3333 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3342 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3343 RC = &AMDGPU::SGPR_64RegClass;
3344 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3345 RC = &AMDGPU::SGPR_32RegClass;
3351 Entry->addLiveIn(*
I);
3356 for (
auto *Exit : Exits)
3358 TII->get(TargetOpcode::COPY), *
I)
3373 bool IsError =
false;
3377 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3395 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3396 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3398 if (!Subtarget->hasFlatScratchEnabled())
3403 !Subtarget->hasArchitectedSGPRs())
3404 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3405 !Info->hasWorkGroupIDZ());
3408 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3426 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3427 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3430 Info->markPSInputAllocated(0);
3431 Info->markPSInputEnabled(0);
3433 if (Subtarget->isAmdPalOS()) {
3442 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3443 if ((PsInputBits & 0x7F) == 0 ||
3444 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3447 }
else if (IsKernel) {
3448 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3460 if (IsKernel && Subtarget->hasKernargPreload())
3464 }
else if (!IsGraphics) {
3469 if (!Subtarget->hasFlatScratchEnabled())
3481 Info->setNumWaveDispatchSGPRs(
3483 Info->setNumWaveDispatchVGPRs(
3485 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3486 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3491 if (IsWholeWaveFunc) {
3493 {MVT::i1, MVT::Other}, Chain);
3505 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3516 if (IsEntryFunc && VA.
isMemLoc()) {
3539 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3543 int64_t OffsetDiff =
Offset - AlignDownOffset;
3550 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3553 Register VReg = MRI.getLiveInVirtReg(Reg);
3561 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3562 Ins[i].Flags.isSExt(), &Ins[i]);
3570 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3573 if (PreloadRegs.
size() == 1) {
3574 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3579 TRI->getRegSizeInBits(*RC)));
3587 for (
auto Reg : PreloadRegs) {
3588 Register VReg = MRI.getLiveInVirtReg(Reg);
3594 PreloadRegs.size()),
3611 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3612 Ins[i].Flags.isSExt(), &Ins[i]);
3624 "hidden argument in kernel signature was not preloaded",
3630 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3631 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3651 if (!IsEntryFunc && VA.
isMemLoc()) {
3652 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3663 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3664 RC = &AMDGPU::VGPR_32RegClass;
3665 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3666 RC = &AMDGPU::SGPR_32RegClass;
3672 if (Arg.
Flags.
isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3678 ReadFirstLane, Val);
3694 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3703 Info->setBytesInStackArgArea(StackArgSize);
3705 return Chains.
empty() ? Chain
3714 const Type *RetTy)
const {
3722 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3727 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3728 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3729 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3730 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3753 Info->setIfReturnsVoid(Outs.
empty());
3754 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3773 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3774 ++
I, ++RealRVLocIdx) {
3778 SDValue Arg = OutVals[RealRVLocIdx];
3801 ReadFirstLane, Arg);
3808 if (!Info->isEntryFunction()) {
3814 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3816 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3829 unsigned Opc = AMDGPUISD::ENDPGM;
3831 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3832 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3833 : AMDGPUISD::RET_GLUE;
3938 const auto [OutgoingArg, ArgRC, ArgTy] =
3943 const auto [IncomingArg, IncomingArgRC, Ty] =
3945 assert(IncomingArgRC == ArgRC);
3948 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3956 InputReg = getImplicitArgPtr(DAG,
DL);
3958 std::optional<uint32_t> Id =
3960 if (Id.has_value()) {
3971 if (OutgoingArg->isRegister()) {
3972 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3973 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
3976 unsigned SpecialArgOffset =
3987 auto [OutgoingArg, ArgRC, Ty] =
3990 std::tie(OutgoingArg, ArgRC, Ty) =
3993 std::tie(OutgoingArg, ArgRC, Ty) =
4008 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
4009 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
4010 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
4015 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
4023 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
4033 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
4042 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4043 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4054 : IncomingArgY ? *IncomingArgY
4061 if (OutgoingArg->isRegister()) {
4063 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4089 if (Callee->isDivergent())
4096 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4100 if (!CallerPreserved)
4103 bool CCMatch = CallerCC == CalleeCC;
4116 if (Arg.hasByValAttr())
4130 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4131 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4140 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4153 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4155 if (!CCVA.isRegLoc())
4160 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4162 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4186enum ChainCallArgIdx {
4208 bool UsesDynamicVGPRs =
false;
4209 if (IsChainCallConv) {
4214 auto RequestedExecIt =
4216 return Arg.OrigArgIndex == 2;
4218 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4220 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4223 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4226 "Haven't popped all the special args");
4229 CLI.
Args[ChainCallArgIdx::Exec];
4230 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4238 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4240 ChainCallSpecialArgs.
push_back(Arg.Node);
4243 PushNodeOrTargetConstant(RequestedExecArg);
4249 if (FlagsValue.
isZero()) {
4250 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4252 "no additional args allowed if flags == 0");
4254 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4258 if (!Subtarget->isWave32()) {
4260 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4263 UsesDynamicVGPRs =
true;
4264 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4265 CLI.
Args.end(), PushNodeOrTargetConstant);
4274 bool IsSibCall =
false;
4288 "unsupported call to variadic function ");
4296 "unsupported required tail call to function ");
4301 Outs, OutVals, Ins, DAG);
4305 "site marked musttail or on llvm.amdgcn.cs.chain");
4312 if (!TailCallOpt && IsTailCall)
4336 if (!Subtarget->hasFlatScratchEnabled())
4357 auto *
TRI = Subtarget->getRegisterInfo();
4364 if (!IsSibCall || IsChainCallConv) {
4365 if (!Subtarget->hasFlatScratchEnabled()) {
4371 RegsToPass.emplace_back(IsChainCallConv
4372 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4373 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4380 const unsigned NumSpecialInputs = RegsToPass.size();
4382 MVT PtrVT = MVT::i32;
4385 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4413 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4421 int32_t
Offset = LocMemOffset;
4428 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4434 ? Flags.getNonZeroByValAlign()
4461 if (Outs[i].Flags.isByVal()) {
4463 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4466 Outs[i].Flags.getNonZeroByValAlign(),
4468 nullptr, std::nullopt, DstInfo,
4474 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4480 if (!MemOpChains.
empty())
4496 unsigned ArgIdx = 0;
4497 for (
auto [Reg, Val] : RegsToPass) {
4498 if (ArgIdx++ >= NumSpecialInputs &&
4499 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4525 if (IsTailCall && !IsSibCall) {
4530 std::vector<SDValue>
Ops({Chain});
4536 Ops.push_back(Callee);
4553 Ops.push_back(Callee);
4564 if (IsChainCallConv)
4569 for (
auto &[Reg, Val] : RegsToPass)
4573 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4574 assert(Mask &&
"Missing call preserved mask for calling convention");
4584 MVT::Glue, GlueOps),
4589 Ops.push_back(InGlue);
4595 unsigned OPC = AMDGPUISD::TC_RETURN;
4598 OPC = AMDGPUISD::TC_RETURN_GFX;
4602 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4603 : AMDGPUISD::TC_RETURN_CHAIN;
4609 if (Info->isWholeWaveFunction())
4610 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4617 Chain =
Call.getValue(0);
4618 InGlue =
Call.getValue(1);
4620 uint64_t CalleePopBytes = NumBytes;
4641 EVT VT =
Op.getValueType();
4655 "Stack grows upwards for AMDGPU");
4657 Chain = BaseAddr.getValue(1);
4659 if (Alignment > StackAlign) {
4661 << Subtarget->getWavefrontSizeLog2();
4662 uint64_t StackAlignMask = ScaledAlignment - 1;
4669 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4675 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4686 DAG.
getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4702 if (
Op.getValueType() != MVT::i32)
4721 assert(
Op.getValueType() == MVT::i32);
4730 Op.getOperand(0), IntrinID, GetRoundBothImm);
4764 SDValue RoundModeTimesNumBits =
4784 TableEntry, EnumOffset);
4800 static_cast<uint32_t>(ConstMode->getZExtValue()),
4812 if (UseReducedTable) {
4818 SDValue RoundModeTimesNumBits =
4838 SDValue RoundModeTimesNumBits =
4847 NewMode = TruncTable;
4856 ReadFirstLaneID, NewMode);
4869 IntrinID, RoundBothImm, NewMode);
4875 if (
Op->isDivergent() &&
4876 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4886 if (Subtarget->hasSafeSmemPrefetch())
4894 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4903 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4904 EVT SrcVT = Src.getValueType();
4913 EVT DstVT =
Op.getValueType();
4922 if (
Op.getValueType() != MVT::i64)
4936 Op.getOperand(0), IntrinID, ModeHwRegImm);
4938 Op.getOperand(0), IntrinID, TrapHwRegImm);
4952 if (
Op.getOperand(1).getValueType() != MVT::i64)
4964 ReadFirstLaneID, NewModeReg);
4966 ReadFirstLaneID, NewTrapReg);
4968 unsigned ModeHwReg =
4971 unsigned TrapHwReg =
4979 IntrinID, ModeHwRegImm, NewModeReg);
4982 IntrinID, TrapHwRegImm, NewTrapReg);
4991 .
Case(
"m0", AMDGPU::M0)
4992 .
Case(
"exec", AMDGPU::EXEC)
4993 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
4994 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
4995 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
4996 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4997 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
5002 if (!Subtarget->hasFlatScrRegister() &&
5003 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
5005 "\" for subtarget."));
5010 case AMDGPU::EXEC_LO:
5011 case AMDGPU::EXEC_HI:
5012 case AMDGPU::FLAT_SCR_LO:
5013 case AMDGPU::FLAT_SCR_HI:
5018 case AMDGPU::FLAT_SCR:
5037 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
5046static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5068 auto Next = std::next(
I);
5079 MBB.addSuccessor(LoopBB);
5081 return std::pair(LoopBB, RemainderBB);
5088 auto I =
MI.getIterator();
5089 auto E = std::next(
I);
5111 Src->setIsKill(
false);
5121 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5130 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5154 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5155 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5177 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5184 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5188 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5198 if (UseGPRIdxMode) {
5200 SGPRIdxReg = CurrentIdxReg;
5203 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5213 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5244 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5245 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5253 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5272 InitResultReg, DstReg, PhiReg, TmpExec,
5273 Offset, UseGPRIdxMode, SGPRIdxReg);
5279 LoopBB->removeSuccessor(RemainderBB);
5281 LoopBB->addSuccessor(LandingPad);
5292static std::pair<unsigned, int>
5296 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5301 return std::pair(AMDGPU::sub0,
Offset);
5358 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5359 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5365 std::tie(SubReg,
Offset) =
5368 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5371 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5375 if (UseGPRIdxMode) {
5382 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5391 .
addReg(SrcReg, {}, SubReg)
5395 MI.eraseFromParent();
5411 UseGPRIdxMode, SGPRIdxReg);
5415 if (UseGPRIdxMode) {
5417 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5419 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5424 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5425 .
addReg(SrcReg, {}, SubReg)
5429 MI.eraseFromParent();
5446 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5454 std::tie(SubReg,
Offset) =
5456 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5458 if (Idx->
getReg() == AMDGPU::NoRegister) {
5469 MI.eraseFromParent();
5474 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5478 if (UseGPRIdxMode) {
5482 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5491 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5492 TRI.getRegSizeInBits(*VecRC), 32,
false);
5498 MI.eraseFromParent();
5512 UseGPRIdxMode, SGPRIdxReg);
5515 if (UseGPRIdxMode) {
5517 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5519 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5525 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5526 TRI.getRegSizeInBits(*VecRC), 32,
false);
5527 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5533 MI.eraseFromParent();
5549 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5550 if (ST.hasScalarAddSub64()) {
5551 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5561 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5565 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5567 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5570 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5572 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5574 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5575 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5584 MI.eraseFromParent();
5598 Register SrcCond =
MI.getOperand(3).getReg();
5606 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src0);
5608 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src1);
5610 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), Src0Idx));
5612 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), Src1Idx));
5615 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5617 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5620 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5622 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5625 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5627 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5649 MI.eraseFromParent();
5654 case AMDGPU::S_MIN_U32:
5655 return std::numeric_limits<uint32_t>::max();
5656 case AMDGPU::S_MIN_I32:
5657 return std::numeric_limits<int32_t>::max();
5658 case AMDGPU::S_MAX_U32:
5659 return std::numeric_limits<uint32_t>::min();
5660 case AMDGPU::S_MAX_I32:
5661 return std::numeric_limits<int32_t>::min();
5662 case AMDGPU::V_ADD_F32_e64:
5664 case AMDGPU::V_SUB_F32_e64:
5666 case AMDGPU::S_ADD_I32:
5667 case AMDGPU::S_SUB_I32:
5668 case AMDGPU::S_OR_B32:
5669 case AMDGPU::S_XOR_B32:
5670 return std::numeric_limits<uint32_t>::min();
5671 case AMDGPU::S_AND_B32:
5672 return std::numeric_limits<uint32_t>::max();
5673 case AMDGPU::V_MIN_F32_e64:
5674 case AMDGPU::V_MAX_F32_e64:
5676 case AMDGPU::V_CMP_LT_U64_e64:
5677 return std::numeric_limits<uint64_t>::max();
5678 case AMDGPU::V_CMP_LT_I64_e64:
5679 return std::numeric_limits<int64_t>::max();
5680 case AMDGPU::V_CMP_GT_U64_e64:
5681 return std::numeric_limits<uint64_t>::min();
5682 case AMDGPU::V_CMP_GT_I64_e64:
5683 return std::numeric_limits<int64_t>::min();
5684 case AMDGPU::V_MIN_F64_e64:
5685 case AMDGPU::V_MAX_F64_e64:
5686 case AMDGPU::V_MIN_NUM_F64_e64:
5687 case AMDGPU::V_MAX_NUM_F64_e64:
5688 return 0x7FF8000000000000;
5689 case AMDGPU::S_ADD_U64_PSEUDO:
5690 case AMDGPU::S_SUB_U64_PSEUDO:
5691 case AMDGPU::S_OR_B64:
5692 case AMDGPU::S_XOR_B64:
5693 return std::numeric_limits<uint64_t>::min();
5694 case AMDGPU::S_AND_B64:
5695 return std::numeric_limits<uint64_t>::max();
5696 case AMDGPU::V_ADD_F64_e64:
5697 case AMDGPU::V_ADD_F64_pseudo_e64:
5698 return 0x8000000000000000;
5705 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5706 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5707 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5708 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5709 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5710 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5711 Opc == AMDGPU::V_SUB_F32_e64;
5715 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5716 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64 ||
5717 Opc == AMDGPU::V_MIN_F64_e64 ||
Opc == AMDGPU::V_MAX_F64_e64 ||
5718 Opc == AMDGPU::V_MIN_NUM_F64_e64 ||
Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5719 Opc == AMDGPU::V_ADD_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5722static std::tuple<unsigned, unsigned>
5726 case AMDGPU::S_MIN_U32:
5727 DPPOpc = AMDGPU::V_MIN_U32_dpp;
5729 case AMDGPU::S_MIN_I32:
5730 DPPOpc = AMDGPU::V_MIN_I32_dpp;
5732 case AMDGPU::S_MAX_U32:
5733 DPPOpc = AMDGPU::V_MAX_U32_dpp;
5735 case AMDGPU::S_MAX_I32:
5736 DPPOpc = AMDGPU::V_MAX_I32_dpp;
5738 case AMDGPU::S_ADD_I32:
5739 case AMDGPU::S_SUB_I32:
5740 DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5741 : AMDGPU::V_ADD_CO_U32_dpp;
5743 case AMDGPU::S_AND_B32:
5744 DPPOpc = AMDGPU::V_AND_B32_dpp;
5746 case AMDGPU::S_OR_B32:
5747 DPPOpc = AMDGPU::V_OR_B32_dpp;
5749 case AMDGPU::S_XOR_B32:
5750 DPPOpc = AMDGPU::V_XOR_B32_dpp;
5752 case AMDGPU::V_ADD_F32_e64:
5753 case AMDGPU::V_SUB_F32_e64:
5754 DPPOpc = AMDGPU::V_ADD_F32_dpp;
5756 case AMDGPU::V_MIN_F32_e64:
5757 DPPOpc = AMDGPU::V_MIN_F32_dpp;
5759 case AMDGPU::V_MAX_F32_e64:
5760 DPPOpc = AMDGPU::V_MAX_F32_dpp;
5762 case AMDGPU::V_CMP_LT_U64_e64:
5763 case AMDGPU::V_CMP_LT_I64_e64:
5764 case AMDGPU::V_CMP_GT_U64_e64:
5765 case AMDGPU::V_CMP_GT_I64_e64:
5766 case AMDGPU::S_ADD_U64_PSEUDO:
5767 case AMDGPU::S_SUB_U64_PSEUDO:
5768 case AMDGPU::S_AND_B64:
5769 case AMDGPU::S_OR_B64:
5770 case AMDGPU::S_XOR_B64:
5771 case AMDGPU::V_MIN_NUM_F64_e64:
5772 case AMDGPU::V_MIN_F64_e64:
5773 case AMDGPU::V_MAX_NUM_F64_e64:
5774 case AMDGPU::V_MAX_F64_e64:
5775 case AMDGPU::V_ADD_F64_pseudo_e64:
5776 case AMDGPU::V_ADD_F64_e64:
5777 DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
5782 unsigned ClampOpc =
Opc;
5783 if (!ST.getInstrInfo()->isVALU(
Opc)) {
5784 if (
Opc == AMDGPU::S_SUB_I32)
5785 ClampOpc = AMDGPU::S_ADD_I32;
5786 if (
Opc == AMDGPU::S_ADD_U64_PSEUDO ||
Opc == AMDGPU::S_SUB_U64_PSEUDO)
5787 ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
5788 else if (
Opc == AMDGPU::S_AND_B64)
5789 ClampOpc = AMDGPU::V_AND_B32_e64;
5790 else if (
Opc == AMDGPU::S_OR_B64)
5791 ClampOpc = AMDGPU::V_OR_B32_e64;
5792 else if (
Opc == AMDGPU::S_XOR_B64)
5793 ClampOpc = AMDGPU::V_XOR_B32_e64;
5795 ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
5797 return {DPPOpc, ClampOpc};
5800static std::pair<Register, Register>
5807 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5809 TII->buildExtractSubReg(
MI, MRI,
Op, SrcRC, AMDGPU::sub0, SrcSubRC);
5811 TII->buildExtractSubReg(
MI, MRI,
Op, SrcRC, AMDGPU::sub1, SrcSubRC);
5812 return {Op1L, Op1H};
5828 unsigned Stratergy =
static_cast<unsigned>(
MI.getOperand(2).
getImm());
5829 enum WAVE_REDUCE_STRATEGY :
unsigned {
DEFAULT = 0, ITERATIVE = 1,
DPP = 2 };
5831 unsigned MIOpc =
MI.getOpcode();
5845 case AMDGPU::S_MIN_U32:
5846 case AMDGPU::S_MIN_I32:
5847 case AMDGPU::V_MIN_F32_e64:
5848 case AMDGPU::S_MAX_U32:
5849 case AMDGPU::S_MAX_I32:
5850 case AMDGPU::V_MAX_F32_e64:
5851 case AMDGPU::S_AND_B32:
5852 case AMDGPU::S_OR_B32: {
5858 case AMDGPU::V_CMP_LT_U64_e64:
5859 case AMDGPU::V_CMP_LT_I64_e64:
5860 case AMDGPU::V_CMP_GT_U64_e64:
5861 case AMDGPU::V_CMP_GT_I64_e64:
5862 case AMDGPU::V_MIN_F64_e64:
5863 case AMDGPU::V_MIN_NUM_F64_e64:
5864 case AMDGPU::V_MAX_F64_e64:
5865 case AMDGPU::V_MAX_NUM_F64_e64:
5866 case AMDGPU::S_AND_B64:
5867 case AMDGPU::S_OR_B64: {
5873 case AMDGPU::S_XOR_B32:
5874 case AMDGPU::S_XOR_B64:
5875 case AMDGPU::S_ADD_I32:
5876 case AMDGPU::S_ADD_U64_PSEUDO:
5877 case AMDGPU::V_ADD_F32_e64:
5878 case AMDGPU::V_ADD_F64_e64:
5879 case AMDGPU::V_ADD_F64_pseudo_e64:
5880 case AMDGPU::S_SUB_I32:
5881 case AMDGPU::S_SUB_U64_PSEUDO:
5882 case AMDGPU::V_SUB_F32_e64: {
5889 bool IsWave32 = ST.isWave32();
5890 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5891 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5892 unsigned BitCountOpc =
5893 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5897 auto NewAccumulator =
5902 case AMDGPU::S_XOR_B32:
5903 case AMDGPU::S_XOR_B64: {
5912 .
addReg(NewAccumulator->getOperand(0).getReg())
5915 if (
Opc == AMDGPU::S_XOR_B32) {
5932 BuildRegSequence(BB,
MI, DstReg, DestSub0, DestSub1);
5936 case AMDGPU::S_SUB_I32: {
5945 .
addReg(NewAccumulator->getOperand(0).getReg());
5948 case AMDGPU::S_ADD_I32: {
5951 .
addReg(NewAccumulator->getOperand(0).getReg());
5954 case AMDGPU::S_ADD_U64_PSEUDO:
5955 case AMDGPU::S_SUB_U64_PSEUDO: {
5970 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5973 .
addReg(NewAccumulator->getOperand(0).getReg())
5983 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
5985 : NewAccumulator->getOperand(0).getReg();
5996 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
6002 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6008 BuildRegSequence(BB,
MI, DstReg, DestSub0, DestSub1);
6011 case AMDGPU::V_ADD_F32_e64:
6012 case AMDGPU::V_ADD_F64_e64:
6013 case AMDGPU::V_ADD_F64_pseudo_e64:
6014 case AMDGPU::V_SUB_F32_e64: {
6021 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
6022 : AMDGPU::V_CVT_F64_I32_e64),
6024 .
addReg(NewAccumulator->getOperand(0).getReg())
6029 unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6030 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
6033 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
6035 ? AMDGPU::V_MUL_F64_pseudo_e64
6036 : AMDGPU::V_MUL_F64_e64;
6046 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6063 BuildRegSequence(BB,
MI, DstReg, LaneValueLoReg, LaneValueHiReg);
6075 bool NeedsMovDPP = !is32BitOpc;
6080 bool IsWave32 = ST.isWave32();
6081 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6082 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6083 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
6109 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6113 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6114 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6123 I = ComputeLoop->begin();
6125 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
6129 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
6133 I = ComputeLoop->end();
6137 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6141 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6150 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_MOV_B32_e32),
6162 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6179 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6183 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6187 auto LaneValue = BuildRegSequence(*ComputeLoop,
I, LaneValReg,
6188 LaneValueLoReg, LaneValueHiReg);
6190 case AMDGPU::S_OR_B64:
6191 case AMDGPU::S_AND_B64:
6192 case AMDGPU::S_XOR_B64: {
6195 .
addReg(LaneValue->getOperand(0).getReg())
6199 case AMDGPU::V_CMP_GT_I64_e64:
6200 case AMDGPU::V_CMP_GT_U64_e64:
6201 case AMDGPU::V_CMP_LT_I64_e64:
6202 case AMDGPU::V_CMP_LT_U64_e64: {
6207 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6209 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6213 BuildRegSequence(*ComputeLoop,
I, AccumulatorVReg, SrcReg0Sub0,
6216 .
addReg(LaneValue->getOperand(0).getReg())
6217 .
addReg(AccumulatorVReg);
6219 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6220 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
6224 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6225 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6226 .
addReg(LaneValue->getOperand(0).getReg())
6230 case AMDGPU::V_MIN_F64_e64:
6231 case AMDGPU::V_MIN_NUM_F64_e64:
6232 case AMDGPU::V_MAX_F64_e64:
6233 case AMDGPU::V_MAX_NUM_F64_e64:
6234 case AMDGPU::V_ADD_F64_e64:
6235 case AMDGPU::V_ADD_F64_pseudo_e64: {
6237 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6239 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6246 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), AccumulatorVReg)
6249 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6255 .
addReg(LaneValue->getOperand(0).getReg())
6262 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6265 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6267 auto [Op1L, Op1H] =
ExtractSubRegs(*Iters, DstVregInst->getOperand(0),
6269 ReadLaneLo.addReg(Op1L);
6270 ReadLaneHi.addReg(Op1H);
6272 BuildRegSequence(*ComputeLoop,
I, DstReg, LaneValLo, LaneValHi);
6275 case AMDGPU::S_ADD_U64_PSEUDO:
6276 case AMDGPU::S_SUB_U64_PSEUDO: {
6279 .
addReg(LaneValue->getOperand(0).getReg());
6287 unsigned BITSETOpc =
6288 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6289 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
6295 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6298 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6300 .
addReg(NewActiveBitsReg)
6302 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
6307 assert(ST.hasDPP() &&
"Sub Target does not support DPP Operations");
6324 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
6328 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6329 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6332 auto IdentityCopyInstr =
6336 unsigned DPPOpc = std::get<0>(DPPClampOpcPair);
6337 unsigned ClampOpc = std::get<1>(DPPClampOpcPair);
6352 if (isFPOp && !NeedsMovDPP)
6355 if (isFPOp && !NeedsMovDPP)
6359 if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
6368 bool isAddSub =
false,
6369 bool needsCarryIn =
false,
6371 unsigned InstrOpc = ClampOpc;
6374 InstrOpc = AMDGPU::V_ADDC_U32_e64;
6375 auto ClampInstr =
BuildMI(*CurrBB,
MI,
DL,
TII->get(InstrOpc), Dst);
6380 ClampInstr.addReg(CarryOutReg,
6386 ClampInstr.addReg(Src0);
6389 ClampInstr.addReg(Src1);
6392 if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
6393 ClampInstr.addImm(0);
6395 ClampInstr.addImm(0);
6396 LastBcastInstr = ClampInstr;
6401 Opc == AMDGPU::S_ADD_U64_PSEUDO ||
Opc == AMDGPU::S_SUB_U64_PSEUDO;
6402 bool isBitWiseOpc =
Opc == AMDGPU::S_AND_B64 ||
6403 Opc == AMDGPU::S_OR_B64 ||
Opc == AMDGPU::S_XOR_B64;
6405 if (isAddSubOpc || isBitWiseOpc) {
6412 auto [Src0Lo, Src0Hi] =
6414 auto [Src1Lo, Src1Hi] =
6416 Register CarryReg = BuildClampInstr(
6417 ResLo, Src0Lo, Src1Lo, isAddSubOpc,
false);
6418 BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,
6419 isAddSubOpc, CarryReg);
6420 BuildRegSequence(*CurrBB,
MI, ReturnReg, ResLo, ResHi);
6449 SrcWithIdentityInstr =
6450 BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
6457 MI, IdentityCopyInstr->getOperand(0), SrcRegClass, ST, MRI);
6458 auto [SrcReg0Sub0, SrcReg0Sub1] =
6461 BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);
6463 BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);
6464 SrcWithIdentityInstr =
6465 BuildRegSequence(*CurrBB,
MI, SrcWithIdentity,
6472 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
6475 DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
6477 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6480 DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
6482 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6485 DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
6487 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6490 DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
6492 if (ST.hasDPPBroadcasts()) {
6495 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
6510 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
6531 BuildRegSequence(*CurrBB,
MI, SwizzledValue64, SwizzledValuelo,
6534 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
6536 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
6539 FinalDPPResult = RowBcast15;
6541 if (ST.hasDPPBroadcasts()) {
6544 RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
6560 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
6564 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
6570 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
6578 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
6583 .
addReg(ShiftedThreadID);
6588 .
addReg(PermuteByteOffset)
6598 auto [RowBcast15Lo, RowBcast15Hi] =
6602 .
addReg(PermuteByteOffset)
6607 .
addReg(PermuteByteOffset)
6610 BuildRegSequence(*CurrBB,
MI, PermutedValue, PermutedValuelo,
6614 RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
6616 BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
6618 FinalDPPResult = RowBcast31;
6620 if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6621 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
6631 .
addReg(IsWave32 ? RowBcast15 : RowBcast31)
6634 FinalDPPResult = NegatedValVGPR;
6641 .
addImm(ST.getWavefrontSize() - 1);
6656 .
addImm(ST.getWavefrontSize() - 1);
6660 .
addImm(ST.getWavefrontSize() - 1);
6661 BuildRegSequence(*CurrBB,
MI, ReducedValSGPR, LaneValueLoReg,
6664 if (
Opc == AMDGPU::S_SUB_I32) {
6665 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6668 }
else if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6669 auto NegatedValInstr =
6677 .
addReg(
Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_SUB_U64_PSEUDO
6683 MI.eraseFromParent();
6698 switch (
MI.getOpcode()) {
6699 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6701 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6703 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6705 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6707 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6709 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6712 ? AMDGPU::V_MIN_NUM_F64_e64
6713 : AMDGPU::V_MIN_F64_e64);
6714 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6716 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6718 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6720 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6722 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6724 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6727 ? AMDGPU::V_MAX_NUM_F64_e64
6728 : AMDGPU::V_MAX_F64_e64);
6729 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6731 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6733 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6735 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6738 ? AMDGPU::V_ADD_F64_pseudo_e64
6739 : AMDGPU::V_ADD_F64_e64);
6740 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6742 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6744 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6746 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6751 ? AMDGPU::V_ADD_F64_pseudo_e64
6752 : AMDGPU::V_ADD_F64_e64);
6753 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6755 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6757 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6759 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6761 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6763 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6765 case AMDGPU::S_UADDO_PSEUDO:
6766 case AMDGPU::S_USUBO_PSEUDO: {
6772 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6774 : AMDGPU::S_SUB_U32;
6782 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6785 MI.eraseFromParent();
6788 case AMDGPU::S_ADD_U64_PSEUDO:
6789 case AMDGPU::S_SUB_U64_PSEUDO: {
6792 case AMDGPU::V_ADD_U64_PSEUDO:
6793 case AMDGPU::V_SUB_U64_PSEUDO: {
6794 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6800 if (ST.hasAddSubU64Insts()) {
6802 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6803 : AMDGPU::V_SUB_U64_e64),
6808 TII->legalizeOperands(*
I);
6809 MI.eraseFromParent();
6813 if (IsAdd && ST.hasLshlAddU64Inst()) {
6819 TII->legalizeOperands(*
Add);
6820 MI.eraseFromParent();
6824 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6834 : &AMDGPU::VReg_64RegClass;
6837 : &AMDGPU::VReg_64RegClass;
6840 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6842 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6845 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6847 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6850 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6852 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6855 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6862 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6876 TII->legalizeOperands(*LoHalf);
6877 TII->legalizeOperands(*HiHalf);
6878 MI.eraseFromParent();
6881 case AMDGPU::S_ADD_CO_PSEUDO:
6882 case AMDGPU::S_SUB_CO_PSEUDO: {
6894 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6900 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6905 if (
TRI->isVectorRegister(MRI, Src2.
getReg())) {
6906 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6911 if (ST.isWave64()) {
6912 if (ST.hasScalarCompareEq64()) {
6919 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6921 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6923 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6926 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
6940 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6941 ? AMDGPU::S_ADDC_U32
6942 : AMDGPU::S_SUBB_U32;
6947 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6953 MI.eraseFromParent();
6956 case AMDGPU::SI_INIT_M0: {
6959 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6962 MI.eraseFromParent();
6965 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6968 TII->get(AMDGPU::S_CMP_EQ_U32))
6973 case AMDGPU::GET_GROUPSTATICSIZE: {
6977 .
add(
MI.getOperand(0))
6979 MI.eraseFromParent();
6982 case AMDGPU::GET_SHADERCYCLESHILO: {
6997 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
7000 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
7003 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
7012 .
add(
MI.getOperand(0))
7017 MI.eraseFromParent();
7020 case AMDGPU::SI_INDIRECT_SRC_V1:
7021 case AMDGPU::SI_INDIRECT_SRC_V2:
7022 case AMDGPU::SI_INDIRECT_SRC_V3:
7023 case AMDGPU::SI_INDIRECT_SRC_V4:
7024 case AMDGPU::SI_INDIRECT_SRC_V5:
7025 case AMDGPU::SI_INDIRECT_SRC_V6:
7026 case AMDGPU::SI_INDIRECT_SRC_V7:
7027 case AMDGPU::SI_INDIRECT_SRC_V8:
7028 case AMDGPU::SI_INDIRECT_SRC_V9:
7029 case AMDGPU::SI_INDIRECT_SRC_V10:
7030 case AMDGPU::SI_INDIRECT_SRC_V11:
7031 case AMDGPU::SI_INDIRECT_SRC_V12:
7032 case AMDGPU::SI_INDIRECT_SRC_V16:
7033 case AMDGPU::SI_INDIRECT_SRC_V32:
7035 case AMDGPU::SI_INDIRECT_DST_V1:
7036 case AMDGPU::SI_INDIRECT_DST_V2:
7037 case AMDGPU::SI_INDIRECT_DST_V3:
7038 case AMDGPU::SI_INDIRECT_DST_V4:
7039 case AMDGPU::SI_INDIRECT_DST_V5:
7040 case AMDGPU::SI_INDIRECT_DST_V6:
7041 case AMDGPU::SI_INDIRECT_DST_V7:
7042 case AMDGPU::SI_INDIRECT_DST_V8:
7043 case AMDGPU::SI_INDIRECT_DST_V9:
7044 case AMDGPU::SI_INDIRECT_DST_V10:
7045 case AMDGPU::SI_INDIRECT_DST_V11:
7046 case AMDGPU::SI_INDIRECT_DST_V12:
7047 case AMDGPU::SI_INDIRECT_DST_V16:
7048 case AMDGPU::SI_INDIRECT_DST_V32:
7050 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7051 case AMDGPU::SI_KILL_I1_PSEUDO:
7053 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
7057 case AMDGPU::SI_BR_UNDEF: {
7059 .
add(
MI.getOperand(0));
7061 MI.eraseFromParent();
7064 case AMDGPU::ADJCALLSTACKUP:
7065 case AMDGPU::ADJCALLSTACKDOWN: {
7072 case AMDGPU::SI_CALL_ISEL: {
7073 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
7076 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
7082 MI.eraseFromParent();
7085 case AMDGPU::V_ADD_CO_U32_e32:
7086 case AMDGPU::V_SUB_CO_U32_e32:
7087 case AMDGPU::V_SUBREV_CO_U32_e32: {
7089 unsigned Opc =
MI.getOpcode();
7091 bool NeedClampOperand =
false;
7092 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
7094 NeedClampOperand =
true;
7098 if (
TII->isVOP3(*
I)) {
7101 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
7102 if (NeedClampOperand)
7105 TII->legalizeOperands(*
I);
7107 MI.eraseFromParent();
7110 case AMDGPU::V_ADDC_U32_e32:
7111 case AMDGPU::V_SUBB_U32_e32:
7112 case AMDGPU::V_SUBBREV_U32_e32:
7115 TII->legalizeOperands(
MI);
7117 case AMDGPU::DS_GWS_INIT:
7118 case AMDGPU::DS_GWS_SEMA_BR:
7119 case AMDGPU::DS_GWS_BARRIER:
7120 case AMDGPU::DS_GWS_SEMA_V:
7121 case AMDGPU::DS_GWS_SEMA_P:
7122 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
7130 case AMDGPU::S_SETREG_B32: {
7146 const unsigned SetMask = WidthMask <<
Offset;
7149 unsigned SetDenormOp = 0;
7150 unsigned SetRoundOp = 0;
7158 SetRoundOp = AMDGPU::S_ROUND_MODE;
7159 SetDenormOp = AMDGPU::S_DENORM_MODE;
7161 SetRoundOp = AMDGPU::S_ROUND_MODE;
7163 SetDenormOp = AMDGPU::S_DENORM_MODE;
7166 if (SetRoundOp || SetDenormOp) {
7168 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
7169 unsigned ImmVal = Def->getOperand(1).getImm();
7183 MI.eraseFromParent();
7192 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
7196 case AMDGPU::S_INVERSE_BALLOT_U32:
7197 case AMDGPU::S_INVERSE_BALLOT_U64:
7200 MI.setDesc(
TII->get(AMDGPU::COPY));
7202 case AMDGPU::ENDPGM_TRAP: {
7204 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
7224 MI.eraseFromParent();
7227 case AMDGPU::SIMULATED_TRAP: {
7228 assert(Subtarget->hasPrivEnabledTrap2NopBug());
7230 TII->insertSimulatedTrap(MRI, *BB,
MI,
MI.getDebugLoc());
7231 MI.eraseFromParent();
7234 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
7235 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
7241 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
7242 Register OriginalExec = Setup->getOperand(0).getReg();
7244 MI.getOperand(0).setReg(OriginalExec);
7281 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7285 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7312 if (!Subtarget->hasMadMacF32Insts())
7313 return Subtarget->hasFastFMAF32();
7319 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7322 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7338 switch (Ty.getScalarSizeInBits()) {
7356 if (Ty.getScalarSizeInBits() == 16)
7358 if (Ty.getScalarSizeInBits() == 32)
7359 return Subtarget->hasMadMacF32Insts() &&
7369 EVT VT =
N->getValueType(0);
7371 return Subtarget->hasMadMacF32Insts() &&
7373 if (VT == MVT::f16) {
7374 return Subtarget->hasMadF16() &&
7389 unsigned Opc =
Op.getOpcode();
7390 EVT VT =
Op.getValueType();
7391 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7392 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7393 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7394 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7395 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7396 VT == MVT::v32bf16);
7412 [[maybe_unused]]
EVT VT =
Op.getValueType();
7414 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7415 VT == MVT::v16i32) &&
7416 "Unexpected ValueType.");
7425 unsigned Opc =
Op.getOpcode();
7426 EVT VT =
Op.getValueType();
7427 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7428 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7429 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7430 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7431 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7432 VT == MVT::v32bf16);
7440 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
7442 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
7449 unsigned Opc =
Op.getOpcode();
7450 EVT VT =
Op.getValueType();
7451 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7452 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7453 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7454 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7455 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7456 VT == MVT::v32bf16);
7461 : std::pair(Op0, Op0);
7470 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
7472 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
7478 switch (
Op.getOpcode()) {
7482 return LowerBRCOND(
Op, DAG);
7484 return LowerRETURNADDR(
Op, DAG);
7486 return LowerSPONENTRY(
Op, DAG);
7489 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7490 "Load should return a value and a chain");
7494 EVT VT =
Op.getValueType();
7496 return lowerFSQRTF32(
Op, DAG);
7498 return lowerFSQRTF64(
Op, DAG);
7503 return LowerTrig(
Op, DAG);
7505 return LowerSELECT(
Op, DAG);
7507 return LowerFDIV(
Op, DAG);
7509 return LowerFFREXP(
Op, DAG);
7511 return LowerATOMIC_CMP_SWAP(
Op, DAG);
7513 return LowerSTORE(
Op, DAG);
7517 return LowerGlobalAddress(MFI,
Op, DAG);
7520 return LowerExternalSymbol(
Op, DAG);
7522 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
7524 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
7526 return LowerINTRINSIC_VOID(
Op, DAG);
7528 return lowerADDRSPACECAST(
Op, DAG);
7530 return lowerINSERT_SUBVECTOR(
Op, DAG);
7532 return lowerINSERT_VECTOR_ELT(
Op, DAG);
7534 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
7536 return lowerVECTOR_SHUFFLE(
Op, DAG);
7538 return lowerSCALAR_TO_VECTOR(
Op, DAG);
7540 return lowerBUILD_VECTOR(
Op, DAG);
7543 return lowerFP_ROUND(
Op, DAG);
7545 return lowerTRAP(
Op, DAG);
7547 return lowerDEBUGTRAP(
Op, DAG);
7556 return lowerFMINNUM_FMAXNUM(
Op, DAG);
7559 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
7562 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
7565 return lowerFLDEXP(
Op, DAG);
7571 Op.getValueType() == MVT::i16 &&
7572 Op.getOperand(0).getValueType() == MVT::f32) {
7596 return lowerFCOPYSIGN(
Op, DAG);
7598 return lowerMUL(
Op, DAG);
7601 return lowerXMULO(
Op, DAG);
7604 return lowerXMUL_LOHI(
Op, DAG);
7625 return LowerINLINEASM(
Op, DAG);
7641 EVT FittingLoadVT = LoadVT;
7673SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
7676 bool IsIntrinsic)
const {
7679 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7680 EVT LoadVT =
M->getValueType(0);
7682 EVT EquivLoadVT = LoadVT;
7696 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7700 M->getMemoryVT(),
M->getMemOperand());
7711 EVT LoadVT =
M->getValueType(0);
7717 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7718 bool IsTFE =
M->getNumValues() == 3;
7720 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7721 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7722 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7723 : AMDGPUISD::BUFFER_LOAD;
7726 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7731 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7735 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7736 M->getMemOperand(), DAG);
7740 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7742 M->getMemOperand(), DAG);
7750 EVT VT =
N->getValueType(0);
7751 unsigned CondCode =
N->getConstantOperandVal(3);
7762 EVT CmpVT =
LHS.getValueType();
7763 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7764 unsigned PromoteOp =
7784 EVT VT =
N->getValueType(0);
7786 unsigned CondCode =
N->getConstantOperandVal(3);
7795 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7804 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7813 EVT VT =
N->getValueType(0);
7837 Exec = AMDGPU::EXEC_LO;
7839 Exec = AMDGPU::EXEC;
7856 EVT VT =
N->getValueType(0);
7858 unsigned IID =
N->getConstantOperandVal(0);
7859 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7860 IID == Intrinsic::amdgcn_permlanex16;
7861 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7862 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7863 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
7864 IID == Intrinsic::amdgcn_permlane_up ||
7865 IID == Intrinsic::amdgcn_permlane_down ||
7866 IID == Intrinsic::amdgcn_permlane_xor;
7870 unsigned SplitSize = 32;
7871 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7872 ST->hasDPALU_DPP() &&
7880 case Intrinsic::amdgcn_permlane16:
7881 case Intrinsic::amdgcn_permlanex16:
7882 case Intrinsic::amdgcn_update_dpp:
7887 case Intrinsic::amdgcn_writelane:
7888 case Intrinsic::amdgcn_permlane_bcast:
7889 case Intrinsic::amdgcn_permlane_up:
7890 case Intrinsic::amdgcn_permlane_down:
7891 case Intrinsic::amdgcn_permlane_xor:
7894 case Intrinsic::amdgcn_readlane:
7895 case Intrinsic::amdgcn_set_inactive:
7896 case Intrinsic::amdgcn_set_inactive_chain_arg:
7897 case Intrinsic::amdgcn_mov_dpp8:
7900 case Intrinsic::amdgcn_readfirstlane:
7901 case Intrinsic::amdgcn_permlane64:
7909 std::reverse(Operands.
begin(), Operands.
end());
7911 if (
SDNode *GL =
N->getGluedNode()) {
7913 GL = GL->getOperand(0).getNode();
7923 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7924 IID == Intrinsic::amdgcn_mov_dpp8 ||
7925 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
7926 IsPermlaneShuffle) {
7927 Src1 =
N->getOperand(2);
7928 if (IID == Intrinsic::amdgcn_writelane ||
7929 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 ||
7931 Src2 =
N->getOperand(3);
7934 if (ValSize == SplitSize) {
7944 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7949 if (IID == Intrinsic::amdgcn_writelane) {
7954 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7956 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
7959 if (ValSize % SplitSize != 0)
7963 EVT VT =
N->getValueType(0);
7967 unsigned NumOperands =
N->getNumOperands();
7969 SDNode *GL =
N->getGluedNode();
7974 for (
unsigned i = 0; i != NE; ++i) {
7975 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7977 SDValue Operand =
N->getOperand(j);
7986 Operands[j] = Operand;
7991 Operands[NumOperands - 1] =
8007 if (SplitSize == 32) {
8009 return unrollLaneOp(LaneOp.
getNode());
8015 unsigned SubVecNumElt =
8019 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
8020 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
8024 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
8030 createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT));
8031 }
else if (IID == Intrinsic::amdgcn_writelane) {
8035 createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
8037 Pieces.
push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT));
8040 EltIdx += SubVecNumElt;
8054 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
8057 if (IID == Intrinsic::amdgcn_writelane)
8060 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
8067 EVT VT =
N->getValueType(0);
8085 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
8089 Operands.
append(IntrinArgs);
8095 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8096 {ShiftedIndex, ValueI32});
8106 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8107 {ValueI32, PoisonVal});
8108 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8109 {ShiftedIndex, PoisonVal});
8112 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
8115 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8116 {WWMIndex, WWMValue});
8117 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
8118 MVT::i32, {WWMIndex, Swapped});
8120 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
8128 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
8136 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
8146 switch (
N->getOpcode()) {
8158 unsigned IID =
N->getConstantOperandVal(0);
8160 case Intrinsic::amdgcn_make_buffer_rsrc:
8161 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
8163 case Intrinsic::amdgcn_cvt_pkrtz: {
8168 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
8172 case Intrinsic::amdgcn_cvt_pknorm_i16:
8173 case Intrinsic::amdgcn_cvt_pknorm_u16:
8174 case Intrinsic::amdgcn_cvt_pk_i16:
8175 case Intrinsic::amdgcn_cvt_pk_u16: {
8181 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
8182 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8183 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
8184 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8185 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
8186 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8188 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8190 EVT VT =
N->getValueType(0);
8199 case Intrinsic::amdgcn_s_buffer_load: {
8205 if (!Subtarget->hasScalarSubwordLoads())
8211 EVT VT =
Op.getValueType();
8212 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
8224 if (!
Offset->isDivergent()) {
8243 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
8248 case Intrinsic::amdgcn_dead: {
8249 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
8260 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
8261 Results.push_back(Res.getOperand(
I));
8265 Results.push_back(Res.getValue(1));
8274 EVT VT =
N->getValueType(0);
8279 EVT SelectVT = NewVT;
8280 if (NewVT.
bitsLT(MVT::i32)) {
8283 SelectVT = MVT::i32;
8289 if (NewVT != SelectVT)
8295 if (
N->getValueType(0) != MVT::v2f16)
8307 if (
N->getValueType(0) != MVT::v2f16)
8319 if (
N->getValueType(0) != MVT::f16)
8334 if (U.get() !=
Value)
8337 if (U.getUser()->getOpcode() == Opcode)
8343unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
8346 case Intrinsic::amdgcn_if:
8347 return AMDGPUISD::IF;
8348 case Intrinsic::amdgcn_else:
8349 return AMDGPUISD::ELSE;
8350 case Intrinsic::amdgcn_loop:
8351 return AMDGPUISD::LOOP;
8352 case Intrinsic::amdgcn_end_cf:
8372 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8395 assert(GVar->isDeclaration() &&
"AS3 GVs should be declaration here "
8396 "when object linking is enabled");
8411 SDNode *Intr = BRCOND.getOperand(1).getNode();
8428 Intr =
LHS.getNode();
8436 assert(BR &&
"brcond missing unconditional branch user");
8441 unsigned CFNode = isCFIntrinsic(Intr);
8461 Ops.push_back(Target);
8484 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
8503 MVT VT =
Op.getSimpleValueType();
8506 if (
Op.getConstantOperandVal(0) != 0)
8510 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8512 if (
Info->isEntryFunction())
8529 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
8543 return Op.getValueType().bitsLE(VT)
8551 EVT DstVT =
Op.getValueType();
8558 unsigned Opc =
Op.getOpcode();
8570 EVT SrcVT = Src.getValueType();
8571 EVT DstVT =
Op.getValueType();
8574 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
8577 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
8584 if (DstVT == MVT::f16) {
8589 if (!Subtarget->has16BitInsts()) {
8594 if (
Op->getFlags().hasApproximateFuncs()) {
8605 "custom lower FP_ROUND for f16 or bf16");
8606 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
8618 EVT VT =
Op.getValueType();
8620 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8621 bool IsIEEEMode =
Info->getMode().IEEE;
8630 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8637SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
8639 EVT VT =
Op.getValueType();
8641 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8642 bool IsIEEEMode =
Info->getMode().IEEE;
8647 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8655 EVT VT =
Op.getValueType();
8659 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8660 !Subtarget->hasMinimum3Maximum3F16() &&
8661 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8662 "should not need to widen f16 minimum/maximum to v2f16");
8676 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8684 EVT VT =
Op.getValueType();
8688 EVT ExpVT =
Exp.getValueType();
8689 if (ExpVT == MVT::i16)
8710 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
8717 switch (
Op->getOpcode()) {
8747 DAGCombinerInfo &DCI)
const {
8748 const unsigned Opc =
Op.getOpcode();
8756 :
Op->getOperand(0).getValueType();
8757 auto &DAG = DCI.DAG;
8760 if (DCI.isBeforeLegalizeOps() ||
8768 LHS =
Op->getOperand(1);
8769 RHS =
Op->getOperand(2);
8771 LHS =
Op->getOperand(0);
8772 RHS =
Op->getOperand(1);
8811 if (MagVT == SignVT)
8828 EVT VT =
Op.getValueType();
8834 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8861 if (
Op->isDivergent())
8874 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8876 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8879 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8881 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8887 EVT VT =
Op.getValueType();
8894 const APInt &
C = RHSC->getAPIntValue();
8896 if (
C.isPowerOf2()) {
8898 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
8925 if (
Op->isDivergent()) {
8929 if (Subtarget->hasSMulHi()) {
8940 if (!Subtarget->hasTrapHandler() ||
8942 return lowerTrapEndpgm(
Op, DAG);
8944 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
8945 : lowerTrapHsaQueuePtr(
Op, DAG);
8951 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8955SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
8957 ImplicitParameter Param)
const {
8961 MachinePointerInfo PtrInfo =
8978 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
8981 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8984 if (UserSGPR == AMDGPU::NoRegister) {
9001 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
9010 if (Subtarget->hasPrivEnabledTrap2NopBug())
9011 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
9015 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
9023 if (!Subtarget->hasTrapHandler() ||
9027 "debugtrap handler not supported",
9035 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
9045 const SIRegisterInfo *
TRI = Subtarget->getRegisterInfo();
9046 SmallSet<Register, 8> SGPRInputRegs;
9048 unsigned NumVals = 0;
9051 const InlineAsm::Flag
Flags(
Op.getConstantOperandVal(
I));
9052 NumVals =
Flags.getNumOperandRegisters();
9056 NumVals > 0 &&
Flags.hasRegClassConstraint(RCID) &&
9057 TRI->isSGPRClass(
TRI->getRegClass(RCID));
9059 for (
unsigned J = 0; J < NumVals; ++J) {
9061 if (
const RegisterSDNode *RegNode =
9070 if (SGPRInputRegs.
empty())
9075 SDNode *
N =
Op.getOperand(
NumOps - 1).getNode();
9087 ReadFirstLaneID, SrcVal);
9091 if (
N->getNumOperands() > 3)
9092 Ops.push_back(
N->getOperand(3));
9098 SDNode *
Next =
nullptr;
9099 for (
unsigned I = 0,
E =
N->getNumOperands();
I !=
E; ++
I) {
9100 if (
N->getOperand(
I).getValueType() == MVT::Glue) {
9101 Next =
N->getOperand(
I).getNode();
9111SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
9113 if (Subtarget->hasApertureRegs()) {
9115 ? AMDGPU::SRC_SHARED_BASE
9116 : AMDGPU::SRC_PRIVATE_BASE;
9117 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
9118 !Subtarget->hasGloballyAddressableScratch()) &&
9119 "Cannot use src_private_base with globally addressable scratch!");
9140 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
9144 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
9146 if (UserSGPR == AMDGPU::NoRegister) {
9191 const AMDGPUTargetMachine &TM =
9194 unsigned DestAS, SrcAS;
9196 bool IsNonNull =
false;
9198 SrcAS = ASC->getSrcAddressSpace();
9199 Src = ASC->getOperand(0);
9200 DestAS = ASC->getDestAddressSpace();
9203 Op.getConstantOperandVal(0) ==
9204 Intrinsic::amdgcn_addrspacecast_nonnull);
9205 Src =
Op->getOperand(1);
9206 SrcAS =
Op->getConstantOperandVal(2);
9207 DestAS =
Op->getConstantOperandVal(3);
9220 Subtarget->hasGloballyAddressableScratch()) {
9225 AMDGPU::S_MOV_B32, SL, MVT::i32,
9226 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
9249 Subtarget->hasGloballyAddressableScratch()) {
9258 if (Subtarget->isWave64())
9264 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
9272 AMDGPU::S_MOV_B64, SL, MVT::i64,
9273 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
9275 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
9277 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
9297 Op.getValueType() == MVT::i64) {
9298 const SIMachineFunctionInfo *
Info =
9300 if (
Info->get32BitAddressHighBits() == 0)
9309 Src.getValueType() == MVT::i64)
9337 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
9342 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
9344 MVT::i32, InsNumElts / 2);
9349 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
9351 if (InsNumElts == 2) {
9364 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
9387 if (NumElts == 4 && EltSize == 16 && KIdx) {
9398 unsigned Idx = KIdx->getZExtValue();
9399 bool InsertLo = Idx < 2;
9403 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9409 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9422 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
9457 EVT ResultVT =
Op.getValueType();
9470 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
9473 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9477 if (VecSize == 128) {
9485 }
else if (VecSize == 256) {
9488 for (
unsigned P = 0;
P < 4; ++
P) {
9494 Parts[0], Parts[1]));
9496 Parts[2], Parts[3]));
9502 for (
unsigned P = 0;
P < 8; ++
P) {
9509 Parts[0], Parts[1], Parts[2], Parts[3]));
9512 Parts[4], Parts[5], Parts[6], Parts[7]));
9532 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9547 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9557 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9562 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9563 !(Mask[Elt + 1] & 1);
9569 EVT ResultVT =
Op.getValueType();
9572 const int NewSrcNumElts = 2;
9574 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
9590 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
9612 if (ShouldUseConsecutiveExtract &&
9615 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9616 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9628 if (Idx0 >= SrcNumElts) {
9633 if (Idx1 >= SrcNumElts) {
9638 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9639 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9647 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9648 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9653 if (SubVec0 != SubVec1) {
9654 NewMaskIdx1 += NewSrcNumElts;
9661 {NewMaskIdx0, NewMaskIdx1});
9666 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9667 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9668 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9669 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9688 EVT ResultVT =
Op.getValueType();
9704 EVT VT =
Op.getValueType();
9706 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9707 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
9741 for (
unsigned P = 0;
P < NumParts; ++
P) {
9743 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
9762 if (!Subtarget->isAmdHsaOS())
9805 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9814 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9822 EVT PtrVT =
Op.getValueType();
9824 const GlobalValue *GV = GSD->
getGlobal();
9838 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9853 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9856 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9857 if (Subtarget->has64BitLiterals()) {
9888 MachinePointerInfo PtrInfo =
9901 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
9925 SDValue Param = lowerKernargMemParameter(
9936 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
9944 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
9952 unsigned NumElts = Elts.
size();
9954 if (NumElts <= 12) {
9963 for (
unsigned i = 0; i < Elts.
size(); ++i) {
9969 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
9979 EVT SrcVT = Src.getValueType();
10000 bool Unpacked,
bool IsD16,
int DMaskPop,
10001 int NumVDataDwords,
bool IsAtomicPacked16Bit,
10005 EVT ReqRetVT = ResultTypes[0];
10007 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
10008 ? (ReqRetNumElts + 1) / 2
10011 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
10014 NumDataDwords == 1 ? MVT::i32 :
MVT::getVectorVT(MVT::i32, NumDataDwords);
10017 MaskPopDwords == 1 ? MVT::i32 :
MVT::getVectorVT(MVT::i32, MaskPopDwords);
10022 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
10026 SDValue(Result, 0), ZeroIdx);
10029 SDValue(Result, 0), ZeroIdx);
10033 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
10035 NumDataDwords - MaskPopDwords);
10040 EVT LegalReqRetVT = ReqRetVT;
10042 if (!
Data.getValueType().isInteger())
10044 Data.getValueType().changeTypeToInteger(),
Data);
10065 if (Result->getNumValues() == 1)
10072 SDValue *LWE,
bool &IsTexFail) {
10092 unsigned DimIdx,
unsigned EndIdx,
10093 unsigned NumGradients) {
10095 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
10103 if (((
I + 1) >= EndIdx) ||
10104 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
10105 I == DimIdx + NumGradients - 1))) {
10127 !
Op.getNode()->hasAnyUseOfValue(0))
10129 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
10140 ResultTypes.erase(&ResultTypes[0]);
10142 bool IsD16 =
false;
10143 bool IsG16 =
false;
10144 bool IsA16 =
false;
10146 int NumVDataDwords = 0;
10147 bool AdjustRetType =
false;
10148 bool IsAtomicPacked16Bit =
false;
10151 const unsigned ArgOffset = WithChain ? 2 : 1;
10154 unsigned DMaskLanes = 0;
10156 if (BaseOpcode->
Atomic) {
10157 VData =
Op.getOperand(2);
10159 IsAtomicPacked16Bit =
10160 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
10161 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
10162 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
10163 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
10174 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
10176 DMask = Is64Bit ? 0xf : 0x3;
10177 NumVDataDwords = Is64Bit ? 4 : 2;
10179 DMask = Is64Bit ? 0x3 : 0x1;
10180 NumVDataDwords = Is64Bit ? 2 : 1;
10183 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
10186 if (BaseOpcode->
Store) {
10187 VData =
Op.getOperand(2);
10191 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
10195 VData = handleD16VData(VData, DAG,
true);
10198 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
10199 }
else if (!BaseOpcode->
NoReturn) {
10204 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
10212 (!LoadVT.
isVector() && DMaskLanes > 1))
10218 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
10219 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
10220 NumVDataDwords = (DMaskLanes + 1) / 2;
10222 NumVDataDwords = DMaskLanes;
10224 AdjustRetType =
true;
10228 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
10235 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10236 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10238 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
10240 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10241 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10245 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
10251 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
10255 "Bias needs to be converted to 16 bit in A16 mode");
10260 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
10264 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
10265 "require 16 bit args for both gradients and addresses");
10270 if (!
ST->hasA16()) {
10271 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
10272 "support 16 bit addresses\n");
10282 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
10284 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
10286 IntrOpcode = G16MappingInfo->
G16;
10309 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
10327 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
10328 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
10329 const bool UseNSA =
ST->hasNSAEncoding() &&
10330 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
10331 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
10332 const bool UsePartialNSA =
10333 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
10336 if (UsePartialNSA) {
10338 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
10339 }
else if (!UseNSA) {
10349 uint64_t UnormConst =
10350 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
10352 Unorm = UnormConst ? True : False;
10358 bool IsTexFail =
false;
10359 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10368 NumVDataDwords = 1;
10370 NumVDataDwords += 1;
10371 AdjustRetType =
true;
10376 if (AdjustRetType) {
10379 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
10388 MVT::i32, NumVDataDwords)
10391 ResultTypes[0] = NewVT;
10392 if (ResultTypes.size() == 3) {
10396 ResultTypes.erase(&ResultTypes[1]);
10410 Ops.push_back(VData);
10411 if (UsePartialNSA) {
10413 Ops.push_back(VAddr);
10417 Ops.push_back(VAddr);
10420 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10422 Ops.push_back(Rsrc);
10427 Ops.push_back(Samp);
10432 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10433 Ops.push_back(Unorm);
10435 Ops.push_back(IsA16 &&
10436 ST->hasFeature(AMDGPU::FeatureR128A16)
10440 Ops.push_back(IsA16 ? True : False);
10442 if (!Subtarget->hasGFX90AInsts())
10443 Ops.push_back(TFE);
10447 "TFE is not supported on this GPU",
DL.getDebugLoc()));
10450 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10451 Ops.push_back(LWE);
10453 Ops.push_back(DimInfo->
DA ? True : False);
10455 Ops.push_back(IsD16 ? True : False);
10457 Ops.push_back(
Op.getOperand(0));
10459 int NumVAddrDwords =
10465 NumVDataDwords, NumVAddrDwords);
10466 }
else if (IsGFX12Plus) {
10468 NumVDataDwords, NumVAddrDwords);
10469 }
else if (IsGFX11Plus) {
10471 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10472 : AMDGPU::MIMGEncGfx11Default,
10473 NumVDataDwords, NumVAddrDwords);
10474 }
else if (IsGFX10Plus) {
10476 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10477 : AMDGPU::MIMGEncGfx10Default,
10478 NumVDataDwords, NumVAddrDwords);
10480 if (Subtarget->hasGFX90AInsts()) {
10482 NumVDataDwords, NumVAddrDwords);
10483 if (Opcode == -1) {
10486 "requested image instruction is not supported on this GPU",
10487 DL.getDebugLoc()));
10491 for (EVT VT : OrigResultTypes) {
10492 if (VT == MVT::Other)
10493 RetValues[Idx++] =
Op.getOperand(0);
10501 if (Opcode == -1 &&
10504 NumVDataDwords, NumVAddrDwords);
10507 NumVDataDwords, NumVAddrDwords);
10514 MachineMemOperand *MemRef = MemOp->getMemOperand();
10533 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10534 NumVDataDwords, IsAtomicPacked16Bit,
DL);
10547 MachinePointerInfo(),
10552 if (!
Offset->isDivergent()) {
10559 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10568 !Subtarget->hasScalarDwordx3Loads()) {
10572 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
10595 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10597 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
10601 unsigned NumLoads = 1;
10607 if (NumElts == 8 || NumElts == 16) {
10608 NumLoads = NumElts / 4;
10612 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
10617 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
10619 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
10621 for (
unsigned i = 0; i < NumLoads; ++i) {
10624 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
10625 LoadVT, LoadMMO, DAG));
10628 if (NumElts == 8 || NumElts == 16)
10636 if (!Subtarget->hasArchitectedSGPRs())
10641 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10648 unsigned Width)
const {
10650 using namespace AMDGPU::Hwreg;
10652 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10691 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
10693 EVT VT =
Op.getValueType();
10695 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
10699 switch (IntrinsicID) {
10700 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10703 return getPreloadedValue(DAG, *MFI, VT,
10706 case Intrinsic::amdgcn_dispatch_ptr:
10707 case Intrinsic::amdgcn_queue_ptr: {
10708 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
10710 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
10711 DL.getDebugLoc()));
10715 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10718 return getPreloadedValue(DAG, *MFI, VT, RegID);
10720 case Intrinsic::amdgcn_implicitarg_ptr: {
10722 return getImplicitArgPtr(DAG,
DL);
10723 return getPreloadedValue(DAG, *MFI, VT,
10726 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10732 return getPreloadedValue(DAG, *MFI, VT,
10735 case Intrinsic::amdgcn_dispatch_id: {
10738 case Intrinsic::amdgcn_rcp:
10739 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
10740 case Intrinsic::amdgcn_rsq:
10741 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
10742 case Intrinsic::amdgcn_rsq_legacy:
10746 case Intrinsic::amdgcn_rcp_legacy:
10749 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
10750 case Intrinsic::amdgcn_rsq_clamp: {
10752 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
10764 case Intrinsic::r600_read_ngroups_x:
10765 if (Subtarget->isAmdHsaOS())
10768 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10771 case Intrinsic::r600_read_ngroups_y:
10772 if (Subtarget->isAmdHsaOS())
10775 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10778 case Intrinsic::r600_read_ngroups_z:
10779 if (Subtarget->isAmdHsaOS())
10782 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10785 case Intrinsic::r600_read_local_size_x:
10786 if (Subtarget->isAmdHsaOS())
10789 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10791 case Intrinsic::r600_read_local_size_y:
10792 if (Subtarget->isAmdHsaOS())
10795 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10797 case Intrinsic::r600_read_local_size_z:
10798 if (Subtarget->isAmdHsaOS())
10801 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10803 case Intrinsic::amdgcn_workgroup_id_x:
10804 return lowerWorkGroupId(DAG, *MFI, VT,
10808 case Intrinsic::amdgcn_workgroup_id_y:
10809 return lowerWorkGroupId(DAG, *MFI, VT,
10813 case Intrinsic::amdgcn_workgroup_id_z:
10814 return lowerWorkGroupId(DAG, *MFI, VT,
10818 case Intrinsic::amdgcn_cluster_id_x:
10819 return Subtarget->hasClusters()
10820 ? getPreloadedValue(DAG, *MFI, VT,
10822 : DAG.getPOISON(VT);
10823 case Intrinsic::amdgcn_cluster_id_y:
10824 return Subtarget->hasClusters()
10825 ? getPreloadedValue(DAG, *MFI, VT,
10828 case Intrinsic::amdgcn_cluster_id_z:
10829 return Subtarget->hasClusters()
10830 ? getPreloadedValue(DAG, *MFI, VT,
10833 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10834 return Subtarget->hasClusters()
10835 ? getPreloadedValue(
10839 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10840 return Subtarget->hasClusters()
10841 ? getPreloadedValue(
10845 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10846 return Subtarget->hasClusters()
10847 ? getPreloadedValue(
10851 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10852 return Subtarget->hasClusters()
10855 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10856 return Subtarget->hasClusters()
10857 ? getPreloadedValue(
10861 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10862 return Subtarget->hasClusters()
10863 ? getPreloadedValue(
10867 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10868 return Subtarget->hasClusters()
10869 ? getPreloadedValue(
10873 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10874 return Subtarget->hasClusters()
10875 ? getPreloadedValue(
10879 case Intrinsic::amdgcn_wave_id:
10880 return lowerWaveID(DAG,
Op);
10881 case Intrinsic::amdgcn_lds_kernel_id: {
10883 return getLDSKernelId(DAG,
DL);
10884 return getPreloadedValue(DAG, *MFI, VT,
10887 case Intrinsic::amdgcn_workitem_id_x:
10888 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
10889 case Intrinsic::amdgcn_workitem_id_y:
10890 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
10891 case Intrinsic::amdgcn_workitem_id_z:
10892 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
10893 case Intrinsic::amdgcn_wavefrontsize:
10895 SDLoc(
Op), MVT::i32);
10896 case Intrinsic::amdgcn_s_buffer_load: {
10897 unsigned CPol =
Op.getConstantOperandVal(3);
10904 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
10905 Op.getOperand(3), DAG);
10907 case Intrinsic::amdgcn_fdiv_fast:
10908 return lowerFDIV_FAST(
Op, DAG);
10909 case Intrinsic::amdgcn_sin:
10910 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
10912 case Intrinsic::amdgcn_cos:
10913 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
10915 case Intrinsic::amdgcn_mul_u24:
10916 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
10918 case Intrinsic::amdgcn_mul_i24:
10919 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
10922 case Intrinsic::amdgcn_log_clamp: {
10928 case Intrinsic::amdgcn_fract:
10929 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
10931 case Intrinsic::amdgcn_class:
10932 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
10934 case Intrinsic::amdgcn_div_fmas:
10935 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
10936 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10938 case Intrinsic::amdgcn_div_fixup:
10939 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
10940 Op.getOperand(2),
Op.getOperand(3));
10942 case Intrinsic::amdgcn_div_scale: {
10948 SDValue Denominator =
Op.getOperand(2);
10955 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
10957 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
10958 Denominator, Numerator);
10960 case Intrinsic::amdgcn_icmp: {
10962 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
10963 Op.getConstantOperandVal(2) == 0 &&
10968 case Intrinsic::amdgcn_fcmp: {
10971 case Intrinsic::amdgcn_ballot:
10973 case Intrinsic::amdgcn_fmed3:
10974 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
10975 Op.getOperand(2),
Op.getOperand(3));
10976 case Intrinsic::amdgcn_fdot2:
10977 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
10978 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
10979 case Intrinsic::amdgcn_fmul_legacy:
10980 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
10982 case Intrinsic::amdgcn_sbfe:
10983 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
10984 Op.getOperand(2),
Op.getOperand(3));
10985 case Intrinsic::amdgcn_ubfe:
10986 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
10987 Op.getOperand(2),
Op.getOperand(3));
10988 case Intrinsic::amdgcn_cvt_pkrtz:
10989 case Intrinsic::amdgcn_cvt_pknorm_i16:
10990 case Intrinsic::amdgcn_cvt_pknorm_u16:
10991 case Intrinsic::amdgcn_cvt_pk_i16:
10992 case Intrinsic::amdgcn_cvt_pk_u16: {
10994 EVT VT =
Op.getValueType();
10997 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10998 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10999 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
11000 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
11001 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
11002 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
11003 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
11004 Opcode = AMDGPUISD::CVT_PK_I16_I32;
11006 Opcode = AMDGPUISD::CVT_PK_U16_U32;
11009 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
11012 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
11015 case Intrinsic::amdgcn_fmad_ftz:
11016 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
11017 Op.getOperand(2),
Op.getOperand(3));
11019 case Intrinsic::amdgcn_if_break:
11021 Op->getOperand(1),
Op->getOperand(2)),
11024 case Intrinsic::amdgcn_groupstaticsize: {
11030 const GlobalValue *GV =
11036 case Intrinsic::amdgcn_is_shared:
11037 case Intrinsic::amdgcn_is_private: {
11044 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
11048 Subtarget->hasGloballyAddressableScratch()) {
11051 AMDGPU::S_MOV_B32,
DL, MVT::i32,
11052 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
11061 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
11064 case Intrinsic::amdgcn_perm:
11065 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
11066 Op.getOperand(2),
Op.getOperand(3));
11067 case Intrinsic::amdgcn_reloc_constant: {
11077 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
11078 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
11079 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
11080 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
11081 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
11082 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
11083 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
11084 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
11085 if (
Op.getOperand(4).getValueType() == MVT::i32)
11091 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
11092 Op.getOperand(3), IndexKeyi32);
11094 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
11095 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
11096 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
11097 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
11098 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
11099 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
11100 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
11101 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
11102 if (
Op.getOperand(4).getValueType() == MVT::i64)
11107 Op.getOperand(4).getValueType() == MVT::v2i32
11111 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11112 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
11113 Op.getOperand(6)});
11115 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
11116 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
11117 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
11118 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
11119 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
11120 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
11121 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
11124 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
11129 Op.getOperand(6).getValueType().isVector()
11133 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
11134 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
11135 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
11136 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
11137 Args.push_back(
Op.getOperand(9));
11140 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
11141 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
11142 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
11143 if (
Op.getOperand(6).getValueType() == MVT::i32)
11149 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11150 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11151 IndexKeyi32, Op.getOperand(7)});
11153 case Intrinsic::amdgcn_addrspacecast_nonnull:
11154 return lowerADDRSPACECAST(
Op, DAG);
11155 case Intrinsic::amdgcn_readlane:
11156 case Intrinsic::amdgcn_readfirstlane:
11157 case Intrinsic::amdgcn_writelane:
11158 case Intrinsic::amdgcn_permlane16:
11159 case Intrinsic::amdgcn_permlanex16:
11160 case Intrinsic::amdgcn_permlane64:
11161 case Intrinsic::amdgcn_set_inactive:
11162 case Intrinsic::amdgcn_set_inactive_chain_arg:
11163 case Intrinsic::amdgcn_mov_dpp8:
11164 case Intrinsic::amdgcn_update_dpp:
11165 case Intrinsic::amdgcn_permlane_bcast:
11166 case Intrinsic::amdgcn_permlane_up:
11167 case Intrinsic::amdgcn_permlane_down:
11168 case Intrinsic::amdgcn_permlane_xor:
11170 case Intrinsic::amdgcn_dead: {
11172 for (
const EVT ValTy :
Op.getNode()->values())
11176 case Intrinsic::amdgcn_wave_shuffle:
11179 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11181 return lowerImage(
Op, ImageDimIntr, DAG,
false);
11191 if (Subtarget->hasRestrictedSOffset() &&
isNullConstant(SOffset))
11192 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
11198 unsigned NewOpcode)
const {
11202 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11203 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11221 M->getMemOperand());
11226 unsigned NewOpcode)
const {
11230 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11231 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11249 M->getMemOperand());
11254 unsigned IntrID =
Op.getConstantOperandVal(1);
11258 case Intrinsic::amdgcn_ds_ordered_add:
11259 case Intrinsic::amdgcn_ds_ordered_swap: {
11264 unsigned IndexOperand =
M->getConstantOperandVal(7);
11265 unsigned WaveRelease =
M->getConstantOperandVal(8);
11266 unsigned WaveDone =
M->getConstantOperandVal(9);
11268 unsigned OrderedCountIndex = IndexOperand & 0x3f;
11269 IndexOperand &= ~0x3f;
11270 unsigned CountDw = 0;
11273 CountDw = (IndexOperand >> 24) & 0xf;
11274 IndexOperand &= ~(0xf << 24);
11276 if (CountDw < 1 || CountDw > 4) {
11279 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
11280 DL.getDebugLoc()));
11285 if (IndexOperand) {
11288 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
11291 if (WaveDone && !WaveRelease) {
11295 Fn,
"ds_ordered_count: wave_done requires wave_release",
11296 DL.getDebugLoc()));
11299 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
11300 unsigned ShaderType =
11302 unsigned Offset0 = OrderedCountIndex << 2;
11303 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
11306 Offset1 |= (CountDw - 1) << 6;
11309 Offset1 |= ShaderType << 2;
11311 unsigned Offset = Offset0 | (Offset1 << 8);
11318 M->getVTList(),
Ops,
M->getMemoryVT(),
11319 M->getMemOperand());
11321 case Intrinsic::amdgcn_raw_buffer_load:
11322 case Intrinsic::amdgcn_raw_ptr_buffer_load:
11323 case Intrinsic::amdgcn_raw_atomic_buffer_load:
11324 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
11325 case Intrinsic::amdgcn_raw_buffer_load_format:
11326 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
11327 const bool IsFormat =
11328 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
11329 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
11331 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11332 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
11346 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
11348 case Intrinsic::amdgcn_struct_buffer_load:
11349 case Intrinsic::amdgcn_struct_ptr_buffer_load:
11350 case Intrinsic::amdgcn_struct_buffer_load_format:
11351 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
11352 case Intrinsic::amdgcn_struct_atomic_buffer_load:
11353 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
11354 const bool IsFormat =
11355 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
11356 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
11358 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11359 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11374 case Intrinsic::amdgcn_raw_tbuffer_load:
11375 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11377 EVT LoadVT =
Op.getValueType();
11378 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11379 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
11395 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11397 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11398 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11401 case Intrinsic::amdgcn_struct_tbuffer_load:
11402 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11404 EVT LoadVT =
Op.getValueType();
11405 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11406 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11422 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11424 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11425 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11428 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11430 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11431 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11432 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11433 return lowerStructBufferAtomicIntrin(
Op, DAG,
11434 AMDGPUISD::BUFFER_ATOMIC_FADD);
11435 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11436 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11437 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11440 return lowerStructBufferAtomicIntrin(
Op, DAG,
11441 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11442 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11444 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11445 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11447 return lowerStructBufferAtomicIntrin(
Op, DAG,
11448 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11449 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11451 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11452 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11454 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11455 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11457 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11458 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11460 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11461 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11463 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11464 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11466 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11467 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11469 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11470 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11472 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11473 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11475 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11476 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11478 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11479 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11481 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11482 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11484 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11485 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11487 return lowerStructBufferAtomicIntrin(
Op, DAG,
11488 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11489 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11491 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11492 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11493 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11494 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11495 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11497 return lowerStructBufferAtomicIntrin(
Op, DAG,
11498 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11499 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11501 return lowerStructBufferAtomicIntrin(
Op, DAG,
11502 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11503 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11505 return lowerStructBufferAtomicIntrin(
Op, DAG,
11506 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11507 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11508 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11509 return lowerStructBufferAtomicIntrin(
Op, DAG,
11510 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11511 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11512 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11513 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11514 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11516 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11517 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11518 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11519 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11520 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11521 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11522 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11523 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11525 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11526 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11527 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11528 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11529 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11530 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11531 return lowerStructBufferAtomicIntrin(
Op, DAG,
11532 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11533 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11534 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11535 return lowerRawBufferAtomicIntrin(
Op, DAG,
11536 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11537 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11538 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11539 return lowerStructBufferAtomicIntrin(
Op, DAG,
11540 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11541 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11542 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11543 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
11544 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11558 EVT VT =
Op.getValueType();
11562 Op->getVTList(),
Ops, VT,
11563 M->getMemOperand());
11565 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11566 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11567 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
11568 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
11582 EVT VT =
Op.getValueType();
11586 Op->getVTList(),
Ops, VT,
11587 M->getMemOperand());
11589 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11590 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11592 SDValue NodePtr =
M->getOperand(2);
11593 SDValue RayExtent =
M->getOperand(3);
11594 SDValue InstanceMask =
M->getOperand(4);
11595 SDValue RayOrigin =
M->getOperand(5);
11596 SDValue RayDir =
M->getOperand(6);
11598 SDValue TDescr =
M->getOperand(8);
11603 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11608 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11609 const unsigned NumVDataDwords = 10;
11610 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11612 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11613 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11614 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11618 Ops.push_back(NodePtr);
11621 {DAG.getBitcast(MVT::i32, RayExtent),
11622 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11623 Ops.push_back(RayOrigin);
11624 Ops.push_back(RayDir);
11625 Ops.push_back(Offsets);
11626 Ops.push_back(TDescr);
11627 Ops.push_back(
M->getChain());
11630 MachineMemOperand *MemRef =
M->getMemOperand();
11634 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11636 SDValue NodePtr =
M->getOperand(2);
11637 SDValue RayExtent =
M->getOperand(3);
11638 SDValue RayOrigin =
M->getOperand(4);
11639 SDValue RayDir =
M->getOperand(5);
11640 SDValue RayInvDir =
M->getOperand(6);
11641 SDValue TDescr =
M->getOperand(7);
11648 if (!Subtarget->hasGFX10_AEncoding()) {
11658 const unsigned NumVDataDwords = 4;
11659 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11660 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11661 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11664 const unsigned BaseOpcodes[2][2] = {
11665 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11666 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11667 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11671 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11672 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11673 : AMDGPU::MIMGEncGfx10NSA,
11674 NumVDataDwords, NumVAddrDwords);
11678 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11679 : AMDGPU::MIMGEncGfx10Default,
11680 NumVDataDwords, NumVAddrDwords);
11686 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
11689 if (Lanes[0].getValueSizeInBits() == 32) {
11690 for (
unsigned I = 0;
I < 3; ++
I)
11697 Ops.push_back(Lanes[2]);
11709 if (UseNSA && IsGFX11Plus) {
11710 Ops.push_back(NodePtr);
11712 Ops.push_back(RayOrigin);
11717 for (
unsigned I = 0;
I < 3; ++
I) {
11720 {DirLanes[I], InvDirLanes[I]})));
11724 Ops.push_back(RayDir);
11725 Ops.push_back(RayInvDir);
11732 Ops.push_back(NodePtr);
11735 packLanes(RayOrigin,
true);
11736 packLanes(RayDir,
true);
11737 packLanes(RayInvDir,
false);
11742 if (NumVAddrDwords > 12) {
11750 Ops.push_back(MergedOps);
11753 Ops.push_back(TDescr);
11755 Ops.push_back(
M->getChain());
11758 MachineMemOperand *MemRef =
M->getMemOperand();
11762 case Intrinsic::amdgcn_global_atomic_fmin_num:
11763 case Intrinsic::amdgcn_global_atomic_fmax_num:
11764 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11765 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11772 unsigned Opcode = 0;
11774 case Intrinsic::amdgcn_global_atomic_fmin_num:
11775 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11779 case Intrinsic::amdgcn_global_atomic_fmax_num:
11780 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11787 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
11788 Ops,
M->getMemOperand());
11790 case Intrinsic::amdgcn_s_alloc_vgpr: {
11798 ReadFirstLaneID, NumVGPRs);
11801 Op.getOperand(0),
Op.getOperand(1), NumVGPRs);
11803 case Intrinsic::amdgcn_s_get_barrier_state:
11804 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11811 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11812 BarID = (BarID >> 4) & 0x3F;
11813 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11816 Ops.push_back(Chain);
11818 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11819 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11835 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11836 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11837 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11841 EVT VT =
Op->getValueType(0);
11845 case Intrinsic::amdgcn_flat_load_monitor_b32:
11846 case Intrinsic::amdgcn_flat_load_monitor_b64:
11847 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11852 Op->getVTList(), {Chain, Ptr},
11855 case Intrinsic::amdgcn_global_load_monitor_b32:
11856 case Intrinsic::amdgcn_global_load_monitor_b64:
11857 case Intrinsic::amdgcn_global_load_monitor_b128: {
11862 Op->getVTList(), {Chain, Ptr},
11867 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11869 return lowerImage(
Op, ImageDimIntr, DAG,
true);
11877SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
11884 EVT VT = VTList.
VTs[0];
11887 bool IsTFE = VTList.
NumVTs == 3;
11890 unsigned NumOpDWords = NumValueDWords + 1;
11892 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
11893 MachineMemOperand *OpDWordsMMO =
11895 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
11896 OpDWordsVT, OpDWordsMMO, DAG);
11901 NumValueDWords == 1
11910 if (!Subtarget->hasDwordx3LoadStores() &&
11911 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11915 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
11917 WidenedMemVT, WidenedMMO);
11927 bool ImageStore)
const {
11937 if (Subtarget->hasUnpackedD16VMem()) {
11951 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11962 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
11968 if ((NumElements % 2) == 1) {
11970 unsigned I = Elts.
size() / 2;
11986 if (NumElements == 3) {
12005 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12006 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12007 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12008 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
12009 case Intrinsic::amdgcn_load_async_to_lds:
12010 case Intrinsic::amdgcn_global_load_async_lds:
12020 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
12022 switch (IntrinsicID) {
12023 case Intrinsic::amdgcn_exp_compr: {
12024 if (!Subtarget->hasCompressedExport()) {
12027 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
12049 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
12053 case Intrinsic::amdgcn_struct_tbuffer_store:
12054 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
12056 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
12058 VData = handleD16VData(VData, DAG);
12059 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12060 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
12074 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12075 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12078 M->getMemoryVT(),
M->getMemOperand());
12081 case Intrinsic::amdgcn_raw_tbuffer_store:
12082 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
12084 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
12086 VData = handleD16VData(VData, DAG);
12087 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12088 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
12102 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12103 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12106 M->getMemoryVT(),
M->getMemOperand());
12109 case Intrinsic::amdgcn_raw_buffer_store:
12110 case Intrinsic::amdgcn_raw_ptr_buffer_store:
12111 case Intrinsic::amdgcn_raw_buffer_store_format:
12112 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
12113 const bool IsFormat =
12114 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
12115 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
12122 VData = handleD16VData(VData, DAG);
12132 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12133 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
12147 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
12148 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
12153 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
12156 M->getMemoryVT(),
M->getMemOperand());
12159 case Intrinsic::amdgcn_struct_buffer_store:
12160 case Intrinsic::amdgcn_struct_ptr_buffer_store:
12161 case Intrinsic::amdgcn_struct_buffer_store_format:
12162 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
12163 const bool IsFormat =
12164 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
12165 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
12173 VData = handleD16VData(VData, DAG);
12183 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12184 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
12198 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
12199 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
12203 EVT VDataType = VData.getValueType().getScalarType();
12205 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
12208 M->getMemoryVT(),
M->getMemOperand());
12210 case Intrinsic::amdgcn_raw_buffer_load_lds:
12211 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12212 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
12213 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12214 case Intrinsic::amdgcn_struct_buffer_load_lds:
12215 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12216 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
12217 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
12218 if (!Subtarget->hasVMemToLDSLoad())
12222 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
12223 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
12224 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
12225 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
12226 unsigned OpOffset = HasVIndex ? 1 : 0;
12227 SDValue VOffset =
Op.getOperand(5 + OpOffset);
12229 unsigned Size =
Op->getConstantOperandVal(4);
12235 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
12236 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
12237 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
12238 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
12241 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
12242 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
12243 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
12244 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
12247 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
12248 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
12249 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
12250 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
12253 if (!Subtarget->hasLDSLoadB96_B128())
12255 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
12256 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
12257 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
12258 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
12261 if (!Subtarget->hasLDSLoadB96_B128())
12263 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
12264 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
12265 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
12266 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
12274 if (HasVIndex && HasVOffset)
12278 else if (HasVIndex)
12279 Ops.push_back(
Op.getOperand(5));
12280 else if (HasVOffset)
12281 Ops.push_back(VOffset);
12283 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
12284 Ops.push_back(Rsrc);
12285 Ops.push_back(
Op.getOperand(6 + OpOffset));
12286 Ops.push_back(
Op.getOperand(7 + OpOffset));
12288 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
12311 case Intrinsic::amdgcn_load_to_lds:
12312 case Intrinsic::amdgcn_load_async_to_lds:
12313 case Intrinsic::amdgcn_global_load_lds:
12314 case Intrinsic::amdgcn_global_load_async_lds: {
12315 if (!Subtarget->hasVMemToLDSLoad())
12319 unsigned Size =
Op->getConstantOperandVal(4);
12324 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
12327 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
12330 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
12333 if (!Subtarget->hasLDSLoadB96_B128())
12335 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
12338 if (!Subtarget->hasLDSLoadB96_B128())
12340 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
12356 if (
LHS->isDivergent())
12360 RHS.getOperand(0).getValueType() == MVT::i32) {
12363 VOffset =
RHS.getOperand(0);
12367 Ops.push_back(Addr);
12375 Ops.push_back(VOffset);
12378 Ops.push_back(
Op.getOperand(5));
12380 unsigned Aux =
Op.getConstantOperandVal(6);
12395 case Intrinsic::amdgcn_end_cf:
12397 Op->getOperand(2), Chain),
12399 case Intrinsic::amdgcn_s_barrier_signal_var: {
12406 if (CntC && CntC->isZero()) {
12411 std::optional<uint64_t> BarVal;
12413 BarVal =
C->getZExtValue();
12417 BarVal = *Addr + GA->getOffset();
12420 unsigned BarID = (*BarVal >> 4) & 0x3F;
12422 Ops.push_back(Chain);
12424 Op->getVTList(),
Ops);
12430 case Intrinsic::amdgcn_s_barrier_init: {
12437 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12438 ? AMDGPU::S_BARRIER_INIT_M0
12439 : AMDGPU::S_BARRIER_SIGNAL_M0;
12454 constexpr unsigned ShAmt = 16;
12466 case Intrinsic::amdgcn_s_wakeup_barrier: {
12467 if (!Subtarget->hasSWakeupBarrier())
12471 case Intrinsic::amdgcn_s_barrier_join: {
12480 switch (IntrinsicID) {
12483 case Intrinsic::amdgcn_s_barrier_join:
12484 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12486 case Intrinsic::amdgcn_s_wakeup_barrier:
12487 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12491 unsigned BarID = (BarVal >> 4) & 0x3F;
12494 Ops.push_back(Chain);
12496 switch (IntrinsicID) {
12499 case Intrinsic::amdgcn_s_barrier_join:
12500 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12502 case Intrinsic::amdgcn_s_wakeup_barrier:
12503 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12520 case Intrinsic::amdgcn_s_prefetch_data: {
12523 return Op.getOperand(0);
12526 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12528 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
12535 Op->getVTList(),
Ops,
M->getMemoryVT(),
12536 M->getMemOperand());
12538 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12539 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12540 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12549 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12551 return lowerImage(
Op, ImageDimIntr, DAG,
true);
12567 return PtrVT == MVT::i64;
12581std::pair<SDValue, SDValue>
12594 bool CheckNUW = Subtarget->hasGFX1250Insts();
12611 unsigned Overflow = ImmOffset & ~MaxImm;
12612 ImmOffset -= Overflow;
12613 if ((int32_t)Overflow < 0) {
12614 Overflow += ImmOffset;
12619 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
12638void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
12640 Align Alignment)
const {
12642 SDLoc
DL(CombinedOffset);
12644 uint32_t
Imm =
C->getZExtValue();
12645 uint32_t SOffset, ImmOffset;
12646 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12657 bool CheckNUW = Subtarget->hasGFX1250Insts();
12660 uint32_t SOffset, ImmOffset;
12663 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
12671 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12680SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
12683 return MaybePointer;
12697 SDValue NumRecords =
Op->getOperand(3);
12703 if (Subtarget->has45BitNumRecordsBufferResource()) {
12722 SDValue ExtShiftedStrideVec =
12734 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12736 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12741 auto [LowHalf, HighHalf] =
12742 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12752 NumRecords, Flags);
12764 bool IsTFE)
const {
12769 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12770 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12773 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
12785 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12786 : AMDGPUISD::BUFFER_LOAD_USHORT;
12788 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
12802 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12806 Ops[1] = BufferStoreExt;
12807 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12808 : AMDGPUISD::BUFFER_STORE_SHORT;
12811 M->getMemOperand());
12836 DAGCombinerInfo &DCI)
const {
12837 SelectionDAG &DAG = DCI.DAG;
12852 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
12859 "unexpected vector extload");
12872 "unexpected fp extload");
12890 DCI.AddToWorklist(Cvt.
getNode());
12895 DCI.AddToWorklist(Cvt.
getNode());
12906 if (Info.isEntryFunction())
12907 return Info.getUserSGPRInfo().hasFlatScratchInit();
12915 EVT MemVT =
Load->getMemoryVT();
12916 MachineMemOperand *MMO =
Load->getMemOperand();
12928 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12956 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
12957 "Custom lowering for non-i32 vectors hasn't been implemented.");
12960 unsigned AS =
Load->getAddressSpace();
12961 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12968 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
12972 !Subtarget->hasMultiDwordFlatScratchAddressing())
12982 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
12985 Alignment >=
Align(4) && NumElements < 32) {
12987 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12999 if (NumElements > 4)
13002 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13012 switch (Subtarget->getMaxPrivateElementSize()) {
13018 if (NumElements > 2)
13023 if (NumElements > 4)
13026 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13035 auto Flags =
Load->getMemOperand()->getFlags();
13037 Load->getAlign(), Flags, &
Fast) &&
13046 MemVT, *
Load->getMemOperand())) {
13055 EVT VT =
Op.getValueType();
13092 EVT VT =
Op.getValueType();
13093 const SDNodeFlags
Flags =
Op->getFlags();
13095 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
13101 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
13104 if (CLHS->isExactlyValue(1.0)) {
13117 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
13121 if (CLHS->isExactlyValue(-1.0)) {
13124 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
13130 if (!AllowInaccurateRcp &&
13131 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
13145 EVT VT =
Op.getValueType();
13146 const SDNodeFlags
Flags =
Op->getFlags();
13148 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
13149 if (!AllowInaccurateDiv)
13185 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
13195 Opcode = AMDGPUISD::FMUL_W_CHAIN;
13199 return DAG.
getNode(Opcode, SL, VTList,
13208 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
13218 Opcode = AMDGPUISD::FMA_W_CHAIN;
13222 return DAG.
getNode(Opcode, SL, VTList,
13228 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
13229 return FastLowered;
13232 EVT VT =
Op.getValueType();
13239 if (VT == MVT::bf16) {
13262 unsigned FMADOpCode =
13266 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
13269 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13271 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
13272 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13282 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
13288 SDNodeFlags
Flags =
Op->getFlags();
13298 const APFloat K0Val(0x1p+96f);
13301 const APFloat K1Val(0x1p-32f);
13328 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
13329 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
13330 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
13335 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
13336 return FastLowered;
13342 SDNodeFlags
Flags =
Op->getFlags();
13343 Flags.setNoFPExcept(
true);
13351 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
13360 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
13364 using namespace AMDGPU::Hwreg;
13365 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
13369 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
13370 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
13373 const bool HasDynamicDenormals =
13379 if (!PreservesDenormals) {
13384 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
13387 if (HasDynamicDenormals) {
13391 SavedDenormMode =
SDValue(GetReg, 0);
13397 SDNode *EnableDenorm;
13398 if (Subtarget->hasDenormModeInst()) {
13399 const SDValue EnableDenormValue =
13402 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
13406 const SDValue EnableDenormValue =
13408 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
13409 {EnableDenormValue,
BitField, Glue});
13419 ApproxRcp, One, NegDivScale0, Flags);
13422 ApproxRcp, Fma0, Flags);
13428 NumeratorScaled,
Mul, Flags);
13434 NumeratorScaled, Fma3, Flags);
13436 if (!PreservesDenormals) {
13437 SDNode *DisableDenorm;
13438 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13442 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
13444 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13448 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
13449 const SDValue DisableDenormValue =
13450 HasDynamicDenormals
13455 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13466 {Fma4, Fma1, Fma3, Scale},
Flags);
13468 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
13472 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
13473 return FastLowered;
13481 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
13487 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13505 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13535 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
13537 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
13541 EVT VT =
Op.getValueType();
13543 if (VT == MVT::f32)
13544 return LowerFDIV32(
Op, DAG);
13546 if (VT == MVT::f64)
13547 return LowerFDIV64(
Op, DAG);
13549 if (VT == MVT::f16 || VT == MVT::bf16)
13550 return LowerFDIV16(
Op, DAG);
13559 EVT ResultExpVT =
Op->getValueType(1);
13560 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13570 if (Subtarget->hasFractBug()) {
13588 EVT VT =
Store->getMemoryVT();
13590 if (VT == MVT::i1) {
13594 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
13598 Store->getValue().getValueType().getScalarType() == MVT::i32);
13600 unsigned AS =
Store->getAddressSpace();
13601 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13609 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
13613 !Subtarget->hasMultiDwordFlatScratchAddressing())
13620 if (NumElements > 4)
13623 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13627 VT, *
Store->getMemOperand()))
13633 switch (Subtarget->getMaxPrivateElementSize()) {
13637 if (NumElements > 2)
13641 if (NumElements > 4 ||
13642 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13650 auto Flags =
Store->getMemOperand()->getFlags();
13669 assert(!Subtarget->has16BitInsts());
13670 SDNodeFlags
Flags =
Op->getFlags();
13684 SDNodeFlags
Flags =
Op->getFlags();
13685 MVT VT =
Op.getValueType().getSimpleVT();
13793 SDNodeFlags
Flags =
Op->getFlags();
13802 if (!
Flags.hasApproximateFuncs()) {
13834 if (!
Flags.hasApproximateFuncs()) {
13843 ScaleDownFactor, ZeroInt);
13850 if (
Flags.hasNoInfs()) {
13866 EVT VT =
Op.getValueType();
13877 if (!
V.getValueType().isVector())
13885 if (Subtarget->hasTrigReducedRange()) {
13887 TrigVal = UnrollIfVec(DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags));
13892 switch (
Op.getOpcode()) {
13894 TrigVal = DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
13897 TrigVal = DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
13903 return UnrollIfVec(TrigVal);
13923 EVT VT =
Op.getValueType();
13931 Op->getVTList(),
Ops, VT,
13940SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
13941 DAGCombinerInfo &DCI)
const {
13942 EVT VT =
N->getValueType(0);
13944 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13947 SelectionDAG &DAG = DCI.DAG;
13951 EVT SrcVT = Src.getValueType();
13957 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13960 DCI.AddToWorklist(Cvt.
getNode());
13963 if (ScalarVT != MVT::f32) {
13975 DAGCombinerInfo &DCI)
const {
13986 SelectionDAG &DAG = DCI.DAG;
14005 for (
unsigned I = 0;
I != NumElts; ++
I) {
14029 if (NewElts.
size() == 1)
14051 for (
unsigned I = 0;
I != NumElts; ++
I) {
14086SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
14088 DAGCombinerInfo &DCI)
const {
14105 SelectionDAG &DAG = DCI.DAG;
14118 AM.BaseOffs =
Offset.getSExtValue();
14123 EVT VT =
N->getValueType(0);
14129 Flags.setNoUnsignedWrap(
14130 N->getFlags().hasNoUnsignedWrap() &&
14142 switch (
N->getOpcode()) {
14153 DAGCombinerInfo &DCI)
const {
14154 SelectionDAG &DAG = DCI.DAG;
14161 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
14162 N->getMemoryVT(), DCI);
14166 NewOps[PtrIdx] = NewPtr;
14175 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
14176 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
14185SDValue SITargetLowering::splitBinaryBitConstantOp(
14189 uint32_t ValLo =
Lo_32(Val);
14190 uint32_t ValHi =
Hi_32(Val);
14197 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
14211 if (V.getValueType() != MVT::i1)
14213 switch (V.getOpcode()) {
14218 case AMDGPUISD::FP_CLASS:
14230 return V.getResNo() == 1;
14232 unsigned IntrinsicID = V.getConstantOperandVal(0);
14233 switch (IntrinsicID) {
14234 case Intrinsic::amdgcn_is_shared:
14235 case Intrinsic::amdgcn_is_private:
14252 if (!(
C & 0x000000ff))
14253 ZeroByteMask |= 0x000000ff;
14254 if (!(
C & 0x0000ff00))
14255 ZeroByteMask |= 0x0000ff00;
14256 if (!(
C & 0x00ff0000))
14257 ZeroByteMask |= 0x00ff0000;
14258 if (!(
C & 0xff000000))
14259 ZeroByteMask |= 0xff000000;
14260 uint32_t NonZeroByteMask = ~ZeroByteMask;
14261 if ((NonZeroByteMask &
C) != NonZeroByteMask)
14274 assert(V.getValueSizeInBits() == 32);
14276 if (V.getNumOperands() != 2)
14285 switch (V.getOpcode()) {
14290 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
14295 return (0x03020100 & ~ConstMask) | ConstMask;
14302 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
14308 return uint32_t(0x0c0c0c0c03020100ull >>
C);
14315 DAGCombinerInfo &DCI)
const {
14316 if (DCI.isBeforeLegalize())
14319 SelectionDAG &DAG = DCI.DAG;
14320 EVT VT =
N->getValueType(0);
14325 if (VT == MVT::i64 && CRHS) {
14327 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
14331 if (CRHS && VT == MVT::i32) {
14341 unsigned Shift = CShift->getZExtValue();
14343 unsigned Offset = NB + Shift;
14344 if ((
Offset & (Bits - 1)) == 0) {
14347 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
14368 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
14370 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14383 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
14388 if (
X !=
LHS.getOperand(1))
14392 const ConstantFPSDNode *C1 =
14409 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
14415 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
14418 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14426 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
14427 LHS.getOperand(0) ==
LHS.getOperand(1))) {
14429 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
14430 :
Mask->getZExtValue() & OrdMask;
14433 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
14451 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14454 if (LHSMask != ~0u && RHSMask != ~0u) {
14457 if (LHSMask > RHSMask) {
14464 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14465 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14468 if (!(LHSUsedLanes & RHSUsedLanes) &&
14471 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14477 uint32_t
Mask = LHSMask & RHSMask;
14478 for (
unsigned I = 0;
I < 32;
I += 8) {
14479 uint32_t ByteSel = 0xff <<
I;
14480 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14481 Mask &= (0x0c <<
I) & 0xffffffff;
14486 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
14489 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14539static const std::optional<ByteProvider<SDValue>>
14541 unsigned Depth = 0) {
14544 return std::nullopt;
14546 if (
Op.getValueSizeInBits() < 8)
14547 return std::nullopt;
14549 if (
Op.getValueType().isVector())
14552 switch (
Op->getOpcode()) {
14565 NarrowVT = VTSign->getVT();
14568 return std::nullopt;
14571 if (SrcIndex >= NarrowByteWidth)
14572 return std::nullopt;
14580 return std::nullopt;
14582 uint64_t BitShift = ShiftOp->getZExtValue();
14584 if (BitShift % 8 != 0)
14585 return std::nullopt;
14587 SrcIndex += BitShift / 8;
14605static const std::optional<ByteProvider<SDValue>>
14607 unsigned StartingIndex = 0) {
14611 return std::nullopt;
14613 unsigned BitWidth =
Op.getScalarValueSizeInBits();
14615 return std::nullopt;
14617 return std::nullopt;
14619 bool IsVec =
Op.getValueType().isVector();
14620 switch (
Op.getOpcode()) {
14623 return std::nullopt;
14628 return std::nullopt;
14632 return std::nullopt;
14635 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
14636 return std::nullopt;
14637 if (!
LHS ||
LHS->isConstantZero())
14639 if (!
RHS ||
RHS->isConstantZero())
14641 return std::nullopt;
14646 return std::nullopt;
14650 return std::nullopt;
14652 uint32_t BitMask = BitMaskOp->getZExtValue();
14654 uint32_t IndexMask = 0xFF << (Index * 8);
14656 if ((IndexMask & BitMask) != IndexMask) {
14659 if (IndexMask & BitMask)
14660 return std::nullopt;
14669 return std::nullopt;
14673 if (!ShiftOp ||
Op.getValueType().isVector())
14674 return std::nullopt;
14676 uint64_t BitsProvided =
Op.getValueSizeInBits();
14677 if (BitsProvided % 8 != 0)
14678 return std::nullopt;
14680 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14682 return std::nullopt;
14684 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14685 uint64_t ByteShift = BitShift / 8;
14687 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14688 uint64_t BytesProvided = BitsProvided / 8;
14689 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14690 NewIndex %= BytesProvided;
14697 return std::nullopt;
14701 return std::nullopt;
14703 uint64_t BitShift = ShiftOp->getZExtValue();
14705 return std::nullopt;
14707 auto BitsProvided =
Op.getScalarValueSizeInBits();
14708 if (BitsProvided % 8 != 0)
14709 return std::nullopt;
14711 uint64_t BytesProvided = BitsProvided / 8;
14712 uint64_t ByteShift = BitShift / 8;
14717 return BytesProvided - ByteShift > Index
14725 return std::nullopt;
14729 return std::nullopt;
14731 uint64_t BitShift = ShiftOp->getZExtValue();
14732 if (BitShift % 8 != 0)
14733 return std::nullopt;
14734 uint64_t ByteShift = BitShift / 8;
14740 return Index < ByteShift
14743 Depth + 1, StartingIndex);
14752 return std::nullopt;
14760 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14762 if (NarrowBitWidth % 8 != 0)
14763 return std::nullopt;
14764 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14766 if (Index >= NarrowByteWidth)
14768 ? std::optional<ByteProvider<SDValue>>(
14776 return std::nullopt;
14780 if (NarrowByteWidth >= Index) {
14785 return std::nullopt;
14792 return std::nullopt;
14798 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14799 if (NarrowBitWidth % 8 != 0)
14800 return std::nullopt;
14801 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14806 if (Index >= NarrowByteWidth) {
14808 ? std::optional<ByteProvider<SDValue>>(
14813 if (NarrowByteWidth > Index) {
14817 return std::nullopt;
14822 return std::nullopt;
14825 Depth + 1, StartingIndex);
14831 return std::nullopt;
14832 auto VecIdx = IdxOp->getZExtValue();
14833 auto ScalarSize =
Op.getScalarValueSizeInBits();
14834 if (ScalarSize < 32)
14835 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14837 StartingIndex, Index);
14840 case AMDGPUISD::PERM: {
14842 return std::nullopt;
14846 return std::nullopt;
14849 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14850 if (IdxMask > 0x07 && IdxMask != 0x0c)
14851 return std::nullopt;
14853 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14854 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14856 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
14862 return std::nullopt;
14877 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
14884 auto MemVT = L->getMemoryVT();
14887 return L->getMemoryVT().getSizeInBits() == 16;
14897 int Low8 = Mask & 0xff;
14898 int Hi8 = (Mask & 0xff00) >> 8;
14900 assert(Low8 < 8 && Hi8 < 8);
14902 bool IsConsecutive = (Hi8 - Low8 == 1);
14907 bool Is16Aligned = !(Low8 % 2);
14909 return IsConsecutive && Is16Aligned;
14917 int Low16 = PermMask & 0xffff;
14918 int Hi16 = (PermMask & 0xffff0000) >> 16;
14928 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14930 if (!OtherOpIs16Bit)
14938 unsigned DWordOffset) {
14943 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14948 if (Src.getValueType().isVector()) {
14949 auto ScalarTySize = Src.getScalarValueSizeInBits();
14950 auto ScalarTy = Src.getValueType().getScalarType();
14951 if (ScalarTySize == 32) {
14955 if (ScalarTySize > 32) {
14958 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14959 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14966 assert(ScalarTySize < 32);
14967 auto NumElements =
TypeSize / ScalarTySize;
14968 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14969 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14970 auto NumElementsIn32 = 32 / ScalarTySize;
14971 auto NumAvailElements = DWordOffset < Trunc32Elements
14973 : NumElements - NormalizedTrunc;
14986 auto ShiftVal = 32 * DWordOffset;
14994 [[maybe_unused]]
EVT VT =
N->getValueType(0);
14999 for (
int i = 0; i < 4; i++) {
15001 std::optional<ByteProvider<SDValue>>
P =
15004 if (!
P ||
P->isConstantZero())
15009 if (PermNodes.
size() != 4)
15012 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
15013 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
15015 for (
size_t i = 0; i < PermNodes.
size(); i++) {
15016 auto PermOp = PermNodes[i];
15019 int SrcByteAdjust = 4;
15023 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
15024 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
15026 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
15027 ((PermOp.SrcOffset / 4) != SecondSrc->second))
15031 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
15032 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
15035 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
15037 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
15040 SDValue Op = *PermNodes[FirstSrc.first].Src;
15042 assert(
Op.getValueSizeInBits() == 32);
15046 int Low16 = PermMask & 0xffff;
15047 int Hi16 = (PermMask & 0xffff0000) >> 16;
15049 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
15050 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
15053 if (WellFormedLow && WellFormedHi)
15057 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
15066 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
15067 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
15072 assert(
Op.getValueType().isByteSized() &&
15083 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
15090 DAGCombinerInfo &DCI)
const {
15091 SelectionDAG &DAG = DCI.DAG;
15095 EVT VT =
N->getValueType(0);
15096 if (VT == MVT::i1) {
15098 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
15099 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
15101 if (Src !=
RHS.getOperand(0))
15106 if (!CLHS || !CRHS)
15110 static const uint32_t MaxMask = 0x3ff;
15115 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
15124 LHS.getOpcode() == AMDGPUISD::PERM &&
15130 Sel |=
LHS.getConstantOperandVal(2);
15132 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
15139 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15143 auto usesCombinedOperand = [](SDNode *OrUse) {
15146 !OrUse->getValueType(0).isVector())
15150 for (
auto *VUser : OrUse->users()) {
15151 if (!VUser->getValueType(0).isVector())
15158 if (VUser->getOpcode() == VectorwiseOp)
15164 if (!
any_of(
N->users(), usesCombinedOperand))
15170 if (LHSMask != ~0u && RHSMask != ~0u) {
15173 if (LHSMask > RHSMask) {
15180 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15181 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15184 if (!(LHSUsedLanes & RHSUsedLanes) &&
15187 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
15189 LHSMask &= ~RHSUsedLanes;
15190 RHSMask &= ~LHSUsedLanes;
15192 LHSMask |= LHSUsedLanes & 0x04040404;
15194 uint32_t Sel = LHSMask | RHSMask;
15197 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
15202 if (LHSMask == ~0u || RHSMask == ~0u) {
15243 return IdentitySrc;
15249 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
15264 if (SrcVT == MVT::i32) {
15269 DCI.AddToWorklist(LowOr.
getNode());
15270 DCI.AddToWorklist(HiBits.getNode());
15281 N->getOperand(0), CRHS))
15289 DAGCombinerInfo &DCI)
const {
15290 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
15297 SelectionDAG &DAG = DCI.DAG;
15299 EVT VT =
N->getValueType(0);
15300 if (CRHS && VT == MVT::i64) {
15302 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
15309 unsigned Opc =
LHS.getOpcode();
15339 LHS->getOperand(0), FNegLHS, FNegRHS);
15348SITargetLowering::performZeroOrAnyExtendCombine(
SDNode *
N,
15349 DAGCombinerInfo &DCI)
const {
15350 if (!Subtarget->has16BitInsts() ||
15354 EVT VT =
N->getValueType(0);
15355 if (VT != MVT::i32)
15359 if (Src.getValueType() != MVT::i16)
15362 if (!Src->hasOneUse())
15369 std::optional<ByteProvider<SDValue>> BP0 =
15371 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
15375 std::optional<ByteProvider<SDValue>> BP1 =
15377 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
15385 SelectionDAG &DAG = DCI.DAG;
15387 uint32_t PermMask = 0x0c0c0c0c;
15390 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
15395 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
15398 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32, V0, V1,
15403SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
15404 DAGCombinerInfo &DCI)
const {
15410 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
15411 VTSign->getVT() == MVT::i8) ||
15412 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
15413 VTSign->getVT() == MVT::i16))) {
15414 assert(Subtarget->hasScalarSubwordLoads() &&
15415 "s_buffer_load_{u8, i8} are supported "
15416 "in GFX12 (or newer) architectures.");
15417 EVT VT = Src.getValueType();
15418 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
15419 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15420 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15422 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15429 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15430 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15434 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15435 VTSign->getVT() == MVT::i8) ||
15436 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15437 VTSign->getVT() == MVT::i16)) &&
15446 Src.getOperand(6), Src.getOperand(7)};
15449 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15450 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15451 ? AMDGPUISD::BUFFER_LOAD_BYTE
15452 : AMDGPUISD::BUFFER_LOAD_SHORT;
15453 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15454 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15455 return DCI.DAG.getMergeValues(
15456 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
15462 DAGCombinerInfo &DCI)
const {
15463 SelectionDAG &DAG = DCI.DAG;
15470 if (
N->getOperand(0).isUndef())
15477 DAGCombinerInfo &DCI)
const {
15478 EVT VT =
N->getValueType(0);
15489 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
15498 unsigned MaxDepth)
const {
15499 unsigned Opcode =
Op.getOpcode();
15504 const auto &
F = CFP->getValueAPF();
15505 if (
F.isNaN() &&
F.isSignaling())
15507 if (!
F.isDenormal())
15539 case AMDGPUISD::FMUL_LEGACY:
15540 case AMDGPUISD::FMAD_FTZ:
15541 case AMDGPUISD::RCP:
15542 case AMDGPUISD::RSQ:
15543 case AMDGPUISD::RSQ_CLAMP:
15544 case AMDGPUISD::RCP_LEGACY:
15545 case AMDGPUISD::RCP_IFLAG:
15546 case AMDGPUISD::LOG:
15547 case AMDGPUISD::EXP:
15548 case AMDGPUISD::DIV_SCALE:
15549 case AMDGPUISD::DIV_FMAS:
15550 case AMDGPUISD::DIV_FIXUP:
15551 case AMDGPUISD::FRACT:
15552 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15553 case AMDGPUISD::CVT_F32_UBYTE0:
15554 case AMDGPUISD::CVT_F32_UBYTE1:
15555 case AMDGPUISD::CVT_F32_UBYTE2:
15556 case AMDGPUISD::CVT_F32_UBYTE3:
15557 case AMDGPUISD::FP_TO_FP16:
15558 case AMDGPUISD::SIN_HW:
15559 case AMDGPUISD::COS_HW:
15570 if (
Op.getValueType() == MVT::i32) {
15576 if (RHS->getZExtValue() == 0xffff0000) {
15586 return Op.getValueType().getScalarType() != MVT::f16;
15596 case AMDGPUISD::CLAMP:
15597 case AMDGPUISD::FMED3:
15598 case AMDGPUISD::FMAX3:
15599 case AMDGPUISD::FMIN3:
15600 case AMDGPUISD::FMAXIMUM3:
15601 case AMDGPUISD::FMINIMUM3: {
15607 if (Subtarget->supportsMinMaxDenormModes() ||
15617 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
15629 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
15656 if (
Op.getValueType() == MVT::i16) {
15667 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
15669 switch (IntrinsicID) {
15670 case Intrinsic::amdgcn_cvt_pkrtz:
15671 case Intrinsic::amdgcn_cubeid:
15672 case Intrinsic::amdgcn_frexp_mant:
15673 case Intrinsic::amdgcn_fdot2:
15674 case Intrinsic::amdgcn_rcp:
15675 case Intrinsic::amdgcn_rsq:
15676 case Intrinsic::amdgcn_rsq_clamp:
15677 case Intrinsic::amdgcn_rcp_legacy:
15678 case Intrinsic::amdgcn_rsq_legacy:
15679 case Intrinsic::amdgcn_trig_preop:
15680 case Intrinsic::amdgcn_tanh:
15681 case Intrinsic::amdgcn_log:
15682 case Intrinsic::amdgcn_exp2:
15683 case Intrinsic::amdgcn_sqrt:
15701 unsigned MaxDepth)
const {
15704 unsigned Opcode =
MI->getOpcode();
15706 if (Opcode == AMDGPU::G_FCANONICALIZE)
15709 std::optional<FPValueAndVReg> FCR;
15712 if (FCR->Value.isSignaling())
15714 if (!FCR->Value.isDenormal())
15725 case AMDGPU::G_FADD:
15726 case AMDGPU::G_FSUB:
15727 case AMDGPU::G_FMUL:
15728 case AMDGPU::G_FCEIL:
15729 case AMDGPU::G_FFLOOR:
15730 case AMDGPU::G_FRINT:
15731 case AMDGPU::G_FNEARBYINT:
15732 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15733 case AMDGPU::G_INTRINSIC_TRUNC:
15734 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15735 case AMDGPU::G_FMA:
15736 case AMDGPU::G_FMAD:
15737 case AMDGPU::G_FSQRT:
15738 case AMDGPU::G_FDIV:
15739 case AMDGPU::G_FREM:
15740 case AMDGPU::G_FPOW:
15741 case AMDGPU::G_FPEXT:
15742 case AMDGPU::G_FLOG:
15743 case AMDGPU::G_FLOG2:
15744 case AMDGPU::G_FLOG10:
15745 case AMDGPU::G_FPTRUNC:
15746 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15747 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15748 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15749 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15750 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15752 case AMDGPU::G_FNEG:
15753 case AMDGPU::G_FABS:
15754 case AMDGPU::G_FCOPYSIGN:
15756 case AMDGPU::G_FMINNUM:
15757 case AMDGPU::G_FMAXNUM:
15758 case AMDGPU::G_FMINNUM_IEEE:
15759 case AMDGPU::G_FMAXNUM_IEEE:
15760 case AMDGPU::G_FMINIMUM:
15761 case AMDGPU::G_FMAXIMUM:
15762 case AMDGPU::G_FMINIMUMNUM:
15763 case AMDGPU::G_FMAXIMUMNUM: {
15764 if (Subtarget->supportsMinMaxDenormModes() ||
15771 case AMDGPU::G_BUILD_VECTOR:
15776 case AMDGPU::G_INTRINSIC:
15777 case AMDGPU::G_INTRINSIC_CONVERGENT:
15779 case Intrinsic::amdgcn_fmul_legacy:
15780 case Intrinsic::amdgcn_fmad_ftz:
15781 case Intrinsic::amdgcn_sqrt:
15782 case Intrinsic::amdgcn_fmed3:
15783 case Intrinsic::amdgcn_sin:
15784 case Intrinsic::amdgcn_cos:
15785 case Intrinsic::amdgcn_log:
15786 case Intrinsic::amdgcn_exp2:
15787 case Intrinsic::amdgcn_log_clamp:
15788 case Intrinsic::amdgcn_rcp:
15789 case Intrinsic::amdgcn_rcp_legacy:
15790 case Intrinsic::amdgcn_rsq:
15791 case Intrinsic::amdgcn_rsq_clamp:
15792 case Intrinsic::amdgcn_rsq_legacy:
15793 case Intrinsic::amdgcn_div_scale:
15794 case Intrinsic::amdgcn_div_fmas:
15795 case Intrinsic::amdgcn_div_fixup:
15796 case Intrinsic::amdgcn_fract:
15797 case Intrinsic::amdgcn_cvt_pkrtz:
15798 case Intrinsic::amdgcn_cubeid:
15799 case Intrinsic::amdgcn_cubema:
15800 case Intrinsic::amdgcn_cubesc:
15801 case Intrinsic::amdgcn_cubetc:
15802 case Intrinsic::amdgcn_frexp_mant:
15803 case Intrinsic::amdgcn_fdot2:
15804 case Intrinsic::amdgcn_trig_preop:
15805 case Intrinsic::amdgcn_tanh:
15824 if (
C.isDenormal()) {
15838 if (
C.isSignaling()) {
15861SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
15862 DAGCombinerInfo &DCI)
const {
15863 SelectionDAG &DAG = DCI.DAG;
15865 EVT VT =
N->getValueType(0);
15874 EVT VT =
N->getValueType(0);
15875 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
15891 EVT EltVT =
Lo.getValueType();
15894 for (
unsigned I = 0;
I != 2; ++
I) {
15898 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15899 }
else if (
Op.isUndef()) {
15935 return AMDGPUISD::FMAX3;
15937 return AMDGPUISD::FMAXIMUM3;
15939 return AMDGPUISD::SMAX3;
15941 return AMDGPUISD::UMAX3;
15945 return AMDGPUISD::FMIN3;
15947 return AMDGPUISD::FMINIMUM3;
15949 return AMDGPUISD::SMIN3;
15951 return AMDGPUISD::UMIN3;
15972 if (!MinK || !MaxK)
15984 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15985 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15986 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
16010 bool IsKnownNoNaNs)
const {
16046 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
16052 if (
Info->getMode().DX10Clamp) {
16061 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
16093 case AMDGPUISD::FMIN_LEGACY:
16094 case AMDGPUISD::FMAX_LEGACY:
16095 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
16096 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
16099 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
16100 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
16101 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
16106 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
16115 DAGCombinerInfo &DCI)
const {
16116 SelectionDAG &DAG = DCI.DAG;
16127 auto IsTreeWithCombinableChildren = [
Opc](
SDValue Op) {
16128 return (
Op.getOperand(0).getOpcode() ==
Opc &&
16129 Op.getOperand(0).hasOneUse()) ||
16130 (
Op.getOperand(1).getOpcode() ==
Opc &&
16131 Op.getOperand(1).hasOneUse());
16136 bool HasCombinableTreeChild =
16137 CanTreeCombineApply && (IsTreeWithCombinableChildren(Op0) ||
16138 IsTreeWithCombinableChildren(Op1));
16147 if (CanTreeCombineApply && !HasCombinableTreeChild) {
16177 uint64_t Clamp = 0;
16193 if (
SDValue Med3 = performIntMed3ImmCombine(
16198 if (
SDValue Med3 = performIntMed3ImmCombine(
16204 if (
SDValue Med3 = performIntMed3ImmCombine(
16209 if (
SDValue Med3 = performIntMed3ImmCombine(
16222 (
Opc == AMDGPUISD::FMIN_LEGACY &&
16223 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
16224 (VT == MVT::f32 || VT == MVT::f64 ||
16225 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
16226 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
16227 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
16228 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
16230 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1,
16231 N->getFlags().hasNoNaNs()))
16238 const SDNodeFlags
Flags =
N->getFlags();
16240 !Subtarget->hasIEEEMinimumMaximumInsts() &&
16244 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
16254 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
16255 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
16264 DAGCombinerInfo &DCI)
const {
16265 EVT VT =
N->getValueType(0);
16269 SelectionDAG &DAG = DCI.DAG;
16280 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
16284 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
16288 if (
Info->getMode().DX10Clamp) {
16301 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
16308 DAGCombinerInfo &DCI)
const {
16312 return DCI.DAG.getUNDEF(
N->getValueType(0));
16320 bool IsDivergentIdx,
16325 unsigned VecSize = EltSize * NumElem;
16328 if (VecSize <= 64 && EltSize < 32)
16337 if (IsDivergentIdx)
16341 unsigned NumInsts = NumElem +
16342 ((EltSize + 31) / 32) * NumElem ;
16346 if (Subtarget->useVGPRIndexMode())
16347 return NumInsts <= 16;
16351 if (Subtarget->hasMovrel())
16352 return NumInsts <= 15;
16358 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
16373SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
16374 DAGCombinerInfo &DCI)
const {
16380 EVT ResVT =
N->getValueType(0);
16404 if (!
C ||
C->getZExtValue() != 0x1f)
16420 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
16448 DCI.AddToWorklist(Elt0.
getNode());
16449 DCI.AddToWorklist(Elt1.
getNode());
16480 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16481 uint64_t KImmValue = KImm->getZExtValue();
16483 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16486 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16487 uint64_t KFPImmValue =
16488 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16489 return DAG.
getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16495 if (!DCI.isBeforeLegalize())
16502 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16505 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16506 unsigned EltIdx = BitIndex / 32;
16507 unsigned LeftoverBitIdx = BitIndex % 32;
16511 DCI.AddToWorklist(Cast.
getNode());
16515 DCI.AddToWorklist(Elt.
getNode());
16518 DCI.AddToWorklist(Srl.
getNode());
16522 DCI.AddToWorklist(Trunc.
getNode());
16524 if (VecEltVT == ResVT) {
16536SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
16537 DAGCombinerInfo &DCI)
const {
16548 SelectionDAG &DAG = DCI.DAG;
16568 Src.getOperand(0).getValueType() == MVT::f16) {
16569 return Src.getOperand(0);
16573 APFloat Val = CFP->getValueAPF();
16574 bool LosesInfo =
true;
16584 DAGCombinerInfo &DCI)
const {
16585 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16586 "combine only useful on gfx8");
16588 SDValue TruncSrc =
N->getOperand(0);
16589 EVT VT =
N->getValueType(0);
16590 if (VT != MVT::f16)
16593 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
16597 SelectionDAG &DAG = DCI.DAG;
16628unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
16630 const SDNode *N1)
const {
16635 if (((VT == MVT::f32 &&
16637 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16657 EVT VT =
N->getValueType(0);
16658 if (VT != MVT::i32 && VT != MVT::i64)
16664 unsigned Opc =
N->getOpcode();
16719 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
16738 DAGCombinerInfo &DCI)
const {
16741 SelectionDAG &DAG = DCI.DAG;
16742 EVT VT =
N->getValueType(0);
16752 if (!
N->isDivergent() && Subtarget->hasSMulHi())
16756 if (NumBits <= 32 || NumBits > 64)
16767 if (!Subtarget->hasFullRate64Ops()) {
16768 unsigned NumUsers = 0;
16769 for (SDNode *User :
LHS->
users()) {
16772 if (!
User->isAnyAdd())
16796 bool MulSignedLo =
false;
16797 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16806 if (VT != MVT::i64) {
16829 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16831 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16832 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16834 if (!MulLHSUnsigned32) {
16841 if (!MulRHSUnsigned32) {
16852 if (VT != MVT::i64)
16858SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
16859 DAGCombinerInfo &DCI)
const {
16869 SelectionDAG &DAG = DCI.DAG;
16884 unsigned Opcode =
N->getOpcode();
16888 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
16899static std::optional<ByteProvider<SDValue>>
16902 if (!Byte0 || Byte0->isConstantZero()) {
16903 return std::nullopt;
16906 if (Byte1 && !Byte1->isConstantZero()) {
16907 return std::nullopt;
16913 unsigned FirstCs =
First & 0x0c0c0c0c;
16914 unsigned SecondCs = Second & 0x0c0c0c0c;
16915 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
16916 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16918 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16919 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16920 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16921 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16923 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16947 for (
int BPI = 0; BPI < 2; BPI++) {
16950 BPP = {Src1, Src0};
16952 unsigned ZeroMask = 0x0c0c0c0c;
16953 unsigned FMask = 0xFF << (8 * (3 - Step));
16955 unsigned FirstMask =
16956 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16957 unsigned SecondMask =
16958 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16962 int FirstGroup = -1;
16963 for (
int I = 0;
I < 2;
I++) {
16965 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
16966 return IterElt.SrcOp == *BPP.first.Src &&
16967 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16971 if (Match != Srcs.
end()) {
16972 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
16977 if (FirstGroup != -1) {
16979 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
16980 return IterElt.SrcOp == *BPP.second.Src &&
16981 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16984 if (Match != Srcs.
end()) {
16985 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
16987 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16995 unsigned ZeroMask = 0x0c0c0c0c;
16996 unsigned FMask = 0xFF << (8 * (3 - Step));
17000 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17004 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17013 if (Srcs.
size() == 1) {
17014 auto *Elt = Srcs.
begin();
17018 if (Elt->PermMask == 0x3020100)
17021 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17025 auto *FirstElt = Srcs.
begin();
17026 auto *SecondElt = std::next(FirstElt);
17033 auto FirstMask = FirstElt->PermMask;
17034 auto SecondMask = SecondElt->PermMask;
17036 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
17037 unsigned FirstPlusFour = FirstMask | 0x04040404;
17040 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
17052 FirstElt = std::next(SecondElt);
17053 if (FirstElt == Srcs.
end())
17056 SecondElt = std::next(FirstElt);
17059 if (SecondElt == Srcs.
end()) {
17064 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17065 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
17071 return Perms.
size() == 2
17077 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
17078 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
17079 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
17080 EntryMask += ZeroMask;
17085 auto Opcode =
Op.getOpcode();
17087 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
17088 Opcode == AMDGPUISD::MUL_I24);
17091static std::optional<bool>
17102 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
17105 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
17107 assert(!(S0IsUnsigned && S0IsSigned));
17108 assert(!(S1IsUnsigned && S1IsSigned));
17116 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
17122 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
17123 return std::nullopt;
17135 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
17136 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
17141 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
17147 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
17148 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
17149 return std::nullopt;
17155 DAGCombinerInfo &DCI)
const {
17156 SelectionDAG &DAG = DCI.DAG;
17157 EVT VT =
N->getValueType(0);
17163 if (Subtarget->hasMad64_32()) {
17164 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
17169 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
17173 if (VT == MVT::i64) {
17174 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17179 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
17181 std::optional<bool> IsSigned;
17187 int ChainLength = 0;
17188 for (
int I = 0;
I < 4;
I++) {
17192 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
17195 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
17200 TempNode->getOperand(MulIdx), *Src0, *Src1,
17201 TempNode->getOperand(MulIdx)->getOperand(0),
17202 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
17206 IsSigned = *IterIsSigned;
17207 if (*IterIsSigned != *IsSigned)
17210 auto AddIdx = 1 - MulIdx;
17213 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
17214 Src2s.
push_back(TempNode->getOperand(AddIdx));
17224 TempNode->getOperand(AddIdx), *Src0, *Src1,
17225 TempNode->getOperand(AddIdx)->getOperand(0),
17226 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
17230 if (*IterIsSigned != *IsSigned)
17234 ChainLength =
I + 2;
17238 TempNode = TempNode->getOperand(AddIdx);
17240 ChainLength =
I + 1;
17241 if (TempNode->getNumOperands() < 2)
17243 LHS = TempNode->getOperand(0);
17244 RHS = TempNode->getOperand(1);
17247 if (ChainLength < 2)
17253 if (ChainLength < 4) {
17263 bool UseOriginalSrc =
false;
17264 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
17265 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
17266 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
17267 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
17268 SmallVector<unsigned, 4> SrcBytes;
17269 auto Src0Mask = Src0s.
begin()->PermMask;
17270 SrcBytes.
push_back(Src0Mask & 0xFF000000);
17271 bool UniqueEntries =
true;
17272 for (
auto I = 1;
I < 4;
I++) {
17273 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
17276 UniqueEntries =
false;
17282 if (UniqueEntries) {
17283 UseOriginalSrc =
true;
17285 auto *FirstElt = Src0s.
begin();
17289 auto *SecondElt = Src1s.
begin();
17291 SecondElt->DWordOffset);
17300 if (!UseOriginalSrc) {
17307 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
17310 : Intrinsic::amdgcn_udot4,
17320 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
17325 unsigned Opc =
LHS.getOpcode();
17337 auto Cond =
RHS.getOperand(0);
17342 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
17359 DAGCombinerInfo &DCI)
const {
17360 SelectionDAG &DAG = DCI.DAG;
17362 EVT VT =
N->getValueType(0);
17375 SDNodeFlags ShlFlags = N1->
getFlags();
17379 SDNodeFlags NewShlFlags =
17384 DCI.AddToWorklist(Inner.
getNode());
17391 if (Subtarget->hasMad64_32()) {
17392 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
17401 if (VT == MVT::i64) {
17402 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17415 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
17416 Y->isDivergent() !=
Z->isDivergent()) {
17425 if (
Y->isDivergent())
17428 SDNodeFlags ReassocFlags =
17431 DCI.AddToWorklist(UniformInner.
getNode());
17443 DAGCombinerInfo &DCI)
const {
17444 SelectionDAG &DAG = DCI.DAG;
17445 EVT VT =
N->getValueType(0);
17447 if (VT == MVT::i64) {
17448 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17452 if (VT != MVT::i32)
17461 unsigned Opc =
RHS.getOpcode();
17468 auto Cond =
RHS.getOperand(0);
17473 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
17499 ConstantSDNode *ShiftAmt =
17501 unsigned BitWidth =
X.getValueType().getScalarSizeInBits();
17512SITargetLowering::performAddCarrySubCarryCombine(
SDNode *
N,
17513 DAGCombinerInfo &DCI)
const {
17515 if (
N->getValueType(0) != MVT::i32)
17521 SelectionDAG &DAG = DCI.DAG;
17526 unsigned LHSOpc =
LHS.getOpcode();
17527 unsigned Opc =
N->getOpcode();
17531 return DAG.
getNode(
Opc, SDLoc(
N),
N->getVTList(), Args);
17537 DAGCombinerInfo &DCI)
const {
17541 SelectionDAG &DAG = DCI.DAG;
17542 EVT VT =
N->getValueType(0);
17554 if (
A ==
LHS.getOperand(1)) {
17555 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17556 if (FusedOp != 0) {
17558 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
17566 if (
A ==
RHS.getOperand(1)) {
17567 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17568 if (FusedOp != 0) {
17570 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
17579 DAGCombinerInfo &DCI)
const {
17583 SelectionDAG &DAG = DCI.DAG;
17585 EVT VT =
N->getValueType(0);
17598 if (
A ==
LHS.getOperand(1)) {
17599 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17600 if (FusedOp != 0) {
17604 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
17613 if (
A ==
RHS.getOperand(1)) {
17614 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17615 if (FusedOp != 0) {
17617 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
17626 DAGCombinerInfo &DCI)
const {
17627 SelectionDAG &DAG = DCI.DAG;
17629 EVT VT =
N->getValueType(0);
17638 SDNodeFlags
Flags =
N->getFlags();
17639 SDNodeFlags RHSFlags =
RHS->getFlags();
17645 bool IsNegative =
false;
17646 if (CLHS->isExactlyValue(1.0) ||
17647 (IsNegative = CLHS->isExactlyValue(-1.0))) {
17653 DAG.
getNode(AMDGPUISD::RSQ, SL, VT,
RHS.getOperand(0), Flags);
17663 DAGCombinerInfo &DCI)
const {
17664 SelectionDAG &DAG = DCI.DAG;
17665 EVT VT =
N->getValueType(0);
17669 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
17670 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17685 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17690 const ConstantFPSDNode *FalseNode =
17700 if (ScalarVT == MVT::f32 &&
17706 if (TrueNodeExpVal == INT_MIN)
17709 if (FalseNodeExpVal == INT_MIN)
17729 DAGCombinerInfo &DCI)
const {
17730 SelectionDAG &DAG = DCI.DAG;
17731 EVT VT =
N->getValueType(0);
17734 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17752 (
N->getFlags().hasAllowContract() &&
17753 FMA->getFlags().hasAllowContract())) {
17787 if (Vec1 == Vec2 || Vec3 == Vec4)
17793 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17794 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17837 EVT VT =
LHS.getValueType();
17838 assert(VT == MVT::f64 &&
"Incorrect operand type!");
17870 if (CC ==
ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)
17874 if (CC ==
ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))
17878 if (CC ==
ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))
17882 if (CC ==
ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)
17885 const std::optional<bool> KnownEq =
17914 if (CC ==
ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))
17918 if (CC ==
ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))
17926 const std::optional<bool> KnownUge =
17951 if (CC ==
ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))
17965 if (CC ==
ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))
17968 const std::optional<bool> KnownUle =
17991 DAGCombinerInfo &DCI)
const {
17992 SelectionDAG &DAG = DCI.DAG;
17997 EVT VT =
LHS.getValueType();
18026 return LHS.getOperand(0);
18040 const APInt &CT =
LHS.getConstantOperandAPInt(1);
18041 const APInt &CF =
LHS.getConstantOperandAPInt(2);
18046 return DAG.
getNOT(SL,
LHS.getOperand(0), MVT::i1);
18049 return LHS.getOperand(0);
18070 if (VT == MVT::i64) {
18082 const std::optional<bool> KnownEq =
18090 const std::optional<bool> KnownEq =
18101 const std::optional<bool> KnownUge =
18121 const std::optional<bool> KnownUle =
18172 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
18177 {Op0Hi, Op1Hi, CarryInHi});
18187 DCI.CombineTo(
LHS.getNode(), Result);
18191 if (VT != MVT::f32 && VT != MVT::f64 &&
18192 (!Subtarget->has16BitInsts() || VT != MVT::f16))
18207 const unsigned IsInfMask =
18209 const unsigned IsFiniteMask =
18214 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
18219 if (VT == MVT::f64) {
18230SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
18231 DAGCombinerInfo &DCI)
const {
18232 SelectionDAG &DAG = DCI.DAG;
18234 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
18253 unsigned ShiftOffset = 8 *
Offset;
18255 ShiftOffset -=
C->getZExtValue();
18257 ShiftOffset +=
C->getZExtValue();
18259 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
18260 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
18261 MVT::f32, Shifted);
18272 DCI.AddToWorklist(
N);
18279 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
18285 DAGCombinerInfo &DCI)
const {
18290 const MachineFunction &MF = DCI.DAG.getMachineFunction();
18294 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
18295 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
18298 APFloat One(
F.getSemantics(),
"1.0");
18300 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
18306 DAGCombinerInfo &DCI)
const {
18327 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
18328 bool isInteger =
LHS.getValueType().isInteger();
18331 if (!isFloatingPoint && !isInteger)
18336 if (!isEquality && !isNonEquality)
18353 if (isFloatingPoint) {
18355 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
18358 const std::optional<int64_t> Val =
18367 if (!(isEquality && TrueVal == ConstVal) &&
18368 !(isNonEquality && FalseVal == ConstVal))
18375 SelectLHS, SelectRHS);
18380 switch (
N->getOpcode()) {
18396 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
18406 switch (
N->getOpcode()) {
18408 return performAddCombine(
N, DCI);
18410 return performPtrAddCombine(
N, DCI);
18412 return performSubCombine(
N, DCI);
18415 return performAddCarrySubCarryCombine(
N, DCI);
18417 return performFAddCombine(
N, DCI);
18419 return performFSubCombine(
N, DCI);
18421 return performFDivCombine(
N, DCI);
18423 return performFMulCombine(
N, DCI);
18425 return performSetCCCombine(
N, DCI);
18427 if (
auto Res = performSelectCombine(
N, DCI))
18442 case AMDGPUISD::FMIN_LEGACY:
18443 case AMDGPUISD::FMAX_LEGACY:
18444 return performMinMaxCombine(
N, DCI);
18446 return performFMACombine(
N, DCI);
18448 return performAndCombine(
N, DCI);
18450 return performOrCombine(
N, DCI);
18453 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
18454 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
18460 return performXorCombine(
N, DCI);
18463 return performZeroOrAnyExtendCombine(
N, DCI);
18465 return performSignExtendInRegCombine(
N, DCI);
18466 case AMDGPUISD::FP_CLASS:
18467 return performClassCombine(
N, DCI);
18469 return performFCanonicalizeCombine(
N, DCI);
18470 case AMDGPUISD::RCP:
18471 return performRcpCombine(
N, DCI);
18473 case AMDGPUISD::FRACT:
18474 case AMDGPUISD::RSQ:
18475 case AMDGPUISD::RCP_LEGACY:
18476 case AMDGPUISD::RCP_IFLAG:
18477 case AMDGPUISD::RSQ_CLAMP: {
18486 return performUCharToFloatCombine(
N, DCI);
18488 return performFCopySignCombine(
N, DCI);
18489 case AMDGPUISD::CVT_F32_UBYTE0:
18490 case AMDGPUISD::CVT_F32_UBYTE1:
18491 case AMDGPUISD::CVT_F32_UBYTE2:
18492 case AMDGPUISD::CVT_F32_UBYTE3:
18493 return performCvtF32UByteNCombine(
N, DCI);
18494 case AMDGPUISD::FMED3:
18495 return performFMed3Combine(
N, DCI);
18496 case AMDGPUISD::CVT_PKRTZ_F16_F32:
18497 return performCvtPkRTZCombine(
N, DCI);
18498 case AMDGPUISD::CLAMP:
18499 return performClampCombine(
N, DCI);
18502 EVT VT =
N->getValueType(0);
18505 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
18508 EVT EltVT = Src.getValueType();
18509 if (EltVT != MVT::i16)
18519 return performExtractVectorEltCombine(
N, DCI);
18521 return performInsertVectorEltCombine(
N, DCI);
18523 return performFPRoundCombine(
N, DCI);
18532 return performMemSDNodeCombine(MemNode, DCI);
18563 unsigned Opcode =
Node->getMachineOpcode();
18566 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
18567 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
18570 SDNode *
Users[5] = {
nullptr};
18572 unsigned DmaskIdx =
18573 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
18574 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
18575 unsigned NewDmask = 0;
18576 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
18577 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
18578 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
18579 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
18580 unsigned TFCLane = 0;
18581 bool HasChain =
Node->getNumValues() > 1;
18583 if (OldDmask == 0) {
18591 TFCLane = OldBitsSet;
18595 for (SDUse &Use :
Node->uses()) {
18598 if (
Use.getResNo() != 0)
18601 SDNode *
User =
Use.getUser();
18604 if (!
User->isMachineOpcode() ||
18605 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
18617 if (UsesTFC && Lane == TFCLane) {
18622 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
18624 Dmask &= ~(1 << Comp);
18632 NewDmask |= 1 << Comp;
18637 bool NoChannels = !NewDmask;
18644 if (OldBitsSet == 1)
18650 if (NewDmask == OldDmask)
18659 unsigned NewChannels = BitsSet + UsesTFC;
18663 assert(NewOpcode != -1 &&
18664 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
18665 "failed to find equivalent MIMG op");
18673 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
18675 MVT ResultVT = NewChannels == 1
18678 : NewChannels == 5 ? 8
18680 SDVTList NewVTList =
18683 MachineSDNode *NewNode =
18692 if (NewChannels == 1) {
18702 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18707 if (i || !NoChannels)
18712 if (NewUser != User) {
18722 Idx = AMDGPU::sub1;
18725 Idx = AMDGPU::sub2;
18728 Idx = AMDGPU::sub3;
18731 Idx = AMDGPU::sub4;
18742 Op =
Op.getOperand(0);
18767 Node->getOperand(0), SL, VReg, SrcVal,
18773 return ToResultReg.
getNode();
18778 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
18780 Ops.push_back(
Node->getOperand(i));
18786 Node->getOperand(i).getValueType(),
18787 Node->getOperand(i)),
18799 unsigned Opcode =
Node->getMachineOpcode();
18801 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
18802 !
TII->isGather4(Opcode) &&
18804 return adjustWritemask(
Node, DAG);
18807 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
18813 case AMDGPU::V_DIV_SCALE_F32_e64:
18814 case AMDGPU::V_DIV_SCALE_F64_e64: {
18824 (Src0 == Src1 || Src0 == Src2))
18880 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
18881 unsigned InitIdx = 0;
18883 if (
TII->isImage(
MI)) {
18891 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
18892 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
18893 unsigned D16Val = D16 ? D16->getImm() : 0;
18895 if (!TFEVal && !LWEVal)
18906 assert(MO_Dmask &&
"Expected dmask operand in instruction");
18908 unsigned dmask = MO_Dmask->
getImm();
18913 bool Packed = !Subtarget->hasUnpackedD16VMem();
18915 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18922 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
18923 if (DstSize < InitIdx)
18927 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
18936 unsigned NewDst = 0;
18941 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18942 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18945 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18966 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
18978 if (
TII->isVOP3(
MI.getOpcode())) {
18980 TII->legalizeOperandsVOP3(MRI,
MI);
18982 if (
TII->isMAI(
MI)) {
18987 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18988 AMDGPU::OpName::scale_src0);
18989 if (Src0Idx != -1) {
18990 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
18991 AMDGPU::OpName::scale_src1);
18992 if (
TII->usesConstantBus(MRI,
MI, Src0Idx) &&
18993 TII->usesConstantBus(MRI,
MI, Src1Idx))
18994 TII->legalizeOpWithMove(
MI, Src1Idx);
19001 if (
TII->isImage(
MI))
19002 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
19076std::pair<unsigned, const TargetRegisterClass *>
19083 if (Constraint.
size() == 1) {
19087 if (VT == MVT::Other)
19090 switch (Constraint[0]) {
19097 RC = &AMDGPU::SReg_32RegClass;
19100 RC = &AMDGPU::SGPR_64RegClass;
19105 return std::pair(0U,
nullptr);
19112 return std::pair(0U,
nullptr);
19114 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
19115 : &AMDGPU::VGPR_32_Lo256RegClass;
19118 RC = Subtarget->has1024AddressableVGPRs()
19119 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
19122 return std::pair(0U,
nullptr);
19127 if (!Subtarget->hasMAIInsts())
19131 return std::pair(0U,
nullptr);
19133 RC = &AMDGPU::AGPR_32RegClass;
19138 return std::pair(0U,
nullptr);
19143 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
19147 RC = &AMDGPU::AV_32RegClass;
19150 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
19152 return std::pair(0U,
nullptr);
19161 return std::pair(0U, RC);
19164 if (Kind !=
'\0') {
19166 RC = &AMDGPU::VGPR_32_Lo256RegClass;
19167 }
else if (Kind ==
's') {
19168 RC = &AMDGPU::SGPR_32RegClass;
19169 }
else if (Kind ==
'a') {
19170 RC = &AMDGPU::AGPR_32RegClass;
19176 return std::pair(0U,
nullptr);
19182 return std::pair(0U,
nullptr);
19186 RC =
TRI->getVGPRClassForBitWidth(Width);
19188 RC =
TRI->getSGPRClassForBitWidth(Width);
19190 RC =
TRI->getAGPRClassForBitWidth(Width);
19192 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
19197 return std::pair(0U,
nullptr);
19199 return std::pair(Reg, RC);
19205 return std::pair(0U,
nullptr);
19206 if (RC && Idx < RC->getNumRegs())
19208 return std::pair(0U,
nullptr);
19214 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
19220 if (Constraint.
size() == 1) {
19221 switch (Constraint[0]) {
19231 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
19239 if (Constraint.
size() == 1) {
19240 switch (Constraint[0]) {
19248 }
else if (Constraint.
size() == 2) {
19249 if (Constraint ==
"VA")
19267 std::vector<SDValue> &
Ops,
19282 unsigned Size =
Op.getScalarValueSizeInBits();
19286 if (
Size == 16 && !Subtarget->has16BitInsts())
19290 Val =
C->getSExtValue();
19294 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
19298 if (
Size != 16 ||
Op.getNumOperands() != 2)
19300 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
19303 Val =
C->getSExtValue();
19307 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
19317 if (Constraint.
size() == 1) {
19318 switch (Constraint[0]) {
19333 }
else if (Constraint.
size() == 2) {
19334 if (Constraint ==
"DA") {
19335 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
19336 int64_t LoBits =
static_cast<int32_t
>(Val);
19340 if (Constraint ==
"DB") {
19348 unsigned MaxSize)
const {
19349 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
19350 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
19352 MVT VT =
Op.getSimpleValueType();
19377 switch (UnalignedClassID) {
19378 case AMDGPU::VReg_64RegClassID:
19379 return AMDGPU::VReg_64_Align2RegClassID;
19380 case AMDGPU::VReg_96RegClassID:
19381 return AMDGPU::VReg_96_Align2RegClassID;
19382 case AMDGPU::VReg_128RegClassID:
19383 return AMDGPU::VReg_128_Align2RegClassID;
19384 case AMDGPU::VReg_160RegClassID:
19385 return AMDGPU::VReg_160_Align2RegClassID;
19386 case AMDGPU::VReg_192RegClassID:
19387 return AMDGPU::VReg_192_Align2RegClassID;
19388 case AMDGPU::VReg_224RegClassID:
19389 return AMDGPU::VReg_224_Align2RegClassID;
19390 case AMDGPU::VReg_256RegClassID:
19391 return AMDGPU::VReg_256_Align2RegClassID;
19392 case AMDGPU::VReg_288RegClassID:
19393 return AMDGPU::VReg_288_Align2RegClassID;
19394 case AMDGPU::VReg_320RegClassID:
19395 return AMDGPU::VReg_320_Align2RegClassID;
19396 case AMDGPU::VReg_352RegClassID:
19397 return AMDGPU::VReg_352_Align2RegClassID;
19398 case AMDGPU::VReg_384RegClassID:
19399 return AMDGPU::VReg_384_Align2RegClassID;
19400 case AMDGPU::VReg_512RegClassID:
19401 return AMDGPU::VReg_512_Align2RegClassID;
19402 case AMDGPU::VReg_1024RegClassID:
19403 return AMDGPU::VReg_1024_Align2RegClassID;
19404 case AMDGPU::AReg_64RegClassID:
19405 return AMDGPU::AReg_64_Align2RegClassID;
19406 case AMDGPU::AReg_96RegClassID:
19407 return AMDGPU::AReg_96_Align2RegClassID;
19408 case AMDGPU::AReg_128RegClassID:
19409 return AMDGPU::AReg_128_Align2RegClassID;
19410 case AMDGPU::AReg_160RegClassID:
19411 return AMDGPU::AReg_160_Align2RegClassID;
19412 case AMDGPU::AReg_192RegClassID:
19413 return AMDGPU::AReg_192_Align2RegClassID;
19414 case AMDGPU::AReg_256RegClassID:
19415 return AMDGPU::AReg_256_Align2RegClassID;
19416 case AMDGPU::AReg_512RegClassID:
19417 return AMDGPU::AReg_512_Align2RegClassID;
19418 case AMDGPU::AReg_1024RegClassID:
19419 return AMDGPU::AReg_1024_Align2RegClassID;
19435 if (Info->isEntryFunction()) {
19442 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
19444 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
19445 :
TRI->getAlignedHighSGPRForRC(MF, 2,
19446 &AMDGPU::SGPR_64RegClass);
19447 Info->setSGPRForEXECCopy(SReg);
19449 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
19450 Info->getStackPtrOffsetReg()));
19451 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
19452 MRI.
replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
19456 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
19457 MRI.
replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
19459 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
19462 Info->limitOccupancy(MF);
19464 if (ST.isWave32() && !MF.
empty()) {
19465 for (
auto &
MBB : MF) {
19466 for (
auto &
MI :
MBB) {
19467 TII->fixImplicitOperands(
MI);
19477 if (ST.needsAlignedVGPRs()) {
19484 if (NewClassID != -1)
19494 const APInt &DemandedElts,
19496 unsigned Depth)
const {
19498 unsigned Opc =
Op.getOpcode();
19501 unsigned IID =
Op.getConstantOperandVal(0);
19503 case Intrinsic::amdgcn_mbcnt_lo:
19504 case Intrinsic::amdgcn_mbcnt_hi: {
19510 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
19520 Op, Known, DemandedElts, DAG,
Depth);
19536 unsigned MaxValue =
19543 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
19547 unsigned Src1Cst = 0;
19548 if (Src1.
isImm()) {
19549 Src1Cst = Src1.
getImm();
19550 }
else if (Src1.
isReg()) {
19554 Src1Cst = Cst->Value.getZExtValue();
19565 if (Width >= BFEWidth)
19574 Known = Known.
sext(BFEWidth);
19576 Known = Known.
zext(BFEWidth);
19582 unsigned Depth)
const {
19585 switch (
MI->getOpcode()) {
19586 case AMDGPU::S_BFE_I32:
19589 case AMDGPU::S_BFE_U32:
19592 case AMDGPU::S_BFE_I64:
19595 case AMDGPU::S_BFE_U64:
19598 case AMDGPU::G_INTRINSIC:
19599 case AMDGPU::G_INTRINSIC_CONVERGENT: {
19602 case Intrinsic::amdgcn_workitem_id_x:
19605 case Intrinsic::amdgcn_workitem_id_y:
19608 case Intrinsic::amdgcn_workitem_id_z:
19611 case Intrinsic::amdgcn_mbcnt_lo:
19612 case Intrinsic::amdgcn_mbcnt_hi: {
19624 case Intrinsic::amdgcn_groupstaticsize: {
19635 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
19638 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
19641 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
19646 case AMDGPU::G_AMDGPU_SMED3:
19647 case AMDGPU::G_AMDGPU_UMED3: {
19648 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
19675 unsigned Depth)
const {
19682 AttributeList Attrs =
19684 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
19702 if (Header->getAlignment() != PrefAlign)
19703 return Header->getAlignment();
19704 if (needsFetchWindowAlignment(*Header))
19725 if (Header->getAlignment() != PrefAlign)
19726 return Header->getAlignment();
19728 unsigned LoopSize = 0;
19733 LoopSize +=
MBB->getAlignment().value() / 2;
19736 LoopSize +=
TII->getInstSizeInBytes(
MI);
19737 if (LoopSize > 192)
19742 if (LoopSize <= 64)
19745 if (LoopSize <= 128)
19746 return CacheLineAlign;
19752 auto I = Exit->getFirstNonDebugInstr();
19753 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19754 return CacheLineAlign;
19763 if (PreTerm == Pre->
begin() ||
19764 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19768 auto ExitHead = Exit->getFirstNonDebugInstr();
19769 if (ExitHead == Exit->end() ||
19770 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19775 return CacheLineAlign;
19783 if (needsFetchWindowAlignment(*
MBB))
19788bool SITargetLowering::needsFetchWindowAlignment(
19790 if (!
getSubtarget()->hasLoopHeadInstSplitSensitivity())
19794 if (
MI.isMetaInstruction())
19797 return TII->getInstSizeInBytes(
MI) > 4;
19807 N =
N->getOperand(0).getNode();
19817 switch (
N->getOpcode()) {
19825 if (Reg.isPhysical() || MRI.
isLiveIn(Reg))
19826 return !
TRI->isSGPRReg(MRI, Reg);
19832 return !
TRI->isSGPRReg(MRI, Reg);
19836 unsigned AS = L->getAddressSpace();
19846 case AMDGPUISD::ATOMIC_CMP_SWAP:
19847 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
19848 case AMDGPUISD::BUFFER_ATOMIC_ADD:
19849 case AMDGPUISD::BUFFER_ATOMIC_SUB:
19850 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
19851 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
19852 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
19853 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
19854 case AMDGPUISD::BUFFER_ATOMIC_AND:
19855 case AMDGPUISD::BUFFER_ATOMIC_OR:
19856 case AMDGPUISD::BUFFER_ATOMIC_XOR:
19857 case AMDGPUISD::BUFFER_ATOMIC_INC:
19858 case AMDGPUISD::BUFFER_ATOMIC_DEC:
19859 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
19860 case AMDGPUISD::BUFFER_ATOMIC_FADD:
19861 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
19862 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
19868 return A->readMem() &&
A->writeMem();
19889 switch (Ty.getScalarSizeInBits()) {
19901 const APInt &DemandedElts,
19904 unsigned Depth)
const {
19905 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
19909 if (Info->getMode().DX10Clamp)
19921 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
19941 <<
"Hardware instruction generated for atomic "
19943 <<
" operation at memory scope " << MemScope;
19948 Type *EltTy = VT->getElementType();
19949 return VT->getNumElements() == 2 &&
19969 unsigned BW =
IT->getBitWidth();
19970 return BW == 32 || BW == 64;
19984 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
19985 return BW == 32 || BW == 64;
19988 if (Ty->isFloatTy() || Ty->isDoubleTy())
19992 return VT->getNumElements() == 2 &&
19993 VT->getElementType()->getPrimitiveSizeInBits() == 16;
20003 bool HasSystemScope) {
20010 if (HasSystemScope) {
20011 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
20014 if (Subtarget.hasEmulatedSystemScopeAtomics())
20016 }
else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
20019 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
20032 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
20040 return STI.hasGloballyAddressableScratch()
20058 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
20071 bool HasSystemScope =
20103 if (!
IT ||
IT->getBitWidth() != 32)
20109 if (Subtarget->hasEmulatedSystemScopeAtomics())
20125 if (!HasSystemScope &&
20126 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
20138 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
20146 ConstVal && ConstVal->isNullValue())
20184 if (Ty->isFloatTy()) {
20189 if (Ty->isDoubleTy()) {
20210 if (Ty->isFloatTy() &&
20211 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
20224 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
20228 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
20232 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
20237 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
20242 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
20246 if (Ty->isFloatTy()) {
20249 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20252 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20257 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
20265 if (Subtarget->hasFlatAtomicFaddF32Inst())
20274 if (Subtarget->hasLDSFPAtomicAddF32()) {
20275 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20277 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20305 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
20307 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
20311 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
20313 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
20367 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
20368 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
20369 : &AMDGPU::SReg_32RegClass;
20370 if (!
TRI->isSGPRClass(RC) && !isDivergent)
20371 return TRI->getEquivalentSGPRClass(RC);
20372 if (
TRI->isSGPRClass(RC) && isDivergent) {
20373 if (Subtarget->hasGFX90AInsts())
20374 return TRI->getEquivalentAVClass(RC);
20375 return TRI->getEquivalentVGPRClass(RC);
20388 unsigned WaveSize) {
20393 if (!
IT ||
IT->getBitWidth() != WaveSize)
20398 if (!Visited.
insert(V).second)
20400 bool Result =
false;
20401 for (
const auto *U : V->users()) {
20403 if (V == U->getOperand(1)) {
20408 case Intrinsic::amdgcn_if_break:
20409 case Intrinsic::amdgcn_if:
20410 case Intrinsic::amdgcn_else:
20415 if (V == U->getOperand(0)) {
20420 case Intrinsic::amdgcn_end_cf:
20421 case Intrinsic::amdgcn_loop:
20427 Result =
hasCFUser(U, Visited, WaveSize);
20436 const Value *V)
const {
20438 if (CI->isInlineAsm()) {
20447 for (
auto &TC : TargetConstraints) {
20461 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
20496 if (
I.getMetadata(
"amdgpu.noclobber"))
20498 if (
I.getMetadata(
"amdgpu.last.use"))
20562 Alignment = RMW->getAlign();
20575 bool FullFlatEmulation =
20577 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
20578 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
20579 RMW->getType()->isDoubleTy()));
20582 bool ReturnValueIsUsed = !AI->
use_empty();
20591 if (FullFlatEmulation) {
20602 std::prev(BB->
end())->eraseFromParent();
20603 Builder.SetInsertPoint(BB);
20605 Value *LoadedShared =
nullptr;
20606 if (FullFlatEmulation) {
20607 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
20608 {Addr},
nullptr,
"is.shared");
20609 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
20610 Builder.SetInsertPoint(SharedBB);
20611 Value *CastToLocal = Builder.CreateAddrSpaceCast(
20617 LoadedShared = Clone;
20619 Builder.CreateBr(PhiBB);
20620 Builder.SetInsertPoint(CheckPrivateBB);
20623 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
20624 {Addr},
nullptr,
"is.private");
20625 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
20627 Builder.SetInsertPoint(PrivateBB);
20629 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
20632 Value *LoadedPrivate;
20634 LoadedPrivate = Builder.CreateAlignedLoad(
20635 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
20638 LoadedPrivate, RMW->getValOperand());
20640 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
20642 auto [ResultLoad, Equal] =
20648 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
20651 Builder.CreateBr(PhiBB);
20653 Builder.SetInsertPoint(GlobalBB);
20657 if (FullFlatEmulation) {
20658 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
20667 if (!FullFlatEmulation) {
20672 MDNode *RangeNotPrivate =
20675 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
20679 Builder.CreateBr(PhiBB);
20681 Builder.SetInsertPoint(PhiBB);
20683 if (ReturnValueIsUsed) {
20686 if (FullFlatEmulation)
20687 Loaded->addIncoming(LoadedShared, SharedBB);
20688 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20689 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20690 Loaded->takeName(AI);
20693 Builder.CreateBr(ExitBB);
20697 unsigned PtrOpIdx) {
20698 Value *PtrOp =
I->getOperand(PtrOpIdx);
20705 I->setOperand(PtrOpIdx, ASCast);
20717 ConstVal && ConstVal->isNullValue()) {
20747 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20755 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20770 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
static bool isCtlzOpc(unsigned Opc)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
static constexpr Value * getValue(Ty &ValueOrUse)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static MachineBasicBlock * expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static std::tuple< unsigned, unsigned > getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static std::pair< Register, Register > ExtractSubRegs(MachineInstr &MI, MachineOperand &Op, const TargetRegisterClass *SrcRC, const GCNSubtarget &ST, MachineRegisterInfo &MRI)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC, const SDValue LHS, const SDValue RHS, const SelectionDAG &DAG)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
static uint64_t getIdentityValueForWaveReduction(unsigned Opc)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
void setUsesDynamicLDS(bool DynLDS)
bool isBottomOfStack() const
uint32_t getLDSSize() const
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static bool EnableObjectLinking
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
bool empty() const
Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
const SIInstrInfo * getInstrInfo() const override
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
constexpr bool empty() const
Check if the string is empty.
constexpr size_t size() const
Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
bool isGFX13(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ CTLS
Count leading redundant sign bits.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > OverloadTys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_IntrinsicWOChain(const OpndPreds &...Opnds)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
FunctionAddr VTableAddr Value
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
FunctionAddr VTableAddr uintptr_t uintptr_t Data
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
FunctionAddr VTableAddr Next
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
constexpr RegState getUndefRegState(bool B)
@ Custom
The result value requires a custom uniformity check.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
unsigned getBitWidth() const
Get the bit width of this value.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
bool isNonZero() const
Returns true if this value is known to be non-zero.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
bool isAllOnes() const
Returns true if value is all one bits.
bool isKnownNeverNaN() const
Return true if it's known this can never be a nan.
static LLVM_ABI KnownFPClass bitcast(const fltSemantics &FltSemantics, const KnownBits &Bits)
Report known values for a bitcast into a float with provided semantics.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const