44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
56#define DEBUG_TYPE "si-lower"
62 cl::desc(
"Do not align and prefetch loops"),
66 "amdgpu-use-divergent-register-indexing",
cl::Hidden,
67 cl::desc(
"Use indirect register addressing for divergent indexes"),
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (
unsigned Reg = 0;
Reg < NumSGPRs; ++
Reg) {
84 return AMDGPU::SGPR0 +
Reg;
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
212 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
213 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
214 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
215 MVT::i1, MVT::v32i32},
219 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
220 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
221 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
222 MVT::i1, MVT::v32i32},
291 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1},
Expand);
298 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
299 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
300 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
303 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
304 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
305 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
309 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
310 MVT::v3i16, MVT::v4i16, MVT::Other},
315 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64},
Expand);
331 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
332 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
333 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
334 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
335 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
336 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
337 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
338 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
370 for (
MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
384 for (
MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
398 for (
MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
412 for (
MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
426 for (
MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
441 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
442 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
445 if (Subtarget->hasPkMovB32()) {
466 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
467 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
472 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32},
Custom);
476 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
477 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
478 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
479 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
503 if (Subtarget->hasSMemRealTime() ||
508 if (Subtarget->has16BitInsts()) {
518 if (Subtarget->hasMadMacF32Insts())
536 if (Subtarget->hasIntClamp())
539 if (Subtarget->hasAddNoCarryInsts())
545 {MVT::f32, MVT::f64},
Custom);
551 {MVT::f32, MVT::f64},
Legal);
553 if (Subtarget->haveRoundOpsF64())
583 if (Subtarget->has16BitInsts()) {
637 if (Subtarget->hasBF16TransInsts())
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
687 if (Subtarget->hasVCvtPkIU16F32())
690 {MVT::v2i16, MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16},
817 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
818 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
819 MVT::v32f16, MVT::v32bf16},
829 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
833 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
837 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
838 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
846 if (Subtarget->hasVOP3PInsts()) {
858 {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
Custom);
861 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
862 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
863 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
866 for (
MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
874 for (
MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
882 {MVT::v2f16, MVT::v4f16},
Custom);
888 if (Subtarget->hasBF16PackedInsts()) {
893 for (
MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
900 if (Subtarget->hasPackedFP32Ops()) {
904 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
907 if (Subtarget->hasPackedFP64Ops()) {
919 {MVT::v4f64, MVT::v8f64, MVT::v16f64, MVT::v32f64},
Custom);
922 if (Subtarget->hasPackedU64Ops()) {
926 {MVT::v4i64, MVT::v8i64, MVT::v16i64, MVT::v32i64},
933 if (Subtarget->has16BitInsts()) {
946 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
947 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
948 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
949 MVT::v32f16, MVT::v32bf16},
954 if (Subtarget->hasVMulU64Inst())
956 else if (Subtarget->hasScalarSMulU64())
959 if (Subtarget->hasMad64_32())
962 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
965 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
967 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16},
Legal);
970 if (Subtarget->hasMinimum3Maximum3F32())
973 if (Subtarget->hasMinimum3Maximum3PKF16()) {
977 if (!Subtarget->hasMinimum3Maximum3F16())
982 if (Subtarget->hasVOP3PInsts()) {
985 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
989 if (Subtarget->hasMinMaxI64Insts())
994 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
995 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
1000 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
1001 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
1002 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
1003 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1007 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
1008 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
1009 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
1010 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1025 if (Subtarget->hasBF16ConversionInsts()) {
1030 if (Subtarget->hasBF16TransInsts()) {
1034 if (Subtarget->hasCvtPkF16F32Inst()) {
1036 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1087 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1128 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1141 EVT DestVT,
EVT SrcVT)
const {
1143 ((((Opcode ==
ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1144 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1146 (Opcode ==
ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1153 LLT DestTy,
LLT SrcTy)
const {
1154 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1155 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1157 SrcTy.getScalarSizeInBits() == 16 &&
1178 return Subtarget->has16BitInsts()
1184 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1188 if (!Subtarget->has16BitInsts() && VT.
getSizeInBits() == 16)
1210 return (NumElts + 1) / 2;
1216 return NumElts * ((
Size + 31) / 32);
1225 unsigned &NumIntermediates,
MVT &RegisterVT)
const {
1234 MVT SimpleIntermediateVT =
1236 IntermediateVT = SimpleIntermediateVT;
1237 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1238 NumIntermediates = (NumElts + 1) / 2;
1239 return (NumElts + 1) / 2;
1244 IntermediateVT = RegisterVT;
1245 NumIntermediates = NumElts;
1246 return NumIntermediates;
1251 RegisterVT = MVT::i16;
1252 IntermediateVT = ScalarVT;
1253 NumIntermediates = NumElts;
1254 return NumIntermediates;
1258 RegisterVT = MVT::i32;
1259 IntermediateVT = ScalarVT;
1260 NumIntermediates = NumElts;
1261 return NumIntermediates;
1265 RegisterVT = MVT::i32;
1266 IntermediateVT = RegisterVT;
1267 NumIntermediates = NumElts * ((
Size + 31) / 32);
1268 return NumIntermediates;
1273 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1278 unsigned MaxNumLanes) {
1279 assert(MaxNumLanes != 0);
1283 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1294 unsigned MaxNumLanes) {
1300 assert(ST->getNumContainedTypes() == 2 &&
1301 ST->getContainedType(1)->isIntegerTy(32));
1315 return MVT::amdgpuBufferFatPointer;
1317 DL.getPointerSizeInBits(AS) == 192)
1318 return MVT::amdgpuBufferStridedPointer;
1327 DL.getPointerSizeInBits(AS) == 160) ||
1329 DL.getPointerSizeInBits(AS) == 192))
1336 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1337 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1338 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1340 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1341 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1342 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1343 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1344 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1345 case Intrinsic::amdgcn_flat_load_monitor_b32:
1346 case Intrinsic::amdgcn_global_load_monitor_b32:
1348 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1349 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1350 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1351 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1352 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1353 case Intrinsic::amdgcn_flat_load_monitor_b64:
1354 case Intrinsic::amdgcn_global_load_monitor_b64:
1356 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1357 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1358 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1359 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1360 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1361 case Intrinsic::amdgcn_flat_load_monitor_b128:
1362 case Intrinsic::amdgcn_global_load_monitor_b128:
1398 unsigned IntrID)
const {
1400 if (CI.
hasMetadata(LLVMContext::MD_invariant_load))
1414 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1427 if (RsrcIntr->IsImage) {
1442 Info.ptrVal = RsrcArg;
1446 if (RsrcIntr->IsImage) {
1447 unsigned MaxNumLanes = 4;
1462 std::numeric_limits<unsigned>::max());
1472 if (RsrcIntr->IsImage) {
1492 if ((RsrcIntr->IsImage && BaseOpcode->
NoReturn) || IsSPrefetch) {
1494 Info.memVT = MVT::i32;
1501 case Intrinsic::amdgcn_raw_buffer_load_lds:
1502 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1503 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1504 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1505 case Intrinsic::amdgcn_struct_buffer_load_lds:
1506 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1507 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1508 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1522 CI.
getContext(), Width * 8 * Subtarget->getWavefrontSize());
1531 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1532 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1533 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1534 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1537 std::numeric_limits<unsigned>::max());
1550 case Intrinsic::amdgcn_ds_ordered_add:
1551 case Intrinsic::amdgcn_ds_ordered_swap: {
1565 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1566 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1569 Info.ptrVal =
nullptr;
1575 case Intrinsic::amdgcn_ds_append:
1576 case Intrinsic::amdgcn_ds_consume: {
1590 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1591 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1592 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1597 Info.memVT = MVT::i64;
1605 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1606 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1607 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1610 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1613 ->getElementType(0));
1622 case Intrinsic::amdgcn_global_atomic_fmin_num:
1623 case Intrinsic::amdgcn_global_atomic_fmax_num:
1624 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1625 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1626 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1637 case Intrinsic::amdgcn_cluster_load_b32:
1638 case Intrinsic::amdgcn_cluster_load_b64:
1639 case Intrinsic::amdgcn_cluster_load_b128:
1640 case Intrinsic::amdgcn_ds_load_tr6_b96:
1641 case Intrinsic::amdgcn_ds_load_tr4_b64:
1642 case Intrinsic::amdgcn_ds_load_tr8_b64:
1643 case Intrinsic::amdgcn_ds_load_tr16_b128:
1644 case Intrinsic::amdgcn_global_load_tr6_b96:
1645 case Intrinsic::amdgcn_global_load_tr4_b64:
1646 case Intrinsic::amdgcn_global_load_tr_b64:
1647 case Intrinsic::amdgcn_global_load_tr_b128:
1648 case Intrinsic::amdgcn_ds_read_tr4_b64:
1649 case Intrinsic::amdgcn_ds_read_tr6_b96:
1650 case Intrinsic::amdgcn_ds_read_tr8_b64:
1651 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1660 case Intrinsic::amdgcn_flat_load_monitor_b32:
1661 case Intrinsic::amdgcn_flat_load_monitor_b64:
1662 case Intrinsic::amdgcn_flat_load_monitor_b128:
1663 case Intrinsic::amdgcn_global_load_monitor_b32:
1664 case Intrinsic::amdgcn_global_load_monitor_b64:
1665 case Intrinsic::amdgcn_global_load_monitor_b128: {
1676 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1677 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1678 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1689 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1690 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1691 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1702 case Intrinsic::amdgcn_ds_gws_init:
1703 case Intrinsic::amdgcn_ds_gws_barrier:
1704 case Intrinsic::amdgcn_ds_gws_sema_v:
1705 case Intrinsic::amdgcn_ds_gws_sema_br:
1706 case Intrinsic::amdgcn_ds_gws_sema_p:
1707 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1717 Info.memVT = MVT::i32;
1719 Info.align =
Align(4);
1721 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1728 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1729 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1730 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1731 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1732 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1733 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1734 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1735 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1751 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1753 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1768 case Intrinsic::amdgcn_av_load_b128:
1769 case Intrinsic::amdgcn_av_store_b128: {
1770 bool IsStore = IntrID == Intrinsic::amdgcn_av_store_b128;
1772 Info.memVT = MVT::v4i32;
1774 Info.align =
Align(16);
1782 unsigned ScopeIdx = CI.
arg_size() - 1;
1786 Info.ssid = Ctx.getOrInsertSyncScopeID(Scope);
1790 case Intrinsic::amdgcn_load_to_lds:
1791 case Intrinsic::amdgcn_load_async_to_lds:
1792 case Intrinsic::amdgcn_global_load_lds:
1793 case Intrinsic::amdgcn_global_load_async_lds: {
1812 Width * 8 * Subtarget->getWavefrontSize());
1818 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1819 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1820 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1821 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1831 Info.memVT = MVT::i32;
1833 Info.align =
Align(4);
1839 case Intrinsic::amdgcn_s_prefetch_data:
1840 case Intrinsic::amdgcn_s_prefetch_inst:
1841 case Intrinsic::amdgcn_flat_prefetch:
1842 case Intrinsic::amdgcn_global_prefetch: {
1858 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1861 unsigned SrcAS =
I.getOperand(0)->getType()->getPointerAddressSpace();
1862 unsigned DstAS =
I.getType()->getPointerAddressSpace();
1874 Type *&AccessTy)
const {
1875 Value *Ptr =
nullptr;
1876 switch (
II->getIntrinsicID()) {
1877 case Intrinsic::amdgcn_cluster_load_b128:
1878 case Intrinsic::amdgcn_cluster_load_b64:
1879 case Intrinsic::amdgcn_cluster_load_b32:
1880 case Intrinsic::amdgcn_ds_append:
1881 case Intrinsic::amdgcn_ds_consume:
1882 case Intrinsic::amdgcn_ds_load_tr8_b64:
1883 case Intrinsic::amdgcn_ds_load_tr16_b128:
1884 case Intrinsic::amdgcn_ds_load_tr4_b64:
1885 case Intrinsic::amdgcn_ds_load_tr6_b96:
1886 case Intrinsic::amdgcn_ds_read_tr4_b64:
1887 case Intrinsic::amdgcn_ds_read_tr6_b96:
1888 case Intrinsic::amdgcn_ds_read_tr8_b64:
1889 case Intrinsic::amdgcn_ds_read_tr16_b64:
1890 case Intrinsic::amdgcn_ds_ordered_add:
1891 case Intrinsic::amdgcn_ds_ordered_swap:
1892 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1893 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1894 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1895 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1896 case Intrinsic::amdgcn_global_atomic_fmax_num:
1897 case Intrinsic::amdgcn_global_atomic_fmin_num:
1898 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1899 case Intrinsic::amdgcn_global_load_tr_b64:
1900 case Intrinsic::amdgcn_global_load_tr_b128:
1901 case Intrinsic::amdgcn_global_load_tr4_b64:
1902 case Intrinsic::amdgcn_global_load_tr6_b96:
1903 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1904 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1905 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1906 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1907 case Intrinsic::amdgcn_av_load_b128:
1908 case Intrinsic::amdgcn_av_store_b128:
1909 Ptr =
II->getArgOperand(0);
1911 case Intrinsic::amdgcn_load_to_lds:
1912 case Intrinsic::amdgcn_load_async_to_lds:
1913 case Intrinsic::amdgcn_global_load_lds:
1914 case Intrinsic::amdgcn_global_load_async_lds:
1915 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1916 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1917 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1918 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1919 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1920 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1921 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1922 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1923 Ptr =
II->getArgOperand(1);
1928 AccessTy =
II->getType();
1934 unsigned AddrSpace)
const {
1935 if (!Subtarget->hasFlatInstOffsets()) {
1942 FlatAddrSpace FlatVariant =
1945 : FlatAddrSpace::FLAT;
1947 return AM.
Scale == 0 &&
1948 (AM.
BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1949 AM.
BaseOffs, AddrSpace, FlatVariant));
1953 if (Subtarget->hasFlatGlobalInsts())
1956 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1969 return isLegalMUBUFAddressingMode(AM);
1972bool SITargetLowering::isLegalMUBUFAddressingMode(
const AddrMode &AM)
const {
1983 if (!
TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1995 if (AM.HasBaseReg) {
2027 return isLegalMUBUFAddressingMode(AM);
2029 if (!Subtarget->hasScalarSubwordLoads()) {
2034 if (Ty->isSized() &&
DL.getTypeStoreSize(Ty) < 4)
2082 return Subtarget->hasFlatScratchEnabled()
2084 : isLegalMUBUFAddressingMode(AM);
2131 unsigned Size,
unsigned AddrSpace,
Align Alignment,
2140 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment <
Align(4))
2143 Align RequiredAlignment(
2145 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
Size > 32 &&
2146 Alignment < RequiredAlignment)
2161 if (!Subtarget->hasUsableDSOffset() && Alignment <
Align(8))
2167 RequiredAlignment =
Align(4);
2169 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2185 *IsFast = (Alignment >= RequiredAlignment) ? 64
2186 : (Alignment <
Align(4)) ? 32
2193 if (!Subtarget->hasDS96AndDS128())
2199 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2208 *IsFast = (Alignment >= RequiredAlignment) ? 96
2209 : (Alignment <
Align(4)) ? 32
2216 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2222 RequiredAlignment =
Align(8);
2224 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2233 *IsFast = (Alignment >= RequiredAlignment) ? 128
2234 : (Alignment <
Align(4)) ? 32
2251 *IsFast = (Alignment >= RequiredAlignment) ?
Size : 0;
2253 return Alignment >= RequiredAlignment ||
2254 Subtarget->hasUnalignedDSAccessEnabled();
2262 bool AlignedBy4 = Alignment >=
Align(4);
2263 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2265 *IsFast = AlignedBy4 ?
Size : 1;
2270 *IsFast = AlignedBy4;
2281 return Alignment >=
Align(4) ||
2282 Subtarget->hasUnalignedBufferAccessEnabled();
2295 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2310 return Size >= 32 && Alignment >=
Align(4);
2315 unsigned *IsFast)
const {
2317 Alignment, Flags, IsFast);
2322 const AttributeList &FuncAttributes)
const {
2328 if (
Op.size() >= 16 &&
2332 if (
Op.size() >= 8 &&
Op.isDstAligned(
Align(4)))
2350 unsigned DestAS)
const {
2353 Subtarget->hasGloballyAddressableScratch()) {
2383 unsigned Index)
const {
2395 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2400 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2435 auto [InputPtrReg, RC, ArgTy] =
2451 const SDLoc &SL)
const {
2458 const SDLoc &SL)
const {
2461 std::optional<uint32_t> KnownSize =
2463 if (KnownSize.has_value())
2490 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2505SDValue SITargetLowering::lowerKernargMemParameter(
2510 MachinePointerInfo PtrInfo =
2519 int64_t OffsetDiff =
Offset - AlignDownOffset;
2525 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2536 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal,
Signed, Arg);
2541 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain,
Offset);
2546 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load,
Signed, Arg);
2555 const SDLoc &SL)
const {
2624 ExtType, SL, VA.
getLocVT(), Chain, FIN,
2627 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2628 if (ConvertedVal == ArgValue)
2629 return ConvertedVal;
2634SDValue SITargetLowering::lowerWorkGroupId(
2639 if (!Subtarget->hasClusters())
2640 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2648 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2649 SDLoc SL(ClusterIdXYZ);
2650 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2653 SDValue ClusterWorkGroupIdXYZ =
2654 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2664 return ClusterIdXYZ;
2666 using namespace AMDGPU::Hwreg;
2670 DAG.
getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2681SDValue SITargetLowering::getPreloadedValue(
2684 const ArgDescriptor *
Reg =
nullptr;
2685 const TargetRegisterClass *RC;
2689 const ArgDescriptor WorkGroupIDX =
2697 const ArgDescriptor WorkGroupIDZ =
2699 const ArgDescriptor ClusterWorkGroupIDX =
2701 const ArgDescriptor ClusterWorkGroupIDY =
2703 const ArgDescriptor ClusterWorkGroupIDZ =
2705 const ArgDescriptor ClusterWorkGroupMaxIDX =
2707 const ArgDescriptor ClusterWorkGroupMaxIDY =
2709 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2711 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2714 auto LoadConstant = [&](
unsigned N) {
2718 if (Subtarget->hasArchitectedSGPRs() &&
2725 Reg = &WorkGroupIDX;
2726 RC = &AMDGPU::SReg_32RegClass;
2730 Reg = &WorkGroupIDY;
2731 RC = &AMDGPU::SReg_32RegClass;
2735 Reg = &WorkGroupIDZ;
2736 RC = &AMDGPU::SReg_32RegClass;
2740 if (HasFixedDims && ClusterDims.
getDims()[0] == 1)
2741 return LoadConstant(0);
2742 Reg = &ClusterWorkGroupIDX;
2743 RC = &AMDGPU::SReg_32RegClass;
2747 if (HasFixedDims && ClusterDims.
getDims()[1] == 1)
2748 return LoadConstant(0);
2749 Reg = &ClusterWorkGroupIDY;
2750 RC = &AMDGPU::SReg_32RegClass;
2754 if (HasFixedDims && ClusterDims.
getDims()[2] == 1)
2755 return LoadConstant(0);
2756 Reg = &ClusterWorkGroupIDZ;
2757 RC = &AMDGPU::SReg_32RegClass;
2762 return LoadConstant(ClusterDims.
getDims()[0] - 1);
2763 Reg = &ClusterWorkGroupMaxIDX;
2764 RC = &AMDGPU::SReg_32RegClass;
2769 return LoadConstant(ClusterDims.
getDims()[1] - 1);
2770 Reg = &ClusterWorkGroupMaxIDY;
2771 RC = &AMDGPU::SReg_32RegClass;
2776 return LoadConstant(ClusterDims.
getDims()[2] - 1);
2777 Reg = &ClusterWorkGroupMaxIDZ;
2778 RC = &AMDGPU::SReg_32RegClass;
2782 Reg = &ClusterWorkGroupMaxFlatID;
2783 RC = &AMDGPU::SReg_32RegClass;
2814 for (
unsigned I = 0,
E = Ins.
size(), PSInputNum = 0;
I !=
E; ++
I) {
2818 "vector type argument should have been split");
2823 bool SkipArg = !Arg->
Used && !Info->isPSInputAllocated(PSInputNum);
2831 "unexpected vector split in ps argument type");
2845 Info->markPSInputAllocated(PSInputNum);
2847 Info->markPSInputEnabled(PSInputNum);
2863 if (Info.hasWorkItemIDX()) {
2869 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2873 if (Info.hasWorkItemIDY()) {
2874 assert(Info.hasWorkItemIDX());
2875 if (Subtarget->hasPackedTID()) {
2876 Info.setWorkItemIDY(
2879 unsigned Reg = AMDGPU::VGPR1;
2887 if (Info.hasWorkItemIDZ()) {
2888 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2889 if (Subtarget->hasPackedTID()) {
2890 Info.setWorkItemIDZ(
2893 unsigned Reg = AMDGPU::VGPR2;
2913 if (RegIdx == ArgVGPRs.
size()) {
2920 unsigned Reg = ArgVGPRs[RegIdx];
2932 unsigned NumArgRegs) {
2935 if (RegIdx == ArgSGPRs.
size())
2938 unsigned Reg = ArgSGPRs[RegIdx];
2980 const unsigned Mask = 0x3ff;
2983 if (Info.hasWorkItemIDX()) {
2985 Info.setWorkItemIDX(Arg);
2988 if (Info.hasWorkItemIDY()) {
2990 Info.setWorkItemIDY(Arg);
2993 if (Info.hasWorkItemIDZ())
3005 const unsigned Mask = 0x3ff;
3014 auto &
ArgInfo = Info.getArgInfo();
3026 if (Info.hasImplicitArgPtr())
3034 if (Info.hasWorkGroupIDX())
3037 if (Info.hasWorkGroupIDY())
3040 if (Info.hasWorkGroupIDZ())
3043 if (Info.hasLDSKernelId())
3054 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(
TRI);
3055 MF.
addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
3061 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(
TRI);
3062 MF.
addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
3067 Register DispatchPtrReg = Info.addDispatchPtr(
TRI);
3068 MF.
addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
3074 MF.
addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3080 Register InputPtrReg = Info.addKernargSegmentPtr(
TRI);
3089 MF.
addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3094 Register FlatScratchInitReg = Info.addFlatScratchInit(
TRI);
3095 MF.
addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3100 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(
TRI);
3101 MF.
addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3116 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3118 bool InPreloadSequence =
true;
3120 bool AlignedForImplictArgs =
false;
3121 unsigned ImplicitArgOffset = 0;
3122 for (
auto &Arg :
F.args()) {
3123 if (!InPreloadSequence || !Arg.hasInRegAttr())
3126 unsigned ArgIdx = Arg.getArgNo();
3129 if (InIdx < Ins.
size() &&
3130 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3133 for (; InIdx < Ins.
size() && Ins[InIdx].isOrigArg() &&
3134 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3136 assert(ArgLocs[ArgIdx].isMemLoc());
3137 auto &ArgLoc = ArgLocs[InIdx];
3139 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3141 unsigned NumAllocSGPRs =
3142 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3145 if (Arg.hasAttribute(
"amdgpu-hidden-argument")) {
3146 if (!AlignedForImplictArgs) {
3148 alignTo(LastExplicitArgOffset,
3149 Subtarget->getAlignmentForImplicitArgPtr()) -
3150 LastExplicitArgOffset;
3151 AlignedForImplictArgs =
true;
3153 ArgOffset += ImplicitArgOffset;
3157 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3158 assert(InIdx >= 1 &&
"No previous SGPR");
3159 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3160 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3164 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3165 unsigned PaddingSGPRs =
alignTo(Padding, 4) / 4;
3168 InPreloadSequence =
false;
3174 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3176 Info.addPreloadedKernArg(
TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3178 if (PreloadRegs->
size() > 1)
3179 RC = &AMDGPU::SGPR_32RegClass;
3180 for (
auto &Reg : *PreloadRegs) {
3186 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3195 if (Info.hasLDSKernelId()) {
3196 Register Reg = Info.addLDSKernelId();
3197 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3206 bool IsShader)
const {
3207 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3208 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3214 assert(!HasArchitectedSGPRs &&
"Unhandled feature for the subtarget");
3216 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3220 unsigned NumRequiredSystemSGPRs =
3221 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3222 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3223 for (
unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3224 Register Reg = Info.addReservedUserSGPR();
3225 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3230 if (!HasArchitectedSGPRs) {
3231 if (Info.hasWorkGroupIDX()) {
3232 Register Reg = Info.addWorkGroupIDX();
3233 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3237 if (Info.hasWorkGroupIDY()) {
3238 Register Reg = Info.addWorkGroupIDY();
3239 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3243 if (Info.hasWorkGroupIDZ()) {
3244 Register Reg = Info.addWorkGroupIDZ();
3245 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3250 if (Info.hasWorkGroupInfo()) {
3251 Register Reg = Info.addWorkGroupInfo();
3252 MF.
addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3256 if (Info.hasPrivateSegmentWaveByteOffset()) {
3258 unsigned PrivateSegmentWaveByteOffsetReg;
3261 PrivateSegmentWaveByteOffsetReg =
3262 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3266 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3268 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3271 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3273 MF.
addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3274 CCInfo.
AllocateReg(PrivateSegmentWaveByteOffsetReg);
3277 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3278 Info.getNumPreloadedSGPRs() >= 16);
3293 if (HasStackObjects)
3294 Info.setHasNonSpillStackObjects(
true);
3299 HasStackObjects =
true;
3303 bool RequiresStackAccess = HasStackObjects || MFI.
hasCalls();
3305 if (!ST.hasFlatScratchEnabled()) {
3306 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.
getFunction())) {
3313 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3315 unsigned ReservedBufferReg =
TRI.reservedPrivateSegmentBufferReg(MF);
3325 Info.setScratchRSrcReg(ReservedBufferReg);
3344 if (!MRI.
isLiveIn(AMDGPU::SGPR32)) {
3345 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3352 for (
unsigned Reg : AMDGPU::SGPR_32RegClass) {
3354 Info.setStackPtrOffsetReg(
Reg);
3359 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3366 if (ST.getFrameLowering()->hasFP(MF)) {
3367 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3383 const MCPhysReg *IStart =
TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3392 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3393 RC = &AMDGPU::SGPR_64RegClass;
3394 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3395 RC = &AMDGPU::SGPR_32RegClass;
3401 Entry->addLiveIn(*
I);
3406 for (
auto *Exit : Exits)
3408 TII->get(TargetOpcode::COPY), *
I)
3423 bool IsError =
false;
3427 Fn,
"unsupported non-compute shaders with HSA",
DL.getDebugLoc()));
3445 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3446 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3448 if (!Subtarget->hasFlatScratchEnabled())
3453 !Subtarget->hasArchitectedSGPRs())
3454 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3455 !Info->hasWorkGroupIDZ());
3458 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3476 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3477 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3480 Info->markPSInputAllocated(0);
3481 Info->markPSInputEnabled(0);
3483 if (Subtarget->isAmdPalOS()) {
3492 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3493 if ((PsInputBits & 0x7F) == 0 ||
3494 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3497 }
else if (IsKernel) {
3498 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3510 if (IsKernel && Subtarget->hasKernargPreload())
3514 }
else if (!IsGraphics) {
3519 if (!Subtarget->hasFlatScratchEnabled())
3531 Info->setNumWaveDispatchSGPRs(
3533 Info->setNumWaveDispatchVGPRs(
3535 }
else if (Info->getNumKernargPreloadedSGPRs()) {
3536 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3541 if (IsWholeWaveFunc) {
3543 {MVT::i1, MVT::Other}, Chain);
3555 for (
unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.
size(), ArgIdx = 0; i != e;
3566 if (IsEntryFunc && VA.
isMemLoc()) {
3589 if (Arg.
isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3593 int64_t OffsetDiff =
Offset - AlignDownOffset;
3600 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3603 Register VReg = MRI.getLiveInVirtReg(Reg);
3611 NewArg = convertArgType(DAG, VT, MemVT,
DL, ArgVal,
3612 Ins[i].Flags.isSExt(), &Ins[i]);
3620 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3623 if (PreloadRegs.
size() == 1) {
3624 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3629 TRI->getRegSizeInBits(*RC)));
3637 for (
auto Reg : PreloadRegs) {
3638 Register VReg = MRI.getLiveInVirtReg(Reg);
3644 PreloadRegs.size()),
3661 NewArg = convertArgType(DAG, VT, MemVT,
DL, NewArg,
3662 Ins[i].Flags.isSExt(), &Ins[i]);
3674 "hidden argument in kernel signature was not preloaded",
3680 lowerKernargMemParameter(DAG, VT, MemVT,
DL, Chain,
Offset,
3681 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3701 if (!IsEntryFunc && VA.
isMemLoc()) {
3702 SDValue Val = lowerStackParameter(DAG, VA,
DL, Chain, Arg);
3713 if (AMDGPU::VGPR_32RegClass.
contains(Reg))
3714 RC = &AMDGPU::VGPR_32RegClass;
3715 else if (AMDGPU::SGPR_32RegClass.
contains(Reg))
3716 RC = &AMDGPU::SGPR_32RegClass;
3722 if (Arg.
Flags.
isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3728 ReadFirstLane, Val);
3744 Val = convertABITypeToValueType(DAG, Val, VA,
DL);
3753 Info->setBytesInStackArgArea(StackArgSize);
3755 return Chains.
empty() ? Chain
3764 const Type *RetTy)
const {
3772 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3777 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3778 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3779 for (
unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3780 if (CCInfo.
isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3803 Info->setIfReturnsVoid(Outs.
empty());
3804 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3823 for (
unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.
size();
I != E;
3824 ++
I, ++RealRVLocIdx) {
3828 SDValue Arg = OutVals[RealRVLocIdx];
3851 ReadFirstLane, Arg);
3858 if (!Info->isEntryFunction()) {
3864 if (AMDGPU::SReg_64RegClass.
contains(*
I))
3866 else if (AMDGPU::SReg_32RegClass.
contains(*
I))
3879 unsigned Opc = AMDGPUISD::ENDPGM;
3881 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3882 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3883 : AMDGPUISD::RET_GLUE;
3988 const auto [OutgoingArg, ArgRC, ArgTy] =
3993 const auto [IncomingArg, IncomingArgRC, Ty] =
3995 assert(IncomingArgRC == ArgRC);
3998 EVT ArgVT =
TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
4006 InputReg = getImplicitArgPtr(DAG,
DL);
4008 std::optional<uint32_t> Id =
4010 if (Id.has_value()) {
4021 if (OutgoingArg->isRegister()) {
4022 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4023 if (!CCInfo.
AllocateReg(OutgoingArg->getRegister()))
4026 unsigned SpecialArgOffset =
4037 auto [OutgoingArg, ArgRC, Ty] =
4040 std::tie(OutgoingArg, ArgRC, Ty) =
4043 std::tie(OutgoingArg, ArgRC, Ty) =
4058 const bool NeedWorkItemIDX = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-x");
4059 const bool NeedWorkItemIDY = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-y");
4060 const bool NeedWorkItemIDZ = !CLI.
CB->
hasFnAttr(
"amdgpu-no-workitem-id-z");
4065 if (Subtarget->getMaxWorkitemID(
F, 0) != 0) {
4073 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(
F, 1) != 0) {
4083 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(
F, 2) != 0) {
4092 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4093 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4104 : IncomingArgY ? *IncomingArgY
4111 if (OutgoingArg->isRegister()) {
4113 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4139 if (Callee->isDivergent())
4146 const uint32_t *CallerPreserved =
TRI->getCallPreservedMask(MF, CallerCC);
4150 if (!CallerPreserved)
4153 bool CCMatch = CallerCC == CalleeCC;
4166 if (Arg.hasByValAttr())
4180 const uint32_t *CalleePreserved =
TRI->getCallPreservedMask(MF, CalleeCC);
4181 if (!
TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4190 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4203 for (
const auto &[CCVA, ArgVal] :
zip_equal(ArgLocs, OutVals)) {
4205 if (!CCVA.isRegLoc())
4210 if (ArgVal->
isDivergent() &&
TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4212 dbgs() <<
"Cannot tail call due to divergent outgoing argument in "
4236enum ChainCallArgIdx {
4258 bool UsesDynamicVGPRs =
false;
4259 if (IsChainCallConv) {
4264 auto RequestedExecIt =
4266 return Arg.OrigArgIndex == 2;
4268 assert(RequestedExecIt != CLI.
Outs.end() &&
"No node for EXEC");
4270 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.
Outs.begin();
4273 CLI.
Outs.erase(RequestedExecIt, CLI.
Outs.end());
4276 "Haven't popped all the special args");
4279 CLI.
Args[ChainCallArgIdx::Exec];
4280 if (!RequestedExecArg.
Ty->
isIntegerTy(Subtarget->getWavefrontSize()))
4288 ArgNode->getAPIntValue(),
DL, ArgNode->getValueType(0)));
4290 ChainCallSpecialArgs.
push_back(Arg.Node);
4293 PushNodeOrTargetConstant(RequestedExecArg);
4299 if (FlagsValue.
isZero()) {
4300 if (CLI.
Args.size() > ChainCallArgIdx::Flags + 1)
4302 "no additional args allowed if flags == 0");
4304 if (CLI.
Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4308 if (!Subtarget->isWave32()) {
4310 CLI, InVals,
"dynamic VGPR mode is only supported for wave32");
4313 UsesDynamicVGPRs =
true;
4314 std::for_each(CLI.
Args.begin() + ChainCallArgIdx::NumVGPRs,
4315 CLI.
Args.end(), PushNodeOrTargetConstant);
4324 bool IsSibCall =
false;
4338 "unsupported call to variadic function ");
4346 "unsupported required tail call to function ");
4351 Outs, OutVals, Ins, DAG);
4355 "site marked musttail or on llvm.amdgcn.cs.chain");
4362 if (!TailCallOpt && IsTailCall)
4386 if (!Subtarget->hasFlatScratchEnabled())
4407 auto *
TRI = Subtarget->getRegisterInfo();
4414 if (!IsSibCall || IsChainCallConv) {
4415 if (!Subtarget->hasFlatScratchEnabled()) {
4421 RegsToPass.emplace_back(IsChainCallConv
4422 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4423 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4430 const unsigned NumSpecialInputs = RegsToPass.size();
4432 MVT PtrVT = MVT::i32;
4435 for (
unsigned i = 0, e = ArgLocs.
size(); i != e; ++i) {
4463 RegsToPass.push_back(std::pair(VA.
getLocReg(), Arg));
4471 int32_t
Offset = LocMemOffset;
4478 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4484 ? Flags.getNonZeroByValAlign()
4511 if (Outs[i].Flags.isByVal()) {
4513 DAG.
getConstant(Outs[i].Flags.getByValSize(),
DL, MVT::i32);
4516 Outs[i].Flags.getNonZeroByValAlign(),
4517 Outs[i].Flags.getNonZeroByValAlign(),
4519 nullptr, std::nullopt, DstInfo,
4525 DAG.
getStore(Chain,
DL, Arg, DstAddr, DstInfo, Alignment);
4531 if (!MemOpChains.
empty())
4547 unsigned ArgIdx = 0;
4548 for (
auto [Reg, Val] : RegsToPass) {
4549 if (ArgIdx++ >= NumSpecialInputs &&
4550 (IsChainCallConv || !Val->
isDivergent()) &&
TRI->isSGPRPhysReg(Reg)) {
4576 if (IsTailCall && !IsSibCall) {
4581 std::vector<SDValue>
Ops({Chain});
4587 Ops.push_back(Callee);
4604 Ops.push_back(Callee);
4615 if (IsChainCallConv)
4620 for (
auto &[Reg, Val] : RegsToPass)
4624 const uint32_t *Mask =
TRI->getCallPreservedMask(MF, CallConv);
4625 assert(Mask &&
"Missing call preserved mask for calling convention");
4635 MVT::Glue, GlueOps),
4640 Ops.push_back(InGlue);
4646 unsigned OPC = AMDGPUISD::TC_RETURN;
4649 OPC = AMDGPUISD::TC_RETURN_GFX;
4653 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4654 : AMDGPUISD::TC_RETURN_CHAIN;
4660 if (Info->isWholeWaveFunction())
4661 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4668 Chain =
Call.getValue(0);
4669 InGlue =
Call.getValue(1);
4671 uint64_t CalleePopBytes = NumBytes;
4692 EVT VT =
Op.getValueType();
4706 "Stack grows upwards for AMDGPU");
4708 Chain = BaseAddr.getValue(1);
4710 const bool HasFlatScratch = Subtarget->hasFlatScratchEnabled();
4711 const unsigned WavefrontSizeLog2 = Subtarget->getWavefrontSizeLog2();
4714 if (Alignment > StackAlign) {
4716 << (HasFlatScratch ? 0 : WavefrontSizeLog2);
4717 uint64_t StackAlignMask = ScaledAlignment - 1;
4724 assert(
Size.getValueType() == MVT::i32 &&
"Size must be 32-bit");
4733 DAG.
getConstant(WavefrontSizeLog2, dl, MVT::i32));
4744 if (!HasFlatScratch) {
4747 DAG.
getConstant(WavefrontSizeLog2, dl, MVT::i32));
4764 if (
Op.getValueType() != MVT::i32)
4783 assert(
Op.getValueType() == MVT::i32);
4792 Op.getOperand(0), IntrinID, GetRoundBothImm);
4826 SDValue RoundModeTimesNumBits =
4846 TableEntry, EnumOffset);
4862 static_cast<uint32_t>(ConstMode->getZExtValue()),
4874 if (UseReducedTable) {
4880 SDValue RoundModeTimesNumBits =
4900 SDValue RoundModeTimesNumBits =
4909 NewMode = TruncTable;
4918 ReadFirstLaneID, NewMode);
4931 IntrinID, RoundBothImm, NewMode);
4937 if (
Op->isDivergent() &&
4938 (!Subtarget->hasVmemPrefInsts() || !
Op.getConstantOperandVal(4)))
4948 if (Subtarget->hasSafeSmemPrefetch())
4956 if (!Subtarget->hasSafeSmemPrefetch() && !
Op.getConstantOperandVal(4))
4965 SDValue Src =
Op.getOperand(IsStrict ? 1 : 0);
4966 EVT SrcVT = Src.getValueType();
4975 EVT DstVT =
Op.getValueType();
4984 if (
Op.getValueType() != MVT::i64)
4998 Op.getOperand(0), IntrinID, ModeHwRegImm);
5000 Op.getOperand(0), IntrinID, TrapHwRegImm);
5014 if (
Op.getOperand(1).getValueType() != MVT::i64)
5026 ReadFirstLaneID, NewModeReg);
5028 ReadFirstLaneID, NewTrapReg);
5030 unsigned ModeHwReg =
5033 unsigned TrapHwReg =
5041 IntrinID, ModeHwRegImm, NewModeReg);
5044 IntrinID, TrapHwRegImm, NewTrapReg);
5053 .
Case(
"m0", AMDGPU::M0)
5054 .
Case(
"exec", AMDGPU::EXEC)
5055 .
Case(
"exec_lo", AMDGPU::EXEC_LO)
5056 .
Case(
"exec_hi", AMDGPU::EXEC_HI)
5057 .
Case(
"flat_scratch", AMDGPU::FLAT_SCR)
5058 .
Case(
"flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
5059 .
Case(
"flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
5064 if (!Subtarget->hasFlatScrRegister() &&
5065 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
5067 "\" for subtarget."));
5072 case AMDGPU::EXEC_LO:
5073 case AMDGPU::EXEC_HI:
5074 case AMDGPU::FLAT_SCR_LO:
5075 case AMDGPU::FLAT_SCR_HI:
5080 case AMDGPU::FLAT_SCR:
5099 MI.setDesc(
TII->getKillTerminatorFromPseudo(
MI.getOpcode()));
5108static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5130 auto Next = std::next(
I);
5141 MBB.addSuccessor(LoopBB);
5143 return std::pair(LoopBB, RemainderBB);
5150 auto I =
MI.getIterator();
5151 auto E = std::next(
I);
5173 Src->setIsKill(
false);
5183 BuildMI(*LoopBB, LoopBB->begin(),
DL,
TII->get(AMDGPU::S_SETREG_IMM32_B32))
5192 BuildMI(*LoopBB,
I,
DL,
TII->get(AMDGPU::S_GETREG_B32), Reg)
5216 unsigned InitReg,
unsigned ResultReg,
unsigned PhiReg,
5217 unsigned InitSaveExecReg,
int Offset,
bool UseGPRIdxMode,
5239 BuildMI(LoopBB,
I,
DL,
TII->get(TargetOpcode::PHI), PhiExec)
5246 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5250 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5260 if (UseGPRIdxMode) {
5262 SGPRIdxReg = CurrentIdxReg;
5265 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5275 BuildMI(LoopBB,
I,
DL,
TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5306 unsigned InitResultReg,
unsigned PhiReg,
int Offset,
5307 bool UseGPRIdxMode,
Register &SGPRIdxReg) {
5315 const auto *BoolXExecRC =
TRI->getWaveMaskRegClass();
5334 InitResultReg, DstReg, PhiReg, TmpExec,
5335 Offset, UseGPRIdxMode, SGPRIdxReg);
5341 LoopBB->removeSuccessor(RemainderBB);
5343 LoopBB->addSuccessor(LandingPad);
5354static std::pair<unsigned, int>
5358 int NumElts =
TRI.getRegSizeInBits(*SuperRC) / 32;
5363 return std::pair(AMDGPU::sub0,
Offset);
5420 Register SrcReg =
TII->getNamedOperand(
MI, AMDGPU::OpName::src)->getReg();
5421 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5427 std::tie(SubReg,
Offset) =
5430 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5433 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5437 if (UseGPRIdxMode) {
5444 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5453 .
addReg(SrcReg, {}, SubReg)
5457 MI.eraseFromParent();
5473 UseGPRIdxMode, SGPRIdxReg);
5477 if (UseGPRIdxMode) {
5479 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
true);
5481 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5486 BuildMI(*LoopBB, InsPt,
DL,
TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5487 .
addReg(SrcReg, {}, SubReg)
5491 MI.eraseFromParent();
5508 int Offset =
TII->getNamedOperand(
MI, AMDGPU::OpName::offset)->getImm();
5516 std::tie(SubReg,
Offset) =
5518 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5520 if (Idx->
getReg() == AMDGPU::NoRegister) {
5531 MI.eraseFromParent();
5536 if (
TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5540 if (UseGPRIdxMode) {
5544 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5553 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5554 TRI.getRegSizeInBits(*VecRC), 32,
false);
5560 MI.eraseFromParent();
5574 UseGPRIdxMode, SGPRIdxReg);
5577 if (UseGPRIdxMode) {
5579 TII->getIndirectGPRIDXPseudo(
TRI.getRegSizeInBits(*VecRC),
false);
5581 BuildMI(*LoopBB, InsPt,
DL, GPRIDXDesc, Dst)
5587 const MCInstrDesc &MovRelDesc =
TII->getIndirectRegWriteMovRelPseudo(
5588 TRI.getRegSizeInBits(*VecRC), 32,
false);
5589 BuildMI(*LoopBB, InsPt,
DL, MovRelDesc, Dst)
5595 MI.eraseFromParent();
5611 bool IsAdd = (
MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5612 if (ST.hasScalarAddSub64()) {
5613 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5623 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5624 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5627 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5629 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5632 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5634 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5636 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5637 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5646 MI.eraseFromParent();
5660 Register SrcCond =
MI.getOperand(3).getReg();
5668 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src0);
5670 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src1);
5672 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), Src0Idx));
5674 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), Src1Idx));
5677 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5679 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5682 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5684 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5687 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5689 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5711 MI.eraseFromParent();
5716 case AMDGPU::S_MIN_U32:
5717 return std::numeric_limits<uint32_t>::max();
5718 case AMDGPU::S_MIN_I32:
5719 return std::numeric_limits<int32_t>::max();
5720 case AMDGPU::S_MAX_U32:
5721 return std::numeric_limits<uint32_t>::min();
5722 case AMDGPU::S_MAX_I32:
5723 return std::numeric_limits<int32_t>::min();
5724 case AMDGPU::V_ADD_F32_e64:
5726 case AMDGPU::V_SUB_F32_e64:
5728 case AMDGPU::S_ADD_I32:
5729 case AMDGPU::S_SUB_I32:
5730 case AMDGPU::S_OR_B32:
5731 case AMDGPU::S_XOR_B32:
5732 return std::numeric_limits<uint32_t>::min();
5733 case AMDGPU::S_AND_B32:
5734 return std::numeric_limits<uint32_t>::max();
5735 case AMDGPU::V_MIN_F32_e64:
5736 case AMDGPU::V_MAX_F32_e64:
5738 case AMDGPU::V_CMP_LT_U64_e64:
5739 return std::numeric_limits<uint64_t>::max();
5740 case AMDGPU::V_CMP_LT_I64_e64:
5741 return std::numeric_limits<int64_t>::max();
5742 case AMDGPU::V_CMP_GT_U64_e64:
5743 return std::numeric_limits<uint64_t>::min();
5744 case AMDGPU::V_CMP_GT_I64_e64:
5745 return std::numeric_limits<int64_t>::min();
5746 case AMDGPU::V_MIN_F64_e64:
5747 case AMDGPU::V_MAX_F64_e64:
5748 case AMDGPU::V_MIN_NUM_F64_e64:
5749 case AMDGPU::V_MAX_NUM_F64_e64:
5750 return 0x7FF8000000000000;
5751 case AMDGPU::S_ADD_U64_PSEUDO:
5752 case AMDGPU::S_SUB_U64_PSEUDO:
5753 case AMDGPU::S_OR_B64:
5754 case AMDGPU::S_XOR_B64:
5755 return std::numeric_limits<uint64_t>::min();
5756 case AMDGPU::S_AND_B64:
5757 return std::numeric_limits<uint64_t>::max();
5758 case AMDGPU::V_ADD_F64_e64:
5759 case AMDGPU::V_ADD_F64_pseudo_e64:
5760 return 0x8000000000000000;
5767 return Opc == AMDGPU::S_MIN_U32 ||
Opc == AMDGPU::S_MIN_I32 ||
5768 Opc == AMDGPU::S_MAX_U32 ||
Opc == AMDGPU::S_MAX_I32 ||
5769 Opc == AMDGPU::S_ADD_I32 ||
Opc == AMDGPU::S_SUB_I32 ||
5770 Opc == AMDGPU::S_AND_B32 ||
Opc == AMDGPU::S_OR_B32 ||
5771 Opc == AMDGPU::S_XOR_B32 ||
Opc == AMDGPU::V_MIN_F32_e64 ||
5772 Opc == AMDGPU::V_MAX_F32_e64 ||
Opc == AMDGPU::V_ADD_F32_e64 ||
5773 Opc == AMDGPU::V_SUB_F32_e64;
5777 return Opc == AMDGPU::V_MIN_F32_e64 ||
Opc == AMDGPU::V_MAX_F32_e64 ||
5778 Opc == AMDGPU::V_ADD_F32_e64 ||
Opc == AMDGPU::V_SUB_F32_e64 ||
5779 Opc == AMDGPU::V_MIN_F64_e64 ||
Opc == AMDGPU::V_MAX_F64_e64 ||
5780 Opc == AMDGPU::V_MIN_NUM_F64_e64 ||
Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5781 Opc == AMDGPU::V_ADD_F64_e64 ||
Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5784static std::tuple<unsigned, unsigned>
5788 case AMDGPU::S_MIN_U32:
5789 DPPOpc = AMDGPU::V_MIN_U32_dpp;
5791 case AMDGPU::S_MIN_I32:
5792 DPPOpc = AMDGPU::V_MIN_I32_dpp;
5794 case AMDGPU::S_MAX_U32:
5795 DPPOpc = AMDGPU::V_MAX_U32_dpp;
5797 case AMDGPU::S_MAX_I32:
5798 DPPOpc = AMDGPU::V_MAX_I32_dpp;
5800 case AMDGPU::S_ADD_I32:
5801 case AMDGPU::S_SUB_I32:
5802 DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5803 : AMDGPU::V_ADD_CO_U32_dpp;
5805 case AMDGPU::S_AND_B32:
5806 DPPOpc = AMDGPU::V_AND_B32_dpp;
5808 case AMDGPU::S_OR_B32:
5809 DPPOpc = AMDGPU::V_OR_B32_dpp;
5811 case AMDGPU::S_XOR_B32:
5812 DPPOpc = AMDGPU::V_XOR_B32_dpp;
5814 case AMDGPU::V_ADD_F32_e64:
5815 case AMDGPU::V_SUB_F32_e64:
5816 DPPOpc = AMDGPU::V_ADD_F32_dpp;
5818 case AMDGPU::V_MIN_F32_e64:
5819 DPPOpc = AMDGPU::V_MIN_F32_dpp;
5821 case AMDGPU::V_MAX_F32_e64:
5822 DPPOpc = AMDGPU::V_MAX_F32_dpp;
5824 case AMDGPU::V_CMP_LT_U64_e64:
5825 case AMDGPU::V_CMP_LT_I64_e64:
5826 case AMDGPU::V_CMP_GT_U64_e64:
5827 case AMDGPU::V_CMP_GT_I64_e64:
5828 case AMDGPU::S_ADD_U64_PSEUDO:
5829 case AMDGPU::S_SUB_U64_PSEUDO:
5830 case AMDGPU::S_AND_B64:
5831 case AMDGPU::S_OR_B64:
5832 case AMDGPU::S_XOR_B64:
5833 case AMDGPU::V_MIN_NUM_F64_e64:
5834 case AMDGPU::V_MIN_F64_e64:
5835 case AMDGPU::V_MAX_NUM_F64_e64:
5836 case AMDGPU::V_MAX_F64_e64:
5837 case AMDGPU::V_ADD_F64_pseudo_e64:
5838 case AMDGPU::V_ADD_F64_e64:
5839 DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
5844 unsigned ClampOpc =
Opc;
5845 if (!ST.getInstrInfo()->isVALU(
Opc,
true)) {
5846 if (
Opc == AMDGPU::S_SUB_I32)
5847 ClampOpc = AMDGPU::S_ADD_I32;
5848 if (
Opc == AMDGPU::S_ADD_U64_PSEUDO ||
Opc == AMDGPU::S_SUB_U64_PSEUDO)
5849 ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
5850 else if (
Opc == AMDGPU::S_AND_B64)
5851 ClampOpc = AMDGPU::V_AND_B32_e64;
5852 else if (
Opc == AMDGPU::S_OR_B64)
5853 ClampOpc = AMDGPU::V_OR_B32_e64;
5854 else if (
Opc == AMDGPU::S_XOR_B64)
5855 ClampOpc = AMDGPU::V_XOR_B32_e64;
5857 ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
5859 return {DPPOpc, ClampOpc};
5862static std::pair<Register, Register>
5869 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5871 TII->buildExtractSubReg(
MI, MRI,
Op, SrcRC, AMDGPU::sub0, SrcSubRC);
5873 TII->buildExtractSubReg(
MI, MRI,
Op, SrcRC, AMDGPU::sub1, SrcSubRC);
5874 return {Op1L, Op1H};
5890 unsigned Stratergy =
static_cast<unsigned>(
MI.getOperand(2).
getImm());
5891 enum WAVE_REDUCE_STRATEGY :
unsigned {
DEFAULT = 0, ITERATIVE = 1,
DPP = 2 };
5893 unsigned MIOpc =
MI.getOpcode();
5907 case AMDGPU::S_MIN_U32:
5908 case AMDGPU::S_MIN_I32:
5909 case AMDGPU::V_MIN_F32_e64:
5910 case AMDGPU::S_MAX_U32:
5911 case AMDGPU::S_MAX_I32:
5912 case AMDGPU::V_MAX_F32_e64:
5913 case AMDGPU::S_AND_B32:
5914 case AMDGPU::S_OR_B32: {
5920 case AMDGPU::V_CMP_LT_U64_e64:
5921 case AMDGPU::V_CMP_LT_I64_e64:
5922 case AMDGPU::V_CMP_GT_U64_e64:
5923 case AMDGPU::V_CMP_GT_I64_e64:
5924 case AMDGPU::V_MIN_F64_e64:
5925 case AMDGPU::V_MIN_NUM_F64_e64:
5926 case AMDGPU::V_MAX_F64_e64:
5927 case AMDGPU::V_MAX_NUM_F64_e64:
5928 case AMDGPU::S_AND_B64:
5929 case AMDGPU::S_OR_B64: {
5935 case AMDGPU::S_XOR_B32:
5936 case AMDGPU::S_XOR_B64:
5937 case AMDGPU::S_ADD_I32:
5938 case AMDGPU::S_ADD_U64_PSEUDO:
5939 case AMDGPU::V_ADD_F32_e64:
5940 case AMDGPU::V_ADD_F64_e64:
5941 case AMDGPU::V_ADD_F64_pseudo_e64:
5942 case AMDGPU::S_SUB_I32:
5943 case AMDGPU::S_SUB_U64_PSEUDO:
5944 case AMDGPU::V_SUB_F32_e64: {
5951 bool IsWave32 = ST.isWave32();
5952 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5953 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5954 unsigned BitCountOpc =
5955 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5959 auto NewAccumulator =
5964 case AMDGPU::S_XOR_B32:
5965 case AMDGPU::S_XOR_B64: {
5974 .
addReg(NewAccumulator->getOperand(0).getReg())
5977 if (
Opc == AMDGPU::S_XOR_B32) {
5994 BuildRegSequence(BB,
MI, DstReg, DestSub0, DestSub1);
5998 case AMDGPU::S_SUB_I32: {
6007 .
addReg(NewAccumulator->getOperand(0).getReg());
6010 case AMDGPU::S_ADD_I32: {
6013 .
addReg(NewAccumulator->getOperand(0).getReg());
6016 case AMDGPU::S_ADD_U64_PSEUDO:
6017 case AMDGPU::S_SUB_U64_PSEUDO: {
6032 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6035 .
addReg(NewAccumulator->getOperand(0).getReg())
6045 Register LowOpcode =
Opc == AMDGPU::S_SUB_U64_PSEUDO
6047 : NewAccumulator->getOperand(0).getReg();
6058 Register HiVal =
Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
6064 if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6070 BuildRegSequence(BB,
MI, DstReg, DestSub0, DestSub1);
6073 case AMDGPU::V_ADD_F32_e64:
6074 case AMDGPU::V_ADD_F64_e64:
6075 case AMDGPU::V_ADD_F64_pseudo_e64:
6076 case AMDGPU::V_SUB_F32_e64: {
6083 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
6084 : AMDGPU::V_CVT_F64_I32_e64),
6086 .
addReg(NewAccumulator->getOperand(0).getReg())
6091 unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6092 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
6095 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
6097 ? AMDGPU::V_MUL_F64_pseudo_e64
6098 : AMDGPU::V_MUL_F64_e64;
6108 BuildMI(BB,
MI,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6125 BuildRegSequence(BB,
MI, DstReg, LaneValueLoReg, LaneValueHiReg);
6137 bool NeedsMovDPP = !is32BitOpc;
6142 bool IsWave32 = ST.isWave32();
6143 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6144 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6145 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
6171 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6175 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6176 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6185 I = ComputeLoop->begin();
6187 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), AccumulatorReg)
6191 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::PHI), ActiveBitsReg)
6195 I = ComputeLoop->end();
6199 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6204 bool hasSrc0Modifier = AMDGPU::getNamedOperandIdx(
6205 Opc, AMDGPU::OpName::src0_modifiers) != -1;
6206 bool hasSrc1Modifier = AMDGPU::getNamedOperandIdx(
6207 Opc, AMDGPU::OpName::src1_modifiers) != -1;
6209 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::clamp) != -1;
6211 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::op_sel) != -1;
6213 AMDGPU::getNamedOperandIdx(
Opc, AMDGPU::OpName::omod) != -1;
6214 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6218 if (ST.getInstrInfo()->isVALU(
Opc,
true)) {
6222 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), LaneValVgpr)
6224 OpDstReg = VgprResultReg;
6225 LaneValueReg = LaneValVgpr;
6228 if (hasSrc0Modifier)
6230 OpInstr.addReg(AccumulatorReg);
6231 if (hasSrc1Modifier)
6233 OpInstr.addReg(LaneValueReg);
6240 if (ST.getInstrInfo()->isVALU(
Opc,
true)) {
6241 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32),
6255 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6259 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::V_READLANE_B32),
6263 auto LaneValue = BuildRegSequence(*ComputeLoop,
I, LaneValReg,
6264 LaneValueLoReg, LaneValueHiReg);
6266 case AMDGPU::S_OR_B64:
6267 case AMDGPU::S_AND_B64:
6268 case AMDGPU::S_XOR_B64: {
6271 .
addReg(LaneValue->getOperand(0).getReg())
6275 case AMDGPU::V_CMP_GT_I64_e64:
6276 case AMDGPU::V_CMP_GT_U64_e64:
6277 case AMDGPU::V_CMP_LT_I64_e64:
6278 case AMDGPU::V_CMP_LT_U64_e64: {
6283 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6285 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6289 BuildRegSequence(*ComputeLoop,
I, AccumulatorVReg, SrcReg0Sub0,
6292 .
addReg(LaneValue->getOperand(0).getReg())
6293 .
addReg(AccumulatorVReg);
6295 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6296 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AndOpc), ComparisonResultReg)
6300 NewAccumulator =
BuildMI(*ComputeLoop,
I,
DL,
6301 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6302 .
addReg(LaneValue->getOperand(0).getReg())
6306 case AMDGPU::V_MIN_F64_e64:
6307 case AMDGPU::V_MIN_NUM_F64_e64:
6308 case AMDGPU::V_MAX_F64_e64:
6309 case AMDGPU::V_MAX_NUM_F64_e64:
6310 case AMDGPU::V_ADD_F64_e64:
6311 case AMDGPU::V_ADD_F64_pseudo_e64: {
6313 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::src);
6315 TRI->getAllocatableClass(
TII->getRegClass(
MI.getDesc(), SrcIdx));
6322 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::COPY), AccumulatorVReg)
6325 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6331 .
addReg(LaneValue->getOperand(0).getReg())
6338 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6341 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6343 auto [Op1L, Op1H] =
ExtractSubRegs(*Iters, DstVregInst->getOperand(0),
6345 ReadLaneLo.addReg(Op1L);
6346 ReadLaneHi.addReg(Op1H);
6348 BuildRegSequence(*ComputeLoop,
I, DstReg, LaneValLo, LaneValHi);
6351 case AMDGPU::S_ADD_U64_PSEUDO:
6352 case AMDGPU::S_SUB_U64_PSEUDO: {
6355 .
addReg(LaneValue->getOperand(0).getReg());
6363 unsigned BITSETOpc =
6364 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6365 BuildMI(*ComputeLoop,
I,
DL,
TII->get(BITSETOpc), NewActiveBitsReg)
6371 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6374 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6376 .
addReg(NewActiveBitsReg)
6378 BuildMI(*ComputeLoop,
I,
DL,
TII->get(AMDGPU::S_CBRANCH_SCC1))
6383 assert(ST.hasDPP() &&
"Sub Target does not support DPP Operations");
6400 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
6404 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6405 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6408 auto IdentityCopyInstr =
6412 unsigned DPPOpc = std::get<0>(DPPClampOpcPair);
6413 unsigned ClampOpc = std::get<1>(DPPClampOpcPair);
6428 if (isFPOp && !NeedsMovDPP)
6431 if (isFPOp && !NeedsMovDPP)
6435 if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
6444 bool isAddSub =
false,
6445 bool needsCarryIn =
false,
6447 unsigned InstrOpc = ClampOpc;
6450 InstrOpc = AMDGPU::V_ADDC_U32_e64;
6451 auto ClampInstr =
BuildMI(*CurrBB,
MI,
DL,
TII->get(InstrOpc), Dst);
6456 ClampInstr.addReg(CarryOutReg,
6462 ClampInstr.addReg(Src0);
6465 ClampInstr.addReg(Src1);
6468 if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
6469 ClampInstr.addImm(0);
6471 ClampInstr.addImm(0);
6472 LastBcastInstr = ClampInstr;
6477 Opc == AMDGPU::S_ADD_U64_PSEUDO ||
Opc == AMDGPU::S_SUB_U64_PSEUDO;
6478 bool isBitWiseOpc =
Opc == AMDGPU::S_AND_B64 ||
6479 Opc == AMDGPU::S_OR_B64 ||
Opc == AMDGPU::S_XOR_B64;
6481 if (isAddSubOpc || isBitWiseOpc) {
6488 auto [Src0Lo, Src0Hi] =
6490 auto [Src1Lo, Src1Hi] =
6492 Register CarryReg = BuildClampInstr(
6493 ResLo, Src0Lo, Src1Lo, isAddSubOpc,
false);
6494 BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,
6495 isAddSubOpc, CarryReg);
6496 BuildRegSequence(*CurrBB,
MI, ReturnReg, ResLo, ResHi);
6525 SrcWithIdentityInstr =
6526 BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
6533 MI, IdentityCopyInstr->getOperand(0), SrcRegClass, ST, MRI);
6534 auto [SrcReg0Sub0, SrcReg0Sub1] =
6537 BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);
6539 BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);
6540 SrcWithIdentityInstr =
6541 BuildRegSequence(*CurrBB,
MI, SrcWithIdentity,
6548 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
6551 DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
6553 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6556 DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
6558 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6561 DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
6563 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6566 DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
6568 if (ST.hasDPPBroadcasts()) {
6571 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
6586 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
6607 BuildRegSequence(*CurrBB,
MI, SwizzledValue64, SwizzledValuelo,
6610 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
6612 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
6615 FinalDPPResult = RowBcast15;
6617 if (ST.hasDPPBroadcasts()) {
6620 RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
6636 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
6640 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
6646 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
6654 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
6659 .
addReg(ShiftedThreadID);
6664 .
addReg(PermuteByteOffset)
6674 auto [RowBcast15Lo, RowBcast15Hi] =
6678 .
addReg(PermuteByteOffset)
6683 .
addReg(PermuteByteOffset)
6686 BuildRegSequence(*CurrBB,
MI, PermutedValue, PermutedValuelo,
6690 RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
6692 BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
6694 FinalDPPResult = RowBcast31;
6696 if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6697 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
6707 .
addReg(IsWave32 ? RowBcast15 : RowBcast31)
6710 FinalDPPResult = NegatedValVGPR;
6717 .
addImm(ST.getWavefrontSize() - 1);
6732 .
addImm(ST.getWavefrontSize() - 1);
6736 .
addImm(ST.getWavefrontSize() - 1);
6737 BuildRegSequence(*CurrBB,
MI, ReducedValSGPR, LaneValueLoReg,
6740 if (
Opc == AMDGPU::S_SUB_I32) {
6741 BuildMI(*CurrBB,
MI,
DL,
TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6744 }
else if (
Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6745 auto NegatedValInstr =
6753 .
addReg(
Opc == AMDGPU::S_SUB_I32 ||
Opc == AMDGPU::S_SUB_U64_PSEUDO
6759 MI.eraseFromParent();
6774 switch (
MI.getOpcode()) {
6775 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6777 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6779 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6781 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6783 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6785 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6788 ? AMDGPU::V_MIN_NUM_F64_e64
6789 : AMDGPU::V_MIN_F64_e64);
6790 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6792 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6794 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6796 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6798 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6800 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6803 ? AMDGPU::V_MAX_NUM_F64_e64
6804 : AMDGPU::V_MAX_F64_e64);
6805 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6807 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6809 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6811 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6814 ? AMDGPU::V_ADD_F64_pseudo_e64
6815 : AMDGPU::V_ADD_F64_e64);
6816 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6818 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6820 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6822 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6827 ? AMDGPU::V_ADD_F64_pseudo_e64
6828 : AMDGPU::V_ADD_F64_e64);
6829 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6831 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6833 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6835 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6837 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6839 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6841 case AMDGPU::S_UADDO_PSEUDO:
6842 case AMDGPU::S_USUBO_PSEUDO: {
6848 unsigned Opc = (
MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6850 : AMDGPU::S_SUB_U32;
6858 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6861 MI.eraseFromParent();
6864 case AMDGPU::S_ADD_U64_PSEUDO:
6865 case AMDGPU::S_SUB_U64_PSEUDO: {
6868 case AMDGPU::V_ADD_U64_PSEUDO:
6869 case AMDGPU::V_SUB_U64_PSEUDO: {
6870 bool IsAdd = (
MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6876 if (ST.hasAddSubU64Insts()) {
6878 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6879 : AMDGPU::V_SUB_U64_e64),
6884 TII->legalizeOperands(*
I);
6885 MI.eraseFromParent();
6889 if (IsAdd && ST.hasLshlAddU64Inst()) {
6895 TII->legalizeOperands(*
Add);
6896 MI.eraseFromParent();
6900 const auto *CarryRC =
TRI->getWaveMaskRegClass();
6910 : &AMDGPU::VReg_64RegClass;
6913 : &AMDGPU::VReg_64RegClass;
6916 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6918 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6921 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6923 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6926 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6928 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6931 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6938 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6952 TII->legalizeOperands(*LoHalf);
6953 TII->legalizeOperands(*HiHalf);
6954 MI.eraseFromParent();
6957 case AMDGPU::S_ADD_CO_PSEUDO:
6958 case AMDGPU::S_SUB_CO_PSEUDO: {
6970 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6976 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6981 if (
TRI->isVectorRegister(MRI, Src2.
getReg())) {
6982 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6987 if (ST.isWave64()) {
6988 if (ST.hasScalarCompareEq64()) {
6995 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6997 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6999 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
7002 BuildMI(*BB, MII,
DL,
TII->get(AMDGPU::S_OR_B32), Src2_32)
7016 unsigned Opc =
MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
7017 ? AMDGPU::S_ADDC_U32
7018 : AMDGPU::S_SUBB_U32;
7023 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7029 MI.eraseFromParent();
7032 case AMDGPU::SI_INIT_M0: {
7035 TII->get(M0Init.
isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
7038 MI.eraseFromParent();
7041 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
7044 TII->get(AMDGPU::S_CMP_EQ_U32))
7049 case AMDGPU::GET_GROUPSTATICSIZE: {
7053 .
add(
MI.getOperand(0))
7055 MI.eraseFromParent();
7058 case AMDGPU::GET_SHADERCYCLESHILO: {
7073 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
7076 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
7079 .
addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
7088 .
add(
MI.getOperand(0))
7093 MI.eraseFromParent();
7096 case AMDGPU::SI_INDIRECT_SRC_V1:
7097 case AMDGPU::SI_INDIRECT_SRC_V2:
7098 case AMDGPU::SI_INDIRECT_SRC_V3:
7099 case AMDGPU::SI_INDIRECT_SRC_V4:
7100 case AMDGPU::SI_INDIRECT_SRC_V5:
7101 case AMDGPU::SI_INDIRECT_SRC_V6:
7102 case AMDGPU::SI_INDIRECT_SRC_V7:
7103 case AMDGPU::SI_INDIRECT_SRC_V8:
7104 case AMDGPU::SI_INDIRECT_SRC_V9:
7105 case AMDGPU::SI_INDIRECT_SRC_V10:
7106 case AMDGPU::SI_INDIRECT_SRC_V11:
7107 case AMDGPU::SI_INDIRECT_SRC_V12:
7108 case AMDGPU::SI_INDIRECT_SRC_V16:
7109 case AMDGPU::SI_INDIRECT_SRC_V32:
7111 case AMDGPU::SI_INDIRECT_DST_V1:
7112 case AMDGPU::SI_INDIRECT_DST_V2:
7113 case AMDGPU::SI_INDIRECT_DST_V3:
7114 case AMDGPU::SI_INDIRECT_DST_V4:
7115 case AMDGPU::SI_INDIRECT_DST_V5:
7116 case AMDGPU::SI_INDIRECT_DST_V6:
7117 case AMDGPU::SI_INDIRECT_DST_V7:
7118 case AMDGPU::SI_INDIRECT_DST_V8:
7119 case AMDGPU::SI_INDIRECT_DST_V9:
7120 case AMDGPU::SI_INDIRECT_DST_V10:
7121 case AMDGPU::SI_INDIRECT_DST_V11:
7122 case AMDGPU::SI_INDIRECT_DST_V12:
7123 case AMDGPU::SI_INDIRECT_DST_V16:
7124 case AMDGPU::SI_INDIRECT_DST_V32:
7126 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7127 case AMDGPU::SI_KILL_I1_PSEUDO:
7129 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
7133 case AMDGPU::SI_BR_UNDEF: {
7135 .
add(
MI.getOperand(0));
7137 MI.eraseFromParent();
7140 case AMDGPU::ADJCALLSTACKUP:
7141 case AMDGPU::ADJCALLSTACKDOWN: {
7148 case AMDGPU::SI_CALL_ISEL: {
7149 unsigned ReturnAddrReg =
TII->getRegisterInfo().getReturnAddressReg(*MF);
7152 MIB =
BuildMI(*BB,
MI,
DL,
TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
7158 MI.eraseFromParent();
7161 case AMDGPU::V_ADD_CO_U32_e32:
7162 case AMDGPU::V_SUB_CO_U32_e32:
7163 case AMDGPU::V_SUBREV_CO_U32_e32: {
7165 unsigned Opc =
MI.getOpcode();
7167 bool NeedClampOperand =
false;
7168 if (
TII->pseudoToMCOpcode(
Opc) == -1) {
7170 NeedClampOperand =
true;
7174 if (
TII->isVOP3(*
I)) {
7177 I.add(
MI.getOperand(1)).add(
MI.getOperand(2));
7178 if (NeedClampOperand)
7181 TII->legalizeOperands(*
I);
7183 MI.eraseFromParent();
7186 case AMDGPU::V_ADDC_U32_e32:
7187 case AMDGPU::V_SUBB_U32_e32:
7188 case AMDGPU::V_SUBBREV_U32_e32:
7191 TII->legalizeOperands(
MI);
7193 case AMDGPU::DS_GWS_INIT:
7194 case AMDGPU::DS_GWS_SEMA_BR:
7195 case AMDGPU::DS_GWS_BARRIER:
7196 case AMDGPU::DS_GWS_SEMA_V:
7197 case AMDGPU::DS_GWS_SEMA_P:
7198 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
7206 case AMDGPU::S_SETREG_B32: {
7222 const unsigned SetMask = WidthMask <<
Offset;
7225 unsigned SetDenormOp = 0;
7226 unsigned SetRoundOp = 0;
7234 SetRoundOp = AMDGPU::S_ROUND_MODE;
7235 SetDenormOp = AMDGPU::S_DENORM_MODE;
7237 SetRoundOp = AMDGPU::S_ROUND_MODE;
7239 SetDenormOp = AMDGPU::S_DENORM_MODE;
7242 if (SetRoundOp || SetDenormOp) {
7244 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
7245 unsigned ImmVal = Def->getOperand(1).getImm();
7259 MI.eraseFromParent();
7268 MI.setDesc(
TII->get(AMDGPU::S_SETREG_B32_mode));
7272 case AMDGPU::S_INVERSE_BALLOT_U32:
7273 case AMDGPU::S_INVERSE_BALLOT_U64:
7276 MI.setDesc(
TII->get(AMDGPU::COPY));
7278 case AMDGPU::ENDPGM_TRAP: {
7280 MI.setDesc(
TII->get(AMDGPU::S_ENDPGM));
7300 MI.eraseFromParent();
7303 case AMDGPU::SIMULATED_TRAP: {
7304 assert(Subtarget->hasPrivEnabledTrap2NopBug());
7306 TII->insertSimulatedTrap(MRI, *BB,
MI,
MI.getDebugLoc());
7307 MI.eraseFromParent();
7310 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
7311 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
7317 assert(Setup &&
"Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
7318 Register OriginalExec = Setup->getOperand(0).getReg();
7320 MI.getOperand(0).setReg(OriginalExec);
7357 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7361 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7388 if (!Subtarget->hasMadMacF32Insts())
7389 return Subtarget->hasFastFMAF32();
7395 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7398 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7414 switch (Ty.getScalarSizeInBits()) {
7432 if (Ty.getScalarSizeInBits() == 16)
7434 if (Ty.getScalarSizeInBits() == 32)
7435 return Subtarget->hasMadMacF32Insts() &&
7445 EVT VT =
N->getValueType(0);
7447 return Subtarget->hasMadMacF32Insts() &&
7449 if (VT == MVT::f16) {
7450 return Subtarget->hasMadF16() &&
7465 unsigned Opc =
Op.getOpcode();
7466 EVT VT =
Op.getValueType();
7478 LoOps.
append(TrailingOps.begin(), TrailingOps.end());
7479 HiOps.
append(TrailingOps.begin(), TrailingOps.end());
7492 [[maybe_unused]]
EVT VT =
Op.getValueType();
7494 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7495 VT == MVT::v16i32) &&
7496 "Unexpected ValueType.");
7505 unsigned Opc =
Op.getOpcode();
7506 EVT VT =
Op.getValueType();
7515 DAG.
getNode(
Opc, SL, Lo0.getValueType(), Lo0, Lo1,
Op->getFlags());
7517 DAG.
getNode(
Opc, SL, Hi0.getValueType(), Hi0, Hi1,
Op->getFlags());
7524 unsigned Opc =
Op.getOpcode();
7525 EVT VT =
Op.getValueType();
7531 : std::pair(Op0, Op0);
7540 DAG.
getNode(
Opc, SL, ResVT.first, Lo0, Lo1, Lo2,
Op->getFlags());
7542 DAG.
getNode(
Opc, SL, ResVT.second, Hi0, Hi1, Hi2,
Op->getFlags());
7548 switch (
Op.getOpcode()) {
7552 return LowerBRCOND(
Op, DAG);
7554 return LowerRETURNADDR(
Op, DAG);
7556 return LowerSPONENTRY(
Op, DAG);
7559 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7560 "Load should return a value and a chain");
7564 EVT VT =
Op.getValueType();
7566 return lowerFSQRTF32(
Op, DAG);
7568 return lowerFSQRTF64(
Op, DAG);
7573 return LowerTrig(
Op, DAG);
7575 return LowerSELECT(
Op, DAG);
7577 return LowerFDIV(
Op, DAG);
7579 return LowerFFREXP(
Op, DAG);
7581 return LowerATOMIC_CMP_SWAP(
Op, DAG);
7583 return LowerSTORE(
Op, DAG);
7587 return LowerGlobalAddress(MFI,
Op, DAG);
7592 return LowerExternalSymbol(
Op, DAG);
7594 return LowerINTRINSIC_WO_CHAIN(
Op, DAG);
7596 return LowerINTRINSIC_W_CHAIN(
Op, DAG);
7598 return LowerINTRINSIC_VOID(
Op, DAG);
7600 return lowerADDRSPACECAST(
Op, DAG);
7602 return lowerINSERT_SUBVECTOR(
Op, DAG);
7604 return lowerINSERT_VECTOR_ELT(
Op, DAG);
7606 return lowerEXTRACT_VECTOR_ELT(
Op, DAG);
7608 return lowerVECTOR_SHUFFLE(
Op, DAG);
7610 return lowerSCALAR_TO_VECTOR(
Op, DAG);
7612 return lowerBUILD_VECTOR(
Op, DAG);
7615 return lowerFP_ROUND(
Op, DAG);
7617 return lowerTRAP(
Op, DAG);
7619 return lowerDEBUGTRAP(
Op, DAG);
7628 if (
Op.getValueType().isVector() &&
Op.getValueType() != MVT::v2i16 &&
7629 Op.getOperand(0).getValueType().getScalarType() == MVT::f32)
7634 return lowerFMINNUM_FMAXNUM(
Op, DAG);
7637 return lowerFMINIMUMNUM_FMAXIMUMNUM(
Op, DAG);
7640 return lowerFMINIMUM_FMAXIMUM(
Op, DAG);
7643 return lowerFLDEXP(
Op, DAG);
7648 if (Subtarget->hasVCvtPkIU16F32() &&
Op.getValueType() == MVT::i16 &&
7649 Op.getOperand(0).getValueType() == MVT::f32) {
7673 return lowerFCOPYSIGN(
Op, DAG);
7675 return lowerMUL(
Op, DAG);
7678 return lowerXMULO(
Op, DAG);
7681 return lowerXMUL_LOHI(
Op, DAG);
7702 return LowerINLINEASM(
Op, DAG);
7718 EVT FittingLoadVT = LoadVT;
7750SDValue SITargetLowering::adjustLoadValueType(
unsigned Opcode,
MemSDNode *M,
7753 bool IsIntrinsic)
const {
7756 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7757 EVT LoadVT =
M->getValueType(0);
7759 EVT EquivLoadVT = LoadVT;
7773 SDVTList VTList = DAG.
getVTList(EquivLoadVT, MVT::Other);
7777 M->getMemoryVT(),
M->getMemOperand());
7788 EVT LoadVT =
M->getValueType(0);
7794 assert(
M->getNumValues() == 2 ||
M->getNumValues() == 3);
7795 bool IsTFE =
M->getNumValues() == 3;
7797 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7798 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7799 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7800 : AMDGPUISD::BUFFER_LOAD;
7803 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG,
Ops);
7808 return handleByteShortBufferLoads(DAG, LoadVT,
DL,
Ops,
M->getMemOperand(),
7812 return getMemIntrinsicNode(
Opc,
DL,
M->getVTList(),
Ops, IntVT,
7813 M->getMemOperand(), DAG);
7817 SDVTList VTList = DAG.
getVTList(CastVT, MVT::Other);
7819 M->getMemOperand(), DAG);
7827 EVT VT =
N->getValueType(0);
7828 unsigned CondCode =
N->getConstantOperandVal(3);
7839 EVT CmpVT =
LHS.getValueType();
7840 if (CmpVT == MVT::i16 && !TLI.
isTypeLegal(MVT::i16)) {
7841 unsigned PromoteOp =
7861 EVT VT =
N->getValueType(0);
7863 unsigned CondCode =
N->getConstantOperandVal(3);
7872 if (CmpVT == MVT::f16 && !TLI.
isTypeLegal(CmpVT)) {
7881 SDValue SetCC = DAG.
getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7890 EVT VT =
N->getValueType(0);
7914 Exec = AMDGPU::EXEC_LO;
7916 Exec = AMDGPU::EXEC;
7936 EVT VT =
N->getValueType(0);
7938 unsigned IID =
N->getConstantOperandVal(0);
7939 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7940 IID == Intrinsic::amdgcn_permlanex16;
7941 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7942 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7943 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
7944 IID == Intrinsic::amdgcn_permlane_up ||
7945 IID == Intrinsic::amdgcn_permlane_down ||
7946 IID == Intrinsic::amdgcn_permlane_xor;
7951 if ((IsPermLane16 && !ST->hasPermlane16Insts()) ||
7952 (IID == Intrinsic::amdgcn_mov_dpp8 && !ST->hasDPP8()))
7955 unsigned SplitSize = 32;
7956 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7957 ST->hasDPALU_DPP() &&
7965 case Intrinsic::amdgcn_permlane16:
7966 case Intrinsic::amdgcn_permlanex16:
7967 case Intrinsic::amdgcn_update_dpp:
7972 case Intrinsic::amdgcn_writelane:
7973 case Intrinsic::amdgcn_permlane_bcast:
7974 case Intrinsic::amdgcn_permlane_up:
7975 case Intrinsic::amdgcn_permlane_down:
7976 case Intrinsic::amdgcn_permlane_xor:
7979 case Intrinsic::amdgcn_readlane:
7980 case Intrinsic::amdgcn_set_inactive:
7981 case Intrinsic::amdgcn_set_inactive_chain_arg:
7982 case Intrinsic::amdgcn_mov_dpp8:
7985 case Intrinsic::amdgcn_readfirstlane:
7986 case Intrinsic::amdgcn_permlane64:
7994 std::reverse(Operands.
begin(), Operands.
end());
7996 if (
SDNode *GL =
N->getGluedNode()) {
7998 GL = GL->getOperand(0).getNode();
8008 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
8009 IID == Intrinsic::amdgcn_mov_dpp8 ||
8010 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
8011 IsPermlaneShuffle) {
8012 Src1 =
N->getOperand(2);
8013 if (IID == Intrinsic::amdgcn_writelane ||
8014 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 ||
8016 Src2 =
N->getOperand(3);
8019 if (ValSize == SplitSize) {
8029 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
8034 if (IID == Intrinsic::amdgcn_writelane) {
8039 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
8041 return IsFloat ? DAG.
getBitcast(VT, Trunc) : Trunc;
8044 if (ValSize % SplitSize != 0)
8048 EVT VT =
N->getValueType(0);
8052 unsigned NumOperands =
N->getNumOperands();
8054 SDNode *GL =
N->getGluedNode();
8059 for (
unsigned i = 0; i != NE; ++i) {
8060 for (
unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
8062 SDValue Operand =
N->getOperand(j);
8071 Operands[j] = Operand;
8076 Operands[NumOperands - 1] =
8092 if (SplitSize == 32) {
8094 return unrollLaneOp(LaneOp.
getNode());
8100 unsigned SubVecNumElt =
8104 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
8105 for (
unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
8109 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
8115 createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT));
8116 }
else if (IID == Intrinsic::amdgcn_writelane) {
8120 createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
8122 Pieces.
push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT));
8125 EltIdx += SubVecNumElt;
8139 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
8142 if (IID == Intrinsic::amdgcn_writelane)
8145 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
8152 EVT VT =
N->getValueType(0);
8170 auto MakeIntrinsic = [&DAG, &SL](
unsigned IID,
MVT RetVT,
8174 Operands.
append(IntrinArgs);
8180 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8181 {ShiftedIndex, ValueI32});
8191 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8192 {ValueI32, PoisonVal});
8193 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8194 {ShiftedIndex, PoisonVal});
8197 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
8200 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8201 {WWMIndex, WWMValue});
8202 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
8203 MVT::i32, {WWMIndex, Swapped});
8205 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
8213 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
8221 DAG.
getSetCC(SL, MVT::i1, SameOrOtherHalf,
8231 switch (
N->getOpcode()) {
8243 unsigned IID =
N->getConstantOperandVal(0);
8245 case Intrinsic::amdgcn_make_buffer_rsrc:
8246 Results.push_back(lowerPointerAsRsrcIntrin(
N, DAG));
8248 case Intrinsic::amdgcn_cvt_pkrtz: {
8253 DAG.
getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
8257 case Intrinsic::amdgcn_cvt_pknorm_i16:
8258 case Intrinsic::amdgcn_cvt_pknorm_u16:
8259 case Intrinsic::amdgcn_cvt_pk_i16:
8260 case Intrinsic::amdgcn_cvt_pk_u16: {
8266 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
8267 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8268 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
8269 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8270 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
8271 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8273 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8275 EVT VT =
N->getValueType(0);
8284 case Intrinsic::amdgcn_s_buffer_load: {
8290 if (!Subtarget->hasScalarSubwordLoads())
8296 EVT VT =
Op.getValueType();
8297 assert(VT == MVT::i8 &&
"Expected 8-bit s_buffer_load intrinsics.\n");
8309 if (!
Offset->isDivergent()) {
8328 LoadVal = handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
8333 case Intrinsic::amdgcn_dead: {
8334 for (
unsigned I = 0, E =
N->getNumValues();
I < E; ++
I)
8345 for (
unsigned I = 0;
I < Res.getNumOperands();
I++) {
8346 Results.push_back(Res.getOperand(
I));
8350 Results.push_back(Res.getValue(1));
8359 EVT VT =
N->getValueType(0);
8364 EVT SelectVT = NewVT;
8365 if (NewVT.
bitsLT(MVT::i32)) {
8368 SelectVT = MVT::i32;
8374 if (NewVT != SelectVT)
8380 if (
N->getValueType(0) != MVT::v2f16)
8392 if (
N->getValueType(0) != MVT::v2f16)
8404 if (
N->getValueType(0) != MVT::f16)
8419 if (U.get() !=
Value)
8422 if (U.getUser()->getOpcode() == Opcode)
8428unsigned SITargetLowering::isCFIntrinsic(
const SDNode *Intr)
const {
8431 case Intrinsic::amdgcn_if:
8432 return AMDGPUISD::IF;
8433 case Intrinsic::amdgcn_else:
8434 return AMDGPUISD::ELSE;
8435 case Intrinsic::amdgcn_loop:
8436 return AMDGPUISD::LOOP;
8437 case Intrinsic::amdgcn_end_cf:
8457 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8480 assert(GVar->isDeclaration() &&
"AS3 GVs should be declaration here "
8481 "when object linking is enabled");
8496 SDNode *Intr = BRCOND.getOperand(1).getNode();
8513 Intr =
LHS.getNode();
8521 assert(BR &&
"brcond missing unconditional branch user");
8526 unsigned CFNode = isCFIntrinsic(Intr);
8546 Ops.push_back(Target);
8569 for (
unsigned i = 1, e = Intr->
getNumValues() - 1; i != e; ++i) {
8588 MVT VT =
Op.getSimpleValueType();
8591 if (
Op.getConstantOperandVal(0) != 0)
8595 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8597 if (
Info->isEntryFunction())
8614 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
8628 return Op.getValueType().bitsLE(VT)
8636 EVT DstVT =
Op.getValueType();
8643 unsigned Opc =
Op.getOpcode();
8655 EVT SrcVT = Src.getValueType();
8656 EVT DstVT =
Op.getValueType();
8659 assert(Subtarget->hasCvtPkF16F32Inst() &&
"support v_cvt_pk_f16_f32");
8662 return SrcVT == MVT::v2f32 ?
Op : splitFP_ROUNDVectorOp(
Op, DAG);
8669 if (DstVT == MVT::f16) {
8674 if (!Subtarget->has16BitInsts()) {
8679 if (
Op->getFlags().hasApproximateFuncs()) {
8690 "custom lower FP_ROUND for f16 or bf16");
8691 assert(Subtarget->hasBF16ConversionInsts() &&
"f32 -> bf16 is legal");
8703 EVT VT =
Op.getValueType();
8705 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8706 bool IsIEEEMode =
Info->getMode().IEEE;
8715 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8722SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(
SDValue Op,
8724 EVT VT =
Op.getValueType();
8726 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
8727 bool IsIEEEMode =
Info->getMode().IEEE;
8732 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8740 EVT VT =
Op.getValueType();
8744 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8745 !Subtarget->hasMinimum3Maximum3F16() &&
8746 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8747 "should not need to widen f16 minimum/maximum to v2f16");
8761 DAG.
getNode(
Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8769 EVT VT =
Op.getValueType();
8773 EVT ExpVT =
Exp.getValueType();
8774 if (ExpVT == MVT::i16)
8795 {
Op.getOperand(0),
Op.getOperand(1), TruncExp});
8802 switch (
Op->getOpcode()) {
8834SITargetLowering::promoteUniformUnaryOpToI32(
SDValue Op,
8835 DAGCombinerInfo &DCI)
const {
8836 EVT OpTy =
Op.getValueType();
8837 SelectionDAG &DAG = DCI.DAG;
8846 Input = DAG.
getNode(ExtOp,
DL, ExtTy, Input);
8854 DAGCombinerInfo &DCI)
const {
8855 const unsigned Opc =
Op.getOpcode();
8864 :
Op->getOperand(0).getValueType();
8865 auto &DAG = DCI.DAG;
8868 if (DCI.isBeforeLegalizeOps() ||
8876 LHS =
Op->getOperand(1);
8877 RHS =
Op->getOperand(2);
8879 LHS =
Op->getOperand(0);
8880 RHS =
Op->getOperand(1);
8919 if (MagVT == SignVT)
8940 EVT VT =
Op.getValueType();
8946 assert(VT == MVT::i64 &&
"The following code is a special for s_mul_u64");
8973 if (
Op->isDivergent())
8986 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8988 DAG.
getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8991 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8993 DAG.
getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8999 EVT VT =
Op.getValueType();
9006 const APInt &
C = RHSC->getAPIntValue();
9008 if (
C.isPowerOf2()) {
9010 bool UseArithShift =
isSigned && !
C.isMinSignedValue();
9037 if (
Op->isDivergent()) {
9041 if (Subtarget->hasSMulHi()) {
9052 if (!Subtarget->hasTrapHandler() ||
9054 return lowerTrapEndpgm(
Op, DAG);
9056 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(
Op, DAG)
9057 : lowerTrapHsaQueuePtr(
Op, DAG);
9063 return DAG.
getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
9067SITargetLowering::loadImplicitKernelArgument(
SelectionDAG &DAG,
MVT VT,
9069 ImplicitParameter Param)
const {
9073 MachinePointerInfo PtrInfo =
9090 loadImplicitKernelArgument(DAG, MVT::i64, SL,
Align(8),
QUEUE_PTR);
9093 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
9096 if (UserSGPR == AMDGPU::NoRegister) {
9113 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
9122 if (Subtarget->hasPrivEnabledTrap2NopBug())
9123 return DAG.
getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
9127 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
9135 if (!Subtarget->hasTrapHandler() ||
9139 "debugtrap handler not supported",
9147 return DAG.
getNode(AMDGPUISD::TRAP, SL, MVT::Other,
Ops);
9157 const SIRegisterInfo *
TRI = Subtarget->getRegisterInfo();
9158 SmallSet<Register, 8> SGPRInputRegs;
9160 unsigned NumVals = 0;
9163 const InlineAsm::Flag
Flags(
Op.getConstantOperandVal(
I));
9164 NumVals =
Flags.getNumOperandRegisters();
9168 NumVals > 0 &&
Flags.hasRegClassConstraint(RCID) &&
9169 TRI->isSGPRClass(
TRI->getRegClass(RCID));
9171 for (
unsigned J = 0; J < NumVals; ++J) {
9173 if (
const RegisterSDNode *RegNode =
9182 if (SGPRInputRegs.
empty())
9187 SDNode *
N =
Op.getOperand(
NumOps - 1).getNode();
9199 ReadFirstLaneID, SrcVal);
9203 if (
N->getNumOperands() > 3)
9204 Ops.push_back(
N->getOperand(3));
9210 SDNode *
Next =
nullptr;
9211 for (
unsigned I = 0,
E =
N->getNumOperands();
I !=
E; ++
I) {
9212 if (
N->getOperand(
I).getValueType() == MVT::Glue) {
9213 Next =
N->getOperand(
I).getNode();
9223SDValue SITargetLowering::getSegmentAperture(
unsigned AS,
const SDLoc &
DL,
9225 if (Subtarget->hasApertureRegs()) {
9227 ? AMDGPU::SRC_SHARED_BASE
9228 : AMDGPU::SRC_PRIVATE_BASE;
9229 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
9230 !Subtarget->hasGloballyAddressableScratch()) &&
9231 "Cannot use src_private_base with globally addressable scratch!");
9252 return loadImplicitKernelArgument(DAG, MVT::i32,
DL,
Align(4), Param);
9256 SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
9258 if (UserSGPR == AMDGPU::NoRegister) {
9303 const AMDGPUTargetMachine &TM =
9306 unsigned DestAS, SrcAS;
9308 bool IsNonNull =
false;
9310 SrcAS = ASC->getSrcAddressSpace();
9311 Src = ASC->getOperand(0);
9312 DestAS = ASC->getDestAddressSpace();
9315 Op.getConstantOperandVal(0) ==
9316 Intrinsic::amdgcn_addrspacecast_nonnull);
9317 Src =
Op->getOperand(1);
9318 SrcAS =
Op->getConstantOperandVal(2);
9319 DestAS =
Op->getConstantOperandVal(3);
9332 Subtarget->hasGloballyAddressableScratch()) {
9337 AMDGPU::S_MOV_B32, SL, MVT::i32,
9338 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
9361 Subtarget->hasGloballyAddressableScratch()) {
9370 if (Subtarget->isWave64())
9376 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
9384 AMDGPU::S_MOV_B64, SL, MVT::i64,
9385 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
9387 CvtPtr = DAG.
getNode(
ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
9389 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
9409 Op.getValueType() == MVT::i64) {
9410 const SIMachineFunctionInfo *
Info =
9412 if (
Info->get32BitAddressHighBits() == 0)
9421 Src.getValueType() == MVT::i64)
9449 assert(InsNumElts % 2 == 0 &&
"expect legal vector types");
9454 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
9456 MVT::i32, InsNumElts / 2);
9461 for (
unsigned I = 0;
I != InsNumElts / 2; ++
I) {
9463 if (InsNumElts == 2) {
9476 for (
unsigned I = 0;
I != InsNumElts; ++
I) {
9499 if (NumElts == 4 && EltSize == 16 && KIdx) {
9510 unsigned Idx = KIdx->getZExtValue();
9511 bool InsertLo = Idx < 2;
9515 DAG.
getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9521 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9534 assert(VecSize <= 64 &&
"Expected target vector size to be <= 64 bits");
9569 EVT ResultVT =
Op.getValueType();
9582 if (
SDValue Combined = performExtractVectorEltCombine(
Op.getNode(), DCI))
9585 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9589 if (VecSize == 128) {
9597 }
else if (VecSize == 256) {
9600 for (
unsigned P = 0;
P < 4; ++
P) {
9606 Parts[0], Parts[1]));
9608 Parts[2], Parts[3]));
9614 for (
unsigned P = 0;
P < 8; ++
P) {
9621 Parts[0], Parts[1], Parts[2], Parts[3]));
9624 Parts[4], Parts[5], Parts[6], Parts[7]));
9644 Src = DAG.
getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9659 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9669 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9674 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9675 !(Mask[Elt + 1] & 1);
9681 EVT ResultVT =
Op.getValueType();
9684 const int NewSrcNumElts = 2;
9686 int SrcNumElts =
Op.getOperand(0).getValueType().getVectorNumElements();
9702 const bool ShouldUseConsecutiveExtract = EltVT.
getSizeInBits() == 16;
9724 if (ShouldUseConsecutiveExtract &&
9727 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9728 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9740 if (Idx0 >= SrcNumElts) {
9745 if (Idx1 >= SrcNumElts) {
9750 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9751 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9759 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9760 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9765 if (SubVec0 != SubVec1) {
9766 NewMaskIdx1 += NewSrcNumElts;
9773 {NewMaskIdx0, NewMaskIdx1});
9778 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9779 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9780 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9781 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9800 EVT ResultVT =
Op.getValueType();
9816 EVT VT =
Op.getValueType();
9818 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9819 assert(!Subtarget->hasVOP3PInsts() &&
"this should be legal");
9853 for (
unsigned P = 0;
P < NumParts; ++
P) {
9855 PartVT, SL, {
Op.getOperand(
P * 2),
Op.getOperand(
P * 2 + 1)});
9881 if (!Subtarget->isAmdHsaOS())
9924 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET64,
DL, PtrVT, Ptr);
9933 return DAG.
getNode(AMDGPUISD::PC_ADD_REL_OFFSET,
DL, PtrVT, PtrLo, PtrHi);
9941 EVT PtrVT =
Op.getValueType();
9943 const GlobalValue *GV = GSD->
getGlobal();
9957 assert(PtrVT == MVT::i32 &&
"32-bit pointer is expected.");
9972 return DAG.
getNode(AMDGPUISD::LDS,
DL, MVT::i32, GA);
9975 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9976 if (Subtarget->has64BitLiterals()) {
10007 MachinePointerInfo PtrInfo =
10020 Fn,
"unsupported external symbol",
Op.getDebugLoc()));
10042 unsigned Offset)
const {
10044 SDValue Param = lowerKernargMemParameter(
10055 "non-hsa intrinsic with hsa target",
DL.getDebugLoc()));
10063 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
10071 unsigned NumElts = Elts.
size();
10073 if (NumElts <= 12) {
10077 Type = MVT::v16f32;
10082 for (
unsigned i = 0; i < Elts.
size(); ++i) {
10088 for (
unsigned i = Elts.
size(); i < NumElts; ++i)
10097 SDValue Src,
int ExtraElts) {
10098 EVT SrcVT = Src.getValueType();
10108 while (ExtraElts--)
10119 bool Unpacked,
bool IsD16,
int DMaskPop,
10120 int NumVDataDwords,
bool IsAtomicPacked16Bit,
10124 EVT ReqRetVT = ResultTypes[0];
10126 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
10127 ? (ReqRetNumElts + 1) / 2
10130 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
10133 NumDataDwords == 1 ? MVT::i32 :
MVT::getVectorVT(MVT::i32, NumDataDwords);
10136 MaskPopDwords == 1 ? MVT::i32 :
MVT::getVectorVT(MVT::i32, MaskPopDwords);
10141 if (DMaskPop > 0 &&
Data.getValueType() != MaskPopVT) {
10145 SDValue(Result, 0), ZeroIdx);
10148 SDValue(Result, 0), ZeroIdx);
10152 if (DataDwordVT.
isVector() && !IsAtomicPacked16Bit)
10154 NumDataDwords - MaskPopDwords);
10159 EVT LegalReqRetVT = ReqRetVT;
10161 if (!
Data.getValueType().isInteger())
10163 Data.getValueType().changeTypeToInteger(),
Data);
10184 if (Result->getNumValues() == 1)
10191 SDValue *LWE,
bool &IsTexFail) {
10211 unsigned DimIdx,
unsigned EndIdx,
10212 unsigned NumGradients) {
10214 for (
unsigned I = DimIdx;
I < EndIdx;
I++) {
10222 if (((
I + 1) >= EndIdx) ||
10223 ((NumGradients / 2) % 2 == 1 && (
I == DimIdx + (NumGradients / 2) - 1 ||
10224 I == DimIdx + NumGradients - 1))) {
10246 !
Op.getNode()->hasAnyUseOfValue(0))
10248 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
10259 ResultTypes.erase(&ResultTypes[0]);
10261 bool IsD16 =
false;
10262 bool IsG16 =
false;
10263 bool IsA16 =
false;
10265 int NumVDataDwords = 0;
10266 bool AdjustRetType =
false;
10267 bool IsAtomicPacked16Bit =
false;
10270 const unsigned ArgOffset = WithChain ? 2 : 1;
10273 unsigned DMaskLanes = 0;
10275 if (BaseOpcode->
Atomic) {
10276 VData =
Op.getOperand(2);
10278 IsAtomicPacked16Bit =
10279 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
10280 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
10281 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
10282 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
10293 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
10295 DMask = Is64Bit ? 0xf : 0x3;
10296 NumVDataDwords = Is64Bit ? 4 : 2;
10298 DMask = Is64Bit ? 0x3 : 0x1;
10299 NumVDataDwords = Is64Bit ? 2 : 1;
10302 DMask =
Op->getConstantOperandVal(ArgOffset + Intr->
DMaskIndex);
10305 if (BaseOpcode->
Store) {
10306 VData =
Op.getOperand(2);
10310 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
10314 VData = handleD16VData(VData, DAG,
true);
10317 NumVDataDwords = (VData.
getValueType().getSizeInBits() + 31) / 32;
10318 }
else if (!BaseOpcode->
NoReturn) {
10323 if (!Subtarget->hasD16Images() || !BaseOpcode->
HasD16)
10331 (!LoadVT.
isVector() && DMaskLanes > 1))
10337 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
10338 !(BaseOpcode->
Gather4 && Subtarget->hasImageGather4D16Bug()))
10339 NumVDataDwords = (DMaskLanes + 1) / 2;
10341 NumVDataDwords = DMaskLanes;
10343 AdjustRetType =
true;
10347 unsigned VAddrEnd = ArgOffset + Intr->
VAddrEnd;
10354 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10355 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10357 VAddrVT =
Op.getOperand(ArgOffset + Intr->
CoordStart).getSimpleValueType();
10359 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10360 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10364 if (IsA16 && (
Op.getOperand(ArgOffset +
I).getValueType() == MVT::f16)) {
10370 {
Op.getOperand(ArgOffset +
I), DAG.
getPOISON(MVT::f16)});
10374 "Bias needs to be converted to 16 bit in A16 mode");
10379 if (BaseOpcode->
Gradients && !
ST->hasG16() && (IsA16 != IsG16)) {
10383 dbgs() <<
"Failed to lower image intrinsic: 16 bit addresses "
10384 "require 16 bit args for both gradients and addresses");
10389 if (!
ST->hasA16()) {
10390 LLVM_DEBUG(
dbgs() <<
"Failed to lower image intrinsic: Target does not "
10391 "support 16 bit addresses\n");
10401 if (BaseOpcode->
Gradients && IsG16 &&
ST->hasG16()) {
10403 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
10405 IntrOpcode = G16MappingInfo->
G16;
10428 for (
unsigned I = ArgOffset + Intr->
CoordStart;
I < VAddrEnd;
I++)
10446 const unsigned NSAMaxSize =
ST->getNSAMaxSize(BaseOpcode->
Sampler);
10447 const bool HasPartialNSAEncoding =
ST->hasPartialNSAEncoding();
10448 const bool UseNSA =
ST->hasNSAEncoding() &&
10449 VAddrs.
size() >=
ST->getNSAThreshold(MF) &&
10450 (VAddrs.
size() <= NSAMaxSize || HasPartialNSAEncoding);
10451 const bool UsePartialNSA =
10452 UseNSA && HasPartialNSAEncoding && VAddrs.
size() > NSAMaxSize;
10455 if (UsePartialNSA) {
10457 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
10458 }
else if (!UseNSA) {
10468 uint64_t UnormConst =
10469 Op.getConstantOperandVal(ArgOffset + Intr->
UnormIndex);
10471 Unorm = UnormConst ? True : False;
10477 bool IsTexFail =
false;
10478 if (!
parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10487 NumVDataDwords = 1;
10489 NumVDataDwords += 1;
10490 AdjustRetType =
true;
10495 if (AdjustRetType) {
10498 if (DMaskLanes == 0 && !BaseOpcode->
Store) {
10507 MVT::i32, NumVDataDwords)
10510 ResultTypes[0] = NewVT;
10511 if (ResultTypes.size() == 3) {
10515 ResultTypes.erase(&ResultTypes[1]);
10529 Ops.push_back(VData);
10530 if (UsePartialNSA) {
10532 Ops.push_back(VAddr);
10536 Ops.push_back(VAddr);
10539 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10541 Ops.push_back(Rsrc);
10546 Ops.push_back(Samp);
10551 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10552 Ops.push_back(Unorm);
10554 Ops.push_back(IsA16 &&
10555 ST->hasFeature(AMDGPU::FeatureR128A16)
10559 Ops.push_back(IsA16 ? True : False);
10561 if (!Subtarget->hasGFX90AInsts())
10562 Ops.push_back(TFE);
10566 "TFE is not supported on this GPU",
DL.getDebugLoc()));
10569 if (!IsGFX12Plus || BaseOpcode->
Sampler || BaseOpcode->
MSAA)
10570 Ops.push_back(LWE);
10572 Ops.push_back(DimInfo->
DA ? True : False);
10574 Ops.push_back(IsD16 ? True : False);
10576 Ops.push_back(
Op.getOperand(0));
10578 int NumVAddrDwords =
10584 NumVDataDwords, NumVAddrDwords);
10585 }
else if (IsGFX12Plus) {
10587 NumVDataDwords, NumVAddrDwords);
10588 }
else if (IsGFX11Plus) {
10590 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10591 : AMDGPU::MIMGEncGfx11Default,
10592 NumVDataDwords, NumVAddrDwords);
10593 }
else if (IsGFX10Plus) {
10595 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10596 : AMDGPU::MIMGEncGfx10Default,
10597 NumVDataDwords, NumVAddrDwords);
10599 if (Subtarget->hasGFX90AInsts()) {
10601 NumVDataDwords, NumVAddrDwords);
10602 if (Opcode == -1) {
10605 "requested image instruction is not supported on this GPU",
10606 DL.getDebugLoc()));
10610 for (EVT VT : OrigResultTypes) {
10611 if (VT == MVT::Other)
10612 RetValues[Idx++] =
Op.getOperand(0);
10620 if (Opcode == -1 &&
10623 NumVDataDwords, NumVAddrDwords);
10626 NumVDataDwords, NumVAddrDwords);
10633 MachineMemOperand *MemRef = MemOp->getMemOperand();
10652 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10653 NumVDataDwords, IsAtomicPacked16Bit,
DL);
10666 MachinePointerInfo(),
10671 if (!
Offset->isDivergent()) {
10678 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10687 !Subtarget->hasScalarDwordx3Loads()) {
10691 AMDGPUISD::SBUFFER_LOAD,
DL, DAG.
getVTList(WidenedVT),
Ops, WidenedVT,
10714 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10716 return handleByteShortBufferLoads(DAG, VT,
DL,
Ops, MMO);
10720 unsigned NumLoads = 1;
10726 if (NumElts == 8 || NumElts == 16) {
10727 NumLoads = NumElts / 4;
10731 SDVTList VTList = DAG.
getVTList({LoadVT, MVT::Other});
10736 NumLoads > 1 ?
Align(16 * NumLoads) :
Align(4));
10738 uint64_t InstOffset =
Ops[5]->getAsZExtVal();
10740 for (
unsigned i = 0; i < NumLoads; ++i) {
10743 Loads.
push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD,
DL, VTList,
Ops,
10744 LoadVT, LoadMMO, DAG));
10747 if (NumElts == 8 || NumElts == 16)
10755 if (!Subtarget->hasArchitectedSGPRs())
10760 return DAG.
getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10767 unsigned Width)
const {
10769 using namespace AMDGPU::Hwreg;
10771 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10810 auto *MFI = MF.
getInfo<SIMachineFunctionInfo>();
10812 EVT VT =
Op.getValueType();
10814 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
10818 switch (IntrinsicID) {
10819 case Intrinsic::amdgcn_wave_reduce_min:
10820 case Intrinsic::amdgcn_wave_reduce_umin:
10821 case Intrinsic::amdgcn_wave_reduce_fmin:
10822 case Intrinsic::amdgcn_wave_reduce_max:
10823 case Intrinsic::amdgcn_wave_reduce_umax:
10824 case Intrinsic::amdgcn_wave_reduce_fmax:
10825 case Intrinsic::amdgcn_wave_reduce_add:
10826 case Intrinsic::amdgcn_wave_reduce_fadd:
10827 case Intrinsic::amdgcn_wave_reduce_sub:
10828 case Intrinsic::amdgcn_wave_reduce_fsub:
10829 case Intrinsic::amdgcn_wave_reduce_and:
10830 case Intrinsic::amdgcn_wave_reduce_or:
10831 case Intrinsic::amdgcn_wave_reduce_xor: {
10832 EVT SrcVT =
Op.getOperand(1).getValueType();
10835 bool NeedsSignExt = IntrinsicID == Intrinsic::amdgcn_wave_reduce_min ||
10836 IntrinsicID == Intrinsic::amdgcn_wave_reduce_max ||
10837 IntrinsicID == Intrinsic::amdgcn_wave_reduce_add ||
10838 IntrinsicID == Intrinsic::amdgcn_wave_reduce_sub;
10842 auto SrcType = IsFPOp ? MVT::f16 : MVT::i16;
10843 auto ExtType = IsFPOp ? MVT::f32 : MVT::i32;
10847 Op.getOperand(0), ExtendedSrc, Strategy);
10856 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10859 return getPreloadedValue(DAG, *MFI, VT,
10862 case Intrinsic::amdgcn_dispatch_ptr:
10863 case Intrinsic::amdgcn_queue_ptr: {
10864 if (!Subtarget->isAmdHsaOrMesa(MF.
getFunction())) {
10866 MF.
getFunction(),
"unsupported hsa intrinsic without hsa target",
10867 DL.getDebugLoc()));
10871 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10874 return getPreloadedValue(DAG, *MFI, VT, RegID);
10876 case Intrinsic::amdgcn_implicitarg_ptr: {
10878 return getImplicitArgPtr(DAG,
DL);
10879 return getPreloadedValue(DAG, *MFI, VT,
10882 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10888 return getPreloadedValue(DAG, *MFI, VT,
10891 case Intrinsic::amdgcn_dispatch_id: {
10894 case Intrinsic::amdgcn_rcp:
10895 return DAG.
getNode(AMDGPUISD::RCP,
DL, VT,
Op.getOperand(1));
10896 case Intrinsic::amdgcn_rsq:
10897 return DAG.
getNode(AMDGPUISD::RSQ,
DL, VT,
Op.getOperand(1));
10898 case Intrinsic::amdgcn_rsq_legacy:
10902 case Intrinsic::amdgcn_rcp_legacy:
10905 return DAG.
getNode(AMDGPUISD::RCP_LEGACY,
DL, VT,
Op.getOperand(1));
10906 case Intrinsic::amdgcn_fma_legacy:
10907 if (!Subtarget->hasFmaLegacy32Insts())
10910 case Intrinsic::amdgcn_sudot4:
10911 case Intrinsic::amdgcn_sudot8:
10912 if (!Subtarget->hasDot8Insts())
10915 case Intrinsic::amdgcn_tanh:
10916 if (!Subtarget->hasTanhInsts())
10919 case Intrinsic::amdgcn_rsq_clamp: {
10921 return DAG.
getNode(AMDGPUISD::RSQ_CLAMP,
DL, VT,
Op.getOperand(1));
10933 case Intrinsic::r600_read_ngroups_x:
10934 if (Subtarget->isAmdHsaOS())
10937 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10940 case Intrinsic::r600_read_ngroups_y:
10941 if (Subtarget->isAmdHsaOS())
10944 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10947 case Intrinsic::r600_read_ngroups_z:
10948 if (Subtarget->isAmdHsaOS())
10951 return lowerKernargMemParameter(DAG, VT, VT,
DL, DAG.
getEntryNode(),
10954 case Intrinsic::r600_read_local_size_x:
10955 if (Subtarget->isAmdHsaOS())
10958 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10960 case Intrinsic::r600_read_local_size_y:
10961 if (Subtarget->isAmdHsaOS())
10964 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10966 case Intrinsic::r600_read_local_size_z:
10967 if (Subtarget->isAmdHsaOS())
10970 return lowerImplicitZextParam(DAG,
Op, MVT::i16,
10972 case Intrinsic::amdgcn_workgroup_id_x:
10973 return lowerWorkGroupId(DAG, *MFI, VT,
10977 case Intrinsic::amdgcn_workgroup_id_y:
10978 return lowerWorkGroupId(DAG, *MFI, VT,
10982 case Intrinsic::amdgcn_workgroup_id_z:
10983 return lowerWorkGroupId(DAG, *MFI, VT,
10987 case Intrinsic::amdgcn_cluster_id_x:
10988 return Subtarget->hasClusters()
10989 ? getPreloadedValue(DAG, *MFI, VT,
10991 : DAG.getPOISON(VT);
10992 case Intrinsic::amdgcn_cluster_id_y:
10993 return Subtarget->hasClusters()
10994 ? getPreloadedValue(DAG, *MFI, VT,
10997 case Intrinsic::amdgcn_cluster_id_z:
10998 return Subtarget->hasClusters()
10999 ? getPreloadedValue(DAG, *MFI, VT,
11002 case Intrinsic::amdgcn_cluster_workgroup_id_x:
11003 return Subtarget->hasClusters()
11004 ? getPreloadedValue(
11008 case Intrinsic::amdgcn_cluster_workgroup_id_y:
11009 return Subtarget->hasClusters()
11010 ? getPreloadedValue(
11014 case Intrinsic::amdgcn_cluster_workgroup_id_z:
11015 return Subtarget->hasClusters()
11016 ? getPreloadedValue(
11020 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
11021 return Subtarget->hasClusters()
11024 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
11025 return Subtarget->hasClusters()
11026 ? getPreloadedValue(
11030 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
11031 return Subtarget->hasClusters()
11032 ? getPreloadedValue(
11036 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
11037 return Subtarget->hasClusters()
11038 ? getPreloadedValue(
11042 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
11043 return Subtarget->hasClusters()
11044 ? getPreloadedValue(
11048 case Intrinsic::amdgcn_wave_id:
11049 return lowerWaveID(DAG,
Op);
11050 case Intrinsic::amdgcn_lds_kernel_id: {
11052 return getLDSKernelId(DAG,
DL);
11053 return getPreloadedValue(DAG, *MFI, VT,
11056 case Intrinsic::amdgcn_workitem_id_x:
11057 return lowerWorkitemID(DAG,
Op, 0, MFI->getArgInfo().WorkItemIDX);
11058 case Intrinsic::amdgcn_workitem_id_y:
11059 return lowerWorkitemID(DAG,
Op, 1, MFI->getArgInfo().WorkItemIDY);
11060 case Intrinsic::amdgcn_workitem_id_z:
11061 return lowerWorkitemID(DAG,
Op, 2, MFI->getArgInfo().WorkItemIDZ);
11062 case Intrinsic::amdgcn_wavefrontsize:
11064 SDLoc(
Op), MVT::i32);
11065 case Intrinsic::amdgcn_s_buffer_load: {
11066 unsigned CPol =
Op.getConstantOperandVal(3);
11073 return lowerSBuffer(VT,
DL,
Op.getOperand(1),
Op.getOperand(2),
11074 Op.getOperand(3), DAG);
11076 case Intrinsic::amdgcn_fdiv_fast:
11077 return lowerFDIV_FAST(
Op, DAG);
11078 case Intrinsic::amdgcn_sin:
11079 return DAG.
getNode(AMDGPUISD::SIN_HW,
DL, VT,
Op.getOperand(1));
11081 case Intrinsic::amdgcn_cos:
11082 return DAG.
getNode(AMDGPUISD::COS_HW,
DL, VT,
Op.getOperand(1));
11084 case Intrinsic::amdgcn_mul_u24:
11085 return DAG.
getNode(AMDGPUISD::MUL_U24,
DL, VT,
Op.getOperand(1),
11087 case Intrinsic::amdgcn_mul_i24:
11088 return DAG.
getNode(AMDGPUISD::MUL_I24,
DL, VT,
Op.getOperand(1),
11091 case Intrinsic::amdgcn_log_clamp: {
11097 case Intrinsic::amdgcn_fract:
11098 return DAG.
getNode(AMDGPUISD::FRACT,
DL, VT,
Op.getOperand(1));
11100 case Intrinsic::amdgcn_class:
11101 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, VT,
Op.getOperand(1),
11103 case Intrinsic::amdgcn_div_fmas:
11104 return DAG.
getNode(AMDGPUISD::DIV_FMAS,
DL, VT,
Op.getOperand(1),
11105 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
11107 case Intrinsic::amdgcn_div_fixup:
11108 return DAG.
getNode(AMDGPUISD::DIV_FIXUP,
DL, VT,
Op.getOperand(1),
11109 Op.getOperand(2),
Op.getOperand(3));
11111 case Intrinsic::amdgcn_div_scale: {
11117 SDValue Denominator =
Op.getOperand(2);
11124 SDValue Src0 =
Param->isAllOnes() ? Numerator : Denominator;
11126 return DAG.
getNode(AMDGPUISD::DIV_SCALE,
DL,
Op->getVTList(), Src0,
11127 Denominator, Numerator);
11129 case Intrinsic::amdgcn_icmp: {
11131 if (
Op.getOperand(1).getValueType() == MVT::i1 &&
11132 Op.getConstantOperandVal(2) == 0 &&
11137 case Intrinsic::amdgcn_fcmp: {
11140 case Intrinsic::amdgcn_ballot:
11142 case Intrinsic::amdgcn_fmed3:
11143 return DAG.
getNode(AMDGPUISD::FMED3,
DL, VT,
Op.getOperand(1),
11144 Op.getOperand(2),
Op.getOperand(3),
Op->getFlags());
11145 case Intrinsic::amdgcn_fdot2:
11146 return DAG.
getNode(AMDGPUISD::FDOT2,
DL, VT,
Op.getOperand(1),
11147 Op.getOperand(2),
Op.getOperand(3),
Op.getOperand(4));
11148 case Intrinsic::amdgcn_fmul_legacy:
11149 return DAG.
getNode(AMDGPUISD::FMUL_LEGACY,
DL, VT,
Op.getOperand(1),
11151 case Intrinsic::amdgcn_sbfe:
11152 return DAG.
getNode(AMDGPUISD::BFE_I32,
DL, VT,
Op.getOperand(1),
11153 Op.getOperand(2),
Op.getOperand(3));
11154 case Intrinsic::amdgcn_ubfe:
11155 return DAG.
getNode(AMDGPUISD::BFE_U32,
DL, VT,
Op.getOperand(1),
11156 Op.getOperand(2),
Op.getOperand(3));
11157 case Intrinsic::amdgcn_cvt_pkrtz:
11158 case Intrinsic::amdgcn_cvt_pknorm_i16:
11159 case Intrinsic::amdgcn_cvt_pknorm_u16:
11160 case Intrinsic::amdgcn_cvt_pk_i16:
11161 case Intrinsic::amdgcn_cvt_pk_u16: {
11163 EVT VT =
Op.getValueType();
11166 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
11167 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
11168 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
11169 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
11170 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
11171 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
11172 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
11173 Opcode = AMDGPUISD::CVT_PK_I16_I32;
11175 Opcode = AMDGPUISD::CVT_PK_U16_U32;
11178 return DAG.
getNode(Opcode,
DL, VT,
Op.getOperand(1),
Op.getOperand(2));
11181 DAG.
getNode(Opcode,
DL, MVT::i32,
Op.getOperand(1),
Op.getOperand(2));
11184 case Intrinsic::amdgcn_fmad_ftz:
11185 return DAG.
getNode(AMDGPUISD::FMAD_FTZ,
DL, VT,
Op.getOperand(1),
11186 Op.getOperand(2),
Op.getOperand(3));
11188 case Intrinsic::amdgcn_if_break:
11190 Op->getOperand(1),
Op->getOperand(2)),
11193 case Intrinsic::amdgcn_groupstaticsize: {
11199 const GlobalValue *GV =
11205 case Intrinsic::amdgcn_is_shared:
11206 case Intrinsic::amdgcn_is_private: {
11213 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
11217 Subtarget->hasGloballyAddressableScratch()) {
11220 AMDGPU::S_MOV_B32,
DL, MVT::i32,
11221 DAG.
getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
11230 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
11233 case Intrinsic::amdgcn_perm:
11234 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op.getOperand(1),
11235 Op.getOperand(2),
Op.getOperand(3));
11236 case Intrinsic::amdgcn_reloc_constant: {
11246 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
11247 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
11248 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
11249 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
11250 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
11251 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
11252 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
11253 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
11254 if (
Op.getOperand(4).getValueType() == MVT::i32)
11260 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
11261 Op.getOperand(3), IndexKeyi32);
11263 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
11264 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
11265 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
11266 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
11267 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
11268 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
11269 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
11270 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
11271 if (
Op.getOperand(4).getValueType() == MVT::i64)
11276 Op.getOperand(4).getValueType() == MVT::v2i32
11280 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11281 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
11282 Op.getOperand(6)});
11284 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
11285 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
11286 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
11287 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
11288 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
11289 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
11290 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
11293 if (
Op.getOperand(6).getValueType() == IndexKeyTy)
11298 Op.getOperand(6).getValueType().isVector()
11302 Op.getOperand(0),
Op.getOperand(1),
Op.getOperand(2),
11303 Op.getOperand(3),
Op.getOperand(4),
Op.getOperand(5),
11304 IndexKey,
Op.getOperand(7),
Op.getOperand(8)};
11305 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
11306 Args.push_back(
Op.getOperand(9));
11309 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
11310 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
11311 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
11312 if (
Op.getOperand(6).getValueType() == MVT::i32)
11318 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11319 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11320 IndexKeyi32, Op.getOperand(7)});
11322 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
11323 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
11324 unsigned AFmt = (unsigned)
Op.getConstantOperandVal(1);
11325 unsigned BFmt = (unsigned)
Op.getConstantOperandVal(3);
11326 unsigned AScaleFmt = (unsigned)
Op.getConstantOperandVal(8);
11327 unsigned BScaleFmt = (unsigned)
Op.getConstantOperandVal(11);
11331 "invalid matrix and scale format combination in wmma call");
11337 case Intrinsic::amdgcn_addrspacecast_nonnull:
11338 return lowerADDRSPACECAST(
Op, DAG);
11339 case Intrinsic::amdgcn_readlane:
11340 case Intrinsic::amdgcn_readfirstlane:
11341 case Intrinsic::amdgcn_writelane:
11342 case Intrinsic::amdgcn_permlane16:
11343 case Intrinsic::amdgcn_permlanex16:
11344 case Intrinsic::amdgcn_permlane64:
11345 case Intrinsic::amdgcn_set_inactive:
11346 case Intrinsic::amdgcn_set_inactive_chain_arg:
11347 case Intrinsic::amdgcn_mov_dpp8:
11348 case Intrinsic::amdgcn_update_dpp:
11349 case Intrinsic::amdgcn_permlane_bcast:
11350 case Intrinsic::amdgcn_permlane_up:
11351 case Intrinsic::amdgcn_permlane_down:
11352 case Intrinsic::amdgcn_permlane_xor:
11354 case Intrinsic::amdgcn_dead: {
11356 for (
const EVT ValTy :
Op.getNode()->values())
11360 case Intrinsic::amdgcn_wave_shuffle:
11363 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11365 return lowerImage(
Op, ImageDimIntr, DAG,
false);
11375 if (Subtarget->hasRestrictedSOffset() &&
isNullConstant(SOffset))
11376 return DAG.
getRegister(AMDGPU::SGPR_NULL, MVT::i32);
11382 unsigned NewOpcode)
const {
11386 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11387 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11405 M->getMemOperand());
11410 unsigned NewOpcode)
const {
11414 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
11415 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11433 M->getMemOperand());
11438 unsigned IntrID =
Op.getConstantOperandVal(1);
11442 case Intrinsic::amdgcn_ds_ordered_add:
11443 case Intrinsic::amdgcn_ds_ordered_swap: {
11448 unsigned IndexOperand =
M->getConstantOperandVal(7);
11449 unsigned WaveRelease =
M->getConstantOperandVal(8);
11450 unsigned WaveDone =
M->getConstantOperandVal(9);
11452 unsigned OrderedCountIndex = IndexOperand & 0x3f;
11453 IndexOperand &= ~0x3f;
11454 unsigned CountDw = 0;
11457 CountDw = (IndexOperand >> 24) & 0xf;
11458 IndexOperand &= ~(0xf << 24);
11460 if (CountDw < 1 || CountDw > 4) {
11463 Fn,
"ds_ordered_count: dword count must be between 1 and 4",
11464 DL.getDebugLoc()));
11469 if (IndexOperand) {
11472 Fn,
"ds_ordered_count: bad index operand",
DL.getDebugLoc()));
11475 if (WaveDone && !WaveRelease) {
11479 Fn,
"ds_ordered_count: wave_done requires wave_release",
11480 DL.getDebugLoc()));
11483 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
11484 unsigned ShaderType =
11486 unsigned Offset0 = OrderedCountIndex << 2;
11487 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
11490 Offset1 |= (CountDw - 1) << 6;
11493 Offset1 |= ShaderType << 2;
11495 unsigned Offset = Offset0 | (Offset1 << 8);
11502 M->getVTList(),
Ops,
M->getMemoryVT(),
11503 M->getMemOperand());
11505 case Intrinsic::amdgcn_raw_buffer_load:
11506 case Intrinsic::amdgcn_raw_ptr_buffer_load:
11507 case Intrinsic::amdgcn_raw_atomic_buffer_load:
11508 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
11509 case Intrinsic::amdgcn_raw_buffer_load_format:
11510 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
11511 const bool IsFormat =
11512 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
11513 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
11515 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11516 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
11530 return lowerIntrinsicLoad(M, IsFormat, DAG,
Ops);
11532 case Intrinsic::amdgcn_struct_buffer_load:
11533 case Intrinsic::amdgcn_struct_ptr_buffer_load:
11534 case Intrinsic::amdgcn_struct_buffer_load_format:
11535 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
11536 case Intrinsic::amdgcn_struct_atomic_buffer_load:
11537 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
11538 const bool IsFormat =
11539 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
11540 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
11542 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11543 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11558 case Intrinsic::amdgcn_raw_tbuffer_load:
11559 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11561 EVT LoadVT =
Op.getValueType();
11562 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11563 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(3), DAG);
11579 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11581 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11582 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11585 case Intrinsic::amdgcn_struct_tbuffer_load:
11586 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11588 EVT LoadVT =
Op.getValueType();
11589 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
11590 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
11606 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11608 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT,
DL,
11609 Op->getVTList(),
Ops, LoadVT,
M->getMemOperand(),
11612 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11613 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11614 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11615 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11616 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11617 return lowerStructBufferAtomicIntrin(
Op, DAG,
11618 AMDGPUISD::BUFFER_ATOMIC_FADD);
11619 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11620 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11621 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11622 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11623 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11624 return lowerStructBufferAtomicIntrin(
Op, DAG,
11625 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11626 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11627 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11628 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11629 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11630 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11631 return lowerStructBufferAtomicIntrin(
Op, DAG,
11632 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11633 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11634 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11635 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11636 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11638 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11639 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11640 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11641 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11642 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11644 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11645 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11646 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11647 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11648 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11650 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11651 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11653 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11654 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11655 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11656 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11657 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11658 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11659 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11660 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11661 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11662 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11663 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11664 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11665 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11666 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11667 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11668 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11669 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11670 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11671 return lowerStructBufferAtomicIntrin(
Op, DAG,
11672 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11673 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11674 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11675 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11676 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11677 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11678 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11679 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11680 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11681 return lowerStructBufferAtomicIntrin(
Op, DAG,
11682 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11683 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11684 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11685 return lowerStructBufferAtomicIntrin(
Op, DAG,
11686 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11687 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11688 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11689 return lowerStructBufferAtomicIntrin(
Op, DAG,
11690 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11691 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11692 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11693 return lowerStructBufferAtomicIntrin(
Op, DAG,
11694 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11695 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11696 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11697 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11698 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11699 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11700 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11701 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11702 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11703 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11704 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11705 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11706 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11707 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11708 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11709 return lowerStructBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11710 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11711 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11712 return lowerRawBufferAtomicIntrin(
Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11713 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11714 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11715 return lowerStructBufferAtomicIntrin(
Op, DAG,
11716 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11717 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11718 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11719 return lowerRawBufferAtomicIntrin(
Op, DAG,
11720 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11721 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11722 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11723 return lowerStructBufferAtomicIntrin(
Op, DAG,
11724 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11725 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11726 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11727 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(4), DAG);
11728 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
11742 EVT VT =
Op.getValueType();
11746 Op->getVTList(),
Ops, VT,
11747 M->getMemOperand());
11749 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11750 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11751 SDValue Rsrc = bufferRsrcPtrToVector(
Op->getOperand(4), DAG);
11752 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(6), DAG);
11766 EVT VT =
Op.getValueType();
11770 Op->getVTList(),
Ops, VT,
11771 M->getMemOperand());
11773 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11774 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11776 SDValue NodePtr =
M->getOperand(2);
11777 SDValue RayExtent =
M->getOperand(3);
11778 SDValue InstanceMask =
M->getOperand(4);
11779 SDValue RayOrigin =
M->getOperand(5);
11780 SDValue RayDir =
M->getOperand(6);
11782 SDValue TDescr =
M->getOperand(8);
11787 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11792 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11793 const unsigned NumVDataDwords = 10;
11794 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11796 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11797 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11798 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11802 Ops.push_back(NodePtr);
11805 {DAG.getBitcast(MVT::i32, RayExtent),
11806 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11807 Ops.push_back(RayOrigin);
11808 Ops.push_back(RayDir);
11809 Ops.push_back(Offsets);
11810 Ops.push_back(TDescr);
11811 Ops.push_back(
M->getChain());
11814 MachineMemOperand *MemRef =
M->getMemOperand();
11818 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11820 SDValue NodePtr =
M->getOperand(2);
11821 SDValue RayExtent =
M->getOperand(3);
11822 SDValue RayOrigin =
M->getOperand(4);
11823 SDValue RayDir =
M->getOperand(5);
11824 SDValue RayInvDir =
M->getOperand(6);
11825 SDValue TDescr =
M->getOperand(7);
11832 if (!Subtarget->hasGFX10_AEncoding()) {
11842 const unsigned NumVDataDwords = 4;
11843 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11844 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11845 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11848 const unsigned BaseOpcodes[2][2] = {
11849 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11850 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11851 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11855 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11856 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11857 : AMDGPU::MIMGEncGfx10NSA,
11858 NumVDataDwords, NumVAddrDwords);
11862 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11863 : AMDGPU::MIMGEncGfx10Default,
11864 NumVDataDwords, NumVAddrDwords);
11870 auto packLanes = [&DAG, &
Ops, &
DL](
SDValue Op,
bool IsAligned) {
11873 if (Lanes[0].getValueSizeInBits() == 32) {
11874 for (
unsigned I = 0;
I < 3; ++
I)
11881 Ops.push_back(Lanes[2]);
11893 if (UseNSA && IsGFX11Plus) {
11894 Ops.push_back(NodePtr);
11896 Ops.push_back(RayOrigin);
11901 for (
unsigned I = 0;
I < 3; ++
I) {
11904 {DirLanes[I], InvDirLanes[I]})));
11908 Ops.push_back(RayDir);
11909 Ops.push_back(RayInvDir);
11916 Ops.push_back(NodePtr);
11919 packLanes(RayOrigin,
true);
11920 packLanes(RayDir,
true);
11921 packLanes(RayInvDir,
false);
11926 if (NumVAddrDwords > 12) {
11934 Ops.push_back(MergedOps);
11937 Ops.push_back(TDescr);
11939 Ops.push_back(
M->getChain());
11942 MachineMemOperand *MemRef =
M->getMemOperand();
11946 case Intrinsic::amdgcn_global_atomic_fmin_num:
11947 case Intrinsic::amdgcn_global_atomic_fmax_num:
11948 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11949 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11956 unsigned Opcode = 0;
11958 case Intrinsic::amdgcn_global_atomic_fmin_num:
11959 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11963 case Intrinsic::amdgcn_global_atomic_fmax_num:
11964 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11971 return DAG.
getAtomic(Opcode, SDLoc(
Op),
M->getMemoryVT(),
M->getVTList(),
11972 Ops,
M->getMemOperand());
11974 case Intrinsic::amdgcn_s_alloc_vgpr: {
11982 ReadFirstLaneID, NumVGPRs);
11985 Op.getOperand(0),
Op.getOperand(1), NumVGPRs);
11987 case Intrinsic::amdgcn_s_get_barrier_state:
11988 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11995 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11996 BarID = (BarID >> 4) & 0x3F;
11997 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
12000 Ops.push_back(Chain);
12002 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
12003 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
12019 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
12020 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
12021 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
12025 EVT VT =
Op->getValueType(0);
12029 case Intrinsic::amdgcn_av_load_b128: {
12030 if (!Subtarget->hasFlatGlobalInsts()) {
12033 "llvm.amdgcn.av.load.b128 not supported on subtarget",
12034 DL.getDebugLoc()));
12041 EVT VT =
Op->getValueType(0);
12048 case Intrinsic::amdgcn_flat_load_monitor_b32:
12049 case Intrinsic::amdgcn_flat_load_monitor_b64:
12050 case Intrinsic::amdgcn_flat_load_monitor_b128: {
12055 Op->getVTList(), {Chain, Ptr},
12058 case Intrinsic::amdgcn_global_load_monitor_b32:
12059 case Intrinsic::amdgcn_global_load_monitor_b64:
12060 case Intrinsic::amdgcn_global_load_monitor_b128: {
12065 Op->getVTList(), {Chain, Ptr},
12070 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12072 return lowerImage(
Op, ImageDimIntr, DAG,
true);
12080SDValue SITargetLowering::getMemIntrinsicNode(
unsigned Opcode,
const SDLoc &
DL,
12087 EVT VT = VTList.
VTs[0];
12090 bool IsTFE = VTList.
NumVTs == 3;
12093 unsigned NumOpDWords = NumValueDWords + 1;
12095 SDVTList OpDWordsVTList = DAG.
getVTList(OpDWordsVT, VTList.
VTs[2]);
12096 MachineMemOperand *OpDWordsMMO =
12098 SDValue Op = getMemIntrinsicNode(Opcode,
DL, OpDWordsVTList,
Ops,
12099 OpDWordsVT, OpDWordsMMO, DAG);
12104 NumValueDWords == 1
12113 if (!Subtarget->hasDwordx3LoadStores() &&
12114 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
12118 SDVTList WidenedVTList = DAG.
getVTList(WidenedVT, VTList.
VTs[1]);
12120 WidenedMemVT, WidenedMMO);
12130 bool ImageStore)
const {
12140 if (Subtarget->hasUnpackedD16VMem()) {
12154 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
12165 for (
unsigned I = 0;
I < Elts.
size() / 2;
I += 1) {
12171 if ((NumElements % 2) == 1) {
12173 unsigned I = Elts.
size() / 2;
12189 if (NumElements == 3) {
12208 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12209 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12210 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12211 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
12212 case Intrinsic::amdgcn_load_async_to_lds:
12213 case Intrinsic::amdgcn_global_load_async_lds:
12223 unsigned IntrinsicID =
Op.getConstantOperandVal(1);
12225 switch (IntrinsicID) {
12226 case Intrinsic::amdgcn_exp_compr: {
12227 if (!Subtarget->hasCompressedExport()) {
12230 "intrinsic not supported on subtarget",
DL.getDebugLoc()));
12252 unsigned Opc =
Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
12256 case Intrinsic::amdgcn_struct_tbuffer_store:
12257 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
12259 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
12261 VData = handleD16VData(VData, DAG);
12262 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12263 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
12277 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12278 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12281 M->getMemoryVT(),
M->getMemOperand());
12284 case Intrinsic::amdgcn_raw_tbuffer_store:
12285 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
12287 bool IsD16 = (VData.
getValueType().getScalarType() == MVT::f16);
12289 VData = handleD16VData(VData, DAG);
12290 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12291 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
12305 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12306 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12309 M->getMemoryVT(),
M->getMemOperand());
12312 case Intrinsic::amdgcn_raw_buffer_store:
12313 case Intrinsic::amdgcn_raw_ptr_buffer_store:
12314 case Intrinsic::amdgcn_raw_buffer_store_format:
12315 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
12316 const bool IsFormat =
12317 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
12318 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
12325 VData = handleD16VData(VData, DAG);
12335 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12336 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(4), DAG);
12350 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
12351 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
12356 return handleByteShortBufferStores(DAG, VDataVT,
DL,
Ops, M);
12359 M->getMemoryVT(),
M->getMemOperand());
12362 case Intrinsic::amdgcn_struct_buffer_store:
12363 case Intrinsic::amdgcn_struct_ptr_buffer_store:
12364 case Intrinsic::amdgcn_struct_buffer_store_format:
12365 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
12366 const bool IsFormat =
12367 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
12368 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
12376 VData = handleD16VData(VData, DAG);
12386 auto Rsrc = bufferRsrcPtrToVector(
Op.getOperand(3), DAG);
12387 auto [VOffset,
Offset] = splitBufferOffsets(
Op.getOperand(5), DAG);
12401 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
12402 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 :
Opc;
12406 EVT VDataType = VData.getValueType().getScalarType();
12408 return handleByteShortBufferStores(DAG, VDataType,
DL,
Ops, M);
12411 M->getMemoryVT(),
M->getMemOperand());
12413 case Intrinsic::amdgcn_raw_buffer_load_lds:
12414 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12415 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
12416 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12417 case Intrinsic::amdgcn_struct_buffer_load_lds:
12418 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12419 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
12420 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
12421 if (!Subtarget->hasVMemToLDSLoad())
12425 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
12426 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
12427 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
12428 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
12429 unsigned OpOffset = HasVIndex ? 1 : 0;
12430 SDValue VOffset =
Op.getOperand(5 + OpOffset);
12432 unsigned Size =
Op->getConstantOperandVal(4);
12438 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
12439 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
12440 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
12441 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
12444 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
12445 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
12446 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
12447 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
12450 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
12451 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
12452 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
12453 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
12456 if (!Subtarget->hasLDSLoadB96_B128())
12458 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
12459 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
12460 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
12461 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
12464 if (!Subtarget->hasLDSLoadB96_B128())
12466 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
12467 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
12468 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
12469 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
12477 if (HasVIndex && HasVOffset)
12481 else if (HasVIndex)
12482 Ops.push_back(
Op.getOperand(5));
12483 else if (HasVOffset)
12484 Ops.push_back(VOffset);
12486 SDValue Rsrc = bufferRsrcPtrToVector(
Op.getOperand(2), DAG);
12487 Ops.push_back(Rsrc);
12488 Ops.push_back(
Op.getOperand(6 + OpOffset));
12489 Ops.push_back(
Op.getOperand(7 + OpOffset));
12491 unsigned Aux =
Op.getConstantOperandVal(8 + OpOffset);
12514 case Intrinsic::amdgcn_load_to_lds:
12515 case Intrinsic::amdgcn_load_async_to_lds:
12516 case Intrinsic::amdgcn_global_load_lds:
12517 case Intrinsic::amdgcn_global_load_async_lds: {
12518 if (!Subtarget->hasVMemToLDSLoad())
12522 unsigned Size =
Op->getConstantOperandVal(4);
12527 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
12530 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
12533 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
12536 if (!Subtarget->hasLDSLoadB96_B128())
12538 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
12541 if (!Subtarget->hasLDSLoadB96_B128())
12543 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
12559 if (
LHS->isDivergent())
12563 RHS.getOperand(0).getValueType() == MVT::i32) {
12566 VOffset =
RHS.getOperand(0);
12570 Ops.push_back(Addr);
12578 Ops.push_back(VOffset);
12581 Ops.push_back(
Op.getOperand(5));
12583 unsigned Aux =
Op.getConstantOperandVal(6);
12598 case Intrinsic::amdgcn_end_cf:
12600 Op->getOperand(2), Chain),
12602 case Intrinsic::amdgcn_s_barrier_signal_var: {
12609 if (CntC && CntC->isZero()) {
12614 std::optional<uint64_t> BarVal;
12616 BarVal =
C->getZExtValue();
12620 BarVal = *Addr + GA->getOffset();
12623 unsigned BarID = (*BarVal >> 4) & 0x3F;
12625 Ops.push_back(Chain);
12627 Op->getVTList(),
Ops);
12633 case Intrinsic::amdgcn_s_barrier_init: {
12640 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12641 ? AMDGPU::S_BARRIER_INIT_M0
12642 : AMDGPU::S_BARRIER_SIGNAL_M0;
12657 constexpr unsigned ShAmt = 16;
12669 case Intrinsic::amdgcn_s_wakeup_barrier: {
12670 if (!Subtarget->hasSWakeupBarrier())
12674 case Intrinsic::amdgcn_s_barrier_join: {
12683 switch (IntrinsicID) {
12686 case Intrinsic::amdgcn_s_barrier_join:
12687 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12689 case Intrinsic::amdgcn_s_wakeup_barrier:
12690 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12694 unsigned BarID = (BarVal >> 4) & 0x3F;
12697 Ops.push_back(Chain);
12699 switch (IntrinsicID) {
12702 case Intrinsic::amdgcn_s_barrier_join:
12703 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12705 case Intrinsic::amdgcn_s_wakeup_barrier:
12706 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12723 case Intrinsic::amdgcn_s_prefetch_data:
12724 case Intrinsic::amdgcn_s_prefetch_inst: {
12727 return Op.getOperand(0);
12730 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12732 Chain, bufferRsrcPtrToVector(
Op.getOperand(2), DAG),
12739 Op->getVTList(),
Ops,
M->getMemoryVT(),
12740 M->getMemOperand());
12742 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12743 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12744 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12752 case Intrinsic::amdgcn_av_store_b128: {
12753 if (!Subtarget->hasFlatGlobalInsts()) {
12756 "llvm.amdgcn.av.store.b128 not supported on subtarget",
12757 DL.getDebugLoc()));
12758 return Op->getOperand(0);
12767 if (
const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12769 return lowerImage(
Op, ImageDimIntr, DAG,
true);
12785 return PtrVT == MVT::i64;
12799std::pair<SDValue, SDValue>
12812 bool CheckNUW = Subtarget->hasGFX1250Insts();
12829 unsigned Overflow = ImmOffset & ~MaxImm;
12830 ImmOffset -= Overflow;
12831 if ((int32_t)Overflow < 0) {
12832 Overflow += ImmOffset;
12837 auto OverflowVal = DAG.
getConstant(Overflow,
DL, MVT::i32);
12856void SITargetLowering::setBufferOffsets(
SDValue CombinedOffset,
12858 Align Alignment)
const {
12860 SDLoc
DL(CombinedOffset);
12862 uint32_t
Imm =
C->getZExtValue();
12863 uint32_t SOffset, ImmOffset;
12864 if (
TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12875 bool CheckNUW = Subtarget->hasGFX1250Insts();
12878 uint32_t SOffset, ImmOffset;
12881 TII->splitMUBUFOffset(
Offset, SOffset, ImmOffset, Alignment)) {
12889 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12898SDValue SITargetLowering::bufferRsrcPtrToVector(
SDValue MaybePointer,
12901 return MaybePointer;
12915 SDValue NumRecords =
Op->getOperand(3);
12921 if (Subtarget->has45BitNumRecordsBufferResource()) {
12940 SDValue ExtShiftedStrideVec =
12952 DAG.
getNode(
ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12954 DAG.
getNode(
ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12959 auto [LowHalf, HighHalf] =
12960 DAG.
SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12970 NumRecords, Flags);
12982 bool IsTFE)
const {
12987 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12988 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12991 SDVTList VTs = DAG.
getVTList(MVT::v2i32, MVT::Other);
13003 ? AMDGPUISD::BUFFER_LOAD_UBYTE
13004 : AMDGPUISD::BUFFER_LOAD_USHORT;
13006 SDVTList ResList = DAG.
getVTList(MVT::i32, MVT::Other);
13020 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
13024 Ops[1] = BufferStoreExt;
13025 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
13026 : AMDGPUISD::BUFFER_STORE_SHORT;
13029 M->getMemOperand());
13054 DAGCombinerInfo &DCI)
const {
13055 SelectionDAG &DAG = DCI.DAG;
13070 if ((MemVT.
isSimple() && !DCI.isAfterLegalizeDAG()) ||
13077 "unexpected vector extload");
13090 "unexpected fp extload");
13108 DCI.AddToWorklist(Cvt.
getNode());
13113 DCI.AddToWorklist(Cvt.
getNode());
13124 if (Info.isEntryFunction())
13125 return Info.getUserSGPRInfo().hasFlatScratchInit();
13133 EVT MemVT =
Load->getMemoryVT();
13134 MachineMemOperand *MMO =
Load->getMemOperand();
13146 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
13174 assert(
Op.getValueType().getVectorElementType() == MVT::i32 &&
13175 "Custom lowering for non-i32 vectors hasn't been implemented.");
13178 unsigned AS =
Load->getAddressSpace();
13179 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13186 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
13190 !Subtarget->hasMultiDwordFlatScratchAddressing())
13200 Subtarget->getScalarizeGlobalBehavior() &&
Load->isSimple() &&
13203 Alignment >=
Align(4) && NumElements < 32) {
13205 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
13217 if (NumElements > 4)
13220 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13230 switch (Subtarget->getMaxPrivateElementSize()) {
13236 if (NumElements > 2)
13241 if (NumElements > 4)
13244 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13253 auto Flags =
Load->getMemOperand()->getFlags();
13255 Load->getAlign(), Flags, &
Fast) &&
13264 MemVT, *
Load->getMemOperand())) {
13273 EVT VT =
Op.getValueType();
13310 EVT VT =
Op.getValueType();
13311 const SDNodeFlags
Flags =
Op->getFlags();
13313 bool AllowInaccurateRcp =
Flags.hasApproximateFuncs();
13319 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
13322 if (CLHS->isOne()) {
13335 return DAG.
getNode(AMDGPUISD::RCP, SL, VT,
RHS);
13339 if (CLHS->isMinusOne()) {
13342 return DAG.
getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
13348 if (!AllowInaccurateRcp &&
13349 ((VT != MVT::f16 && VT != MVT::bf16) || !
Flags.hasAllowReciprocal()))
13363 EVT VT =
Op.getValueType();
13364 const SDNodeFlags
Flags =
Op->getFlags();
13366 bool AllowInaccurateDiv =
Flags.hasApproximateFuncs();
13367 if (!AllowInaccurateDiv)
13391 if (IsNegRcp || (CLHS && CLHS->
isOne()))
13403 return DAG.
getNode(Opcode, SL, VT,
A,
B, Flags);
13413 Opcode = AMDGPUISD::FMUL_W_CHAIN;
13417 return DAG.
getNode(Opcode, SL, VTList,
13426 return DAG.
getNode(Opcode, SL, VT, {
A,
B,
C}, Flags);
13436 Opcode = AMDGPUISD::FMA_W_CHAIN;
13440 return DAG.
getNode(Opcode, SL, VTList,
13446 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
13447 return FastLowered;
13450 EVT VT =
Op.getValueType();
13457 if (VT == MVT::bf16) {
13480 unsigned FMADOpCode =
13484 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt,
Op->getFlags());
13487 SDValue Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13489 Quot = DAG.
getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot,
Op->getFlags());
13490 Err = DAG.
getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13500 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst,
RHS,
LHS,
13506 SDNodeFlags
Flags =
Op->getFlags();
13516 const APFloat K0Val(0x1p+96f);
13519 const APFloat K1Val(0x1p-32f);
13546 assert(ST->hasDenormModeInst() &&
"Requires S_DENORM_MODE");
13547 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
13548 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
13553 if (
SDValue FastLowered = lowerFastUnsafeFDIV(
Op, DAG))
13554 return FastLowered;
13560 SDNodeFlags
Flags =
Op->getFlags();
13561 Flags.setNoFPExcept(
true);
13569 SDVTList ScaleVT = DAG.
getVTList(MVT::f32, MVT::i1);
13578 DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
13582 using namespace AMDGPU::Hwreg;
13583 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
13587 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
13588 const DenormalMode DenormMode =
Info->getMode().FP32Denormals;
13591 const bool HasDynamicDenormals =
13597 if (!PreservesDenormals) {
13602 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
13605 if (HasDynamicDenormals) {
13609 SavedDenormMode =
SDValue(GetReg, 0);
13615 SDNode *EnableDenorm;
13616 if (Subtarget->hasDenormModeInst()) {
13617 const SDValue EnableDenormValue =
13620 EnableDenorm = DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
13624 const SDValue EnableDenormValue =
13626 EnableDenorm = DAG.
getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
13627 {EnableDenormValue,
BitField, Glue});
13637 ApproxRcp, One, NegDivScale0, Flags);
13640 ApproxRcp, Fma0, Flags);
13646 NumeratorScaled,
Mul, Flags);
13652 NumeratorScaled, Fma3, Flags);
13654 if (!PreservesDenormals) {
13655 SDNode *DisableDenorm;
13656 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13660 SDVTList BindParamVTs = DAG.
getVTList(MVT::Other, MVT::Glue);
13662 DAG.
getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13666 assert(HasDynamicDenormals == (
bool)SavedDenormMode);
13667 const SDValue DisableDenormValue =
13668 HasDynamicDenormals
13673 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13684 {Fma4, Fma1, Fma3, Scale},
Flags);
13686 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas,
RHS,
LHS, Flags);
13690 if (
SDValue FastLowered = lowerFastUnsafeFDIV64(
Op, DAG))
13691 return FastLowered;
13699 SDVTList ScaleVT = DAG.
getVTList(MVT::f64, MVT::i1);
13705 SDValue Rcp = DAG.
getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13723 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13753 DAG.
getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3,
Mul, Scale);
13755 return DAG.
getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas,
Y,
X);
13759 EVT VT =
Op.getValueType();
13761 if (VT == MVT::f32)
13762 return LowerFDIV32(
Op, DAG);
13764 if (VT == MVT::f64)
13765 return LowerFDIV64(
Op, DAG);
13767 if (VT == MVT::f16 || VT == MVT::bf16)
13768 return LowerFDIV16(
Op, DAG);
13777 EVT ResultExpVT =
Op->getValueType(1);
13778 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13788 if (Subtarget->hasFractBug()) {
13806 EVT VT =
Store->getMemoryVT();
13808 if (VT == MVT::i1) {
13812 Store->getBasePtr(), MVT::i1,
Store->getMemOperand());
13816 Store->getValue().getValueType().getScalarType() == MVT::i32);
13818 unsigned AS =
Store->getAddressSpace();
13819 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13827 SIMachineFunctionInfo *MFI = MF.
getInfo<SIMachineFunctionInfo>();
13831 !Subtarget->hasMultiDwordFlatScratchAddressing())
13838 if (NumElements > 4)
13841 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13845 VT, *
Store->getMemOperand()))
13851 switch (Subtarget->getMaxPrivateElementSize()) {
13855 if (NumElements > 2)
13859 if (NumElements > 4 ||
13860 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13868 auto Flags =
Store->getMemOperand()->getFlags();
13887 assert(!Subtarget->has16BitInsts());
13888 SDNodeFlags
Flags =
Op->getFlags();
13902 SDNodeFlags
Flags =
Op->getFlags();
13903 MVT VT =
Op.getValueType().getSimpleVT();
14011 SDNodeFlags
Flags =
Op->getFlags();
14020 if (!
Flags.hasApproximateFuncs()) {
14052 if (!
Flags.hasApproximateFuncs()) {
14061 ScaleDownFactor, ZeroInt);
14068 if (
Flags.hasNoInfs()) {
14084 EVT VT =
Op.getValueType();
14095 if (!
V.getValueType().isVector())
14103 if (Subtarget->hasTrigReducedRange()) {
14105 TrigVal = UnrollIfVec(DAG.
getNode(AMDGPUISD::FRACT,
DL, VT, MulVal, Flags));
14110 switch (
Op.getOpcode()) {
14112 TrigVal = DAG.
getNode(AMDGPUISD::COS_HW, SDLoc(
Op), VT, TrigVal, Flags);
14115 TrigVal = DAG.
getNode(AMDGPUISD::SIN_HW, SDLoc(
Op), VT, TrigVal, Flags);
14121 return UnrollIfVec(TrigVal);
14141 EVT VT =
Op.getValueType();
14149 Op->getVTList(),
Ops, VT,
14158SITargetLowering::performUCharToFloatCombine(
SDNode *
N,
14159 DAGCombinerInfo &DCI)
const {
14160 EVT VT =
N->getValueType(0);
14162 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
14165 SelectionDAG &DAG = DCI.DAG;
14169 EVT SrcVT = Src.getValueType();
14175 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
14178 DCI.AddToWorklist(Cvt.
getNode());
14181 if (ScalarVT != MVT::f32) {
14193 DAGCombinerInfo &DCI)
const {
14204 SelectionDAG &DAG = DCI.DAG;
14223 for (
unsigned I = 0;
I != NumElts; ++
I) {
14247 if (NewElts.
size() == 1)
14269 for (
unsigned I = 0;
I != NumElts; ++
I) {
14304SDValue SITargetLowering::performSHLPtrCombine(
SDNode *
N,
unsigned AddrSpace,
14306 DAGCombinerInfo &DCI)
const {
14323 SelectionDAG &DAG = DCI.DAG;
14336 AM.BaseOffs =
Offset.getSExtValue();
14341 EVT VT =
N->getValueType(0);
14347 Flags.setNoUnsignedWrap(
14348 N->getFlags().hasNoUnsignedWrap() &&
14360 switch (
N->getOpcode()) {
14371 DAGCombinerInfo &DCI)
const {
14372 SelectionDAG &DAG = DCI.DAG;
14379 SDValue NewPtr = performSHLPtrCombine(Ptr.
getNode(),
N->getAddressSpace(),
14380 N->getMemoryVT(), DCI);
14384 NewOps[PtrIdx] = NewPtr;
14393 return (
Opc ==
ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
14394 (
Opc ==
ISD::OR && (Val == 0xffffffff || Val == 0)) ||
14403SDValue SITargetLowering::splitBinaryBitConstantOp(
14407 uint32_t ValLo =
Lo_32(Val);
14408 uint32_t ValHi =
Hi_32(Val);
14415 if (Subtarget->has64BitLiterals() && CRHS->
hasOneUse() &&
14429 if (V.getValueType() != MVT::i1)
14431 switch (V.getOpcode()) {
14436 case AMDGPUISD::FP_CLASS:
14448 return V.getResNo() == 1;
14450 unsigned IntrinsicID = V.getConstantOperandVal(0);
14451 switch (IntrinsicID) {
14452 case Intrinsic::amdgcn_is_shared:
14453 case Intrinsic::amdgcn_is_private:
14470 if (!(
C & 0x000000ff))
14471 ZeroByteMask |= 0x000000ff;
14472 if (!(
C & 0x0000ff00))
14473 ZeroByteMask |= 0x0000ff00;
14474 if (!(
C & 0x00ff0000))
14475 ZeroByteMask |= 0x00ff0000;
14476 if (!(
C & 0xff000000))
14477 ZeroByteMask |= 0xff000000;
14478 uint32_t NonZeroByteMask = ~ZeroByteMask;
14479 if ((NonZeroByteMask &
C) != NonZeroByteMask)
14492 assert(V.getValueSizeInBits() == 32);
14494 if (V.getNumOperands() != 2)
14503 switch (V.getOpcode()) {
14508 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
14513 return (0x03020100 & ~ConstMask) | ConstMask;
14520 return uint32_t((0x030201000c0c0c0cull <<
C) >> 32);
14526 return uint32_t(0x0c0c0c0c03020100ull >>
C);
14533 DAGCombinerInfo &DCI)
const {
14534 if (DCI.isBeforeLegalize())
14537 SelectionDAG &DAG = DCI.DAG;
14538 EVT VT =
N->getValueType(0);
14543 if (VT == MVT::i64 && CRHS) {
14545 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::AND,
LHS, CRHS))
14549 if (CRHS && VT == MVT::i32) {
14559 unsigned Shift = CShift->getZExtValue();
14561 unsigned Offset = NB + Shift;
14562 if ((
Offset & (Bits - 1)) == 0) {
14565 DAG.
getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
LHS->getOperand(0),
14586 Sel = (
LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
14588 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14601 if (
Y.getOpcode() !=
ISD::FABS ||
Y.getOperand(0) !=
X ||
14606 if (
X !=
LHS.getOperand(1))
14610 const ConstantFPSDNode *C1 =
14627 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
X,
14633 if (
RHS.getOpcode() ==
ISD::SETCC &&
LHS.getOpcode() == AMDGPUISD::FP_CLASS)
14636 if (
LHS.getOpcode() ==
ISD::SETCC &&
RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14644 (
RHS.getOperand(0) ==
LHS.getOperand(0) &&
14645 LHS.getOperand(0) ==
LHS.getOperand(1))) {
14647 unsigned NewMask = LCC ==
ISD::SETO ?
Mask->getZExtValue() & ~OrdMask
14648 :
Mask->getZExtValue() & OrdMask;
14651 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1,
RHS.getOperand(0),
14669 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14672 if (LHSMask != ~0u && RHSMask != ~0u) {
14675 if (LHSMask > RHSMask) {
14682 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14683 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14686 if (!(LHSUsedLanes & RHSUsedLanes) &&
14689 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14695 uint32_t
Mask = LHSMask & RHSMask;
14696 for (
unsigned I = 0;
I < 32;
I += 8) {
14697 uint32_t ByteSel = 0xff <<
I;
14698 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14699 Mask &= (0x0c <<
I) & 0xffffffff;
14704 uint32_t Sel =
Mask | (LHSUsedLanes & 0x04040404);
14707 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
14757static const std::optional<ByteProvider<SDValue>>
14759 unsigned Depth = 0) {
14762 return std::nullopt;
14764 if (
Op.getValueSizeInBits() < 8)
14765 return std::nullopt;
14767 if (
Op.getValueType().isVector())
14770 switch (
Op->getOpcode()) {
14783 NarrowVT = VTSign->getVT();
14786 return std::nullopt;
14789 if (SrcIndex >= NarrowByteWidth)
14790 return std::nullopt;
14798 return std::nullopt;
14800 uint64_t BitShift = ShiftOp->getZExtValue();
14802 if (BitShift % 8 != 0)
14803 return std::nullopt;
14805 uint64_t NewSrcIndex = SrcIndex + BitShift / 8;
14806 if (NewSrcIndex >=
Op.getScalarValueSizeInBits() / 8)
14807 return std::nullopt;
14826static const std::optional<ByteProvider<SDValue>>
14828 unsigned StartingIndex = 0) {
14832 return std::nullopt;
14834 unsigned BitWidth =
Op.getScalarValueSizeInBits();
14836 return std::nullopt;
14838 return std::nullopt;
14840 bool IsVec =
Op.getValueType().isVector();
14841 switch (
Op.getOpcode()) {
14844 return std::nullopt;
14849 return std::nullopt;
14853 return std::nullopt;
14856 if (!
LHS->isConstantZero() && !
RHS->isConstantZero())
14857 return std::nullopt;
14858 if (!
LHS ||
LHS->isConstantZero())
14860 if (!
RHS ||
RHS->isConstantZero())
14862 return std::nullopt;
14867 return std::nullopt;
14871 return std::nullopt;
14873 uint32_t BitMask = BitMaskOp->getZExtValue();
14875 uint32_t IndexMask = 0xFF << (Index * 8);
14877 if ((IndexMask & BitMask) != IndexMask) {
14880 if (IndexMask & BitMask)
14881 return std::nullopt;
14890 return std::nullopt;
14894 if (!ShiftOp ||
Op.getValueType().isVector())
14895 return std::nullopt;
14897 uint64_t BitsProvided =
Op.getValueSizeInBits();
14898 if (BitsProvided % 8 != 0)
14899 return std::nullopt;
14901 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14903 return std::nullopt;
14905 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14906 uint64_t ByteShift = BitShift / 8;
14908 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14909 uint64_t BytesProvided = BitsProvided / 8;
14910 SDValue NextOp =
Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14911 NewIndex %= BytesProvided;
14918 return std::nullopt;
14922 return std::nullopt;
14924 uint64_t BitShift = ShiftOp->getZExtValue();
14926 return std::nullopt;
14928 auto BitsProvided =
Op.getScalarValueSizeInBits();
14929 if (BitsProvided % 8 != 0)
14930 return std::nullopt;
14932 uint64_t BytesProvided = BitsProvided / 8;
14933 uint64_t ByteShift = BitShift / 8;
14934 if (Index + ByteShift < BytesProvided)
14936 Index + ByteShift);
14939 return std::nullopt;
14945 return std::nullopt;
14949 return std::nullopt;
14951 uint64_t BitShift = ShiftOp->getZExtValue();
14952 if (BitShift % 8 != 0)
14953 return std::nullopt;
14954 uint64_t ByteShift = BitShift / 8;
14960 return Index < ByteShift
14963 Depth + 1, StartingIndex);
14972 return std::nullopt;
14980 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14982 if (NarrowBitWidth % 8 != 0)
14983 return std::nullopt;
14984 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14986 if (Index >= NarrowByteWidth)
14988 ? std::optional<ByteProvider<SDValue>>(
14996 return std::nullopt;
15000 if (NarrowByteWidth >= Index) {
15005 return std::nullopt;
15012 return std::nullopt;
15018 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
15019 if (NarrowBitWidth % 8 != 0)
15020 return std::nullopt;
15021 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
15026 if (Index >= NarrowByteWidth) {
15028 ? std::optional<ByteProvider<SDValue>>(
15033 if (NarrowByteWidth > Index) {
15037 return std::nullopt;
15042 return std::nullopt;
15045 Depth + 1, StartingIndex);
15051 return std::nullopt;
15052 auto VecIdx = IdxOp->getZExtValue();
15053 auto ScalarSize =
Op.getScalarValueSizeInBits();
15054 if (ScalarSize < 32)
15055 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
15057 StartingIndex, Index);
15060 case AMDGPUISD::PERM: {
15062 return std::nullopt;
15066 return std::nullopt;
15069 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
15070 if (IdxMask > 0x07 && IdxMask != 0x0c)
15071 return std::nullopt;
15073 auto NextOp =
Op.getOperand(IdxMask > 0x03 ? 0 : 1);
15074 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
15076 return IdxMask != 0x0c ?
calculateSrcByte(NextOp, StartingIndex, NextIndex)
15082 return std::nullopt;
15097 return !OpVT.
isVector() && OpVT.getSizeInBits() == 16;
15104 auto MemVT = L->getMemoryVT();
15107 return L->getMemoryVT().getSizeInBits() == 16;
15117 int Low8 = Mask & 0xff;
15118 int Hi8 = (Mask & 0xff00) >> 8;
15120 assert(Low8 < 8 && Hi8 < 8);
15122 bool IsConsecutive = (Hi8 - Low8 == 1);
15127 bool Is16Aligned = !(Low8 % 2);
15129 return IsConsecutive && Is16Aligned;
15137 int Low16 = PermMask & 0xffff;
15138 int Hi16 = (PermMask & 0xffff0000) >> 16;
15148 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
15150 if (!OtherOpIs16Bit)
15158 unsigned DWordOffset) {
15163 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
15168 if (Src.getValueType().isVector()) {
15169 auto ScalarTySize = Src.getScalarValueSizeInBits();
15170 auto ScalarTy = Src.getValueType().getScalarType();
15171 if (ScalarTySize == 32) {
15175 if (ScalarTySize > 32) {
15178 DAG.
getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
15179 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
15186 assert(ScalarTySize < 32);
15187 auto NumElements =
TypeSize / ScalarTySize;
15188 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
15189 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
15190 auto NumElementsIn32 = 32 / ScalarTySize;
15191 auto NumAvailElements = DWordOffset < Trunc32Elements
15193 : NumElements - NormalizedTrunc;
15206 auto ShiftVal = 32 * DWordOffset;
15214 [[maybe_unused]]
EVT VT =
N->getValueType(0);
15219 for (
int i = 0; i < 4; i++) {
15221 std::optional<ByteProvider<SDValue>>
P =
15224 if (!
P ||
P->isConstantZero())
15229 if (PermNodes.
size() != 4)
15232 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
15233 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
15235 for (
size_t i = 0; i < PermNodes.
size(); i++) {
15236 auto PermOp = PermNodes[i];
15239 int SrcByteAdjust = 4;
15243 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
15244 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
15246 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
15247 ((PermOp.SrcOffset / 4) != SecondSrc->second))
15251 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
15252 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
15255 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
15257 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
15260 SDValue Op = *PermNodes[FirstSrc.first].Src;
15262 assert(
Op.getValueSizeInBits() == 32);
15266 int Low16 = PermMask & 0xffff;
15267 int Hi16 = (PermMask & 0xffff0000) >> 16;
15269 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
15270 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
15273 if (WellFormedLow && WellFormedHi)
15277 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src :
Op;
15286 (
N->getOperand(0) ==
Op ||
N->getOperand(0) == OtherOp) &&
15287 (
N->getOperand(1) ==
Op ||
N->getOperand(1) == OtherOp))
15292 assert(
Op.getValueType().isByteSized() &&
15303 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
Op, OtherOp,
15310 DAGCombinerInfo &DCI)
const {
15311 SelectionDAG &DAG = DCI.DAG;
15315 EVT VT =
N->getValueType(0);
15316 if (VT == MVT::i1) {
15318 if (
LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
15319 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
15321 if (Src !=
RHS.getOperand(0))
15326 if (!CLHS || !CRHS)
15330 static const uint32_t MaxMask = 0x3ff;
15335 return DAG.
getNode(AMDGPUISD::FP_CLASS,
DL, MVT::i1, Src,
15344 LHS.getOpcode() == AMDGPUISD::PERM &&
15350 Sel |=
LHS.getConstantOperandVal(2);
15352 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
15359 N->isDivergent() &&
TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15363 auto usesCombinedOperand = [](SDNode *OrUse) {
15366 !OrUse->getValueType(0).isVector())
15370 for (
auto *VUser : OrUse->users()) {
15371 if (!VUser->getValueType(0).isVector())
15378 if (VUser->getOpcode() == VectorwiseOp)
15384 if (!
any_of(
N->users(), usesCombinedOperand))
15390 if (LHSMask != ~0u && RHSMask != ~0u) {
15393 if (LHSMask > RHSMask) {
15400 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15401 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15404 if (!(LHSUsedLanes & RHSUsedLanes) &&
15407 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
15409 LHSMask &= ~RHSUsedLanes;
15410 RHSMask &= ~LHSUsedLanes;
15412 LHSMask |= LHSUsedLanes & 0x04040404;
15414 uint32_t Sel = LHSMask | RHSMask;
15417 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32,
LHS.getOperand(0),
15422 if (LHSMask == ~0u || RHSMask == ~0u) {
15463 return IdentitySrc;
15469 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
15484 if (SrcVT == MVT::i32) {
15489 DCI.AddToWorklist(LowOr.
getNode());
15490 DCI.AddToWorklist(HiBits.getNode());
15501 N->getOperand(0), CRHS))
15509 DAGCombinerInfo &DCI)
const {
15510 if (
SDValue RV = reassociateScalarOps(
N, DCI.DAG))
15517 SelectionDAG &DAG = DCI.DAG;
15519 EVT VT =
N->getValueType(0);
15520 if (CRHS && VT == MVT::i64) {
15522 splitBinaryBitConstantOp(DCI, SDLoc(
N),
ISD::XOR,
LHS, CRHS))
15529 unsigned Opc =
LHS.getOpcode();
15559 LHS->getOperand(0), FNegLHS, FNegRHS);
15568SITargetLowering::performZeroOrAnyExtendCombine(
SDNode *
N,
15569 DAGCombinerInfo &DCI)
const {
15570 if (!Subtarget->has16BitInsts() ||
15574 EVT VT =
N->getValueType(0);
15575 if (VT != MVT::i32)
15579 if (Src.getValueType() != MVT::i16)
15582 if (!Src->hasOneUse())
15589 std::optional<ByteProvider<SDValue>> BP0 =
15591 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
15595 std::optional<ByteProvider<SDValue>> BP1 =
15597 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
15605 SelectionDAG &DAG = DCI.DAG;
15607 uint32_t PermMask = 0x0c0c0c0c;
15610 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
15615 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
15618 return DAG.
getNode(AMDGPUISD::PERM,
DL, MVT::i32, V0,
V1,
15623SITargetLowering::performSignExtendInRegCombine(
SDNode *
N,
15624 DAGCombinerInfo &DCI)
const {
15630 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
15631 VTSign->getVT() == MVT::i8) ||
15632 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
15633 VTSign->getVT() == MVT::i16))) {
15634 assert(Subtarget->hasScalarSubwordLoads() &&
15635 "s_buffer_load_{u8, i8} are supported "
15636 "in GFX12 (or newer) architectures.");
15637 EVT VT = Src.getValueType();
15638 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
15639 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15640 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15642 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15649 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15650 Opc,
DL, ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15654 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15655 VTSign->getVT() == MVT::i8) ||
15656 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15657 VTSign->getVT() == MVT::i16)) &&
15666 Src.getOperand(6), Src.getOperand(7)};
15669 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15670 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15671 ? AMDGPUISD::BUFFER_LOAD_BYTE
15672 : AMDGPUISD::BUFFER_LOAD_SHORT;
15673 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15674 Opc, SDLoc(
N), ResList,
Ops,
M->getMemoryVT(),
M->getMemOperand());
15675 return DCI.DAG.getMergeValues(
15676 {BufferLoadSignExt, BufferLoadSignExt.
getValue(1)}, SDLoc(
N));
15682 DAGCombinerInfo &DCI)
const {
15683 SelectionDAG &DAG = DCI.DAG;
15690 if (
N->getOperand(0).isUndef())
15697 DAGCombinerInfo &DCI)
const {
15698 EVT VT =
N->getValueType(0);
15709 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(
N), VT, N0.
getOperand(0),
15718 unsigned MaxDepth)
const {
15719 unsigned Opcode =
Op.getOpcode();
15724 const auto &
F = CFP->getValueAPF();
15725 if (
F.isNaN() &&
F.isSignaling())
15727 if (!
F.isDenormal())
15759 case AMDGPUISD::FMUL_LEGACY:
15760 case AMDGPUISD::FMAD_FTZ:
15761 case AMDGPUISD::RCP:
15762 case AMDGPUISD::RSQ:
15763 case AMDGPUISD::RSQ_CLAMP:
15764 case AMDGPUISD::RCP_LEGACY:
15765 case AMDGPUISD::RCP_IFLAG:
15766 case AMDGPUISD::LOG:
15767 case AMDGPUISD::EXP:
15768 case AMDGPUISD::DIV_SCALE:
15769 case AMDGPUISD::DIV_FMAS:
15770 case AMDGPUISD::DIV_FIXUP:
15771 case AMDGPUISD::FRACT:
15772 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15773 case AMDGPUISD::CVT_F32_UBYTE0:
15774 case AMDGPUISD::CVT_F32_UBYTE1:
15775 case AMDGPUISD::CVT_F32_UBYTE2:
15776 case AMDGPUISD::CVT_F32_UBYTE3:
15777 case AMDGPUISD::FP_TO_FP16:
15778 case AMDGPUISD::SIN_HW:
15779 case AMDGPUISD::COS_HW:
15790 if (
Op.getValueType() == MVT::i32) {
15796 if (RHS->getZExtValue() == 0xffff0000) {
15806 return Op.getValueType().getScalarType() != MVT::f16;
15816 case AMDGPUISD::CLAMP:
15817 case AMDGPUISD::FMED3:
15818 case AMDGPUISD::FMAX3:
15819 case AMDGPUISD::FMIN3:
15820 case AMDGPUISD::FMAXIMUM3:
15821 case AMDGPUISD::FMINIMUM3: {
15827 if (Subtarget->supportsMinMaxDenormModes() ||
15837 for (
unsigned I = 0, E =
Op.getNumOperands();
I != E; ++
I) {
15849 for (
unsigned i = 0, e =
Op.getNumOperands(); i != e; ++i) {
15876 if (
Op.getValueType() == MVT::i16) {
15887 unsigned IntrinsicID =
Op.getConstantOperandVal(0);
15889 switch (IntrinsicID) {
15890 case Intrinsic::amdgcn_cvt_pkrtz:
15891 case Intrinsic::amdgcn_cubeid:
15892 case Intrinsic::amdgcn_frexp_mant:
15893 case Intrinsic::amdgcn_fdot2:
15894 case Intrinsic::amdgcn_rcp:
15895 case Intrinsic::amdgcn_rsq:
15896 case Intrinsic::amdgcn_rsq_clamp:
15897 case Intrinsic::amdgcn_rcp_legacy:
15898 case Intrinsic::amdgcn_rsq_legacy:
15899 case Intrinsic::amdgcn_trig_preop:
15900 case Intrinsic::amdgcn_tanh:
15901 case Intrinsic::amdgcn_log:
15902 case Intrinsic::amdgcn_exp2:
15903 case Intrinsic::amdgcn_sqrt:
15921 unsigned MaxDepth)
const {
15924 unsigned Opcode =
MI->getOpcode();
15926 if (Opcode == AMDGPU::G_FCANONICALIZE)
15929 std::optional<FPValueAndVReg> FCR;
15932 if (FCR->Value.isSignaling())
15934 if (!FCR->Value.isDenormal())
15945 case AMDGPU::G_FADD:
15946 case AMDGPU::G_FSUB:
15947 case AMDGPU::G_FMUL:
15948 case AMDGPU::G_FCEIL:
15949 case AMDGPU::G_FFLOOR:
15950 case AMDGPU::G_FRINT:
15951 case AMDGPU::G_FNEARBYINT:
15952 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15953 case AMDGPU::G_INTRINSIC_TRUNC:
15954 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15955 case AMDGPU::G_FMA:
15956 case AMDGPU::G_FMAD:
15957 case AMDGPU::G_FSQRT:
15958 case AMDGPU::G_FDIV:
15959 case AMDGPU::G_FREM:
15960 case AMDGPU::G_FPOW:
15961 case AMDGPU::G_FPEXT:
15962 case AMDGPU::G_FLOG:
15963 case AMDGPU::G_FLOG2:
15964 case AMDGPU::G_FLOG10:
15965 case AMDGPU::G_FPTRUNC:
15966 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15967 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15968 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15969 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15970 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15972 case AMDGPU::G_FNEG:
15973 case AMDGPU::G_FABS:
15974 case AMDGPU::G_FCOPYSIGN:
15976 case AMDGPU::G_FMINNUM:
15977 case AMDGPU::G_FMAXNUM:
15978 case AMDGPU::G_FMINNUM_IEEE:
15979 case AMDGPU::G_FMAXNUM_IEEE:
15980 case AMDGPU::G_FMINIMUM:
15981 case AMDGPU::G_FMAXIMUM:
15982 case AMDGPU::G_FMINIMUMNUM:
15983 case AMDGPU::G_FMAXIMUMNUM: {
15984 if (Subtarget->supportsMinMaxDenormModes() ||
15991 case AMDGPU::G_BUILD_VECTOR:
15996 case AMDGPU::G_INTRINSIC:
15997 case AMDGPU::G_INTRINSIC_CONVERGENT:
15999 case Intrinsic::amdgcn_fmul_legacy:
16000 case Intrinsic::amdgcn_fmad_ftz:
16001 case Intrinsic::amdgcn_sqrt:
16002 case Intrinsic::amdgcn_fmed3:
16003 case Intrinsic::amdgcn_sin:
16004 case Intrinsic::amdgcn_cos:
16005 case Intrinsic::amdgcn_log:
16006 case Intrinsic::amdgcn_exp2:
16007 case Intrinsic::amdgcn_log_clamp:
16008 case Intrinsic::amdgcn_rcp:
16009 case Intrinsic::amdgcn_rcp_legacy:
16010 case Intrinsic::amdgcn_rsq:
16011 case Intrinsic::amdgcn_rsq_clamp:
16012 case Intrinsic::amdgcn_rsq_legacy:
16013 case Intrinsic::amdgcn_div_scale:
16014 case Intrinsic::amdgcn_div_fmas:
16015 case Intrinsic::amdgcn_div_fixup:
16016 case Intrinsic::amdgcn_fract:
16017 case Intrinsic::amdgcn_cvt_pkrtz:
16018 case Intrinsic::amdgcn_cubeid:
16019 case Intrinsic::amdgcn_cubema:
16020 case Intrinsic::amdgcn_cubesc:
16021 case Intrinsic::amdgcn_cubetc:
16022 case Intrinsic::amdgcn_frexp_mant:
16023 case Intrinsic::amdgcn_fdot2:
16024 case Intrinsic::amdgcn_trig_preop:
16025 case Intrinsic::amdgcn_tanh:
16044 if (
C.isDenormal()) {
16058 if (
C.isSignaling()) {
16081SITargetLowering::performFCanonicalizeCombine(
SDNode *
N,
16082 DAGCombinerInfo &DCI)
const {
16083 SelectionDAG &DAG = DCI.DAG;
16085 EVT VT =
N->getValueType(0);
16094 EVT VT =
N->getValueType(0);
16095 return getCanonicalConstantFP(DAG, SDLoc(
N), VT, CFP->getValueAPF());
16111 EVT EltVT =
Lo.getValueType();
16114 for (
unsigned I = 0;
I != 2; ++
I) {
16118 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
16119 }
else if (
Op.isUndef()) {
16155 return AMDGPUISD::FMAX3;
16157 return AMDGPUISD::FMAXIMUM3;
16159 return AMDGPUISD::SMAX3;
16161 return AMDGPUISD::UMAX3;
16165 return AMDGPUISD::FMIN3;
16167 return AMDGPUISD::FMINIMUM3;
16169 return AMDGPUISD::SMIN3;
16171 return AMDGPUISD::UMIN3;
16192 if (!MinK || !MaxK)
16204 unsigned Med3Opc =
Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
16205 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
16206 return DAG.
getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
16230 bool IsKnownNoNaNs)
const {
16266 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
16272 if (
Info->getMode().DX10Clamp) {
16281 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
16313 case AMDGPUISD::FMIN_LEGACY:
16314 case AMDGPUISD::FMAX_LEGACY:
16315 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.
hasMin3Max3_16()) ||
16316 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
16319 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
16320 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
16321 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
16326 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.
hasMin3Max3_16());
16335 DAGCombinerInfo &DCI)
const {
16336 SelectionDAG &DAG = DCI.DAG;
16347 auto IsTreeWithCombinableChildren = [
Opc](
SDValue Op) {
16348 return (
Op.getOperand(0).getOpcode() ==
Opc &&
16349 Op.getOperand(0).hasOneUse()) ||
16350 (
Op.getOperand(1).getOpcode() ==
Opc &&
16351 Op.getOperand(1).hasOneUse());
16356 bool HasCombinableTreeChild =
16357 CanTreeCombineApply && (IsTreeWithCombinableChildren(Op0) ||
16358 IsTreeWithCombinableChildren(Op1));
16367 if (CanTreeCombineApply && !HasCombinableTreeChild) {
16397 uint64_t Clamp = 0;
16413 if (
SDValue Med3 = performIntMed3ImmCombine(
16418 if (
SDValue Med3 = performIntMed3ImmCombine(
16424 if (
SDValue Med3 = performIntMed3ImmCombine(
16429 if (
SDValue Med3 = performIntMed3ImmCombine(
16442 (
Opc == AMDGPUISD::FMIN_LEGACY &&
16443 Op0.
getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
16444 (VT == MVT::f32 || VT == MVT::f64 ||
16445 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
16446 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
16447 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
16448 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
16450 if (
SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(
N), Op0, Op1,
16451 N->getFlags().hasNoNaNs()))
16458 const SDNodeFlags
Flags =
N->getFlags();
16460 !Subtarget->hasIEEEMinimumMaximumInsts() &&
16464 return DAG.
getNode(NewOpc, SDLoc(
N), VT, Op0, Op1, Flags);
16474 return (CA->isPosZero() && CB->isOne()) ||
16475 (CA->isOne() && CB->isPosZero());
16484 DAGCombinerInfo &DCI)
const {
16485 EVT VT =
N->getValueType(0);
16489 SelectionDAG &DAG = DCI.DAG;
16500 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
16504 const SIMachineFunctionInfo *
Info = MF.
getInfo<SIMachineFunctionInfo>();
16508 if (
Info->getMode().DX10Clamp) {
16521 return DAG.
getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
16528 DAGCombinerInfo &DCI)
const {
16532 return DCI.DAG.getUNDEF(
N->getValueType(0));
16540 bool IsDivergentIdx,
16545 unsigned VecSize = EltSize * NumElem;
16548 if (VecSize <= 64 && EltSize < 32)
16557 if (IsDivergentIdx)
16561 unsigned NumInsts = NumElem +
16562 ((EltSize + 31) / 32) * NumElem ;
16566 if (Subtarget->useVGPRIndexMode())
16567 return NumInsts <= 16;
16571 if (Subtarget->hasMovrel())
16572 return NumInsts <= 15;
16578 SDValue Idx =
N->getOperand(
N->getNumOperands() - 1);
16593SITargetLowering::performExtractVectorEltCombine(
SDNode *
N,
16594 DAGCombinerInfo &DCI)
const {
16600 EVT ResVT =
N->getValueType(0);
16624 if (!
C ||
C->getZExtValue() != 0x1f)
16640 if (Vec.
hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
16668 DCI.AddToWorklist(Elt0.
getNode());
16669 DCI.AddToWorklist(Elt1.
getNode());
16700 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16701 uint64_t KImmValue = KImm->getZExtValue();
16703 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16706 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16707 uint64_t KFPImmValue =
16708 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16709 return DAG.
getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16715 if (!DCI.isBeforeLegalize())
16722 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16725 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16726 unsigned EltIdx = BitIndex / 32;
16727 unsigned LeftoverBitIdx = BitIndex % 32;
16731 DCI.AddToWorklist(Cast.
getNode());
16735 DCI.AddToWorklist(Elt.
getNode());
16738 DCI.AddToWorklist(Srl.
getNode());
16742 DCI.AddToWorklist(Trunc.
getNode());
16744 if (VecEltVT == ResVT) {
16756SITargetLowering::performInsertVectorEltCombine(
SDNode *
N,
16757 DAGCombinerInfo &DCI)
const {
16768 SelectionDAG &DAG = DCI.DAG;
16788 Src.getOperand(0).getValueType() == MVT::f16) {
16789 return Src.getOperand(0);
16793 APFloat Val = CFP->getValueAPF();
16794 bool LosesInfo =
true;
16804 DAGCombinerInfo &DCI)
const {
16805 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16806 "combine only useful on gfx8");
16808 SDValue TruncSrc =
N->getOperand(0);
16809 EVT VT =
N->getValueType(0);
16810 if (VT != MVT::f16)
16813 if (TruncSrc.
getOpcode() != AMDGPUISD::FMED3 ||
16817 SelectionDAG &DAG = DCI.DAG;
16848unsigned SITargetLowering::getFusedOpcode(
const SelectionDAG &DAG,
16850 const SDNode *N1)
const {
16855 if (((VT == MVT::f32 &&
16857 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16877 EVT VT =
N->getValueType(0);
16878 if (VT != MVT::i32 && VT != MVT::i64)
16884 unsigned Opc =
N->getOpcode();
16939 if (!Const ||
Hi_32(Const->getZExtValue()) !=
uint32_t(-1))
16958 DAGCombinerInfo &DCI)
const {
16961 SelectionDAG &DAG = DCI.DAG;
16962 EVT VT =
N->getValueType(0);
16972 if (!
N->isDivergent() && Subtarget->hasSMulHi())
16976 if (NumBits <= 32 || NumBits > 64)
16987 if (!Subtarget->hasFullRate64Ops()) {
16988 unsigned NumUsers = 0;
16989 for (SDNode *User :
LHS->
users()) {
16992 if (!
User->isAnyAdd())
17016 bool MulSignedLo =
false;
17017 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
17026 if (VT != MVT::i64) {
17049 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
17051 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
17052 auto [AccumLo, AccumHi] = DAG.
SplitScalar(Accum, SL, MVT::i32, MVT::i32);
17054 if (!MulLHSUnsigned32) {
17061 if (!MulRHSUnsigned32) {
17072 if (VT != MVT::i64)
17078SITargetLowering::foldAddSub64WithZeroLowBitsTo32(
SDNode *
N,
17079 DAGCombinerInfo &DCI)
const {
17089 SelectionDAG &DAG = DCI.DAG;
17104 unsigned Opcode =
N->getOpcode();
17108 DAG.
getNode(Opcode, SL, MVT::i32,
Hi, ConstHi32,
N->getFlags());
17119static std::optional<ByteProvider<SDValue>>
17122 if (!Byte0 || Byte0->isConstantZero()) {
17123 return std::nullopt;
17126 if (Byte1 && !Byte1->isConstantZero()) {
17127 return std::nullopt;
17133 unsigned FirstCs =
First & 0x0c0c0c0c;
17134 unsigned SecondCs = Second & 0x0c0c0c0c;
17135 unsigned FirstNoCs =
First & ~0x0c0c0c0c;
17136 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
17138 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
17139 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
17140 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
17141 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
17143 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
17167 for (
int BPI = 0; BPI < 2; BPI++) {
17170 BPP = {Src1, Src0};
17172 unsigned ZeroMask = 0x0c0c0c0c;
17173 unsigned FMask = 0xFF << (8 * (3 - Step));
17175 unsigned FirstMask =
17176 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
17177 unsigned SecondMask =
17178 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
17182 int FirstGroup = -1;
17183 for (
int I = 0;
I < 2;
I++) {
17185 auto MatchesFirst = [&BPP](
DotSrc &IterElt) {
17186 return IterElt.SrcOp == *BPP.first.Src &&
17187 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
17191 if (Match != Srcs.
end()) {
17192 Match->PermMask =
addPermMasks(FirstMask, Match->PermMask);
17197 if (FirstGroup != -1) {
17199 auto MatchesSecond = [&BPP](
DotSrc &IterElt) {
17200 return IterElt.SrcOp == *BPP.second.Src &&
17201 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
17204 if (Match != Srcs.
end()) {
17205 Match->PermMask =
addPermMasks(SecondMask, Match->PermMask);
17207 Srcs.
push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
17215 unsigned ZeroMask = 0x0c0c0c0c;
17216 unsigned FMask = 0xFF << (8 * (3 - Step));
17220 ((Src0.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17224 ((Src1.
SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17233 if (Srcs.
size() == 1) {
17234 auto *Elt = Srcs.
begin();
17238 if (Elt->PermMask == 0x3020100)
17241 return DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17245 auto *FirstElt = Srcs.
begin();
17246 auto *SecondElt = std::next(FirstElt);
17253 auto FirstMask = FirstElt->PermMask;
17254 auto SecondMask = SecondElt->PermMask;
17256 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
17257 unsigned FirstPlusFour = FirstMask | 0x04040404;
17260 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
17272 FirstElt = std::next(SecondElt);
17273 if (FirstElt == Srcs.
end())
17276 SecondElt = std::next(FirstElt);
17279 if (SecondElt == Srcs.
end()) {
17284 DAG.
getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17285 DAG.
getConstant(FirstElt->PermMask, SL, MVT::i32)));
17291 return Perms.
size() == 2
17297 for (
auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
17298 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
17299 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
17300 EntryMask += ZeroMask;
17305 auto Opcode =
Op.getOpcode();
17307 return (Opcode ==
ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
17308 Opcode == AMDGPUISD::MUL_I24);
17311static std::optional<bool>
17322 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
17325 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
17327 assert(!(S0IsUnsigned && S0IsSigned));
17328 assert(!(S1IsUnsigned && S1IsSigned));
17336 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
17342 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
17343 return std::nullopt;
17355 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
17356 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
17361 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
17367 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
17368 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
17369 return std::nullopt;
17375 DAGCombinerInfo &DCI)
const {
17376 SelectionDAG &DAG = DCI.DAG;
17377 EVT VT =
N->getValueType(0);
17383 if (Subtarget->hasMad64_32()) {
17384 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
17389 if (
SDValue V = reassociateScalarOps(
N, DAG)) {
17393 if (VT == MVT::i64) {
17394 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17399 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
17401 std::optional<bool> IsSigned;
17407 int ChainLength = 0;
17408 for (
int I = 0;
I < 4;
I++) {
17412 auto Src0 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
17415 auto Src1 =
handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
17420 TempNode->getOperand(MulIdx), *Src0, *Src1,
17421 TempNode->getOperand(MulIdx)->getOperand(0),
17422 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
17426 IsSigned = *IterIsSigned;
17427 if (*IterIsSigned != *IsSigned)
17430 auto AddIdx = 1 - MulIdx;
17433 if (
I == 2 &&
isMul(TempNode->getOperand(AddIdx))) {
17434 Src2s.
push_back(TempNode->getOperand(AddIdx));
17444 TempNode->getOperand(AddIdx), *Src0, *Src1,
17445 TempNode->getOperand(AddIdx)->getOperand(0),
17446 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
17450 if (*IterIsSigned != *IsSigned)
17454 ChainLength =
I + 2;
17458 TempNode = TempNode->getOperand(AddIdx);
17460 ChainLength =
I + 1;
17461 if (TempNode->getNumOperands() < 2)
17463 LHS = TempNode->getOperand(0);
17464 RHS = TempNode->getOperand(1);
17467 if (ChainLength < 2)
17473 if (ChainLength < 4) {
17483 bool UseOriginalSrc =
false;
17484 if (ChainLength == 4 && Src0s.
size() == 1 && Src1s.
size() == 1 &&
17485 Src0s.
begin()->PermMask == Src1s.
begin()->PermMask &&
17486 Src0s.
begin()->SrcOp.getValueSizeInBits() >= 32 &&
17487 Src1s.
begin()->SrcOp.getValueSizeInBits() >= 32) {
17488 SmallVector<unsigned, 4> SrcBytes;
17489 auto Src0Mask = Src0s.
begin()->PermMask;
17490 SrcBytes.
push_back(Src0Mask & 0xFF000000);
17491 bool UniqueEntries =
true;
17492 for (
auto I = 1;
I < 4;
I++) {
17493 auto NextByte = Src0Mask & (0xFF << ((3 -
I) * 8));
17496 UniqueEntries =
false;
17502 if (UniqueEntries) {
17503 UseOriginalSrc =
true;
17505 auto *FirstElt = Src0s.
begin();
17509 auto *SecondElt = Src1s.
begin();
17511 SecondElt->DWordOffset);
17520 if (!UseOriginalSrc) {
17527 DAG.
getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
17530 : Intrinsic::amdgcn_udot4,
17540 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
17545 unsigned Opc =
LHS.getOpcode();
17557 auto Cond =
RHS.getOperand(0);
17562 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
17579 DAGCombinerInfo &DCI)
const {
17580 SelectionDAG &DAG = DCI.DAG;
17582 EVT VT =
N->getValueType(0);
17595 SDNodeFlags ShlFlags = N1->
getFlags();
17599 SDNodeFlags NewShlFlags =
17604 DCI.AddToWorklist(Inner.
getNode());
17611 if (Subtarget->hasMad64_32()) {
17612 if (
SDValue Folded = tryFoldToMad64_32(
N, DCI))
17621 if (VT == MVT::i64) {
17622 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17635 if (!YIsConstant && !ZIsConstant && !
X->isDivergent() &&
17636 Y->isDivergent() !=
Z->isDivergent()) {
17645 if (
Y->isDivergent())
17648 SDNodeFlags ReassocFlags =
17651 DCI.AddToWorklist(UniformInner.
getNode());
17663 DAGCombinerInfo &DCI)
const {
17664 SelectionDAG &DAG = DCI.DAG;
17665 EVT VT =
N->getValueType(0);
17667 if (VT == MVT::i64) {
17668 if (
SDValue Folded = foldAddSub64WithZeroLowBitsTo32(
N, DCI))
17672 if (VT != MVT::i32)
17681 unsigned Opc =
RHS.getOpcode();
17688 auto Cond =
RHS.getOperand(0);
17693 SDVTList VTList = DAG.
getVTList(MVT::i32, MVT::i1);
17719 ConstantSDNode *ShiftAmt =
17721 unsigned BitWidth =
X.getValueType().getScalarSizeInBits();
17732 DAGCombinerInfo &DCI)
const {
17736 SelectionDAG &DAG = DCI.DAG;
17737 EVT VT =
N->getValueType(0);
17749 if (
A ==
LHS.getOperand(1)) {
17750 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17751 if (FusedOp != 0) {
17753 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
RHS);
17761 if (
A ==
RHS.getOperand(1)) {
17762 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17763 if (FusedOp != 0) {
17765 return DAG.
getNode(FusedOp, SL, VT,
A, Two,
LHS);
17774 DAGCombinerInfo &DCI)
const {
17778 SelectionDAG &DAG = DCI.DAG;
17780 EVT VT =
N->getValueType(0);
17793 if (
A ==
LHS.getOperand(1)) {
17794 unsigned FusedOp = getFusedOpcode(DAG,
N,
LHS.getNode());
17795 if (FusedOp != 0) {
17799 return DAG.
getNode(FusedOp, SL, VT,
A, Two, NegRHS);
17808 if (
A ==
RHS.getOperand(1)) {
17809 unsigned FusedOp = getFusedOpcode(DAG,
N,
RHS.getNode());
17810 if (FusedOp != 0) {
17812 return DAG.
getNode(FusedOp, SL, VT,
A, NegTwo,
LHS);
17821 DAGCombinerInfo &DCI)
const {
17822 SelectionDAG &DAG = DCI.DAG;
17824 EVT VT =
N->getValueType(0);
17826 if (VT != MVT::f16 && VT != MVT::bf16)
17832 SDNodeFlags
Flags =
N->getFlags();
17833 SDNodeFlags RHSFlags =
RHS->getFlags();
17839 bool IsNegative =
false;
17840 if (CLHS->
isOne() || (IsNegative = CLHS->isMinusOne())) {
17849 Rsq = DAG.
getNode(AMDGPUISD::RSQ, SL, VT, SqrtOp, Flags);
17850 }
else if (VT == MVT::f16) {
17859 DAG.
getNode(AMDGPUISD::RSQ, SL, MVT::f32, Ext, Flags);
17876 DAGCombinerInfo &DCI)
const {
17877 SelectionDAG &DAG = DCI.DAG;
17878 EVT VT =
N->getValueType(0);
17882 if (!
N->isDivergent() &&
getSubtarget()->hasSALUFloatInsts() &&
17883 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17898 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17903 const ConstantFPSDNode *FalseNode =
17913 if (ScalarVT == MVT::f32 &&
17919 if (TrueNodeExpVal == INT_MIN)
17922 if (FalseNodeExpVal == INT_MIN)
17942 DAGCombinerInfo &DCI)
const {
17943 SelectionDAG &DAG = DCI.DAG;
17944 EVT VT =
N->getValueType(0);
17947 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17965 (
N->getFlags().hasAllowContract() &&
17966 FMA->getFlags().hasAllowContract())) {
18000 if (Vec1 == Vec2 || Vec3 == Vec4)
18006 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
18007 return DAG.
getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
18050 EVT VT =
LHS.getValueType();
18051 assert(VT == MVT::f64 &&
"Incorrect operand type!");
18083 if (CC ==
ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)
18087 if (CC ==
ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))
18091 if (CC ==
ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))
18095 if (CC ==
ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)
18098 const std::optional<bool> KnownEq =
18127 if (CC ==
ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))
18131 if (CC ==
ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))
18139 const std::optional<bool> KnownUge =
18164 if (CC ==
ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))
18178 if (CC ==
ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))
18181 const std::optional<bool> KnownUle =
18204 DAGCombinerInfo &DCI)
const {
18205 SelectionDAG &DAG = DCI.DAG;
18210 EVT VT =
LHS.getValueType();
18239 return LHS.getOperand(0);
18253 const APInt &CT =
LHS.getConstantOperandAPInt(1);
18254 const APInt &CF =
LHS.getConstantOperandAPInt(2);
18259 return DAG.
getNOT(SL,
LHS.getOperand(0), MVT::i1);
18262 return LHS.getOperand(0);
18283 if (VT == MVT::i64) {
18295 const std::optional<bool> KnownEq =
18303 const std::optional<bool> KnownEq =
18314 const std::optional<bool> KnownUge =
18334 const std::optional<bool> KnownUle =
18385 DAG.
getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
18390 {Op0Hi, Op1Hi, CarryInHi});
18400 DCI.CombineTo(
LHS.getNode(), Result);
18404 if (VT != MVT::f32 && VT != MVT::f64 &&
18405 (!Subtarget->has16BitInsts() || VT != MVT::f16))
18420 const unsigned IsInfMask =
18422 const unsigned IsFiniteMask =
18427 return DAG.
getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1,
LHS.getOperand(0),
18432 if (VT == MVT::f64) {
18443SITargetLowering::performCvtF32UByteNCombine(
SDNode *
N,
18444 DAGCombinerInfo &DCI)
const {
18445 SelectionDAG &DAG = DCI.DAG;
18447 unsigned Offset =
N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
18466 unsigned ShiftOffset = 8 *
Offset;
18468 ShiftOffset -=
C->getZExtValue();
18470 ShiftOffset +=
C->getZExtValue();
18472 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
18473 return DAG.
getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
18474 MVT::f32, Shifted);
18485 DCI.AddToWorklist(
N);
18492 return DAG.
getNode(
N->getOpcode(), SL, MVT::f32, DemandedSrc);
18498 DAGCombinerInfo &DCI)
const {
18503 const MachineFunction &MF = DCI.DAG.getMachineFunction();
18507 (
F.isNaN() && MF.
getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
18508 return DCI.DAG.getConstantFP(Zero, SDLoc(
N),
N->getValueType(0));
18511 APFloat One(
F.getSemantics(),
"1.0");
18513 return DCI.DAG.getConstantFP(One, SDLoc(
N),
N->getValueType(0));
18519 DAGCombinerInfo &DCI)
const {
18540 bool isFloatingPoint =
LHS.getValueType().isFloatingPoint();
18541 bool isInteger =
LHS.getValueType().isInteger();
18544 if (!isFloatingPoint && !isInteger)
18549 if (!isEquality && !isNonEquality)
18566 if (isFloatingPoint) {
18568 if (!Val.
isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
18571 const std::optional<int64_t> Val =
18580 if (!(isEquality && TrueVal == ConstVal) &&
18581 !(isNonEquality && FalseVal == ConstVal))
18588 SelectLHS, SelectRHS);
18593 switch (
N->getOpcode()) {
18614 if (
auto Res = promoteUniformOpToI32(
SDValue(
N, 0), DCI))
18624 switch (
N->getOpcode()) {
18626 return performAddCombine(
N, DCI);
18628 return performPtrAddCombine(
N, DCI);
18630 return performSubCombine(
N, DCI);
18632 return performFAddCombine(
N, DCI);
18634 return performFSubCombine(
N, DCI);
18636 return performFDivCombine(
N, DCI);
18638 return performFMulCombine(
N, DCI);
18640 return performSetCCCombine(
N, DCI);
18642 if (
auto Res = performSelectCombine(
N, DCI))
18657 case AMDGPUISD::FMIN_LEGACY:
18658 case AMDGPUISD::FMAX_LEGACY:
18659 return performMinMaxCombine(
N, DCI);
18661 return performFMACombine(
N, DCI);
18663 return performAndCombine(
N, DCI);
18665 return performOrCombine(
N, DCI);
18668 if (
N->getValueType(0) == MVT::i32 &&
N->isDivergent() &&
18669 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
18675 return performXorCombine(
N, DCI);
18678 return performZeroOrAnyExtendCombine(
N, DCI);
18680 return performSignExtendInRegCombine(
N, DCI);
18681 case AMDGPUISD::FP_CLASS:
18682 return performClassCombine(
N, DCI);
18684 return performFCanonicalizeCombine(
N, DCI);
18685 case AMDGPUISD::RCP:
18686 return performRcpCombine(
N, DCI);
18688 case AMDGPUISD::FRACT:
18689 case AMDGPUISD::RSQ:
18690 case AMDGPUISD::RCP_LEGACY:
18691 case AMDGPUISD::RCP_IFLAG:
18692 case AMDGPUISD::RSQ_CLAMP: {
18701 return performUCharToFloatCombine(
N, DCI);
18703 return performFCopySignCombine(
N, DCI);
18704 case AMDGPUISD::CVT_F32_UBYTE0:
18705 case AMDGPUISD::CVT_F32_UBYTE1:
18706 case AMDGPUISD::CVT_F32_UBYTE2:
18707 case AMDGPUISD::CVT_F32_UBYTE3:
18708 return performCvtF32UByteNCombine(
N, DCI);
18709 case AMDGPUISD::FMED3:
18710 return performFMed3Combine(
N, DCI);
18711 case AMDGPUISD::CVT_PKRTZ_F16_F32:
18712 return performCvtPkRTZCombine(
N, DCI);
18713 case AMDGPUISD::CLAMP:
18714 return performClampCombine(
N, DCI);
18717 EVT VT =
N->getValueType(0);
18720 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
18723 EVT EltVT = Src.getValueType();
18724 if (EltVT != MVT::i16)
18734 return performExtractVectorEltCombine(
N, DCI);
18736 return performInsertVectorEltCombine(
N, DCI);
18738 return performFPRoundCombine(
N, DCI);
18747 return performMemSDNodeCombine(MemNode, DCI);
18778 unsigned Opcode =
Node->getMachineOpcode();
18781 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
18782 if (D16Idx >= 0 &&
Node->getConstantOperandVal(D16Idx))
18785 SDNode *
Users[5] = {
nullptr};
18787 unsigned DmaskIdx =
18788 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
18789 unsigned OldDmask =
Node->getConstantOperandVal(DmaskIdx);
18790 unsigned NewDmask = 0;
18791 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
18792 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
18793 bool UsesTFC = (int(TFEIdx) >= 0 &&
Node->getConstantOperandVal(TFEIdx)) ||
18794 (
int(LWEIdx) >= 0 &&
Node->getConstantOperandVal(LWEIdx));
18795 unsigned TFCLane = 0;
18796 bool HasChain =
Node->getNumValues() > 1;
18798 if (OldDmask == 0) {
18806 TFCLane = OldBitsSet;
18810 for (SDUse &Use :
Node->uses()) {
18813 if (
Use.getResNo() != 0)
18816 SDNode *
User =
Use.getUser();
18819 if (!
User->isMachineOpcode() ||
18820 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
18832 if (UsesTFC && Lane == TFCLane) {
18837 for (
unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
18839 Dmask &= ~(1 << Comp);
18847 NewDmask |= 1 << Comp;
18852 bool NoChannels = !NewDmask;
18859 if (OldBitsSet == 1)
18865 if (NewDmask == OldDmask)
18874 unsigned NewChannels = BitsSet + UsesTFC;
18878 assert(NewOpcode != -1 &&
18879 NewOpcode !=
static_cast<int>(
Node->getMachineOpcode()) &&
18880 "failed to find equivalent MIMG op");
18888 MVT SVT =
Node->getValueType(0).getVectorElementType().getSimpleVT();
18890 MVT ResultVT = NewChannels == 1
18893 : NewChannels == 5 ? 8
18895 SDVTList NewVTList =
18898 MachineSDNode *NewNode =
18907 if (NewChannels == 1) {
18917 for (
unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18922 if (i || !NoChannels)
18927 if (NewUser != User) {
18937 Idx = AMDGPU::sub1;
18940 Idx = AMDGPU::sub2;
18943 Idx = AMDGPU::sub3;
18946 Idx = AMDGPU::sub4;
18957 Op =
Op.getOperand(0);
18982 Node->getOperand(0), SL, VReg, SrcVal,
18988 return ToResultReg.
getNode();
18993 for (
unsigned i = 0; i <
Node->getNumOperands(); ++i) {
18995 Ops.push_back(
Node->getOperand(i));
19001 Node->getOperand(i).getValueType(),
19002 Node->getOperand(i)),
19014 unsigned Opcode =
Node->getMachineOpcode();
19016 if (
TII->isImage(Opcode) && !
TII->get(Opcode).mayStore() &&
19017 !
TII->isGather4(Opcode) &&
19019 return adjustWritemask(
Node, DAG);
19022 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
19028 case AMDGPU::V_DIV_SCALE_F32_e64:
19029 case AMDGPU::V_DIV_SCALE_F64_e64: {
19039 (Src0 == Src1 || Src0 == Src2))
19095 AMDGPU::getNamedOperandIdx(
MI.getOpcode(), AMDGPU::OpName::vdata);
19096 unsigned InitIdx = 0;
19098 if (
TII->isImage(
MI)) {
19106 unsigned TFEVal = TFE ? TFE->
getImm() : 0;
19107 unsigned LWEVal = LWE ? LWE->
getImm() : 0;
19108 unsigned D16Val = D16 ? D16->getImm() : 0;
19110 if (!TFEVal && !LWEVal)
19121 assert(MO_Dmask &&
"Expected dmask operand in instruction");
19123 unsigned dmask = MO_Dmask->
getImm();
19128 bool Packed = !Subtarget->hasUnpackedD16VMem();
19130 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
19137 uint32_t DstSize =
TRI.getRegSizeInBits(*DstRC) / 32;
19138 if (DstSize < InitIdx)
19142 InitIdx =
TRI.getRegSizeInBits(*DstRC) / 32;
19151 unsigned NewDst = 0;
19156 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
19157 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
19160 for (; SizeLeft; SizeLeft--, CurrIdx++) {
19181 MI.tieOperands(DstIdx,
MI.getNumOperands() - 1);
19193 if (
TII->isVOP3(
MI.getOpcode())) {
19195 TII->legalizeOperandsVOP3(MRI,
MI);
19197 if (
TII->isMAI(
MI)) {
19202 int Src0Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
19203 AMDGPU::OpName::scale_src0);
19204 if (Src0Idx != -1) {
19205 int Src1Idx = AMDGPU::getNamedOperandIdx(
MI.getOpcode(),
19206 AMDGPU::OpName::scale_src1);
19207 if (
TII->usesConstantBus(MRI,
MI, Src0Idx) &&
19208 TII->usesConstantBus(MRI,
MI, Src1Idx))
19209 TII->legalizeOpWithMove(
MI, Src1Idx);
19216 if (
TII->isImage(
MI))
19217 TII->enforceOperandRCAlignment(
MI, AMDGPU::OpName::vaddr);
19291std::pair<unsigned, const TargetRegisterClass *>
19298 if (Constraint.
size() == 1) {
19302 if (VT == MVT::Other)
19305 switch (Constraint[0]) {
19312 RC = &AMDGPU::SReg_32RegClass;
19315 RC = &AMDGPU::SGPR_64RegClass;
19320 return std::pair(0U,
nullptr);
19327 return std::pair(0U,
nullptr);
19329 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
19330 : &AMDGPU::VGPR_32_Lo256RegClass;
19333 RC = Subtarget->has1024AddressableVGPRs()
19334 ?
TRI->getAlignedLo256VGPRClassForBitWidth(
BitWidth)
19337 return std::pair(0U,
nullptr);
19342 if (!Subtarget->hasMAIInsts())
19346 return std::pair(0U,
nullptr);
19348 RC = &AMDGPU::AGPR_32RegClass;
19353 return std::pair(0U,
nullptr);
19358 }
else if (Constraint ==
"VA" && Subtarget->hasGFX90AInsts()) {
19362 RC = &AMDGPU::AV_32RegClass;
19365 RC =
TRI->getVectorSuperClassForBitWidth(
BitWidth);
19367 return std::pair(0U,
nullptr);
19376 return std::pair(0U, RC);
19379 if (Kind !=
'\0') {
19381 RC = &AMDGPU::VGPR_32_Lo256RegClass;
19382 }
else if (Kind ==
's') {
19383 RC = &AMDGPU::SGPR_32RegClass;
19384 }
else if (Kind ==
'a') {
19385 RC = &AMDGPU::AGPR_32RegClass;
19391 return std::pair(0U,
nullptr);
19397 return std::pair(0U,
nullptr);
19401 RC =
TRI->getVGPRClassForBitWidth(Width);
19403 RC =
TRI->getSGPRClassForBitWidth(Width);
19405 RC =
TRI->getAGPRClassForBitWidth(Width);
19407 Reg =
TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
19412 return std::pair(0U,
nullptr);
19414 return std::pair(Reg, RC);
19423 return std::pair(0U,
nullptr);
19424 if (RC && Idx < RC->getNumRegs())
19426 return std::pair(0U,
nullptr);
19432 Ret.second =
TRI->getPhysRegBaseClass(Ret.first);
19438 if (Constraint.
size() == 1) {
19439 switch (Constraint[0]) {
19449 }
else if (Constraint ==
"DA" || Constraint ==
"DB") {
19457 if (Constraint.
size() == 1) {
19458 switch (Constraint[0]) {
19466 }
else if (Constraint.
size() == 2) {
19467 if (Constraint ==
"VA")
19485 std::vector<SDValue> &
Ops,
19500 unsigned Size =
Op.getScalarValueSizeInBits();
19504 if (
Size == 16 && !Subtarget->has16BitInsts())
19508 Val =
C->getSExtValue();
19512 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
19516 if (
Size != 16 ||
Op.getNumOperands() != 2)
19518 if (
Op.getOperand(0).isUndef() ||
Op.getOperand(1).isUndef())
19521 Val =
C->getSExtValue();
19525 Val =
C->getValueAPF().bitcastToAPInt().getSExtValue();
19535 if (Constraint.
size() == 1) {
19536 switch (Constraint[0]) {
19551 }
else if (Constraint.
size() == 2) {
19552 if (Constraint ==
"DA") {
19553 int64_t HiBits =
static_cast<int32_t
>(Val >> 32);
19554 int64_t LoBits =
static_cast<int32_t
>(Val);
19558 if (Constraint ==
"DB") {
19566 unsigned MaxSize)
const {
19567 unsigned Size = std::min<unsigned>(
Op.getScalarValueSizeInBits(), MaxSize);
19568 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
19570 MVT VT =
Op.getSimpleValueType();
19595 switch (UnalignedClassID) {
19596 case AMDGPU::VReg_64RegClassID:
19597 return AMDGPU::VReg_64_Align2RegClassID;
19598 case AMDGPU::VReg_96RegClassID:
19599 return AMDGPU::VReg_96_Align2RegClassID;
19600 case AMDGPU::VReg_128RegClassID:
19601 return AMDGPU::VReg_128_Align2RegClassID;
19602 case AMDGPU::VReg_160RegClassID:
19603 return AMDGPU::VReg_160_Align2RegClassID;
19604 case AMDGPU::VReg_192RegClassID:
19605 return AMDGPU::VReg_192_Align2RegClassID;
19606 case AMDGPU::VReg_224RegClassID:
19607 return AMDGPU::VReg_224_Align2RegClassID;
19608 case AMDGPU::VReg_256RegClassID:
19609 return AMDGPU::VReg_256_Align2RegClassID;
19610 case AMDGPU::VReg_288RegClassID:
19611 return AMDGPU::VReg_288_Align2RegClassID;
19612 case AMDGPU::VReg_320RegClassID:
19613 return AMDGPU::VReg_320_Align2RegClassID;
19614 case AMDGPU::VReg_352RegClassID:
19615 return AMDGPU::VReg_352_Align2RegClassID;
19616 case AMDGPU::VReg_384RegClassID:
19617 return AMDGPU::VReg_384_Align2RegClassID;
19618 case AMDGPU::VReg_512RegClassID:
19619 return AMDGPU::VReg_512_Align2RegClassID;
19620 case AMDGPU::VReg_1024RegClassID:
19621 return AMDGPU::VReg_1024_Align2RegClassID;
19622 case AMDGPU::AReg_64RegClassID:
19623 return AMDGPU::AReg_64_Align2RegClassID;
19624 case AMDGPU::AReg_96RegClassID:
19625 return AMDGPU::AReg_96_Align2RegClassID;
19626 case AMDGPU::AReg_128RegClassID:
19627 return AMDGPU::AReg_128_Align2RegClassID;
19628 case AMDGPU::AReg_160RegClassID:
19629 return AMDGPU::AReg_160_Align2RegClassID;
19630 case AMDGPU::AReg_192RegClassID:
19631 return AMDGPU::AReg_192_Align2RegClassID;
19632 case AMDGPU::AReg_256RegClassID:
19633 return AMDGPU::AReg_256_Align2RegClassID;
19634 case AMDGPU::AReg_512RegClassID:
19635 return AMDGPU::AReg_512_Align2RegClassID;
19636 case AMDGPU::AReg_1024RegClassID:
19637 return AMDGPU::AReg_1024_Align2RegClassID;
19653 if (Info->isEntryFunction()) {
19660 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
19662 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
19663 :
TRI->getAlignedHighSGPRForRC(MF, 2,
19664 &AMDGPU::SGPR_64RegClass);
19665 Info->setSGPRForEXECCopy(SReg);
19667 assert(!
TRI->isSubRegister(Info->getScratchRSrcReg(),
19668 Info->getStackPtrOffsetReg()));
19669 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
19670 MRI.
replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
19674 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
19675 MRI.
replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
19677 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
19680 Info->limitOccupancy(MF);
19682 if (ST.isWave32() && !MF.
empty()) {
19683 for (
auto &
MBB : MF) {
19684 for (
auto &
MI :
MBB) {
19685 TII->fixImplicitOperands(
MI);
19695 if (ST.needsAlignedVGPRs()) {
19702 if (NewClassID != -1)
19712 const APInt &DemandedElts,
19714 unsigned Depth)
const {
19716 unsigned Opc =
Op.getOpcode();
19719 unsigned IID =
Op.getConstantOperandVal(0);
19721 case Intrinsic::amdgcn_mbcnt_lo:
19722 case Intrinsic::amdgcn_mbcnt_hi: {
19728 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
19738 Op, Known, DemandedElts, DAG,
Depth);
19754 unsigned MaxValue =
19761 unsigned BFEWidth,
bool SExt,
unsigned Depth) {
19765 unsigned Src1Cst = 0;
19766 if (Src1.
isImm()) {
19767 Src1Cst = Src1.
getImm();
19768 }
else if (Src1.
isReg()) {
19772 Src1Cst = Cst->Value.getZExtValue();
19783 if (Width >= BFEWidth)
19792 Known = Known.
sext(BFEWidth);
19794 Known = Known.
zext(BFEWidth);
19800 unsigned Depth)
const {
19803 switch (
MI->getOpcode()) {
19804 case AMDGPU::S_BFE_I32:
19807 case AMDGPU::S_BFE_U32:
19810 case AMDGPU::S_BFE_I64:
19813 case AMDGPU::S_BFE_U64:
19816 case AMDGPU::G_INTRINSIC:
19817 case AMDGPU::G_INTRINSIC_CONVERGENT: {
19820 case Intrinsic::amdgcn_workitem_id_x:
19823 case Intrinsic::amdgcn_workitem_id_y:
19826 case Intrinsic::amdgcn_workitem_id_z:
19829 case Intrinsic::amdgcn_mbcnt_lo:
19830 case Intrinsic::amdgcn_mbcnt_hi: {
19842 case Intrinsic::amdgcn_groupstaticsize: {
19853 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
19856 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
19859 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
19864 case AMDGPU::G_AMDGPU_SMED3:
19865 case AMDGPU::G_AMDGPU_UMED3: {
19866 auto [Dst, Src0, Src1, Src2] =
MI->getFirst4Regs();
19893 unsigned Depth)
const {
19900 AttributeList Attrs =
19902 if (
MaybeAlign RetAlign = Attrs.getRetAlignment())
19920 if (Header->getAlignment() != PrefAlign)
19921 return Header->getAlignment();
19922 if (needsFetchWindowAlignment(*Header))
19943 if (Header->getAlignment() != PrefAlign)
19944 return Header->getAlignment();
19946 unsigned LoopSize = 0;
19951 LoopSize +=
MBB->getAlignment().value() / 2;
19954 LoopSize +=
TII->getInstSizeInBytes(
MI);
19955 if (LoopSize > 192)
19960 if (LoopSize <= 64)
19963 if (LoopSize <= 128)
19964 return CacheLineAlign;
19970 auto I = Exit->getFirstNonDebugInstr();
19971 if (
I != Exit->end() &&
I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19972 return CacheLineAlign;
19981 if (PreTerm == Pre->
begin() ||
19982 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19986 auto ExitHead = Exit->getFirstNonDebugInstr();
19987 if (ExitHead == Exit->end() ||
19988 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19993 return CacheLineAlign;
20001 if (needsFetchWindowAlignment(*
MBB))
20006bool SITargetLowering::needsFetchWindowAlignment(
20008 if (!
getSubtarget()->hasLoopHeadInstSplitSensitivity())
20012 if (
MI.isMetaInstruction())
20015 return TII->getInstSizeInBytes(
MI) > 4;
20025 N =
N->getOperand(0).getNode();
20035 switch (
N->getOpcode()) {
20043 if (Reg.isPhysical() || MRI.
isLiveIn(Reg))
20044 return !
TRI->isSGPRReg(MRI, Reg);
20050 return !
TRI->isSGPRReg(MRI, Reg);
20054 unsigned AS = L->getAddressSpace();
20064 case AMDGPUISD::ATOMIC_CMP_SWAP:
20065 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
20066 case AMDGPUISD::BUFFER_ATOMIC_ADD:
20067 case AMDGPUISD::BUFFER_ATOMIC_SUB:
20068 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
20069 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
20070 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
20071 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
20072 case AMDGPUISD::BUFFER_ATOMIC_AND:
20073 case AMDGPUISD::BUFFER_ATOMIC_OR:
20074 case AMDGPUISD::BUFFER_ATOMIC_XOR:
20075 case AMDGPUISD::BUFFER_ATOMIC_INC:
20076 case AMDGPUISD::BUFFER_ATOMIC_DEC:
20077 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
20078 case AMDGPUISD::BUFFER_ATOMIC_FADD:
20079 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
20080 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
20086 return A->readMem() &&
A->writeMem();
20107 switch (Ty.getScalarSizeInBits()) {
20119 const APInt &DemandedElts,
20122 unsigned Depth)
const {
20123 if (
Op.getOpcode() == AMDGPUISD::CLAMP) {
20127 if (Info->getMode().DX10Clamp)
20139 if (RMW->
hasMetadata(
"amdgpu.ignore.denormal.mode"))
20159 <<
"Hardware instruction generated for atomic "
20161 <<
" operation at memory scope " << MemScope;
20166 Type *EltTy = VT->getElementType();
20167 return VT->getNumElements() == 2 &&
20187 unsigned BW =
IT->getBitWidth();
20188 return BW == 32 || BW == 64;
20202 unsigned BW =
DL.getPointerSizeInBits(PT->getAddressSpace());
20203 return BW == 32 || BW == 64;
20206 if (Ty->isFloatTy() || Ty->isDoubleTy())
20210 return VT->getNumElements() == 2 &&
20211 VT->getElementType()->getPrimitiveSizeInBits() == 16;
20221 bool HasSystemScope) {
20228 if (HasSystemScope) {
20229 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
20232 if (Subtarget.hasEmulatedSystemScopeAtomics())
20234 }
else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
20237 return RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory");
20250 const MDNode *MD =
I->getMetadata(LLVMContext::MD_noalias_addrspace);
20258 return STI.hasGloballyAddressableScratch()
20276 DL.getTypeSizeInBits(RMW->
getType()) == 64 &&
20289 bool HasSystemScope =
20321 if (!
IT ||
IT->getBitWidth() != 32)
20327 if (Subtarget->hasEmulatedSystemScopeAtomics())
20343 if (!HasSystemScope &&
20344 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
20356 if (RMW->
hasMetadata(
"amdgpu.no.fine.grained.memory"))
20364 ConstVal && ConstVal->isNullValue())
20402 if (Ty->isFloatTy()) {
20407 if (Ty->isDoubleTy()) {
20428 if (Ty->isFloatTy() &&
20429 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
20442 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
20446 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() &&
isV2BF16(Ty))
20450 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() &&
isV2F16(Ty))
20455 if (Subtarget->hasAtomicBufferPkAddBF16Inst() &&
isV2BF16(Ty))
20460 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
20464 if (Ty->isFloatTy()) {
20467 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20470 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20475 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
20483 if (Subtarget->hasFlatAtomicFaddF32Inst())
20492 if (Subtarget->hasLDSFPAtomicAddF32()) {
20493 if (RMW->
use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20495 if (!RMW->
use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20523 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
20525 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
20529 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
20531 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
20585 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
20586 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
20587 : &AMDGPU::SReg_32RegClass;
20588 if (!
TRI->isSGPRClass(RC) && !isDivergent)
20589 return TRI->getEquivalentSGPRClass(RC);
20590 if (
TRI->isSGPRClass(RC) && isDivergent) {
20591 if (Subtarget->hasGFX90AInsts())
20592 return TRI->getEquivalentAVClass(RC);
20593 return TRI->getEquivalentVGPRClass(RC);
20606 unsigned WaveSize) {
20611 if (!
IT ||
IT->getBitWidth() != WaveSize)
20616 if (!Visited.
insert(V).second)
20618 bool Result =
false;
20619 for (
const auto *U : V->users()) {
20621 if (V == U->getOperand(1)) {
20626 case Intrinsic::amdgcn_if_break:
20627 case Intrinsic::amdgcn_if:
20628 case Intrinsic::amdgcn_else:
20633 if (V == U->getOperand(0)) {
20638 case Intrinsic::amdgcn_end_cf:
20639 case Intrinsic::amdgcn_loop:
20645 Result =
hasCFUser(U, Visited, WaveSize);
20654 const Value *V)
const {
20656 if (CI->isInlineAsm()) {
20665 for (
auto &TC : TargetConstraints) {
20679 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
20714 if (
I.getMetadata(
"amdgpu.noclobber"))
20716 if (
I.getMetadata(
"amdgpu.last.use"))
20780 Alignment = RMW->getAlign();
20793 bool FullFlatEmulation =
20795 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
20796 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
20797 RMW->getType()->isDoubleTy()));
20800 bool ReturnValueIsUsed = !AI->
use_empty();
20809 if (FullFlatEmulation) {
20820 std::prev(BB->
end())->eraseFromParent();
20821 Builder.SetInsertPoint(BB);
20823 Value *LoadedShared =
nullptr;
20824 if (FullFlatEmulation) {
20825 Value *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
20826 {Addr},
nullptr,
"is.shared");
20827 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
20828 Builder.SetInsertPoint(SharedBB);
20829 Value *CastToLocal = Builder.CreateAddrSpaceCast(
20835 LoadedShared = Clone;
20837 Builder.CreateBr(PhiBB);
20838 Builder.SetInsertPoint(CheckPrivateBB);
20841 Value *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
20842 {Addr},
nullptr,
"is.private");
20843 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
20845 Builder.SetInsertPoint(PrivateBB);
20847 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
20850 Value *LoadedPrivate;
20852 LoadedPrivate = Builder.CreateAlignedLoad(
20853 RMW->getType(), CastToPrivate, RMW->getAlign(),
"loaded.private");
20856 LoadedPrivate, RMW->getValOperand());
20858 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
20860 auto [ResultLoad, Equal] =
20866 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
20869 Builder.CreateBr(PhiBB);
20871 Builder.SetInsertPoint(GlobalBB);
20875 if (FullFlatEmulation) {
20876 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
20885 if (!FullFlatEmulation) {
20890 MDNode *RangeNotPrivate =
20893 LoadedGlobal->
setMetadata(LLVMContext::MD_noalias_addrspace,
20897 Builder.CreateBr(PhiBB);
20899 Builder.SetInsertPoint(PhiBB);
20901 if (ReturnValueIsUsed) {
20904 if (FullFlatEmulation)
20905 Loaded->addIncoming(LoadedShared, SharedBB);
20906 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20907 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20908 Loaded->takeName(AI);
20911 Builder.CreateBr(ExitBB);
20915 unsigned PtrOpIdx) {
20916 Value *PtrOp =
I->getOperand(PtrOpIdx);
20923 I->setOperand(PtrOpIdx, ASCast);
20935 ConstVal && ConstVal->isNullValue()) {
20965 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20973 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20988 LoadInst *LI = Builder.CreateAlignedLoad(
static bool isMul(MachineInstr *MI)
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
static bool isCtlzOpc(unsigned Opc)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
static bool isSigned(unsigned Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
iv Induction Variable Users
static constexpr Value * getValue(Ty &ValueOrUse)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register const TargetRegisterInfo * TRI
Promote Memory to Register
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
#define FP_DENORM_FLUSH_NONE
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static MachineBasicBlock * expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static std::tuple< unsigned, unsigned > getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static std::pair< Register, Register > ExtractSubRegs(MachineInstr &MI, MachineOperand &Op, const TargetRegisterClass *SrcRC, const GCNSubtarget &ST, MachineRegisterInfo &MRI)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC, const SDValue LHS, const SDValue RHS, const SelectionDAG &DAG)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
static uint64_t getIdentityValueForWaveReduction(unsigned Opc)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
void setUsesDynamicLDS(bool DynLDS)
bool isBottomOfStack() const
uint32_t getLDSSize() const
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
bool isEntryFunction() const
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
static bool EnableObjectLinking
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
const unsigned XorTermOpc
const unsigned AndSaveExecOpc
static constexpr roundingMode rmNearestTiesToEven
static const fltSemantics & IEEEhalf()
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
LLVM_READONLY int getExactLog2Abs() const
APInt bitcastToAPInt() const
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Class for arbitrary precision integers.
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
unsigned countr_zero() const
Count the number of trailing zero bits.
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
bool getBoolValue() const
Convert APInt to a boolean value.
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
This class represents an incoming formal argument to a Function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
const Function * getParent() const
Represent a constant reference to an array (0 or more elements consecutively in memory),...
size_t size() const
Get the array size.
bool empty() const
Check if the array is empty.
An instruction that atomically checks whether a specified value is in a memory location,...
Value * getNewValOperand()
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getCompareOperand()
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
static bool isFPPredicate(Predicate P)
static bool isIntPredicate(Predicate P)
const APFloat & getValueAPF() const
bool isPosZero() const
Return true if the value is positive zero.
bool isOne() const
Returns true if this value is exactly +1.0.
bool isMinusOne() const
Returns true if this value is exactly -1.0.
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
uint64_t getNumOperands() const
A parsed version of the target data layout string in and methods for querying it.
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
LLVM_ABI const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
iterator_range< arg_iterator > args()
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Argument * getArg(unsigned i) const
const SIInstrInfo * getInstrInfo() const override
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool hasKernargSegmentPtr() const
bool hasDispatchID() const
bool hasPrivateSegmentBuffer() const
unsigned getNumFreeUserSGPRs()
bool hasImplicitBufferPtr() const
bool hasPrivateSegmentSize() const
bool hasDispatchPtr() const
bool hasFlatScratchInit() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
int64_t getOffset() const
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
const MDOperand & getOperand(unsigned I) const
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
bool doesNotAccessMemory() const
Whether this function accesses no memory.
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
bool isWholeWaveFunction() const
bool hasWorkGroupIDZ() const
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
unsigned getBytesInStackArgArea() const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align DstAlign, Align SrcAlign, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
constexpr bool empty() const
Check if the string is empty.
constexpr size_t size() const
Get the string size.
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
@ ZeroOrOneBooleanContent
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
OSType getOS() const
Get the parsed operating system type of this triple.
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
static constexpr TypeSize getFixed(ScalarTy ExactSize)
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
bool isFunctionTy() const
True if this is an instance of FunctionType.
bool isIntegerTy() const
True if this is an instance of IntegerType.
LLVM_ABI const fltSemantics & getFltSemantics() const
bool isVoidTy() const
Return true if this is 'void'.
A Use represents the edge between a Value definition and its users.
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
LLVM_ABI void set(Value *Val)
User * getUser() const
Returns the User that contains this Use.
const Use & getOperandUse(unsigned i) const
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVMContext & getContext() const
All values hold a context through their type.
iterator_range< user_iterator > users()
iterator_range< use_iterator > uses()
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
constexpr bool isKnownEven() const
A return value of true indicates we know at compile time that the number of elements (vscale * Min) i...
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
bool isGFX13(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isValidWMMAScaleFmtCombination(unsigned AFmt, unsigned AScale, unsigned BFmt, unsigned BScale)
@ TowardZeroF32_TowardNegativeF64
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
@ C
The default llvm calling convention, compatible with C.
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
@ BSWAP
Byte Swap and Counting operators.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
@ ADD
Simple integer binary arithmetic operators.
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
@ FADD
Simple binary floating point operators.
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ SET_ROUNDING
Set rounding mode.
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
@ CTLS
Count leading redundant sign bits.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
@ AND
Bitwise operators - logical and, logical or, logical xor.
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > OverloadTys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_IntrinsicWOChain(const OpndPreds &...Opnds)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
@ System
Synchronized with respect to all concurrently executing threads.
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
NodeAddr< NodeBase * > Node
friend class Instruction
Iterator for Instructions in a `BasicBlock.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
LLVM_ABI ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
bool isBoolSGPR(SDValue V)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
LLVM_ABI ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
LLVM_ABI Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Next
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
constexpr RegState getUndefRegState(bool B)
@ Custom
The result value requires a custom uniformity check.
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
@ CLUSTER_WORKGROUP_MAX_ID_X
@ CLUSTER_WORKGROUP_MAX_ID_Z
@ CLUSTER_WORKGROUP_MAX_FLAT_ID
@ CLUSTER_WORKGROUP_MAX_ID_Y
ArgDescriptor WorkItemIDZ
ArgDescriptor WorkItemIDY
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
ArgDescriptor WorkItemIDX
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
unsigned AtomicNoRetBaseOpcode
This struct is a compact representation of a valid (non-zero power of two) alignment.
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
ElementCount getVectorElementCount() const
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
bool isByteSized() const
Return true if the bit size is a multiple of 8.
uint64_t getScalarSizeInBits() const
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isVector() const
Return true if this is a vector value type.
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
bool isVectorOf(EVT EltVT) const
Return true if this is a vector with matching element type.
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
unsigned getBitWidth() const
Get the bit width of this value.
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
void resetAll()
Resets the known state of all bits.
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
bool isNonZero() const
Returns true if this value is known to be non-zero.
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
bool isKnownNeverNaN() const
Return true if it's known this can never be a nan.
static LLVM_ABI KnownFPClass bitcast(const fltSemantics &FltSemantics, const KnownBits &Bits)
Report known values for a bitcast into a float with provided semantics.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SDValue ConvergenceControlToken
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
SmallVector< SDValue, 32 > OutVals
bool isBeforeLegalize() const