29#include "llvm/IR/IntrinsicsAMDGPU.h"
40#define DEBUG_TYPE "amdgpu-codegenprepare"
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc(
"Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
54 BreakLargePHIs(
"amdgpu-codegenprepare-break-large-phis",
55 cl::desc(
"Break large PHI nodes for DAGISel"),
59 ForceBreakLargePHIs(
"amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc(
"For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc(
"Minimum type size in bits for breaking large PHI nodes"),
70 "amdgpu-codegenprepare-mul24",
71 cl::desc(
"Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc(
"Expand 64-bit division in AMDGPUCodeGenPrepare"),
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc(
"Prevent expanding integer division in AMDGPUCodeGenPrepare"),
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc(
"Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
97class AMDGPUCodeGenPrepareImpl
98 :
public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged =
false;
109 mutable Function *SqrtF32 =
nullptr;
110 mutable Function *LdexpF32 =
nullptr;
119 DL(
F.getDataLayout()), SQ(
DL, TLI, DT, AC),
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
144 bool canBreakPHINode(
const PHINode &
I);
147 bool isLegalFloatingTy(
const Type *
T)
const;
156 bool canIgnoreDenormalInput(
const Value *V,
const Instruction *CtxI)
const {
157 return HasFP32DenormalFlush ||
182 unsigned MaxDivBits,
bool Signed)
const;
187 bool IsDiv,
bool IsSigned)
const;
191 bool IsDiv,
bool IsSigned)
const;
209 bool canWidenScalarExtLoad(
LoadInst &
I)
const;
223 float ReqdAccuracy)
const;
228 float ReqdAccuracy)
const;
230 std::pair<Value *, Value *> getFrexpResults(
IRBuilder<> &Builder,
234 bool IsNegative)
const;
241 bool IsNegative)
const;
245 void replaceWithMaskedWorkitemIdX(
Instruction &
I,
unsigned WaveSize)
const;
246 bool tryReplaceWithWorkitemId(
Instruction &
I,
unsigned Wave)
const;
279 if (!ExpandDiv64InIR)
283 StringRef getPassName()
const override {
return "AMDGPU IR optimizations"; }
288bool AMDGPUCodeGenPrepareImpl::run() {
289 BreakPhiNodesCache.clear();
290 bool MadeChange =
false;
302 while (!DeadVals.empty()) {
310bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(
const Type *Ty)
const {
312 (Ty->
isHalfTy() && ST.has16BitInsts());
315bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &
I)
const {
316 Type *Ty =
I.getType();
317 int TySize =
DL.getTypeSizeInBits(Ty);
318 Align Alignment =
DL.getValueOrABITypeAlignment(
I.getAlign(), Ty);
320 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.
isUniform(&
I);
324AMDGPUCodeGenPrepareImpl::numBitsUnsigned(
Value *
Op,
325 const Instruction *CtxI)
const {
330AMDGPUCodeGenPrepareImpl::numBitsSigned(
Value *
Op,
331 const Instruction *CtxI)
const {
343 for (
int I = 0,
E = VT->getNumElements();
I !=
E; ++
I)
344 Values.
push_back(Builder.CreateExtractElement(V,
I));
350 if (!Ty->isVectorTy()) {
356 for (
int I = 0,
E = Values.
size();
I !=
E; ++
I)
357 NewVal = Builder.CreateInsertElement(NewVal, Values[
I],
I);
362bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &
I)
const {
363 if (
I.getOpcode() != Instruction::Mul)
366 Type *Ty =
I.getType();
368 if (
Size <= 16 && ST.has16BitInsts())
378 Builder.SetCurrentDebugLocation(
I.getDebugLoc());
380 unsigned LHSBits = 0, RHSBits = 0;
381 bool IsSigned =
false;
383 if (ST.
hasMulU24() && (LHSBits = numBitsUnsigned(
LHS, &
I)) <= 24 &&
384 (RHSBits = numBitsUnsigned(
RHS, &
I)) <= 24) {
387 }
else if (ST.
hasMulI24() && (LHSBits = numBitsSigned(
LHS, &
I)) <= 24 &&
388 (RHSBits = numBitsSigned(
RHS, &
I)) <= 24) {
394 SmallVector<Value *, 4> LHSVals;
395 SmallVector<Value *, 4> RHSVals;
396 SmallVector<Value *, 4> ResultVals;
400 IntegerType *I32Ty = Builder.getInt32Ty();
401 IntegerType *IntrinTy =
Size > 32 ? Builder.getInt64Ty() : I32Ty;
402 Type *DstTy = LHSVals[0]->getType();
404 for (
int I = 0,
E = LHSVals.
size();
I !=
E; ++
I) {
405 Value *
LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[
I], I32Ty)
406 : Builder.CreateZExtOrTrunc(LHSVals[
I], I32Ty);
407 Value *
RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[
I], I32Ty)
408 : Builder.CreateZExtOrTrunc(RHSVals[
I], I32Ty);
410 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
412 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
413 : Builder.CreateZExtOrTrunc(Result, DstTy);
419 I.replaceAllUsesWith(NewVal);
420 DeadVals.push_back(&
I);
440bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO)
const {
461 if (!CBO || !CT || !CF)
488 Builder.setFastMathFlags(FPOp->getFastMathFlags());
494 DeadVals.push_back(&BO);
496 DeadVals.push_back(CastOp);
497 DeadVals.push_back(Sel);
501std::pair<Value *, Value *>
502AMDGPUCodeGenPrepareImpl::getFrexpResults(
IRBuilder<> &Builder,
504 Type *Ty = Src->getType();
517 : Builder.CreateExtractValue(Frexp, {1});
518 return {FrexpMant, FrexpExp};
524 bool IsNegative)
const {
539 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
542 return Builder.
CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
548 FastMathFlags FMF)
const {
552 if (HasFP32DenormalFlush && ST.
hasFractBug() && !ST.hasFastFMAF32() &&
558 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder,
RHS);
563 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder,
LHS);
575 FastMathFlags FMF)
const {
576 Type *Ty = Src->getType();
580 Builder.
CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
583 Value *InputScaleFactor =
590 Value *OutputScaleFactor =
592 return Builder.
CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
603 Type *Ty = Src->getType();
607 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
608 Constant *One = ConstantFP::get(Ty, 1.0);
609 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
611 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
613 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
615 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
616 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
617 Value *OutputScaleFactor = Builder.CreateSelect(
618 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
620 return Builder.CreateFMul(Rsq, OutputScaleFactor);
626 FastMathFlags SqrtFMF,
627 FastMathFlags DivFMF,
628 const Instruction *CtxI,
629 bool IsNegative)
const {
651 bool MaybePosInf = !SqrtFMF.
noInfs() && !DivFMF.
noInfs();
652 bool MaybeZero = !DivFMF.
noInfs();
654 DenormalMode DenormMode;
661 if (Interested !=
fcNone) {
666 DenormMode =
F.getDenormalMode(
X->getType()->getFltSemantics());
672 if (MaybeZero || MaybePosInf) {
674 if (MaybePosInf && MaybeZero) {
675 if (DenormMode.
Input != DenormalMode::DenormalModeKind::Dynamic) {
690 }
else if (MaybeZero) {
703 Value *
E = Builder.
CreateFMA(NegXY0, Y0, ConstantFP::get(
X->getType(), 1.0));
708 ConstantFP::get(
X->getType(), 0.5));
710 return Builder.
CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
713bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
714 FastMathFlags SqrtFMF)
const {
720Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
722 const FastMathFlags SqrtFMF,
const Instruction *CtxI)
const {
733 bool IsNegative =
false;
738 IRBuilder<>::FastMathFlagGuard Guard(Builder);
743 canIgnoreDenormalInput(Den, CtxI)) {
754 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
768 Value *Den, FastMathFlags FMF,
769 const Instruction *CtxI)
const {
776 bool IsNegative =
false;
781 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
802 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
811 if (HasFP32DenormalFlush || FMF.
approxFunc()) {
816 Value *Recip = emitRcpIEEE1ULP(Builder, Den,
false);
830Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
833 if (ReqdAccuracy < 2.5f)
839 bool NumIsOne =
false;
841 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
849 if (!HasFP32DenormalFlush && !NumIsOne)
852 return Builder.
CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
855Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
857 FastMathFlags SqrtFMF,
Value *RsqOp,
const Instruction *FDivInst,
858 float ReqdDivAccuracy)
const {
861 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
869 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
877 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
881 return emitFrexpDiv(Builder, Num, Den, DivFMF);
899bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
900 if (DisableFDivExpand)
915 FastMathFlags SqrtFMF;
920 Value *RsqOp =
nullptr;
922 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
923 DenII->hasOneUse()) {
925 SqrtFMF = SqrtOp->getFastMathFlags();
926 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
927 RsqOp = SqrtOp->getOperand(0);
931 if (!IsFloat && !RsqOp)
943 const bool AllowInaccurateRcp = DivFMF.
approxFunc();
944 if (!RsqOp && AllowInaccurateRcp)
948 if (IsFloat && ReqdAccuracy < 1.0f)
955 SmallVector<Value *, 4> NumVals;
956 SmallVector<Value *, 4> DenVals;
957 SmallVector<Value *, 4> RsqDenVals;
964 SmallVector<Value *, 4> ResultVals(NumVals.
size());
965 for (
int I = 0,
E = NumVals.
size();
I !=
E; ++
I) {
966 Value *NumElt = NumVals[
I];
967 Value *DenElt = DenVals[
I];
968 Value *RsqDenElt = RsqOp ? RsqDenVals[
I] :
nullptr;
971 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
980 NewEltInst->copyMetadata(FDiv);
983 ResultVals[
I] = NewElt;
991 DeadVals.push_back(&FDiv);
1002 Value *LHS_EXT64 = Builder.CreateZExt(
LHS, I64Ty);
1003 Value *RHS_EXT64 = Builder.CreateZExt(
RHS, I64Ty);
1004 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1005 Value *
Lo = Builder.CreateTrunc(MUL64, I32Ty);
1006 Value *
Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1007 Hi = Builder.CreateTrunc(
Hi, I32Ty);
1008 return std::pair(
Lo,
Hi);
1019unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &
I,
Value *Num,
1021 unsigned MaxDivBits,
1022 bool IsSigned)
const {
1029 unsigned DivBits = SSBits - RHSSignBits + 1;
1030 if (DivBits > MaxDivBits)
1035 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1036 DivBits = SSBits - SignBits + 1;
1046 unsigned DivBits = SSBits - RHSSignBits;
1047 if (DivBits > MaxDivBits)
1055 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1056 DivBits = SSBits - SignBits;
1063 BinaryOperator &
I,
Value *Num,
1064 Value *Den,
bool IsDiv,
1065 bool IsSigned)
const {
1066 unsigned DivBits = getDivNumBits(
I, Num, Den, 24, IsSigned);
1069 return expandDivRem24Impl(Builder,
I, Num, Den, DivBits, IsDiv, IsSigned);
1072Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1074 unsigned DivBits,
bool IsDiv,
bool IsSigned)
const {
1080 ConstantInt *One = Builder.
getInt32(1);
1120 auto FMAD = !ST.hasMadMacF32Insts()
1124 {FQNeg->
getType()}, {FQNeg, FB, FA}, FQ);
1152 if (DivBits != 0 && DivBits < 32) {
1155 int InRegBits = 32 - DivBits;
1157 Res = Builder.
CreateShl(Res, InRegBits);
1160 ConstantInt *TruncMask
1161 = Builder.
getInt32((UINT64_C(1) << DivBits) - 1);
1162 Res = Builder.
CreateAnd(Res, TruncMask);
1173bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &
I,
1179 if (
C->getType()->getScalarSizeInBits() <= 32)
1195 if (BinOpDen->getOpcode() == Instruction::Shl &&
1213 return Builder.CreateAShr(V, Builder.getInt32(31));
1220 assert(
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1221 Opc == Instruction::SRem ||
Opc == Instruction::SDiv);
1227 if (divHasSpecialOptimization(
I,
X,
Y))
1230 bool IsDiv =
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv;
1231 bool IsSigned =
Opc == Instruction::SRem ||
Opc == Instruction::SDiv;
1233 Type *Ty =
X->getType();
1247 if (
Value *Res = expandDivRem24(Builder,
I,
X,
Y, IsDiv, IsSigned)) {
1253 ConstantInt *One = Builder.
getInt32(1);
1255 Value *Sign =
nullptr;
1260 Sign = IsDiv ? Builder.
CreateXor(SignX, SignY) : SignX;
1341 BinaryOperator &
I,
Value *Num,
1343 if (!ExpandDiv64InIR && divHasSpecialOptimization(
I, Num, Den))
1348 bool IsDiv =
Opc == Instruction::SDiv ||
Opc == Instruction::UDiv;
1349 bool IsSigned =
Opc == Instruction::SDiv ||
Opc == Instruction::SRem;
1351 unsigned NumDivBits = getDivNumBits(
I, Num, Den, 32, IsSigned);
1352 if (NumDivBits > 32)
1355 Value *Narrowed =
nullptr;
1356 if (NumDivBits <= 24) {
1357 Narrowed = expandDivRem24Impl(Builder,
I, Num, Den, NumDivBits,
1359 }
else if (NumDivBits <= 32) {
1360 Narrowed = expandDivRem32(Builder,
I, Num, Den);
1371void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &
I)
const {
1374 if (
Opc == Instruction::UDiv ||
Opc == Instruction::SDiv) {
1379 if (
Opc == Instruction::URem ||
Opc == Instruction::SRem) {
1399bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *
I) {
1400 unsigned Opc =
I->getOpcode();
1401 Type *OldType =
I->getType();
1403 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1408 if (
Opc != Instruction::Add &&
Opc != Instruction::Mul)
1410 "Instruction::Mul.");
1414 MaxBitsNeeded = std::max<unsigned>(
bit_ceil(MaxBitsNeeded), 8);
1415 Type *NewType =
DL.getSmallestLegalIntType(
I->getContext(), MaxBitsNeeded);
1419 if (NewBit >= OrigBit)
1431 int NumOfNonConstOps = 2;
1434 NumOfNonConstOps = 1;
1444 if (NewCost >= OldCost)
1455 DeadVals.push_back(
I);
1459bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &
I) {
1460 if (foldBinOpIntoSelect(
I))
1463 if (UseMul24Intrin && replaceMulWithMul24(
I))
1465 if (tryNarrowMathIfNoOverflow(&
I))
1470 Type *Ty =
I.getType();
1471 Value *NewDiv =
nullptr;
1476 if ((
Opc == Instruction::URem ||
Opc == Instruction::UDiv ||
1477 Opc == Instruction::SRem ||
Opc == Instruction::SDiv) &&
1479 !DisableIDivExpand) {
1480 Value *Num =
I.getOperand(0);
1481 Value *Den =
I.getOperand(1);
1488 for (
unsigned N = 0,
E = VT->getNumElements();
N !=
E; ++
N) {
1493 if (ScalarSize <= 32) {
1494 NewElt = expandDivRem32(Builder,
I, NumEltN, DenEltN);
1500 NewElt = shrinkDivRem64(Builder,
I, NumEltN, DenEltN);
1514 NewEltI->copyIRFlags(&
I);
1519 if (ScalarSize <= 32)
1520 NewDiv = expandDivRem32(Builder,
I, Num, Den);
1522 NewDiv = shrinkDivRem64(Builder,
I, Num, Den);
1529 I.replaceAllUsesWith(NewDiv);
1530 DeadVals.push_back(&
I);
1535 if (ExpandDiv64InIR) {
1537 for (BinaryOperator *Div : Div64ToExpand) {
1538 expandDivRem64(*Div);
1547bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &
I) {
1553 canWidenScalarExtLoad(
I)) {
1563 if (
auto *
Range =
WidenLoad->getMetadata(LLVMContext::MD_range)) {
1564 ConstantInt *
Lower =
1567 if (
Lower->isNullValue()) {
1568 WidenLoad->setMetadata(LLVMContext::MD_range,
nullptr);
1576 WidenLoad->setMetadata(LLVMContext::MD_range,
1581 int TySize =
DL.getTypeSizeInBits(
I.getType());
1586 DeadVals.push_back(&
I);
1593bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &
I) {
1598 CmpPredicate IsNanPred;
1611 Value *Fract =
nullptr;
1612 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1613 CmpVal == matchFractPat(*FalseVal)) {
1615 Fract = applyFractPat(Builder, CmpVal);
1616 }
else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1617 if (CmpVal == matchFractPat(*TrueVal)) {
1619 Fract = applyFractPat(Builder, CmpVal);
1623 CmpPredicate PredInf;
1629 PredInf != FCmpInst::FCMP_UNE || CmpVal != matchFractPat(*IfNotInf))
1639 Value *NewFract = applyFractPat(Builder, CmpVal);
1643 DeadVals.push_back(ClampInfSelect->
getOperand(1));
1647 Fract = ClampInfSelect;
1653 I.replaceAllUsesWith(Fract);
1654 DeadVals.push_back(&
I);
1661 return IA && IB && IA->getParent() == IB->getParent();
1671 const Value *CurVal = V;
1674 BitVector EltsCovered(FVT->getNumElements());
1681 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1684 const auto *VecSrc = IE->getOperand(0);
1693 EltsCovered.
set(Idx->getZExtValue());
1696 if (EltsCovered.
all())
1723 const auto [It, Inserted] = SeenPHIs.
insert(&
I);
1727 for (
const Value *Inc :
I.incoming_values()) {
1732 for (
const User *U :
I.users()) {
1738bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(
const PHINode &
I) {
1740 if (
const auto It = BreakPhiNodesCache.find(&
I);
1741 It != BreakPhiNodesCache.end())
1750 SmallPtrSet<const PHINode *, 8> WorkList;
1756 for (
const PHINode *WLP : WorkList) {
1757 assert(BreakPhiNodesCache.count(WLP) == 0);
1772 const auto Threshold = (
alignTo(WorkList.size() * 2, 3) / 3);
1773 unsigned NumBreakablePHIs = 0;
1774 bool CanBreak =
false;
1775 for (
const PHINode *Cur : WorkList) {
1783 if (++NumBreakablePHIs >= Threshold) {
1790 for (
const PHINode *Cur : WorkList)
1791 BreakPhiNodesCache[Cur] = CanBreak;
1840 Value *&Res = SlicedVals[{BB, Inc}];
1846 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1852 Res =
B.CreateShuffleVector(Inc, Mask, NewValName);
1854 Res =
B.CreateExtractElement(Inc,
Idx, NewValName);
1863bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &
I) {
1879 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1882 if (!ForceBreakLargePHIs && !canBreakPHINode(
I))
1885 std::vector<VectorSlice> Slices;
1892 const unsigned EltSize =
DL.getTypeSizeInBits(EltTy);
1894 if (EltSize == 8 || EltSize == 16) {
1895 const unsigned SubVecSize = (32 / EltSize);
1897 for (
unsigned End =
alignDown(NumElts, SubVecSize); Idx < End;
1899 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1903 for (; Idx < NumElts; ++Idx)
1904 Slices.emplace_back(EltTy, Idx, 1);
1907 assert(Slices.size() > 1);
1913 B.SetCurrentDebugLocation(
I.getDebugLoc());
1915 unsigned IncNameSuffix = 0;
1916 for (VectorSlice &S : Slices) {
1919 B.SetInsertPoint(
I.getParent()->getFirstNonPHIIt());
1920 S.NewPHI =
B.CreatePHI(S.Ty,
I.getNumIncomingValues());
1922 for (
const auto &[Idx, BB] :
enumerate(
I.blocks())) {
1923 S.NewPHI->addIncoming(S.getSlicedVal(BB,
I.getIncomingValue(Idx),
1924 "largephi.extractslice" +
1925 std::to_string(IncNameSuffix++)),
1932 unsigned NameSuffix = 0;
1933 for (VectorSlice &S : Slices) {
1934 const auto ValName =
"largephi.insertslice" + std::to_string(NameSuffix++);
1936 Vec =
B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1938 Vec =
B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1941 I.replaceAllUsesWith(Vec);
1942 DeadVals.push_back(&
I);
1965 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1984 assert(SrcPtrKB.getBitWidth() ==
DL.getPointerSizeInBits(AS));
1985 assert((NullVal == 0 || NullVal == -1) &&
1986 "don't know how to check for this null value!");
1987 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1990bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &
I) {
1994 if (
I.getType()->isVectorTy())
1999 const unsigned SrcAS =
I.getSrcAddressSpace();
2000 const unsigned DstAS =
I.getDestAddressSpace();
2002 bool CanLower =
false;
2020 auto *Intrin =
B.CreateIntrinsic(
2021 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2022 I.replaceAllUsesWith(Intrin);
2023 DeadVals.push_back(&
I);
2027bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &
I) {
2030 case Intrinsic::minnum:
2031 case Intrinsic::minimumnum:
2032 case Intrinsic::minimum:
2033 return visitFMinLike(
I);
2034 case Intrinsic::sqrt:
2035 return visitSqrt(
I);
2036 case Intrinsic::log:
2037 case Intrinsic::log10:
2039 case Intrinsic::log2:
2042 case Intrinsic::amdgcn_mbcnt_lo:
2043 return visitMbcntLo(
I);
2044 case Intrinsic::amdgcn_mbcnt_hi:
2045 return visitMbcntHi(
I);
2058Value *AMDGPUCodeGenPrepareImpl::matchFractPat(
Value &V) {
2070 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2071 IID != Intrinsic::minimumnum)
2074 Type *Ty =
V.getType();
2078 Value *Arg0 =
II->getArgOperand(0);
2079 Value *Arg1 =
II->getArgOperand(1);
2086 OneNextDown.
next(
true);
2089 if (OneNextDown != *
C)
2101 SmallVector<Value *, 4> FractVals;
2104 SmallVector<Value *, 4> ResultVals(FractVals.
size());
2107 for (
unsigned I = 0,
E = FractVals.
size();
I !=
E; ++
I) {
2115bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &
I) {
2116 Value *FractArg = matchFractPat(
I);
2126 FastMathFlags FMF =
I.getFastMathFlags();
2130 Value *Fract = applyFractPat(Builder, FractArg);
2132 I.replaceAllUsesWith(Fract);
2133 DeadVals.push_back(&
I);
2138bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2154 if (ReqdAccuracy < 1.0f)
2158 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2162 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2166 SmallVector<Value *, 4> SrcVals;
2169 SmallVector<Value *, 4> ResultVals(SrcVals.
size());
2170 for (
int I = 0,
E = SrcVals.
size();
I !=
E; ++
I) {
2172 ResultVals[
I] = Builder.
CreateCall(getSqrtF32(), SrcVals[
I]);
2174 ResultVals[
I] = emitSqrtIEEE2ULP(Builder, SrcVals[
I], SqrtFMF);
2180 DeadVals.push_back(&Sqrt);
2185bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2191 FastMathFlags FMF =
Log.getFastMathFlags();
2198 if (
Log.getFPAccuracy() < 1.80f)
2209 double Log2BaseInverted =
2216 Log.replaceAllUsesWith(
Mul);
2217 DeadVals.push_back(&Log);
2221bool AMDGPUCodeGenPrepare::runOnFunction(Function &
F) {
2222 if (skipFunction(
F))
2225 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2229 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2230 const TargetLibraryInfo *TLI =
2231 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
F);
2232 AssumptionCache *AC =
2233 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
F);
2234 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2235 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() :
nullptr;
2237 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2238 return AMDGPUCodeGenPrepareImpl(
F, TM, TLI, AC, DT, UA).run();
2248 AMDGPUCodeGenPrepareImpl Impl(
F, ATM, TLI, AC, DT, UA);
2252 if (!Impl.FlowChanged)
2258 "AMDGPU IR optimizations",
false,
false)
2267 CallInst *Tid =
B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2268 ST.makeLIDRangeMetadata(Tid);
2273void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &
I)
const {
2275 CallInst *Tid = createWorkitemIdX(
B);
2281void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2282 Instruction &
I,
unsigned WaveSize)
const {
2284 CallInst *Tid = createWorkitemIdX(
B);
2286 Value *AndInst =
B.CreateAnd(Tid, Mask);
2294bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &
I,
2295 unsigned Wave)
const {
2302 if (*MaybeX == Wave) {
2303 replaceWithWorkitemIdX(
I);
2310 replaceWithMaskedWorkitemIdX(
I, Wave);
2318bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &
I)
const {
2334bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &
I)
const {
2347 if (*MaybeX == Wave) {
2358 using namespace PatternMatch;
2366 return tryReplaceWithWorkitemId(
I, Wave);
2369char AMDGPUCodeGenPrepare::ID = 0;
2372 return new AMDGPUCodeGenPrepare();
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runOnFunction(Function &F, bool PostInlining)
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
opStatus next(bool nextDown)
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
InstListType::iterator iterator
Instruction iterators...
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
BinaryOps getOpcode() const
bool all() const
all - Returns true if all bits are set.
Represents analyses that only rely on functions' control flow.
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Analysis pass which computes a DominatorTree.
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Utility class for floating point operations which can have information about relaxed accuracy require...
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
void setFast(bool B=true)
bool allowReciprocal() const
void setNoNaNs(bool B=true)
bool allowContract() const
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
FunctionPass class - This class is used to implement most global optimizations.
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Base class for instruction visitors.
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
The instances of the Type class are immutable: once they are created, they are never changed.
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
LLVM_ABI const fltSemantics & getFltSemantics() const
void setOperand(unsigned i, Value *Val)
Value * getOperand(unsigned i) const
LLVM Value Representation.
Type * getType() const
All values are typed, get the type of this value.
bool hasOneUse() const
Return true if there is exactly one use of this value.
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Type * getElementType() const
const ParentTy * getParent() const
self_iterator getIterator()
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
@ C
The default llvm calling convention, compatible with C.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
auto dyn_cast_or_null(const Y &Val)
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
auto reverse(ContainerTy &&C)
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
bool isNegative() const
Returns true if this value is known to be negative.
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
SimplifyQuery getWithInstruction(const Instruction *I) const