LLVM 22.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
18#include "llvm/ADT/SetVector.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/IR/ValueHandle.h"
33#include "llvm/Pass.h"
38
39#define DEBUG_TYPE "amdgpu-codegenprepare"
40
41using namespace llvm;
42using namespace llvm::PatternMatch;
43
44namespace {
45
47 "amdgpu-codegenprepare-widen-constant-loads",
48 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
50 cl::init(false));
51
52static cl::opt<bool>
53 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
54 cl::desc("Break large PHI nodes for DAGISel"),
56
57static cl::opt<bool>
58 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
59 cl::desc("For testing purposes, always break large "
60 "PHIs even if it isn't profitable."),
62
63static cl::opt<unsigned> BreakLargePHIsThreshold(
64 "amdgpu-codegenprepare-break-large-phis-threshold",
65 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
67
68static cl::opt<bool> UseMul24Intrin(
69 "amdgpu-codegenprepare-mul24",
70 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
72 cl::init(true));
73
74// Legalize 64-bit division by using the generic IR expansion.
75static cl::opt<bool> ExpandDiv64InIR(
76 "amdgpu-codegenprepare-expand-div64",
77 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
79 cl::init(false));
80
81// Leave all division operations as they are. This supersedes ExpandDiv64InIR
82// and is used for testing the legalizer.
83static cl::opt<bool> DisableIDivExpand(
84 "amdgpu-codegenprepare-disable-idiv-expansion",
85 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
87 cl::init(false));
88
89// Disable processing of fdiv so we can better test the backend implementations.
90static cl::opt<bool> DisableFDivExpand(
91 "amdgpu-codegenprepare-disable-fdiv-expansion",
92 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
94 cl::init(false));
95
96class AMDGPUCodeGenPrepareImpl
97 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
98public:
99 Function &F;
100 const GCNSubtarget &ST;
101 const AMDGPUTargetMachine &TM;
102 const TargetLibraryInfo *TLI;
103 AssumptionCache *AC;
104 const DominatorTree *DT;
105 const UniformityInfo &UA;
106 const DataLayout &DL;
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
111 mutable SmallVector<WeakVH> DeadVals;
112
113 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
114
115 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116 const TargetLibraryInfo *TLI, AssumptionCache *AC,
117 const DominatorTree *DT, const UniformityInfo &UA)
118 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), AC(AC),
119 DT(DT), UA(UA), DL(F.getDataLayout()),
120 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
127 LLVMContext &Ctx = F.getContext();
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
137 LLVMContext &Ctx = F.getContext();
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146 /// Return true if \p T is a legal scalar floating point type.
147 bool isLegalFloatingTy(const Type *T) const;
148
149 /// Wrapper to pass all the arguments to computeKnownFPClass
151 const Instruction *CtxI) const {
152 return llvm::computeKnownFPClass(V, DL, Interested, TLI, AC, CtxI, DT);
153 }
154
155 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
156 return HasFP32DenormalFlush ||
158 }
159
160 /// \returns The minimum number of bits needed to store the value of \Op as an
161 /// unsigned integer. Truncating to this size and then zero-extending to
162 /// the original will not change the value.
163 unsigned numBitsUnsigned(Value *Op) const;
164
165 /// \returns The minimum number of bits needed to store the value of \Op as a
166 /// signed integer. Truncating to this size and then sign-extending to
167 /// the original size will not change the value.
168 unsigned numBitsSigned(Value *Op) const;
169
170 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
171 /// SelectionDAG has an issue where an and asserting the bits are known
172 bool replaceMulWithMul24(BinaryOperator &I) const;
173
174 /// Perform same function as equivalently named function in DAGCombiner. Since
175 /// we expand some divisions here, we need to perform this before obscuring.
176 bool foldBinOpIntoSelect(BinaryOperator &I) const;
177
178 bool divHasSpecialOptimization(BinaryOperator &I,
179 Value *Num, Value *Den) const;
180 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
181 unsigned MaxDivBits, bool Signed) const;
182
183 /// Expands 24 bit div or rem.
184 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
185 Value *Num, Value *Den,
186 bool IsDiv, bool IsSigned) const;
187
188 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
189 Value *Num, Value *Den, unsigned NumBits,
190 bool IsDiv, bool IsSigned) const;
191
192 /// Expands 32 bit div or rem.
193 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
194 Value *Num, Value *Den) const;
195
196 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
197 Value *Num, Value *Den) const;
198 void expandDivRem64(BinaryOperator &I) const;
199
200 /// Widen a scalar load.
201 ///
202 /// \details \p Widen scalar load for uniform, small type loads from constant
203 // memory / to a full 32-bits and then truncate the input to allow a scalar
204 // load instead of a vector load.
205 //
206 /// \returns True.
207
208 bool canWidenScalarExtLoad(LoadInst &I) const;
209
210 Value *matchFractPat(IntrinsicInst &I);
211 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
212
213 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
214
215 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
216 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
217 const Instruction *CtxI) const;
218
219 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
220 FastMathFlags FMF, const Instruction *CtxI) const;
221 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
222 float ReqdAccuracy) const;
223
224 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
225 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
226 Value *RsqOp, const Instruction *FDiv,
227 float ReqdAccuracy) const;
228
229 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
230 Value *Src) const;
231
232 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
233 bool IsNegative) const;
234 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
235 FastMathFlags FMF) const;
236 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
237 FastMathFlags FMF) const;
238 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
239 FastMathFlags DivFMF, const Instruction *CtxI,
240 bool IsNegative) const;
241
242 bool tryNarrowMathIfNoOverflow(Instruction *I);
243
244public:
245 bool visitFDiv(BinaryOperator &I);
246
247 bool visitInstruction(Instruction &I) { return false; }
248 bool visitBinaryOperator(BinaryOperator &I);
249 bool visitLoadInst(LoadInst &I);
250 bool visitSelectInst(SelectInst &I);
251 bool visitPHINode(PHINode &I);
252 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
253
254 bool visitIntrinsicInst(IntrinsicInst &I);
255 bool visitFMinLike(IntrinsicInst &I);
256 bool visitSqrt(IntrinsicInst &I);
257 bool run();
258};
259
260class AMDGPUCodeGenPrepare : public FunctionPass {
261public:
262 static char ID;
263 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
264 void getAnalysisUsage(AnalysisUsage &AU) const override {
268
269 // FIXME: Division expansion needs to preserve the dominator tree.
270 if (!ExpandDiv64InIR)
271 AU.setPreservesAll();
272 }
273 bool runOnFunction(Function &F) override;
274 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
275};
276
277} // end anonymous namespace
278
279bool AMDGPUCodeGenPrepareImpl::run() {
280 BreakPhiNodesCache.clear();
281 bool MadeChange = false;
282
283 // Need to use make_early_inc_range because integer division expansion is
284 // handled by Transform/Utils, and it can delete instructions such as the
285 // terminator of the BB.
286 for (BasicBlock &BB : reverse(F)) {
287 for (Instruction &I : make_early_inc_range(reverse(BB))) {
288 if (!isInstructionTriviallyDead(&I, TLI))
289 MadeChange |= visit(I);
290 }
291 }
292
293 while (!DeadVals.empty()) {
294 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
296 }
297
298 return MadeChange;
299}
300
301bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
302 return Ty->isFloatTy() || Ty->isDoubleTy() ||
303 (Ty->isHalfTy() && ST.has16BitInsts());
304}
305
306bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
307 Type *Ty = I.getType();
308 int TySize = DL.getTypeSizeInBits(Ty);
309 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
310
311 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
312}
313
314unsigned AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op) const {
315 return computeKnownBits(Op, DL, AC).countMaxActiveBits();
316}
317
318unsigned AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op) const {
319 return ComputeMaxSignificantBits(Op, DL, AC);
320}
321
322static void extractValues(IRBuilder<> &Builder,
323 SmallVectorImpl<Value *> &Values, Value *V) {
324 auto *VT = dyn_cast<FixedVectorType>(V->getType());
325 if (!VT) {
326 Values.push_back(V);
327 return;
328 }
329
330 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
331 Values.push_back(Builder.CreateExtractElement(V, I));
332}
333
335 Type *Ty,
336 SmallVectorImpl<Value *> &Values) {
337 if (!Ty->isVectorTy()) {
338 assert(Values.size() == 1);
339 return Values[0];
340 }
341
342 Value *NewVal = PoisonValue::get(Ty);
343 for (int I = 0, E = Values.size(); I != E; ++I)
344 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
345
346 return NewVal;
347}
348
349bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
350 if (I.getOpcode() != Instruction::Mul)
351 return false;
352
353 Type *Ty = I.getType();
354 unsigned Size = Ty->getScalarSizeInBits();
355 if (Size <= 16 && ST.has16BitInsts())
356 return false;
357
358 // Prefer scalar if this could be s_mul_i32
359 if (UA.isUniform(&I))
360 return false;
361
362 Value *LHS = I.getOperand(0);
363 Value *RHS = I.getOperand(1);
364 IRBuilder<> Builder(&I);
365 Builder.SetCurrentDebugLocation(I.getDebugLoc());
366
367 unsigned LHSBits = 0, RHSBits = 0;
368 bool IsSigned = false;
369
370 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS)) <= 24 &&
371 (RHSBits = numBitsUnsigned(RHS)) <= 24) {
372 IsSigned = false;
373
374 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS)) <= 24 &&
375 (RHSBits = numBitsSigned(RHS)) <= 24) {
376 IsSigned = true;
377
378 } else
379 return false;
380
383 SmallVector<Value *, 4> ResultVals;
384 extractValues(Builder, LHSVals, LHS);
385 extractValues(Builder, RHSVals, RHS);
386
387 IntegerType *I32Ty = Builder.getInt32Ty();
388 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
389 Type *DstTy = LHSVals[0]->getType();
390
391 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
392 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
393 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
394 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
395 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
397 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
398 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
399 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
400 : Builder.CreateZExtOrTrunc(Result, DstTy);
401 ResultVals.push_back(Result);
402 }
403
404 Value *NewVal = insertValues(Builder, Ty, ResultVals);
405 NewVal->takeName(&I);
406 I.replaceAllUsesWith(NewVal);
407 DeadVals.push_back(&I);
408
409 return true;
410}
411
412// Find a select instruction, which may have been casted. This is mostly to deal
413// with cases where i16 selects were promoted here to i32.
415 Cast = nullptr;
416 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
417 return Sel;
418
419 if ((Cast = dyn_cast<CastInst>(V))) {
420 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
421 return Sel;
422 }
423
424 return nullptr;
425}
426
427bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
428 // Don't do this unless the old select is going away. We want to eliminate the
429 // binary operator, not replace a binop with a select.
430 int SelOpNo = 0;
431
432 CastInst *CastOp;
433
434 // TODO: Should probably try to handle some cases with multiple
435 // users. Duplicating the select may be profitable for division.
436 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
437 if (!Sel || !Sel->hasOneUse()) {
438 SelOpNo = 1;
439 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
440 }
441
442 if (!Sel || !Sel->hasOneUse())
443 return false;
444
447 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
448 if (!CBO || !CT || !CF)
449 return false;
450
451 if (CastOp) {
452 if (!CastOp->hasOneUse())
453 return false;
454 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
455 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
456 }
457
458 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
459 // need to handle divisions here.
460 Constant *FoldedT =
461 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
462 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
463 if (!FoldedT || isa<ConstantExpr>(FoldedT))
464 return false;
465
466 Constant *FoldedF =
467 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
468 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
469 if (!FoldedF || isa<ConstantExpr>(FoldedF))
470 return false;
471
472 IRBuilder<> Builder(&BO);
473 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
474 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
475 Builder.setFastMathFlags(FPOp->getFastMathFlags());
476
477 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
478 FoldedT, FoldedF);
479 NewSelect->takeName(&BO);
480 BO.replaceAllUsesWith(NewSelect);
481 DeadVals.push_back(&BO);
482 if (CastOp)
483 DeadVals.push_back(CastOp);
484 DeadVals.push_back(Sel);
485 return true;
486}
487
488std::pair<Value *, Value *>
489AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
490 Value *Src) const {
491 Type *Ty = Src->getType();
492 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
493 {Ty, Builder.getInt32Ty()}, Src);
494 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
495
496 // Bypass the bug workaround for the exponent result since it doesn't matter.
497 // TODO: Does the bug workaround even really need to consider the exponent
498 // result? It's unspecified by the spec.
499
500 Value *FrexpExp =
501 ST.hasFractBug()
502 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
503 {Builder.getInt32Ty(), Ty}, Src)
504 : Builder.CreateExtractValue(Frexp, {1});
505 return {FrexpMant, FrexpExp};
506}
507
508/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
509Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
510 Value *Src,
511 bool IsNegative) const {
512 // Same as for 1.0, but expand the sign out of the constant.
513 // -1.0 / x -> rcp (fneg x)
514 if (IsNegative)
515 Src = Builder.CreateFNeg(Src);
516
517 // The rcp instruction doesn't support denormals, so scale the input
518 // out of the denormal range and convert at the end.
519 //
520 // Expand as 2^-n * (1.0 / (x * 2^n))
521
522 // TODO: Skip scaling if input is known never denormal and the input
523 // range won't underflow to denormal. The hard part is knowing the
524 // result. We need a range check, the result could be denormal for
525 // 0x1p+126 < den <= 0x1p+127.
526 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
527 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
528 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
529 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
530}
531
532/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
533Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
534 Value *RHS,
535 FastMathFlags FMF) const {
536 // If we have have to work around the fract/frexp bug, we're worse off than
537 // using the fdiv.fast expansion. The full safe expansion is faster if we have
538 // fast FMA.
539 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
540 (!FMF.noNaNs() || !FMF.noInfs()))
541 return nullptr;
542
543 // We're scaling the LHS to avoid a denormal input, and scale the denominator
544 // to avoid large values underflowing the result.
545 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
546
547 Value *Rcp =
548 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
549
550 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
551 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
552
553 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
554 // result.
555 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
556 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
557}
558
559/// Emit a sqrt that handles denormals and is accurate to 2ulp.
560Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
561 Value *Src,
562 FastMathFlags FMF) const {
563 Type *Ty = Src->getType();
564 APFloat SmallestNormal =
566 Value *NeedScale =
567 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
568
569 ConstantInt *Zero = Builder.getInt32(0);
570 Value *InputScaleFactor =
571 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
572
573 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
574
575 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
576
577 Value *OutputScaleFactor =
578 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
579 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
580}
581
582/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
583static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
584 bool IsNegative) {
585 // bool need_scale = x < 0x1p-126f;
586 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
587 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
588 // rsq(x * input_scale) * output_scale;
589
590 Type *Ty = Src->getType();
591 APFloat SmallestNormal =
592 APFloat::getSmallestNormalized(Ty->getFltSemantics());
593 Value *NeedScale =
594 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
595 Constant *One = ConstantFP::get(Ty, 1.0);
596 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
597 Constant *OutputScale =
598 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
599
600 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
601
602 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
603 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
604 Value *OutputScaleFactor = Builder.CreateSelect(
605 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
606
607 return Builder.CreateFMul(Rsq, OutputScaleFactor);
608}
609
610/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
611/// v_rsq_f64. This should give a 1ulp result.
612Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
613 FastMathFlags SqrtFMF,
614 FastMathFlags DivFMF,
615 const Instruction *CtxI,
616 bool IsNegative) const {
617 // rsq(x):
618 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
619 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
620 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
621 //
622 // -rsq(x):
623 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
624 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
625 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
626 //
627 // The rsq instruction handles the special cases correctly. We need to check
628 // for the edge case conditions to ensure the special case propagates through
629 // the later instructions.
630
631 Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
632
633 // Try to elide the edge case check.
634 //
635 // Fast math flags imply:
636 // sqrt ninf => !isinf(x)
637 // fdiv ninf => x != 0, !isinf(x)
638 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
639 bool MaybeZero = !DivFMF.noInfs();
640
641 DenormalMode DenormMode;
642 FPClassTest Interested = fcNone;
643 if (MaybePosInf)
644 Interested = fcPosInf;
645 if (MaybeZero)
646 Interested |= fcZero;
647
648 if (Interested != fcNone) {
649 KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
650 if (KnownSrc.isKnownNeverPosInfinity())
651 MaybePosInf = false;
652
653 DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
654 if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
655 MaybeZero = false;
656 }
657
658 Value *SpecialOrRsq = X;
659 if (MaybeZero || MaybePosInf) {
660 Value *Cond;
661 if (MaybePosInf && MaybeZero) {
662 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
663 FPClassTest TestMask = fcPosInf | fcZero;
664 if (DenormMode.inputsAreZero())
665 TestMask |= fcSubnormal;
666
667 Cond = Builder.createIsFPClass(X, TestMask);
668 } else {
669 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
670 // doesn't respect the floating-point environment.
671 Value *IsZero =
672 Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
673 Value *IsInf =
674 Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
675 Cond = Builder.CreateOr(IsZero, IsInf);
676 }
677 } else if (MaybeZero) {
678 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
679 } else {
680 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
681 }
682
683 SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
684 }
685
686 Value *NegY0 = Builder.CreateFNeg(Y0);
687 Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
688
689 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
690 Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
691
692 Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
693
694 Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
695 ConstantFP::get(X->getType(), 0.5));
696
697 return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
698}
699
700bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
701 FastMathFlags SqrtFMF) const {
702 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
703 // f64.
704 return DivFMF.allowContract() && SqrtFMF.allowContract();
705}
706
707Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
708 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
709 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
710 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
711 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
712
713 // rsq_f16 is accurate to 0.51 ulp.
714 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
715 // rsq_f64 is never accurate.
716 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
717 if (!CLHS)
718 return nullptr;
719
720 bool IsNegative = false;
721
722 // TODO: Handle other numerator values with arcp.
723 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
724 // Add in the sqrt flags.
725 IRBuilder<>::FastMathFlagGuard Guard(Builder);
726 Builder.setFastMathFlags(DivFMF | SqrtFMF);
727
728 if (Den->getType()->isFloatTy()) {
729 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
730 canIgnoreDenormalInput(Den, CtxI)) {
731 Value *Result =
732 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
733 // -1.0 / sqrt(x) -> fneg(rsq(x))
734 return IsNegative ? Builder.CreateFNeg(Result) : Result;
735 }
736
737 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
738 }
739
740 if (Den->getType()->isDoubleTy())
741 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
742 }
743
744 return nullptr;
745}
746
747// Optimize fdiv with rcp:
748//
749// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
750// allowed with afn.
751//
752// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
753Value *
754AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
755 Value *Den, FastMathFlags FMF,
756 const Instruction *CtxI) const {
757 // rcp_f16 is accurate to 0.51 ulp.
758 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
759 // rcp_f64 is never accurate.
760 assert(Den->getType()->isFloatTy());
761
762 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
763 bool IsNegative = false;
764 if (CLHS->isExactlyValue(1.0) ||
765 (IsNegative = CLHS->isExactlyValue(-1.0))) {
766 Value *Src = Den;
767
768 if (HasFP32DenormalFlush || FMF.approxFunc()) {
769 // -1.0 / x -> 1.0 / fneg(x)
770 if (IsNegative)
771 Src = Builder.CreateFNeg(Src);
772
773 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
774 // the CI documentation has a worst case error of 1 ulp.
775 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
776 // to use it as long as we aren't trying to use denormals.
777 //
778 // v_rcp_f16 and v_rsq_f16 DO support denormals.
779
780 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
781 // insert rsq intrinsic here.
782
783 // 1.0 / x -> rcp(x)
784 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
785 }
786
787 // TODO: If the input isn't denormal, and we know the input exponent isn't
788 // big enough to introduce a denormal we can avoid the scaling.
789 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
790 }
791 }
792
793 if (FMF.allowReciprocal()) {
794 // x / y -> x * (1.0 / y)
795
796 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
797 // will never underflow.
798 if (HasFP32DenormalFlush || FMF.approxFunc()) {
799 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
800 return Builder.CreateFMul(Num, Recip);
801 }
802
803 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
804 return Builder.CreateFMul(Num, Recip);
805 }
806
807 return nullptr;
808}
809
810// optimize with fdiv.fast:
811//
812// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
813//
814// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
815//
816// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
817Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
818 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
819 // fdiv.fast can achieve 2.5 ULP accuracy.
820 if (ReqdAccuracy < 2.5f)
821 return nullptr;
822
823 // Only have fdiv.fast for f32.
824 assert(Den->getType()->isFloatTy());
825
826 bool NumIsOne = false;
827 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
828 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
829 NumIsOne = true;
830 }
831
832 // fdiv does not support denormals. But 1.0/x is always fine to use it.
833 //
834 // TODO: This works for any value with a specific known exponent range, don't
835 // just limit to constant 1.
836 if (!HasFP32DenormalFlush && !NumIsOne)
837 return nullptr;
838
839 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
840}
841
842Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
843 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
844 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
845 float ReqdDivAccuracy) const {
846 if (RsqOp) {
847 Value *Rsq =
848 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
849 if (Rsq)
850 return Rsq;
851 }
852
853 if (!Num->getType()->isFloatTy())
854 return nullptr;
855
856 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
857 if (Rcp)
858 return Rcp;
859
860 // In the basic case fdiv_fast has the same instruction count as the frexp div
861 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
862 // potentially be fused into a user. Also, materialization of the constants
863 // can be reused for multiple instances.
864 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
865 if (FDivFast)
866 return FDivFast;
867
868 return emitFrexpDiv(Builder, Num, Den, DivFMF);
869}
870
871// Optimizations is performed based on fpmath, fast math flags as well as
872// denormals to optimize fdiv with either rcp or fdiv.fast.
873//
874// With rcp:
875// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
876// allowed with afn.
877//
878// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
879//
880// With fdiv.fast:
881// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
882//
883// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
884//
885// NOTE: rcp is the preference in cases that both are legal.
886bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
887 if (DisableFDivExpand)
888 return false;
889
890 Type *Ty = FDiv.getType()->getScalarType();
891 const bool IsFloat = Ty->isFloatTy();
892 if (!IsFloat && !Ty->isDoubleTy())
893 return false;
894
895 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
896 // expansion around them in codegen. f16 is good enough to always use.
897
898 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
899 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
900 const float ReqdAccuracy = FPOp->getFPAccuracy();
901
902 FastMathFlags SqrtFMF;
903
904 Value *Num = FDiv.getOperand(0);
905 Value *Den = FDiv.getOperand(1);
906
907 Value *RsqOp = nullptr;
908 auto *DenII = dyn_cast<IntrinsicInst>(Den);
909 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
910 DenII->hasOneUse()) {
911 const auto *SqrtOp = cast<FPMathOperator>(DenII);
912 SqrtFMF = SqrtOp->getFastMathFlags();
913 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
914 RsqOp = SqrtOp->getOperand(0);
915 }
916
917 // rcp path not yet implemented for f64.
918 if (!IsFloat && !RsqOp)
919 return false;
920
921 // Inaccurate rcp is allowed with afn.
922 //
923 // Defer to codegen to handle this.
924 //
925 // TODO: Decide on an interpretation for interactions between afn + arcp +
926 // !fpmath, and make it consistent between here and codegen. For now, defer
927 // expansion of afn to codegen. The current interpretation is so aggressive we
928 // don't need any pre-consideration here when we have better information. A
929 // more conservative interpretation could use handling here.
930 const bool AllowInaccurateRcp = DivFMF.approxFunc();
931 if (!RsqOp && AllowInaccurateRcp)
932 return false;
933
934 // Defer the correct implementations to codegen.
935 if (IsFloat && ReqdAccuracy < 1.0f)
936 return false;
937
938 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
939 Builder.setFastMathFlags(DivFMF);
940 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
941
944 SmallVector<Value *, 4> RsqDenVals;
945 extractValues(Builder, NumVals, Num);
946 extractValues(Builder, DenVals, Den);
947
948 if (RsqOp)
949 extractValues(Builder, RsqDenVals, RsqOp);
950
951 SmallVector<Value *, 4> ResultVals(NumVals.size());
952 for (int I = 0, E = NumVals.size(); I != E; ++I) {
953 Value *NumElt = NumVals[I];
954 Value *DenElt = DenVals[I];
955 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
956
957 Value *NewElt =
958 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
959 cast<Instruction>(FPOp), ReqdAccuracy);
960 if (!NewElt) {
961 // Keep the original, but scalarized.
962
963 // This has the unfortunate side effect of sometimes scalarizing when
964 // we're not going to do anything.
965 NewElt = Builder.CreateFDiv(NumElt, DenElt);
966 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
967 NewEltInst->copyMetadata(FDiv);
968 }
969
970 ResultVals[I] = NewElt;
971 }
972
973 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
974
975 if (NewVal) {
976 FDiv.replaceAllUsesWith(NewVal);
977 NewVal->takeName(&FDiv);
978 DeadVals.push_back(&FDiv);
979 }
980
981 return true;
982}
983
984static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
985 Value *LHS, Value *RHS) {
986 Type *I32Ty = Builder.getInt32Ty();
987 Type *I64Ty = Builder.getInt64Ty();
988
989 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
990 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
991 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
992 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
993 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
994 Hi = Builder.CreateTrunc(Hi, I32Ty);
995 return std::pair(Lo, Hi);
996}
997
998static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
999 return getMul64(Builder, LHS, RHS).second;
1000}
1001
1002/// Figure out how many bits are really needed for this division.
1003/// \p MaxDivBits is an optimization hint to bypass the second
1004/// ComputeNumSignBits/computeKnownBits call if the first one is
1005/// insufficient.
1006unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1007 Value *Den,
1008 unsigned MaxDivBits,
1009 bool IsSigned) const {
1011 Den->getType()->getScalarSizeInBits());
1012 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1013 if (IsSigned) {
1014 unsigned RHSSignBits = ComputeNumSignBits(Den, DL, AC, &I);
1015 // A sign bit needs to be reserved for shrinking.
1016 unsigned DivBits = SSBits - RHSSignBits + 1;
1017 if (DivBits > MaxDivBits)
1018 return SSBits;
1019
1020 unsigned LHSSignBits = ComputeNumSignBits(Num, DL, AC, &I);
1021
1022 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1023 DivBits = SSBits - SignBits + 1;
1024 return DivBits;
1025 }
1026
1027 // All bits are used for unsigned division for Num or Den in range
1028 // (SignedMax, UnsignedMax].
1029 KnownBits Known = computeKnownBits(Den, DL, AC, &I);
1030 if (Known.isNegative() || !Known.isNonNegative())
1031 return SSBits;
1032 unsigned RHSSignBits = Known.countMinLeadingZeros();
1033 unsigned DivBits = SSBits - RHSSignBits;
1034 if (DivBits > MaxDivBits)
1035 return SSBits;
1036
1037 Known = computeKnownBits(Num, DL, AC, &I);
1038 if (Known.isNegative() || !Known.isNonNegative())
1039 return SSBits;
1040 unsigned LHSSignBits = Known.countMinLeadingZeros();
1041
1042 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1043 DivBits = SSBits - SignBits;
1044 return DivBits;
1045}
1046
1047// The fractional part of a float is enough to accurately represent up to
1048// a 24-bit signed integer.
1049Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1050 BinaryOperator &I, Value *Num,
1051 Value *Den, bool IsDiv,
1052 bool IsSigned) const {
1053 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1054 if (DivBits > 24)
1055 return nullptr;
1056 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1057}
1058
1059Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1060 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1061 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1062 Type *I32Ty = Builder.getInt32Ty();
1063 Num = Builder.CreateTrunc(Num, I32Ty);
1064 Den = Builder.CreateTrunc(Den, I32Ty);
1065
1066 Type *F32Ty = Builder.getFloatTy();
1067 ConstantInt *One = Builder.getInt32(1);
1068 Value *JQ = One;
1069
1070 if (IsSigned) {
1071 // char|short jq = ia ^ ib;
1072 JQ = Builder.CreateXor(Num, Den);
1073
1074 // jq = jq >> (bitsize - 2)
1075 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1076
1077 // jq = jq | 0x1
1078 JQ = Builder.CreateOr(JQ, One);
1079 }
1080
1081 // int ia = (int)LHS;
1082 Value *IA = Num;
1083
1084 // int ib, (int)RHS;
1085 Value *IB = Den;
1086
1087 // float fa = (float)ia;
1088 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1089 : Builder.CreateUIToFP(IA, F32Ty);
1090
1091 // float fb = (float)ib;
1092 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1093 : Builder.CreateUIToFP(IB,F32Ty);
1094
1095 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1096 Builder.getFloatTy(), {FB});
1097 Value *FQM = Builder.CreateFMul(FA, RCP);
1098
1099 // fq = trunc(fqm);
1100 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1101 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1102
1103 // float fqneg = -fq;
1104 Value *FQNeg = Builder.CreateFNeg(FQ);
1105
1106 // float fr = mad(fqneg, fb, fa);
1107 auto FMAD = !ST.hasMadMacF32Insts()
1108 ? Intrinsic::fma
1109 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1110 Value *FR = Builder.CreateIntrinsic(FMAD,
1111 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1112
1113 // int iq = (int)fq;
1114 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1115 : Builder.CreateFPToUI(FQ, I32Ty);
1116
1117 // fr = fabs(fr);
1118 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1119
1120 // fb = fabs(fb);
1121 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1122
1123 // int cv = fr >= fb;
1124 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1125
1126 // jq = (cv ? jq : 0);
1127 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1128
1129 // dst = iq + jq;
1130 Value *Div = Builder.CreateAdd(IQ, JQ);
1131
1132 Value *Res = Div;
1133 if (!IsDiv) {
1134 // Rem needs compensation, it's easier to recompute it
1135 Value *Rem = Builder.CreateMul(Div, Den);
1136 Res = Builder.CreateSub(Num, Rem);
1137 }
1138
1139 if (DivBits != 0 && DivBits < 32) {
1140 // Extend in register from the number of bits this divide really is.
1141 if (IsSigned) {
1142 int InRegBits = 32 - DivBits;
1143
1144 Res = Builder.CreateShl(Res, InRegBits);
1145 Res = Builder.CreateAShr(Res, InRegBits);
1146 } else {
1147 ConstantInt *TruncMask
1148 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1149 Res = Builder.CreateAnd(Res, TruncMask);
1150 }
1151 }
1152
1153 return Res;
1154}
1155
1156// Try to recognize special cases the DAG will emit special, better expansions
1157// than the general expansion we do here.
1158
1159// TODO: It would be better to just directly handle those optimizations here.
1160bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1161 Value *Num,
1162 Value *Den) const {
1163 if (Constant *C = dyn_cast<Constant>(Den)) {
1164 // Arbitrary constants get a better expansion as long as a wider mulhi is
1165 // legal.
1166 if (C->getType()->getScalarSizeInBits() <= 32)
1167 return true;
1168
1169 // TODO: Sdiv check for not exact for some reason.
1170
1171 // If there's no wider mulhi, there's only a better expansion for powers of
1172 // two.
1173 // TODO: Should really know for each vector element.
1174 if (isKnownToBeAPowerOfTwo(C, DL, true, AC, &I, DT))
1175 return true;
1176
1177 return false;
1178 }
1179
1180 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1181 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1182 if (BinOpDen->getOpcode() == Instruction::Shl &&
1183 isa<Constant>(BinOpDen->getOperand(0)) &&
1184 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), DL, true, AC, &I, DT)) {
1185 return true;
1186 }
1187 }
1188
1189 return false;
1190}
1191
1192static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1193 // Check whether the sign can be determined statically.
1194 KnownBits Known = computeKnownBits(V, DL);
1195 if (Known.isNegative())
1196 return Constant::getAllOnesValue(V->getType());
1197 if (Known.isNonNegative())
1198 return Constant::getNullValue(V->getType());
1199 return Builder.CreateAShr(V, Builder.getInt32(31));
1200}
1201
1202Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1203 BinaryOperator &I, Value *X,
1204 Value *Y) const {
1205 Instruction::BinaryOps Opc = I.getOpcode();
1206 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1207 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1208
1209 FastMathFlags FMF;
1210 FMF.setFast();
1211 Builder.setFastMathFlags(FMF);
1212
1213 if (divHasSpecialOptimization(I, X, Y))
1214 return nullptr; // Keep it for later optimization.
1215
1216 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1217 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1218
1219 Type *Ty = X->getType();
1220 Type *I32Ty = Builder.getInt32Ty();
1221 Type *F32Ty = Builder.getFloatTy();
1222
1223 if (Ty->getScalarSizeInBits() != 32) {
1224 if (IsSigned) {
1225 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1226 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1227 } else {
1228 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1229 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1230 }
1231 }
1232
1233 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1234 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1235 Builder.CreateZExtOrTrunc(Res, Ty);
1236 }
1237
1238 ConstantInt *Zero = Builder.getInt32(0);
1239 ConstantInt *One = Builder.getInt32(1);
1240
1241 Value *Sign = nullptr;
1242 if (IsSigned) {
1243 Value *SignX = getSign32(X, Builder, DL);
1244 Value *SignY = getSign32(Y, Builder, DL);
1245 // Remainder sign is the same as LHS
1246 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1247
1248 X = Builder.CreateAdd(X, SignX);
1249 Y = Builder.CreateAdd(Y, SignY);
1250
1251 X = Builder.CreateXor(X, SignX);
1252 Y = Builder.CreateXor(Y, SignY);
1253 }
1254
1255 // The algorithm here is based on ideas from "Software Integer Division", Tom
1256 // Rodeheffer, August 2008.
1257 //
1258 // unsigned udiv(unsigned x, unsigned y) {
1259 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1260 // // that this is a lower bound on inv(y), even if some of the calculations
1261 // // round up.
1262 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1263 //
1264 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1265 // // Empirically this is guaranteed to give a "two-y" lower bound on
1266 // // inv(y).
1267 // z += umulh(z, -y * z);
1268 //
1269 // // Quotient/remainder estimate.
1270 // unsigned q = umulh(x, z);
1271 // unsigned r = x - q * y;
1272 //
1273 // // Two rounds of quotient/remainder refinement.
1274 // if (r >= y) {
1275 // ++q;
1276 // r -= y;
1277 // }
1278 // if (r >= y) {
1279 // ++q;
1280 // r -= y;
1281 // }
1282 //
1283 // return q;
1284 // }
1285
1286 // Initial estimate of inv(y).
1287 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1288 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1289 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1290 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1291 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1292
1293 // One round of UNR.
1294 Value *NegY = Builder.CreateSub(Zero, Y);
1295 Value *NegYZ = Builder.CreateMul(NegY, Z);
1296 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1297
1298 // Quotient/remainder estimate.
1299 Value *Q = getMulHu(Builder, X, Z);
1300 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1301
1302 // First quotient/remainder refinement.
1303 Value *Cond = Builder.CreateICmpUGE(R, Y);
1304 if (IsDiv)
1305 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1306 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1307
1308 // Second quotient/remainder refinement.
1309 Cond = Builder.CreateICmpUGE(R, Y);
1310 Value *Res;
1311 if (IsDiv)
1312 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1313 else
1314 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1315
1316 if (IsSigned) {
1317 Res = Builder.CreateXor(Res, Sign);
1318 Res = Builder.CreateSub(Res, Sign);
1319 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1320 } else {
1321 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1322 }
1323 return Res;
1324}
1325
1326Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1327 BinaryOperator &I, Value *Num,
1328 Value *Den) const {
1329 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1330 return nullptr; // Keep it for later optimization.
1331
1332 Instruction::BinaryOps Opc = I.getOpcode();
1333
1334 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1335 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1336
1337 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1338 if (NumDivBits > 32)
1339 return nullptr;
1340
1341 Value *Narrowed = nullptr;
1342 if (NumDivBits <= 24) {
1343 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1344 IsDiv, IsSigned);
1345 } else if (NumDivBits <= 32) {
1346 Narrowed = expandDivRem32(Builder, I, Num, Den);
1347 }
1348
1349 if (Narrowed) {
1350 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1351 Builder.CreateZExt(Narrowed, Num->getType());
1352 }
1353
1354 return nullptr;
1355}
1356
1357void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1358 Instruction::BinaryOps Opc = I.getOpcode();
1359 // Do the general expansion.
1360 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1362 return;
1363 }
1364
1365 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1367 return;
1368 }
1369
1370 llvm_unreachable("not a division");
1371}
1372
1373/*
1374This will cause non-byte load in consistency, for example:
1375```
1376 %load = load i1, ptr addrspace(4) %arg, align 4
1377 %zext = zext i1 %load to
1378 i64 %add = add i64 %zext
1379```
1380Instead of creating `s_and_b32 s0, s0, 1`,
1381it will create `s_and_b32 s0, s0, 0xff`.
1382We accept this change since the non-byte load assumes the upper bits
1383within the byte are all 0.
1384*/
1385bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1386 unsigned Opc = I->getOpcode();
1387 Type *OldType = I->getType();
1388
1389 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1390 return false;
1391
1392 unsigned OrigBit = OldType->getScalarSizeInBits();
1393
1394 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1395 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1396 "Instruction::Mul.");
1397
1398 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1399
1400 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1401 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1402 if (!NewType)
1403 return false;
1404 unsigned NewBit = NewType->getIntegerBitWidth();
1405 if (NewBit >= OrigBit)
1406 return false;
1407 NewType = I->getType()->getWithNewBitWidth(NewBit);
1408
1409 // Old cost
1410 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1411 InstructionCost OldCost =
1413 // New cost of new op
1414 InstructionCost NewCost =
1416 // New cost of narrowing 2 operands (use trunc)
1417 int NumOfNonConstOps = 2;
1418 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1419 // Cannot be both constant, should be propagated
1420 NumOfNonConstOps = 1;
1421 }
1422 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1423 NewType, OldType,
1426 // New cost of zext narrowed result to original type
1427 NewCost +=
1428 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1430 if (NewCost >= OldCost)
1431 return false;
1432
1433 IRBuilder<> Builder(I);
1434 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1435 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1436 Value *Arith =
1437 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1438
1439 Value *Zext = Builder.CreateZExt(Arith, OldType);
1440 I->replaceAllUsesWith(Zext);
1441 DeadVals.push_back(I);
1442 return true;
1443}
1444
1445bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1446 if (foldBinOpIntoSelect(I))
1447 return true;
1448
1449 if (UseMul24Intrin && replaceMulWithMul24(I))
1450 return true;
1451 if (tryNarrowMathIfNoOverflow(&I))
1452 return true;
1453
1454 bool Changed = false;
1455 Instruction::BinaryOps Opc = I.getOpcode();
1456 Type *Ty = I.getType();
1457 Value *NewDiv = nullptr;
1458 unsigned ScalarSize = Ty->getScalarSizeInBits();
1459
1461
1462 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1463 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1464 ScalarSize <= 64 &&
1465 !DisableIDivExpand) {
1466 Value *Num = I.getOperand(0);
1467 Value *Den = I.getOperand(1);
1468 IRBuilder<> Builder(&I);
1469 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1470
1471 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1472 NewDiv = PoisonValue::get(VT);
1473
1474 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1475 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1476 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1477
1478 Value *NewElt;
1479 if (ScalarSize <= 32) {
1480 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1481 if (!NewElt)
1482 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1483 } else {
1484 // See if this 64-bit division can be shrunk to 32/24-bits before
1485 // producing the general expansion.
1486 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1487 if (!NewElt) {
1488 // The general 64-bit expansion introduces control flow and doesn't
1489 // return the new value. Just insert a scalar copy and defer
1490 // expanding it.
1491 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1492 // CreateBinOp does constant folding. If the operands are constant,
1493 // it will return a Constant instead of a BinaryOperator.
1494 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1495 Div64ToExpand.push_back(NewEltBO);
1496 }
1497 }
1498
1499 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1500 NewEltI->copyIRFlags(&I);
1501
1502 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1503 }
1504 } else {
1505 if (ScalarSize <= 32)
1506 NewDiv = expandDivRem32(Builder, I, Num, Den);
1507 else {
1508 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1509 if (!NewDiv)
1510 Div64ToExpand.push_back(&I);
1511 }
1512 }
1513
1514 if (NewDiv) {
1515 I.replaceAllUsesWith(NewDiv);
1516 DeadVals.push_back(&I);
1517 Changed = true;
1518 }
1519 }
1520
1521 if (ExpandDiv64InIR) {
1522 // TODO: We get much worse code in specially handled constant cases.
1523 for (BinaryOperator *Div : Div64ToExpand) {
1524 expandDivRem64(*Div);
1525 FlowChanged = true;
1526 Changed = true;
1527 }
1528 }
1529
1530 return Changed;
1531}
1532
1533bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1534 if (!WidenLoads)
1535 return false;
1536
1537 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1538 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1539 canWidenScalarExtLoad(I)) {
1540 IRBuilder<> Builder(&I);
1541 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1542
1543 Type *I32Ty = Builder.getInt32Ty();
1544 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1545 WidenLoad->copyMetadata(I);
1546
1547 // If we have range metadata, we need to convert the type, and not make
1548 // assumptions about the high bits.
1549 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1550 ConstantInt *Lower =
1551 mdconst::extract<ConstantInt>(Range->getOperand(0));
1552
1553 if (Lower->isNullValue()) {
1554 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1555 } else {
1556 Metadata *LowAndHigh[] = {
1557 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1558 // Don't make assumptions about the high bits.
1559 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1560 };
1561
1562 WidenLoad->setMetadata(LLVMContext::MD_range,
1563 MDNode::get(F.getContext(), LowAndHigh));
1564 }
1565 }
1566
1567 int TySize = DL.getTypeSizeInBits(I.getType());
1568 Type *IntNTy = Builder.getIntNTy(TySize);
1569 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1570 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1571 I.replaceAllUsesWith(ValOrig);
1572 DeadVals.push_back(&I);
1573 return true;
1574 }
1575
1576 return false;
1577}
1578
1579bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1580 Value *Cond = I.getCondition();
1581 Value *TrueVal = I.getTrueValue();
1582 Value *FalseVal = I.getFalseValue();
1583 Value *CmpVal;
1584 CmpPredicate Pred;
1585
1586 // Match fract pattern with nan check.
1587 if (!match(Cond, m_FCmp(Pred, m_Value(CmpVal), m_NonNaN())))
1588 return false;
1589
1590 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1591 if (!FPOp)
1592 return false;
1593
1594 IRBuilder<> Builder(&I);
1595 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1596
1597 auto *IITrue = dyn_cast<IntrinsicInst>(TrueVal);
1598 auto *IIFalse = dyn_cast<IntrinsicInst>(FalseVal);
1599
1600 Value *Fract = nullptr;
1601 if (Pred == FCmpInst::FCMP_UNO && TrueVal == CmpVal && IIFalse &&
1602 CmpVal == matchFractPat(*IIFalse)) {
1603 // isnan(x) ? x : fract(x)
1604 Fract = applyFractPat(Builder, CmpVal);
1605 } else if (Pred == FCmpInst::FCMP_ORD && FalseVal == CmpVal && IITrue &&
1606 CmpVal == matchFractPat(*IITrue)) {
1607 // !isnan(x) ? fract(x) : x
1608 Fract = applyFractPat(Builder, CmpVal);
1609 } else
1610 return false;
1611
1612 Fract->takeName(&I);
1613 I.replaceAllUsesWith(Fract);
1614 DeadVals.push_back(&I);
1615 return true;
1616}
1617
1618static bool areInSameBB(const Value *A, const Value *B) {
1619 const auto *IA = dyn_cast<Instruction>(A);
1620 const auto *IB = dyn_cast<Instruction>(B);
1621 return IA && IB && IA->getParent() == IB->getParent();
1622}
1623
1624// Helper for breaking large PHIs that returns true when an extractelement on V
1625// is likely to be folded away by the DAG combiner.
1627 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1628 if (!FVT)
1629 return false;
1630
1631 const Value *CurVal = V;
1632
1633 // Check for insertelements, keeping track of the elements covered.
1634 BitVector EltsCovered(FVT->getNumElements());
1635 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1636 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1637
1638 // Non constant index/out of bounds index -> folding is unlikely.
1639 // The latter is more of a sanity check because canonical IR should just
1640 // have replaced those with poison.
1641 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1642 return false;
1643
1644 const auto *VecSrc = IE->getOperand(0);
1645
1646 // If the vector source is another instruction, it must be in the same basic
1647 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1648 // unlikely to be able to do anything interesting here.
1649 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1650 return false;
1651
1652 CurVal = VecSrc;
1653 EltsCovered.set(Idx->getZExtValue());
1654
1655 // All elements covered.
1656 if (EltsCovered.all())
1657 return true;
1658 }
1659
1660 // We either didn't find a single insertelement, or the insertelement chain
1661 // ended before all elements were covered. Check for other interesting values.
1662
1663 // Constants are always interesting because we can just constant fold the
1664 // extractelements.
1665 if (isa<Constant>(CurVal))
1666 return true;
1667
1668 // shufflevector is likely to be profitable if either operand is a constant,
1669 // or if either source is in the same block.
1670 // This is because shufflevector is most often lowered as a series of
1671 // insert/extract elements anyway.
1672 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1673 return isa<Constant>(SV->getOperand(1)) ||
1674 areInSameBB(SV, SV->getOperand(0)) ||
1675 areInSameBB(SV, SV->getOperand(1));
1676 }
1677
1678 return false;
1679}
1680
1681static void collectPHINodes(const PHINode &I,
1683 const auto [It, Inserted] = SeenPHIs.insert(&I);
1684 if (!Inserted)
1685 return;
1686
1687 for (const Value *Inc : I.incoming_values()) {
1688 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1689 collectPHINodes(*PhiInc, SeenPHIs);
1690 }
1691
1692 for (const User *U : I.users()) {
1693 if (const auto *PhiU = dyn_cast<PHINode>(U))
1694 collectPHINodes(*PhiU, SeenPHIs);
1695 }
1696}
1697
1698bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1699 // Check in the cache first.
1700 if (const auto It = BreakPhiNodesCache.find(&I);
1701 It != BreakPhiNodesCache.end())
1702 return It->second;
1703
1704 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1705 // recursively consider all its users and incoming values that are also PHI
1706 // nodes. We then make a decision about all of those PHIs at once. Either they
1707 // all get broken up, or none of them do. That way, we avoid cases where a
1708 // single PHI is/is not broken and we end up reforming/exploding a vector
1709 // multiple times, or even worse, doing it in a loop.
1710 SmallPtrSet<const PHINode *, 8> WorkList;
1711 collectPHINodes(I, WorkList);
1712
1713#ifndef NDEBUG
1714 // Check that none of the PHI nodes in the worklist are in the map. If some of
1715 // them are, it means we're not good enough at collecting related PHIs.
1716 for (const PHINode *WLP : WorkList) {
1717 assert(BreakPhiNodesCache.count(WLP) == 0);
1718 }
1719#endif
1720
1721 // To consider a PHI profitable to break, we need to see some interesting
1722 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1723 // must have one to consider all PHIs breakable.
1724 //
1725 // This threshold has been determined through performance testing.
1726 //
1727 // Note that the computation below is equivalent to
1728 //
1729 // (unsigned)ceil((K / 3.0) * 2)
1730 //
1731 // It's simply written this way to avoid mixing integral/FP arithmetic.
1732 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1733 unsigned NumBreakablePHIs = 0;
1734 bool CanBreak = false;
1735 for (const PHINode *Cur : WorkList) {
1736 // Don't break PHIs that have no interesting incoming values. That is, where
1737 // there is no clear opportunity to fold the "extractelement" instructions
1738 // we would add.
1739 //
1740 // Note: IC does not run after this pass, so we're only interested in the
1741 // foldings that the DAG combiner can do.
1742 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1743 if (++NumBreakablePHIs >= Threshold) {
1744 CanBreak = true;
1745 break;
1746 }
1747 }
1748 }
1749
1750 for (const PHINode *Cur : WorkList)
1751 BreakPhiNodesCache[Cur] = CanBreak;
1752
1753 return CanBreak;
1754}
1755
1756/// Helper class for "break large PHIs" (visitPHINode).
1757///
1758/// This represents a slice of a PHI's incoming value, which is made up of:
1759/// - The type of the slice (Ty)
1760/// - The index in the incoming value's vector where the slice starts (Idx)
1761/// - The number of elements in the slice (NumElts).
1762/// It also keeps track of the NewPHI node inserted for this particular slice.
1763///
1764/// Slice examples:
1765/// <4 x i64> -> Split into four i64 slices.
1766/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1767/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1768/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1770public:
1771 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1772 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1773
1774 Type *Ty = nullptr;
1775 unsigned Idx = 0;
1776 unsigned NumElts = 0;
1777 PHINode *NewPHI = nullptr;
1778
1779 /// Slice \p Inc according to the information contained within this slice.
1780 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1781 /// pair, it returns the same Sliced value as well.
1782 ///
1783 /// Note this *intentionally* does not return the same value for, say,
1784 /// [%bb.0, %0] & [%bb.1, %0] as:
1785 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1786 /// the value in bb.1 may not be reachable from bb.0 if it's its
1787 /// predecessor.)
1788 /// - We also want to make our extract instructions as local as possible so
1789 /// the DAG has better chances of folding them out. Duplicating them like
1790 /// that is beneficial in that regard.
1791 ///
1792 /// This is both a minor optimization to avoid creating duplicate
1793 /// instructions, but also a requirement for correctness. It is not forbidden
1794 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1795 /// returned a new value each time, those previously identical pairs would all
1796 /// have different incoming values (from the same block) and it'd cause a "PHI
1797 /// node has multiple entries for the same basic block with different incoming
1798 /// values!" verifier error.
1799 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1800 Value *&Res = SlicedVals[{BB, Inc}];
1801 if (Res)
1802 return Res;
1803
1805 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1806 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1807
1808 if (NumElts > 1) {
1810 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1811 Mask.push_back(K);
1812 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1813 } else
1814 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1815
1816 return Res;
1817 }
1818
1819private:
1821};
1822
1823bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1824 // Break-up fixed-vector PHIs into smaller pieces.
1825 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1826 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1827 //
1828 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1829 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1830 // With large, odd-sized PHIs we may end up needing many `build_vector`
1831 // operations with most elements being "undef". This inhibits a lot of
1832 // optimization opportunities and can result in unreasonably high register
1833 // pressure and the inevitable stack spilling.
1834 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1835 return false;
1836
1837 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1838 if (!FVT || FVT->getNumElements() == 1 ||
1839 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1840 return false;
1841
1842 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1843 return false;
1844
1845 std::vector<VectorSlice> Slices;
1846
1847 Type *EltTy = FVT->getElementType();
1848 {
1849 unsigned Idx = 0;
1850 // For 8/16 bits type, don't scalarize fully but break it up into as many
1851 // 32-bit slices as we can, and scalarize the tail.
1852 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1853 const unsigned NumElts = FVT->getNumElements();
1854 if (EltSize == 8 || EltSize == 16) {
1855 const unsigned SubVecSize = (32 / EltSize);
1856 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1857 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1858 Idx += SubVecSize)
1859 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1860 }
1861
1862 // Scalarize all remaining elements.
1863 for (; Idx < NumElts; ++Idx)
1864 Slices.emplace_back(EltTy, Idx, 1);
1865 }
1866
1867 assert(Slices.size() > 1);
1868
1869 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1870 // creating the necessary instruction to extract the relevant slices of each
1871 // incoming value.
1872 IRBuilder<> B(I.getParent());
1873 B.SetCurrentDebugLocation(I.getDebugLoc());
1874
1875 unsigned IncNameSuffix = 0;
1876 for (VectorSlice &S : Slices) {
1877 // We need to reset the build on each iteration, because getSlicedVal may
1878 // have inserted something into I's BB.
1879 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1880 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1881
1882 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1883 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1884 "largephi.extractslice" +
1885 std::to_string(IncNameSuffix++)),
1886 BB);
1887 }
1888 }
1889
1890 // And replace this PHI with a vector of all the previous PHI values.
1891 Value *Vec = PoisonValue::get(FVT);
1892 unsigned NameSuffix = 0;
1893 for (VectorSlice &S : Slices) {
1894 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1895 if (S.NumElts > 1)
1896 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1897 else
1898 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1899 }
1900
1901 I.replaceAllUsesWith(Vec);
1902 DeadVals.push_back(&I);
1903 return true;
1904}
1905
1906/// \param V Value to check
1907/// \param DL DataLayout
1908/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1909/// \param AS Target Address Space
1910/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1911static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1912 const AMDGPUTargetMachine &TM, unsigned AS) {
1913 // Pointer cannot be null if it's a block address, GV or alloca.
1914 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1915 // it as the symbol could be null in such cases.
1917 return true;
1918
1919 // Check nonnull arguments.
1920 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
1921 return true;
1922
1923 // Check nonnull loads.
1924 if (const auto *Load = dyn_cast<LoadInst>(V);
1925 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1926 return true;
1927
1928 // getUnderlyingObject may have looked through another addrspacecast, although
1929 // the optimizable situations most likely folded out by now.
1930 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
1931 return false;
1932
1933 // TODO: Calls that return nonnull?
1934
1935 // For all other things, use KnownBits.
1936 // We either use 0 or all bits set to indicate null, so check whether the
1937 // value can be zero or all ones.
1938 //
1939 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1940 // address spaces have non-zero null values.
1941 auto SrcPtrKB = computeKnownBits(V, DL);
1942 const auto NullVal = TM.getNullPointerValue(AS);
1943
1944 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1945 assert((NullVal == 0 || NullVal == -1) &&
1946 "don't know how to check for this null value!");
1947 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1948}
1949
1950bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1951 // Intrinsic doesn't support vectors, also it seems that it's often difficult
1952 // to prove that a vector cannot have any nulls in it so it's unclear if it's
1953 // worth supporting.
1954 if (I.getType()->isVectorTy())
1955 return false;
1956
1957 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1958 // This is only worthwhile for casts from/to priv/local to flat.
1959 const unsigned SrcAS = I.getSrcAddressSpace();
1960 const unsigned DstAS = I.getDestAddressSpace();
1961
1962 bool CanLower = false;
1963 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
1964 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
1965 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
1966 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
1967 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
1968 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
1969 if (!CanLower)
1970 return false;
1971
1973 getUnderlyingObjects(I.getOperand(0), WorkList);
1974 if (!all_of(WorkList, [&](const Value *V) {
1975 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
1976 }))
1977 return false;
1978
1979 IRBuilder<> B(&I);
1980 auto *Intrin = B.CreateIntrinsic(
1981 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
1982 I.replaceAllUsesWith(Intrin);
1983 DeadVals.push_back(&I);
1984 return true;
1985}
1986
1987bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
1988 switch (I.getIntrinsicID()) {
1989 case Intrinsic::minnum:
1990 case Intrinsic::minimumnum:
1991 case Intrinsic::minimum:
1992 return visitFMinLike(I);
1993 case Intrinsic::sqrt:
1994 return visitSqrt(I);
1995 default:
1996 return false;
1997 }
1998}
1999
2000/// Match non-nan fract pattern.
2001/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2002/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2003/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2004///
2005/// If fract is a useful instruction for the subtarget. Does not account for the
2006/// nan handling; the instruction has a nan check on the input value.
2007Value *AMDGPUCodeGenPrepareImpl::matchFractPat(IntrinsicInst &I) {
2008 if (ST.hasFractBug())
2009 return nullptr;
2010
2011 Intrinsic::ID IID = I.getIntrinsicID();
2012
2013 // The value is only used in contexts where we know the input isn't a nan, so
2014 // any of the fmin variants are fine.
2015 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2016 IID != Intrinsic::minimumnum)
2017 return nullptr;
2018
2019 Type *Ty = I.getType();
2020 if (!isLegalFloatingTy(Ty->getScalarType()))
2021 return nullptr;
2022
2023 Value *Arg0 = I.getArgOperand(0);
2024 Value *Arg1 = I.getArgOperand(1);
2025
2026 const APFloat *C;
2027 if (!match(Arg1, m_APFloat(C)))
2028 return nullptr;
2029
2030 APFloat One(1.0);
2031 bool LosesInfo;
2032 One.convert(C->getSemantics(), APFloat::rmNearestTiesToEven, &LosesInfo);
2033
2034 // Match nextafter(1.0, -1)
2035 One.next(true);
2036 if (One != *C)
2037 return nullptr;
2038
2039 Value *FloorSrc;
2040 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2042 return FloorSrc;
2043 return nullptr;
2044}
2045
2046Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2047 Value *FractArg) {
2048 SmallVector<Value *, 4> FractVals;
2049 extractValues(Builder, FractVals, FractArg);
2050
2051 SmallVector<Value *, 4> ResultVals(FractVals.size());
2052
2053 Type *Ty = FractArg->getType()->getScalarType();
2054 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2055 ResultVals[I] =
2056 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2057 }
2058
2059 return insertValues(Builder, FractArg->getType(), ResultVals);
2060}
2061
2062bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2063 Value *FractArg = matchFractPat(I);
2064 if (!FractArg)
2065 return false;
2066
2067 // Match pattern for fract intrinsic in contexts where the nan check has been
2068 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2069 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SimplifyQuery(DL, TLI)))
2070 return false;
2071
2072 IRBuilder<> Builder(&I);
2073 FastMathFlags FMF = I.getFastMathFlags();
2074 FMF.setNoNaNs();
2075 Builder.setFastMathFlags(FMF);
2076
2077 Value *Fract = applyFractPat(Builder, FractArg);
2078 Fract->takeName(&I);
2079 I.replaceAllUsesWith(Fract);
2080 DeadVals.push_back(&I);
2081 return true;
2082}
2083
2084// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2085bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2086 Type *Ty = Sqrt.getType()->getScalarType();
2087 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2088 return false;
2089
2090 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2091 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2092
2093 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2094 // of fast llvm.sqrt will give the raw instruction anyway.
2095 if (SqrtFMF.approxFunc())
2096 return false;
2097
2098 const float ReqdAccuracy = FPOp->getFPAccuracy();
2099
2100 // Defer correctly rounded expansion to codegen.
2101 if (ReqdAccuracy < 1.0f)
2102 return false;
2103
2104 Value *SrcVal = Sqrt.getOperand(0);
2105 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2106
2107 // The raw instruction is 1 ulp, but the correction for denormal handling
2108 // brings it to 2.
2109 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2110 return false;
2111
2112 IRBuilder<> Builder(&Sqrt);
2114 extractValues(Builder, SrcVals, SrcVal);
2115
2116 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2117 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2118 if (CanTreatAsDAZ)
2119 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2120 else
2121 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2122 }
2123
2124 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2125 NewSqrt->takeName(&Sqrt);
2126 Sqrt.replaceAllUsesWith(NewSqrt);
2127 DeadVals.push_back(&Sqrt);
2128 return true;
2129}
2130
2131bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2132 if (skipFunction(F))
2133 return false;
2134
2135 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2136 if (!TPC)
2137 return false;
2138
2139 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2140 const TargetLibraryInfo *TLI =
2141 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2142 AssumptionCache *AC =
2143 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2144 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2145 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2146 const UniformityInfo &UA =
2147 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2148 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2149}
2150
2153 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2154 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2155 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2156 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2157 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2158 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2159 if (!Impl.run())
2160 return PreservedAnalyses::all();
2162 if (!Impl.FlowChanged)
2164 return PA;
2165}
2166
2167INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2168 "AMDGPU IR optimizations", false, false)
2172INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2174
2175char AMDGPUCodeGenPrepare::ID = 0;
2176
2178 return new AMDGPUCodeGenPrepare();
2179}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
void visit(MachineFunction &MF, MachineBasicBlock &Start, std::function< void(MachineBasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
bool hasMadMacF32Insts() const
static int64_t getNullPointerValue(unsigned AddrSpace)
Get the integer value of a null pointer in the given address space.
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction if the block is well formed or null if the block is not well forme...
Definition BasicBlock.h:233
BinaryOps getOpcode() const
Definition InstrTypes.h:374
BitVector & set()
Definition BitVector.h:370
bool all() const
all - Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:610
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:536
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
void setFast(bool B=true)
Definition FMF.h:96
bool noInfs() const
Definition FMF.h:66
bool allowReciprocal() const
Definition FMF.h:68
bool approxFunc() const
Definition FMF.h:70
void setNoNaNs(bool B=true)
Definition FMF.h:78
bool noNaNs() const
Definition FMF.h:65
bool allowContract() const
Definition FMF.h:69
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool hasFractBug() const
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1670
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2158
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:575
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2103
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2626
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2131
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2097
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:562
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2145
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2387
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1784
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1420
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1073
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:334
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2372
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1403
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:590
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2511
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2344
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1599
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1651
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1793
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2118
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1437
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2382
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2138
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2788
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1569
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:297
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:296
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:153
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:156
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
Value * getOperand(unsigned i) const
Definition User.h:233
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:517
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:667
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:345
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
auto reverse(ContainerTy &&C)
Definition STLExtras.h:406
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's know this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.