LLVM 23.0.0git
AMDGPUCodeGenPrepare.cpp
Go to the documentation of this file.
1//===-- AMDGPUCodeGenPrepare.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This pass does misc. AMDGPU optimizations on IR before instruction
11/// selection.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPU.h"
16#include "AMDGPUTargetMachine.h"
18#include "llvm/ADT/SetVector.h"
26#include "llvm/IR/Dominators.h"
27#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/InstVisitor.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include "llvm/IR/ValueHandle.h"
33#include "llvm/Pass.h"
39
40#define DEBUG_TYPE "amdgpu-codegenprepare"
41
42using namespace llvm;
43using namespace llvm::PatternMatch;
44
45namespace {
46
48 "amdgpu-codegenprepare-widen-constant-loads",
49 cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
51 cl::init(false));
52
53static cl::opt<bool>
54 BreakLargePHIs("amdgpu-codegenprepare-break-large-phis",
55 cl::desc("Break large PHI nodes for DAGISel"),
57
58static cl::opt<bool>
59 ForceBreakLargePHIs("amdgpu-codegenprepare-force-break-large-phis",
60 cl::desc("For testing purposes, always break large "
61 "PHIs even if it isn't profitable."),
63
64static cl::opt<unsigned> BreakLargePHIsThreshold(
65 "amdgpu-codegenprepare-break-large-phis-threshold",
66 cl::desc("Minimum type size in bits for breaking large PHI nodes"),
68
69static cl::opt<bool> UseMul24Intrin(
70 "amdgpu-codegenprepare-mul24",
71 cl::desc("Introduce mul24 intrinsics in AMDGPUCodeGenPrepare"),
73 cl::init(true));
74
75// Legalize 64-bit division by using the generic IR expansion.
76static cl::opt<bool> ExpandDiv64InIR(
77 "amdgpu-codegenprepare-expand-div64",
78 cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
80 cl::init(false));
81
82// Leave all division operations as they are. This supersedes ExpandDiv64InIR
83// and is used for testing the legalizer.
84static cl::opt<bool> DisableIDivExpand(
85 "amdgpu-codegenprepare-disable-idiv-expansion",
86 cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
88 cl::init(false));
89
90// Disable processing of fdiv so we can better test the backend implementations.
91static cl::opt<bool> DisableFDivExpand(
92 "amdgpu-codegenprepare-disable-fdiv-expansion",
93 cl::desc("Prevent expanding floating point division in AMDGPUCodeGenPrepare"),
95 cl::init(false));
96
97class AMDGPUCodeGenPrepareImpl
98 : public InstVisitor<AMDGPUCodeGenPrepareImpl, bool> {
99public:
100 Function &F;
101 const GCNSubtarget &ST;
102 const AMDGPUTargetMachine &TM;
103 const TargetLibraryInfo *TLI;
104 const UniformityInfo &UA;
105 const DataLayout &DL;
106 SimplifyQuery SQ;
107 const bool HasFP32DenormalFlush;
108 bool FlowChanged = false;
109 mutable Function *SqrtF32 = nullptr;
110 mutable Function *LdexpF32 = nullptr;
111 mutable SmallVector<WeakVH> DeadVals;
112
113 DenseMap<const PHINode *, bool> BreakPhiNodesCache;
114
115 AMDGPUCodeGenPrepareImpl(Function &F, const AMDGPUTargetMachine &TM,
116 const TargetLibraryInfo *TLI, AssumptionCache *AC,
117 const DominatorTree *DT, const UniformityInfo &UA)
118 : F(F), ST(TM.getSubtarget<GCNSubtarget>(F)), TM(TM), TLI(TLI), UA(UA),
119 DL(F.getDataLayout()), SQ(DL, TLI, DT, AC),
120 HasFP32DenormalFlush(SIModeRegisterDefaults(F, ST).FP32Denormals ==
122
123 Function *getSqrtF32() const {
124 if (SqrtF32)
125 return SqrtF32;
126
127 LLVMContext &Ctx = F.getContext();
129 F.getParent(), Intrinsic::amdgcn_sqrt, {Type::getFloatTy(Ctx)});
130 return SqrtF32;
131 }
132
133 Function *getLdexpF32() const {
134 if (LdexpF32)
135 return LdexpF32;
136
137 LLVMContext &Ctx = F.getContext();
139 F.getParent(), Intrinsic::ldexp,
140 {Type::getFloatTy(Ctx), Type::getInt32Ty(Ctx)});
141 return LdexpF32;
142 }
143
144 bool canBreakPHINode(const PHINode &I);
145
146 /// Return true if \p T is a legal scalar floating point type.
147 bool isLegalFloatingTy(const Type *T) const;
148
149 /// Wrapper to pass all the arguments to computeKnownFPClass
151 const Instruction *CtxI) const {
152 return llvm::computeKnownFPClass(V, Interested,
153 SQ.getWithInstruction(CtxI));
154 }
155
156 bool canIgnoreDenormalInput(const Value *V, const Instruction *CtxI) const {
157 return HasFP32DenormalFlush ||
159 }
160
161 /// \returns The minimum number of bits needed to store the value of \Op as an
162 /// unsigned integer. Truncating to this size and then zero-extending to
163 /// the original will not change the value.
164 unsigned numBitsUnsigned(Value *Op, const Instruction *CtxI) const;
165
166 /// \returns The minimum number of bits needed to store the value of \Op as a
167 /// signed integer. Truncating to this size and then sign-extending to
168 /// the original size will not change the value.
169 unsigned numBitsSigned(Value *Op, const Instruction *CtxI) const;
170
171 /// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
172 /// SelectionDAG has an issue where an and asserting the bits are known
173 bool replaceMulWithMul24(BinaryOperator &I) const;
174
175 /// Perform same function as equivalently named function in DAGCombiner. Since
176 /// we expand some divisions here, we need to perform this before obscuring.
177 bool foldBinOpIntoSelect(BinaryOperator &I) const;
178
179 bool divHasSpecialOptimization(BinaryOperator &I,
180 Value *Num, Value *Den) const;
181 unsigned getDivNumBits(BinaryOperator &I, Value *Num, Value *Den,
182 unsigned MaxDivBits, bool Signed) const;
183
184 /// Expands 24 bit div or rem.
185 Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
186 Value *Num, Value *Den,
187 bool IsDiv, bool IsSigned) const;
188
189 Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
190 Value *Num, Value *Den, unsigned NumBits,
191 bool IsDiv, bool IsSigned) const;
192
193 /// Expands 32 bit div or rem.
194 Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
195 Value *Num, Value *Den) const;
196
197 Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
198 Value *Num, Value *Den) const;
199 void expandDivRem64(BinaryOperator &I) const;
200
201 /// Widen a scalar load.
202 ///
203 /// \details \p Widen scalar load for uniform, small type loads from constant
204 // memory / to a full 32-bits and then truncate the input to allow a scalar
205 // load instead of a vector load.
206 //
207 /// \returns True.
208
209 bool canWidenScalarExtLoad(LoadInst &I) const;
210
211 Value *matchFractPat(Value &V);
212 Value *applyFractPat(IRBuilder<> &Builder, Value *FractArg);
213
214 bool canOptimizeWithRsq(FastMathFlags DivFMF, FastMathFlags SqrtFMF) const;
215
216 Value *optimizeWithRsq(IRBuilder<> &Builder, Value *Num, Value *Den,
217 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
218 const Instruction *CtxI) const;
219
220 Value *optimizeWithRcp(IRBuilder<> &Builder, Value *Num, Value *Den,
221 FastMathFlags FMF, const Instruction *CtxI) const;
222 Value *optimizeWithFDivFast(IRBuilder<> &Builder, Value *Num, Value *Den,
223 float ReqdAccuracy) const;
224
225 Value *visitFDivElement(IRBuilder<> &Builder, Value *Num, Value *Den,
226 FastMathFlags DivFMF, FastMathFlags SqrtFMF,
227 Value *RsqOp, const Instruction *FDiv,
228 float ReqdAccuracy) const;
229
230 std::pair<Value *, Value *> getFrexpResults(IRBuilder<> &Builder,
231 Value *Src) const;
232
233 Value *emitRcpIEEE1ULP(IRBuilder<> &Builder, Value *Src,
234 bool IsNegative) const;
235 Value *emitFrexpDiv(IRBuilder<> &Builder, Value *LHS, Value *RHS,
236 FastMathFlags FMF) const;
237 Value *emitSqrtIEEE2ULP(IRBuilder<> &Builder, Value *Src,
238 FastMathFlags FMF) const;
239 Value *emitRsqF64(IRBuilder<> &Builder, Value *X, FastMathFlags SqrtFMF,
240 FastMathFlags DivFMF, const Instruction *CtxI,
241 bool IsNegative) const;
242
243 CallInst *createWorkitemIdX(IRBuilder<> &B) const;
244 void replaceWithWorkitemIdX(Instruction &I) const;
245 void replaceWithMaskedWorkitemIdX(Instruction &I, unsigned WaveSize) const;
246 bool tryReplaceWithWorkitemId(Instruction &I, unsigned Wave) const;
247
248 bool tryNarrowMathIfNoOverflow(Instruction *I);
249
250public:
251 bool visitFDiv(BinaryOperator &I);
252
253 bool visitInstruction(Instruction &I) { return false; }
254 bool visitBinaryOperator(BinaryOperator &I);
255 bool visitLoadInst(LoadInst &I);
256 bool visitSelectInst(SelectInst &I);
257 bool visitPHINode(PHINode &I);
258 bool visitAddrSpaceCastInst(AddrSpaceCastInst &I);
259
260 bool visitIntrinsicInst(IntrinsicInst &I);
261 bool visitFMinLike(IntrinsicInst &I);
262 bool visitSqrt(IntrinsicInst &I);
263 bool visitLog(FPMathOperator &Log, Intrinsic::ID IID);
264 bool visitMbcntLo(IntrinsicInst &I) const;
265 bool visitMbcntHi(IntrinsicInst &I) const;
266 bool run();
267};
268
269class AMDGPUCodeGenPrepare : public FunctionPass {
270public:
271 static char ID;
272 AMDGPUCodeGenPrepare() : FunctionPass(ID) {}
273 void getAnalysisUsage(AnalysisUsage &AU) const override {
277
278 // FIXME: Division expansion needs to preserve the dominator tree.
279 if (!ExpandDiv64InIR)
280 AU.setPreservesAll();
281 }
282 bool runOnFunction(Function &F) override;
283 StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
284};
285
286} // end anonymous namespace
287
288bool AMDGPUCodeGenPrepareImpl::run() {
289 BreakPhiNodesCache.clear();
290 bool MadeChange = false;
291
292 // Need to use make_early_inc_range because integer division expansion is
293 // handled by Transform/Utils, and it can delete instructions such as the
294 // terminator of the BB.
295 for (BasicBlock &BB : reverse(F)) {
296 for (Instruction &I : make_early_inc_range(reverse(BB))) {
297 if (!isInstructionTriviallyDead(&I, TLI))
298 MadeChange |= visit(I);
299 }
300 }
301
302 while (!DeadVals.empty()) {
303 if (auto *I = dyn_cast_or_null<Instruction>(DeadVals.pop_back_val()))
305 }
306
307 return MadeChange;
308}
309
310bool AMDGPUCodeGenPrepareImpl::isLegalFloatingTy(const Type *Ty) const {
311 return Ty->isFloatTy() || Ty->isDoubleTy() ||
312 (Ty->isHalfTy() && ST.has16BitInsts());
313}
314
315bool AMDGPUCodeGenPrepareImpl::canWidenScalarExtLoad(LoadInst &I) const {
316 Type *Ty = I.getType();
317 int TySize = DL.getTypeSizeInBits(Ty);
318 Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
319
320 return I.isSimple() && TySize < 32 && Alignment >= 4 && UA.isUniform(&I);
321}
322
323unsigned
324AMDGPUCodeGenPrepareImpl::numBitsUnsigned(Value *Op,
325 const Instruction *CtxI) const {
326 return computeKnownBits(Op, SQ.getWithInstruction(CtxI)).countMaxActiveBits();
327}
328
329unsigned
330AMDGPUCodeGenPrepareImpl::numBitsSigned(Value *Op,
331 const Instruction *CtxI) const {
332 return ComputeMaxSignificantBits(Op, SQ.DL, SQ.AC, CtxI, SQ.DT);
333}
334
335static void extractValues(IRBuilder<> &Builder,
336 SmallVectorImpl<Value *> &Values, Value *V) {
337 auto *VT = dyn_cast<FixedVectorType>(V->getType());
338 if (!VT) {
339 Values.push_back(V);
340 return;
341 }
342
343 for (int I = 0, E = VT->getNumElements(); I != E; ++I)
344 Values.push_back(Builder.CreateExtractElement(V, I));
345}
346
348 Type *Ty,
349 SmallVectorImpl<Value *> &Values) {
350 if (!Ty->isVectorTy()) {
351 assert(Values.size() == 1);
352 return Values[0];
353 }
354
355 Value *NewVal = PoisonValue::get(Ty);
356 for (int I = 0, E = Values.size(); I != E; ++I)
357 NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);
358
359 return NewVal;
360}
361
362bool AMDGPUCodeGenPrepareImpl::replaceMulWithMul24(BinaryOperator &I) const {
363 if (I.getOpcode() != Instruction::Mul)
364 return false;
365
366 Type *Ty = I.getType();
367 unsigned Size = Ty->getScalarSizeInBits();
368 if (Size <= 16 && ST.has16BitInsts())
369 return false;
370
371 // Prefer scalar if this could be s_mul_i32
372 if (UA.isUniform(&I))
373 return false;
374
375 Value *LHS = I.getOperand(0);
376 Value *RHS = I.getOperand(1);
377 IRBuilder<> Builder(&I);
378 Builder.SetCurrentDebugLocation(I.getDebugLoc());
379
380 unsigned LHSBits = 0, RHSBits = 0;
381 bool IsSigned = false;
382
383 if (ST.hasMulU24() && (LHSBits = numBitsUnsigned(LHS, &I)) <= 24 &&
384 (RHSBits = numBitsUnsigned(RHS, &I)) <= 24) {
385 IsSigned = false;
386
387 } else if (ST.hasMulI24() && (LHSBits = numBitsSigned(LHS, &I)) <= 24 &&
388 (RHSBits = numBitsSigned(RHS, &I)) <= 24) {
389 IsSigned = true;
390
391 } else
392 return false;
393
394 SmallVector<Value *, 4> LHSVals;
395 SmallVector<Value *, 4> RHSVals;
396 SmallVector<Value *, 4> ResultVals;
397 extractValues(Builder, LHSVals, LHS);
398 extractValues(Builder, RHSVals, RHS);
399
400 IntegerType *I32Ty = Builder.getInt32Ty();
401 IntegerType *IntrinTy = Size > 32 ? Builder.getInt64Ty() : I32Ty;
402 Type *DstTy = LHSVals[0]->getType();
403
404 for (int I = 0, E = LHSVals.size(); I != E; ++I) {
405 Value *LHS = IsSigned ? Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty)
406 : Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
407 Value *RHS = IsSigned ? Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty)
408 : Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
410 IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24;
411 Value *Result = Builder.CreateIntrinsic(ID, {IntrinTy}, {LHS, RHS});
412 Result = IsSigned ? Builder.CreateSExtOrTrunc(Result, DstTy)
413 : Builder.CreateZExtOrTrunc(Result, DstTy);
414 ResultVals.push_back(Result);
415 }
416
417 Value *NewVal = insertValues(Builder, Ty, ResultVals);
418 NewVal->takeName(&I);
419 I.replaceAllUsesWith(NewVal);
420 DeadVals.push_back(&I);
421
422 return true;
423}
424
425// Find a select instruction, which may have been casted. This is mostly to deal
426// with cases where i16 selects were promoted here to i32.
428 Cast = nullptr;
429 if (SelectInst *Sel = dyn_cast<SelectInst>(V))
430 return Sel;
431
432 if ((Cast = dyn_cast<CastInst>(V))) {
433 if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
434 return Sel;
435 }
436
437 return nullptr;
438}
439
440bool AMDGPUCodeGenPrepareImpl::foldBinOpIntoSelect(BinaryOperator &BO) const {
441 // Don't do this unless the old select is going away. We want to eliminate the
442 // binary operator, not replace a binop with a select.
443 int SelOpNo = 0;
444
445 CastInst *CastOp;
446
447 // TODO: Should probably try to handle some cases with multiple
448 // users. Duplicating the select may be profitable for division.
449 SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
450 if (!Sel || !Sel->hasOneUse()) {
451 SelOpNo = 1;
452 Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
453 }
454
455 if (!Sel || !Sel->hasOneUse())
456 return false;
457
460 Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
461 if (!CBO || !CT || !CF)
462 return false;
463
464 if (CastOp) {
465 if (!CastOp->hasOneUse())
466 return false;
467 CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), DL);
468 CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), DL);
469 }
470
471 // TODO: Handle special 0/-1 cases DAG combine does, although we only really
472 // need to handle divisions here.
473 Constant *FoldedT =
474 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, DL)
475 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, DL);
476 if (!FoldedT || isa<ConstantExpr>(FoldedT))
477 return false;
478
479 Constant *FoldedF =
480 SelOpNo ? ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, DL)
481 : ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, DL);
482 if (!FoldedF || isa<ConstantExpr>(FoldedF))
483 return false;
484
485 IRBuilder<> Builder(&BO);
486 Builder.SetCurrentDebugLocation(BO.getDebugLoc());
487 if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
488 Builder.setFastMathFlags(FPOp->getFastMathFlags());
489
490 Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
491 FoldedT, FoldedF);
492 NewSelect->takeName(&BO);
493 BO.replaceAllUsesWith(NewSelect);
494 DeadVals.push_back(&BO);
495 if (CastOp)
496 DeadVals.push_back(CastOp);
497 DeadVals.push_back(Sel);
498 return true;
499}
500
501std::pair<Value *, Value *>
502AMDGPUCodeGenPrepareImpl::getFrexpResults(IRBuilder<> &Builder,
503 Value *Src) const {
504 Type *Ty = Src->getType();
505 Value *Frexp = Builder.CreateIntrinsic(Intrinsic::frexp,
506 {Ty, Builder.getInt32Ty()}, Src);
507 Value *FrexpMant = Builder.CreateExtractValue(Frexp, {0});
508
509 // Bypass the bug workaround for the exponent result since it doesn't matter.
510 // TODO: Does the bug workaround even really need to consider the exponent
511 // result? It's unspecified by the spec.
512
513 Value *FrexpExp =
514 ST.hasFractBug()
515 ? Builder.CreateIntrinsic(Intrinsic::amdgcn_frexp_exp,
516 {Builder.getInt32Ty(), Ty}, Src)
517 : Builder.CreateExtractValue(Frexp, {1});
518 return {FrexpMant, FrexpExp};
519}
520
521/// Emit an expansion of 1.0 / Src good for 1ulp that supports denormals.
522Value *AMDGPUCodeGenPrepareImpl::emitRcpIEEE1ULP(IRBuilder<> &Builder,
523 Value *Src,
524 bool IsNegative) const {
525 // Same as for 1.0, but expand the sign out of the constant.
526 // -1.0 / x -> rcp (fneg x)
527 if (IsNegative)
528 Src = Builder.CreateFNeg(Src);
529
530 // The rcp instruction doesn't support denormals, so scale the input
531 // out of the denormal range and convert at the end.
532 //
533 // Expand as 2^-n * (1.0 / (x * 2^n))
534
535 // TODO: Skip scaling if input is known never denormal and the input
536 // range won't underflow to denormal. The hard part is knowing the
537 // result. We need a range check, the result could be denormal for
538 // 0x1p+126 < den <= 0x1p+127.
539 auto [FrexpMant, FrexpExp] = getFrexpResults(Builder, Src);
540 Value *ScaleFactor = Builder.CreateNeg(FrexpExp);
541 Value *Rcp = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMant);
542 return Builder.CreateCall(getLdexpF32(), {Rcp, ScaleFactor});
543}
544
545/// Emit a 2ulp expansion for fdiv by using frexp for input scaling.
546Value *AMDGPUCodeGenPrepareImpl::emitFrexpDiv(IRBuilder<> &Builder, Value *LHS,
547 Value *RHS,
548 FastMathFlags FMF) const {
549 // If we have have to work around the fract/frexp bug, we're worse off than
550 // using the fdiv.fast expansion. The full safe expansion is faster if we have
551 // fast FMA.
552 if (HasFP32DenormalFlush && ST.hasFractBug() && !ST.hasFastFMAF32() &&
553 (!FMF.noNaNs() || !FMF.noInfs()))
554 return nullptr;
555
556 // We're scaling the LHS to avoid a denormal input, and scale the denominator
557 // to avoid large values underflowing the result.
558 auto [FrexpMantRHS, FrexpExpRHS] = getFrexpResults(Builder, RHS);
559
560 Value *Rcp =
561 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, FrexpMantRHS);
562
563 auto [FrexpMantLHS, FrexpExpLHS] = getFrexpResults(Builder, LHS);
564 Value *Mul = Builder.CreateFMul(FrexpMantLHS, Rcp);
565
566 // We multiplied by 2^N/2^M, so we need to multiply by 2^(N-M) to scale the
567 // result.
568 Value *ExpDiff = Builder.CreateSub(FrexpExpLHS, FrexpExpRHS);
569 return Builder.CreateCall(getLdexpF32(), {Mul, ExpDiff});
570}
571
572/// Emit a sqrt that handles denormals and is accurate to 2ulp.
573Value *AMDGPUCodeGenPrepareImpl::emitSqrtIEEE2ULP(IRBuilder<> &Builder,
574 Value *Src,
575 FastMathFlags FMF) const {
576 Type *Ty = Src->getType();
577 APFloat SmallestNormal =
579 Value *NeedScale =
580 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
581
582 ConstantInt *Zero = Builder.getInt32(0);
583 Value *InputScaleFactor =
584 Builder.CreateSelect(NeedScale, Builder.getInt32(32), Zero);
585
586 Value *Scaled = Builder.CreateCall(getLdexpF32(), {Src, InputScaleFactor});
587
588 Value *Sqrt = Builder.CreateCall(getSqrtF32(), Scaled);
589
590 Value *OutputScaleFactor =
591 Builder.CreateSelect(NeedScale, Builder.getInt32(-16), Zero);
592 return Builder.CreateCall(getLdexpF32(), {Sqrt, OutputScaleFactor});
593}
594
595/// Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
596static Value *emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src,
597 bool IsNegative) {
598 // bool need_scale = x < 0x1p-126f;
599 // float input_scale = need_scale ? 0x1.0p+24f : 1.0f;
600 // float output_scale = need_scale ? 0x1.0p+12f : 1.0f;
601 // rsq(x * input_scale) * output_scale;
602
603 Type *Ty = Src->getType();
604 APFloat SmallestNormal =
605 APFloat::getSmallestNormalized(Ty->getFltSemantics());
606 Value *NeedScale =
607 Builder.CreateFCmpOLT(Src, ConstantFP::get(Ty, SmallestNormal));
608 Constant *One = ConstantFP::get(Ty, 1.0);
609 Constant *InputScale = ConstantFP::get(Ty, 0x1.0p+24);
610 Constant *OutputScale =
611 ConstantFP::get(Ty, IsNegative ? -0x1.0p+12 : 0x1.0p+12);
612
613 Value *InputScaleFactor = Builder.CreateSelect(NeedScale, InputScale, One);
614
615 Value *ScaledInput = Builder.CreateFMul(Src, InputScaleFactor);
616 Value *Rsq = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, ScaledInput);
617 Value *OutputScaleFactor = Builder.CreateSelect(
618 NeedScale, OutputScale, IsNegative ? ConstantFP::get(Ty, -1.0) : One);
619
620 return Builder.CreateFMul(Rsq, OutputScaleFactor);
621}
622
623/// Emit inverse sqrt expansion for f64 with a correction sequence on top of
624/// v_rsq_f64. This should give a 1ulp result.
625Value *AMDGPUCodeGenPrepareImpl::emitRsqF64(IRBuilder<> &Builder, Value *X,
626 FastMathFlags SqrtFMF,
627 FastMathFlags DivFMF,
628 const Instruction *CtxI,
629 bool IsNegative) const {
630 // rsq(x):
631 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
632 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
633 // return MATH_MAD(y0*e, MATH_MAD(e, 0.375, 0.5), y0);
634 //
635 // -rsq(x):
636 // double y0 = BUILTIN_AMDGPU_RSQRT_F64(x);
637 // double e = MATH_MAD(-y0 * (x == PINF_F64 || x == 0.0 ? y0 : x), y0, 1.0);
638 // return MATH_MAD(-y0*e, MATH_MAD(e, 0.375, 0.5), -y0);
639 //
640 // The rsq instruction handles the special cases correctly. We need to check
641 // for the edge case conditions to ensure the special case propagates through
642 // the later instructions.
643
644 Value *Y0 = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, X);
645
646 // Try to elide the edge case check.
647 //
648 // Fast math flags imply:
649 // sqrt ninf => !isinf(x)
650 // fdiv ninf => x != 0, !isinf(x)
651 bool MaybePosInf = !SqrtFMF.noInfs() && !DivFMF.noInfs();
652 bool MaybeZero = !DivFMF.noInfs();
653
654 DenormalMode DenormMode;
655 FPClassTest Interested = fcNone;
656 if (MaybePosInf)
657 Interested = fcPosInf;
658 if (MaybeZero)
659 Interested |= fcZero;
660
661 if (Interested != fcNone) {
662 KnownFPClass KnownSrc = computeKnownFPClass(X, Interested, CtxI);
663 if (KnownSrc.isKnownNeverPosInfinity())
664 MaybePosInf = false;
665
666 DenormMode = F.getDenormalMode(X->getType()->getFltSemantics());
667 if (KnownSrc.isKnownNeverLogicalZero(DenormMode))
668 MaybeZero = false;
669 }
670
671 Value *SpecialOrRsq = X;
672 if (MaybeZero || MaybePosInf) {
673 Value *Cond;
674 if (MaybePosInf && MaybeZero) {
675 if (DenormMode.Input != DenormalMode::DenormalModeKind::Dynamic) {
676 FPClassTest TestMask = fcPosInf | fcZero;
677 if (DenormMode.inputsAreZero())
678 TestMask |= fcSubnormal;
679
680 Cond = Builder.createIsFPClass(X, TestMask);
681 } else {
682 // Avoid using llvm.is.fpclass for dynamic denormal mode, since it
683 // doesn't respect the floating-point environment.
684 Value *IsZero =
685 Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
686 Value *IsInf =
687 Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
688 Cond = Builder.CreateOr(IsZero, IsInf);
689 }
690 } else if (MaybeZero) {
691 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getZero(X->getType()));
692 } else {
693 Cond = Builder.CreateFCmpOEQ(X, ConstantFP::getInfinity(X->getType()));
694 }
695
696 SpecialOrRsq = Builder.CreateSelect(Cond, Y0, X);
697 }
698
699 Value *NegY0 = Builder.CreateFNeg(Y0);
700 Value *NegXY0 = Builder.CreateFMul(SpecialOrRsq, NegY0);
701
702 // Could be fmuladd, but isFMAFasterThanFMulAndFAdd is always true for f64.
703 Value *E = Builder.CreateFMA(NegXY0, Y0, ConstantFP::get(X->getType(), 1.0));
704
705 Value *Y0E = Builder.CreateFMul(E, IsNegative ? NegY0 : Y0);
706
707 Value *EFMA = Builder.CreateFMA(E, ConstantFP::get(X->getType(), 0.375),
708 ConstantFP::get(X->getType(), 0.5));
709
710 return Builder.CreateFMA(Y0E, EFMA, IsNegative ? NegY0 : Y0);
711}
712
713bool AMDGPUCodeGenPrepareImpl::canOptimizeWithRsq(FastMathFlags DivFMF,
714 FastMathFlags SqrtFMF) const {
715 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp for f32 and
716 // f64.
717 return DivFMF.allowContract() && SqrtFMF.allowContract();
718}
719
720Value *AMDGPUCodeGenPrepareImpl::optimizeWithRsq(
721 IRBuilder<> &Builder, Value *Num, Value *Den, const FastMathFlags DivFMF,
722 const FastMathFlags SqrtFMF, const Instruction *CtxI) const {
723 // The rsqrt contraction increases accuracy from ~2ulp to ~1ulp.
724 assert(DivFMF.allowContract() && SqrtFMF.allowContract());
725
726 // rsq_f16 is accurate to 0.51 ulp.
727 // rsq_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
728 // rsq_f64 is never accurate.
729 const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num);
730 if (!CLHS)
731 return nullptr;
732
733 bool IsNegative = false;
734
735 // TODO: Handle other numerator values with arcp.
736 if (CLHS->isExactlyValue(1.0) || (IsNegative = CLHS->isExactlyValue(-1.0))) {
737 // Add in the sqrt flags.
738 IRBuilder<>::FastMathFlagGuard Guard(Builder);
739 Builder.setFastMathFlags(DivFMF | SqrtFMF);
740
741 if (Den->getType()->isFloatTy()) {
742 if ((DivFMF.approxFunc() && SqrtFMF.approxFunc()) ||
743 canIgnoreDenormalInput(Den, CtxI)) {
744 Value *Result =
745 Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rsq, Den);
746 // -1.0 / sqrt(x) -> fneg(rsq(x))
747 return IsNegative ? Builder.CreateFNeg(Result) : Result;
748 }
749
750 return emitRsqIEEE1ULP(Builder, Den, IsNegative);
751 }
752
753 if (Den->getType()->isDoubleTy())
754 return emitRsqF64(Builder, Den, SqrtFMF, DivFMF, CtxI, IsNegative);
755 }
756
757 return nullptr;
758}
759
760// Optimize fdiv with rcp:
761//
762// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
763// allowed with afn.
764//
765// a/b -> a*rcp(b) when arcp is allowed, and we only need provide ULP 1.0
766Value *
767AMDGPUCodeGenPrepareImpl::optimizeWithRcp(IRBuilder<> &Builder, Value *Num,
768 Value *Den, FastMathFlags FMF,
769 const Instruction *CtxI) const {
770 // rcp_f16 is accurate to 0.51 ulp.
771 // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
772 // rcp_f64 is never accurate.
773 assert(Den->getType()->isFloatTy());
774
775 if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
776 bool IsNegative = false;
777 if (CLHS->isExactlyValue(1.0) ||
778 (IsNegative = CLHS->isExactlyValue(-1.0))) {
779 Value *Src = Den;
780
781 if (HasFP32DenormalFlush || FMF.approxFunc()) {
782 // -1.0 / x -> 1.0 / fneg(x)
783 if (IsNegative)
784 Src = Builder.CreateFNeg(Src);
785
786 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
787 // the CI documentation has a worst case error of 1 ulp.
788 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK
789 // to use it as long as we aren't trying to use denormals.
790 //
791 // v_rcp_f16 and v_rsq_f16 DO support denormals.
792
793 // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
794 // insert rsq intrinsic here.
795
796 // 1.0 / x -> rcp(x)
797 return Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Src);
798 }
799
800 // TODO: If the input isn't denormal, and we know the input exponent isn't
801 // big enough to introduce a denormal we can avoid the scaling.
802 return emitRcpIEEE1ULP(Builder, Src, IsNegative);
803 }
804 }
805
806 if (FMF.allowReciprocal()) {
807 // x / y -> x * (1.0 / y)
808
809 // TODO: Could avoid denormal scaling and use raw rcp if we knew the output
810 // will never underflow.
811 if (HasFP32DenormalFlush || FMF.approxFunc()) {
812 Value *Recip = Builder.CreateUnaryIntrinsic(Intrinsic::amdgcn_rcp, Den);
813 return Builder.CreateFMul(Num, Recip);
814 }
815
816 Value *Recip = emitRcpIEEE1ULP(Builder, Den, false);
817 return Builder.CreateFMul(Num, Recip);
818 }
819
820 return nullptr;
821}
822
823// optimize with fdiv.fast:
824//
825// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
826//
827// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
828//
829// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
830Value *AMDGPUCodeGenPrepareImpl::optimizeWithFDivFast(
831 IRBuilder<> &Builder, Value *Num, Value *Den, float ReqdAccuracy) const {
832 // fdiv.fast can achieve 2.5 ULP accuracy.
833 if (ReqdAccuracy < 2.5f)
834 return nullptr;
835
836 // Only have fdiv.fast for f32.
837 assert(Den->getType()->isFloatTy());
838
839 bool NumIsOne = false;
840 if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
841 if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
842 NumIsOne = true;
843 }
844
845 // fdiv does not support denormals. But 1.0/x is always fine to use it.
846 //
847 // TODO: This works for any value with a specific known exponent range, don't
848 // just limit to constant 1.
849 if (!HasFP32DenormalFlush && !NumIsOne)
850 return nullptr;
851
852 return Builder.CreateIntrinsic(Intrinsic::amdgcn_fdiv_fast, {Num, Den});
853}
854
855Value *AMDGPUCodeGenPrepareImpl::visitFDivElement(
856 IRBuilder<> &Builder, Value *Num, Value *Den, FastMathFlags DivFMF,
857 FastMathFlags SqrtFMF, Value *RsqOp, const Instruction *FDivInst,
858 float ReqdDivAccuracy) const {
859 if (RsqOp) {
860 Value *Rsq =
861 optimizeWithRsq(Builder, Num, RsqOp, DivFMF, SqrtFMF, FDivInst);
862 if (Rsq)
863 return Rsq;
864 }
865
866 if (!Num->getType()->isFloatTy())
867 return nullptr;
868
869 Value *Rcp = optimizeWithRcp(Builder, Num, Den, DivFMF, FDivInst);
870 if (Rcp)
871 return Rcp;
872
873 // In the basic case fdiv_fast has the same instruction count as the frexp div
874 // expansion. Slightly prefer fdiv_fast since it ends in an fmul that can
875 // potentially be fused into a user. Also, materialization of the constants
876 // can be reused for multiple instances.
877 Value *FDivFast = optimizeWithFDivFast(Builder, Num, Den, ReqdDivAccuracy);
878 if (FDivFast)
879 return FDivFast;
880
881 return emitFrexpDiv(Builder, Num, Den, DivFMF);
882}
883
884// Optimizations is performed based on fpmath, fast math flags as well as
885// denormals to optimize fdiv with either rcp or fdiv.fast.
886//
887// With rcp:
888// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
889// allowed with afn.
890//
891// a/b -> a*rcp(b) when inaccurate rcp is allowed with afn.
892//
893// With fdiv.fast:
894// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
895//
896// 1/x -> fdiv.fast(1,x) when !fpmath >= 2.5ulp.
897//
898// NOTE: rcp is the preference in cases that both are legal.
899bool AMDGPUCodeGenPrepareImpl::visitFDiv(BinaryOperator &FDiv) {
900 if (DisableFDivExpand)
901 return false;
902
903 Type *Ty = FDiv.getType()->getScalarType();
904 const bool IsFloat = Ty->isFloatTy();
905 if (!IsFloat && !Ty->isDoubleTy())
906 return false;
907
908 // The f64 rcp/rsq approximations are pretty inaccurate. We can do an
909 // expansion around them in codegen. f16 is good enough to always use.
910
911 const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
912 const FastMathFlags DivFMF = FPOp->getFastMathFlags();
913 const float ReqdAccuracy = FPOp->getFPAccuracy();
914
915 FastMathFlags SqrtFMF;
916
917 Value *Num = FDiv.getOperand(0);
918 Value *Den = FDiv.getOperand(1);
919
920 Value *RsqOp = nullptr;
921 auto *DenII = dyn_cast<IntrinsicInst>(Den);
922 if (DenII && DenII->getIntrinsicID() == Intrinsic::sqrt &&
923 DenII->hasOneUse()) {
924 const auto *SqrtOp = cast<FPMathOperator>(DenII);
925 SqrtFMF = SqrtOp->getFastMathFlags();
926 if (canOptimizeWithRsq(DivFMF, SqrtFMF))
927 RsqOp = SqrtOp->getOperand(0);
928 }
929
930 // rcp path not yet implemented for f64.
931 if (!IsFloat && !RsqOp)
932 return false;
933
934 // Inaccurate rcp is allowed with afn.
935 //
936 // Defer to codegen to handle this.
937 //
938 // TODO: Decide on an interpretation for interactions between afn + arcp +
939 // !fpmath, and make it consistent between here and codegen. For now, defer
940 // expansion of afn to codegen. The current interpretation is so aggressive we
941 // don't need any pre-consideration here when we have better information. A
942 // more conservative interpretation could use handling here.
943 const bool AllowInaccurateRcp = DivFMF.approxFunc();
944 if (!RsqOp && AllowInaccurateRcp)
945 return false;
946
947 // Defer the correct implementations to codegen.
948 if (IsFloat && ReqdAccuracy < 1.0f)
949 return false;
950
951 IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
952 Builder.setFastMathFlags(DivFMF);
953 Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
954
955 SmallVector<Value *, 4> NumVals;
956 SmallVector<Value *, 4> DenVals;
957 SmallVector<Value *, 4> RsqDenVals;
958 extractValues(Builder, NumVals, Num);
959 extractValues(Builder, DenVals, Den);
960
961 if (RsqOp)
962 extractValues(Builder, RsqDenVals, RsqOp);
963
964 SmallVector<Value *, 4> ResultVals(NumVals.size());
965 for (int I = 0, E = NumVals.size(); I != E; ++I) {
966 Value *NumElt = NumVals[I];
967 Value *DenElt = DenVals[I];
968 Value *RsqDenElt = RsqOp ? RsqDenVals[I] : nullptr;
969
970 Value *NewElt =
971 visitFDivElement(Builder, NumElt, DenElt, DivFMF, SqrtFMF, RsqDenElt,
972 cast<Instruction>(FPOp), ReqdAccuracy);
973 if (!NewElt) {
974 // Keep the original, but scalarized.
975
976 // This has the unfortunate side effect of sometimes scalarizing when
977 // we're not going to do anything.
978 NewElt = Builder.CreateFDiv(NumElt, DenElt);
979 if (auto *NewEltInst = dyn_cast<Instruction>(NewElt))
980 NewEltInst->copyMetadata(FDiv);
981 }
982
983 ResultVals[I] = NewElt;
984 }
985
986 Value *NewVal = insertValues(Builder, FDiv.getType(), ResultVals);
987
988 if (NewVal) {
989 FDiv.replaceAllUsesWith(NewVal);
990 NewVal->takeName(&FDiv);
991 DeadVals.push_back(&FDiv);
992 }
993
994 return true;
995}
996
997static std::pair<Value*, Value*> getMul64(IRBuilder<> &Builder,
998 Value *LHS, Value *RHS) {
999 Type *I32Ty = Builder.getInt32Ty();
1000 Type *I64Ty = Builder.getInt64Ty();
1001
1002 Value *LHS_EXT64 = Builder.CreateZExt(LHS, I64Ty);
1003 Value *RHS_EXT64 = Builder.CreateZExt(RHS, I64Ty);
1004 Value *MUL64 = Builder.CreateMul(LHS_EXT64, RHS_EXT64);
1005 Value *Lo = Builder.CreateTrunc(MUL64, I32Ty);
1006 Value *Hi = Builder.CreateLShr(MUL64, Builder.getInt64(32));
1007 Hi = Builder.CreateTrunc(Hi, I32Ty);
1008 return std::pair(Lo, Hi);
1009}
1010
1011static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
1012 return getMul64(Builder, LHS, RHS).second;
1013}
1014
1015/// Figure out how many bits are really needed for this division.
1016/// \p MaxDivBits is an optimization hint to bypass the second
1017/// ComputeNumSignBits/computeKnownBits call if the first one is
1018/// insufficient.
1019unsigned AMDGPUCodeGenPrepareImpl::getDivNumBits(BinaryOperator &I, Value *Num,
1020 Value *Den,
1021 unsigned MaxDivBits,
1022 bool IsSigned) const {
1024 Den->getType()->getScalarSizeInBits());
1025 unsigned SSBits = Num->getType()->getScalarSizeInBits();
1026 if (IsSigned) {
1027 unsigned RHSSignBits = ComputeNumSignBits(Den, SQ.DL, SQ.AC, &I, SQ.DT);
1028 // A sign bit needs to be reserved for shrinking.
1029 unsigned DivBits = SSBits - RHSSignBits + 1;
1030 if (DivBits > MaxDivBits)
1031 return SSBits;
1032
1033 unsigned LHSSignBits = ComputeNumSignBits(Num, SQ.DL, SQ.AC, &I);
1034
1035 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1036 DivBits = SSBits - SignBits + 1;
1037 return DivBits;
1038 }
1039
1040 // All bits are used for unsigned division for Num or Den in range
1041 // (SignedMax, UnsignedMax].
1042 KnownBits Known = computeKnownBits(Den, SQ.getWithInstruction(&I));
1043 if (Known.isNegative() || !Known.isNonNegative())
1044 return SSBits;
1045 unsigned RHSSignBits = Known.countMinLeadingZeros();
1046 unsigned DivBits = SSBits - RHSSignBits;
1047 if (DivBits > MaxDivBits)
1048 return SSBits;
1049
1050 Known = computeKnownBits(Num, SQ.getWithInstruction(&I));
1051 if (Known.isNegative() || !Known.isNonNegative())
1052 return SSBits;
1053 unsigned LHSSignBits = Known.countMinLeadingZeros();
1054
1055 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
1056 DivBits = SSBits - SignBits;
1057 return DivBits;
1058}
1059
1060// The fractional part of a float is enough to accurately represent up to
1061// a 24-bit signed integer.
1062Value *AMDGPUCodeGenPrepareImpl::expandDivRem24(IRBuilder<> &Builder,
1063 BinaryOperator &I, Value *Num,
1064 Value *Den, bool IsDiv,
1065 bool IsSigned) const {
1066 unsigned DivBits = getDivNumBits(I, Num, Den, 24, IsSigned);
1067 if (DivBits > 24)
1068 return nullptr;
1069 return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
1070}
1071
1072Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
1073 IRBuilder<> &Builder, BinaryOperator &I, Value *Num, Value *Den,
1074 unsigned DivBits, bool IsDiv, bool IsSigned) const {
1075 Type *I32Ty = Builder.getInt32Ty();
1076 Num = Builder.CreateTrunc(Num, I32Ty);
1077 Den = Builder.CreateTrunc(Den, I32Ty);
1078
1079 Type *F32Ty = Builder.getFloatTy();
1080 ConstantInt *One = Builder.getInt32(1);
1081 Value *JQ = One;
1082
1083 if (IsSigned) {
1084 // char|short jq = ia ^ ib;
1085 JQ = Builder.CreateXor(Num, Den);
1086
1087 // jq = jq >> (bitsize - 2)
1088 JQ = Builder.CreateAShr(JQ, Builder.getInt32(30));
1089
1090 // jq = jq | 0x1
1091 JQ = Builder.CreateOr(JQ, One);
1092 }
1093
1094 // int ia = (int)LHS;
1095 Value *IA = Num;
1096
1097 // int ib, (int)RHS;
1098 Value *IB = Den;
1099
1100 // float fa = (float)ia;
1101 Value *FA = IsSigned ? Builder.CreateSIToFP(IA, F32Ty)
1102 : Builder.CreateUIToFP(IA, F32Ty);
1103
1104 // float fb = (float)ib;
1105 Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
1106 : Builder.CreateUIToFP(IB,F32Ty);
1107
1108 Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
1109 Builder.getFloatTy(), {FB});
1110 Value *FQM = Builder.CreateFMul(FA, RCP);
1111
1112 // fq = trunc(fqm);
1113 CallInst *FQ = Builder.CreateUnaryIntrinsic(Intrinsic::trunc, FQM);
1114 FQ->copyFastMathFlags(Builder.getFastMathFlags());
1115
1116 // float fqneg = -fq;
1117 Value *FQNeg = Builder.CreateFNeg(FQ);
1118
1119 // float fr = mad(fqneg, fb, fa);
1120 auto FMAD = !ST.hasMadMacF32Insts()
1121 ? Intrinsic::fma
1122 : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
1123 Value *FR = Builder.CreateIntrinsic(FMAD,
1124 {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
1125
1126 // int iq = (int)fq;
1127 Value *IQ = IsSigned ? Builder.CreateFPToSI(FQ, I32Ty)
1128 : Builder.CreateFPToUI(FQ, I32Ty);
1129
1130 // fr = fabs(fr);
1131 FR = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FR, FQ);
1132
1133 // fb = fabs(fb);
1134 FB = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, FB, FQ);
1135
1136 // int cv = fr >= fb;
1137 Value *CV = Builder.CreateFCmpOGE(FR, FB);
1138
1139 // jq = (cv ? jq : 0);
1140 JQ = Builder.CreateSelect(CV, JQ, Builder.getInt32(0));
1141
1142 // dst = iq + jq;
1143 Value *Div = Builder.CreateAdd(IQ, JQ);
1144
1145 Value *Res = Div;
1146 if (!IsDiv) {
1147 // Rem needs compensation, it's easier to recompute it
1148 Value *Rem = Builder.CreateMul(Div, Den);
1149 Res = Builder.CreateSub(Num, Rem);
1150 }
1151
1152 if (DivBits != 0 && DivBits < 32) {
1153 // Extend in register from the number of bits this divide really is.
1154 if (IsSigned) {
1155 int InRegBits = 32 - DivBits;
1156
1157 Res = Builder.CreateShl(Res, InRegBits);
1158 Res = Builder.CreateAShr(Res, InRegBits);
1159 } else {
1160 ConstantInt *TruncMask
1161 = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
1162 Res = Builder.CreateAnd(Res, TruncMask);
1163 }
1164 }
1165
1166 return Res;
1167}
1168
1169// Try to recognize special cases the DAG will emit special, better expansions
1170// than the general expansion we do here.
1171
1172// TODO: It would be better to just directly handle those optimizations here.
1173bool AMDGPUCodeGenPrepareImpl::divHasSpecialOptimization(BinaryOperator &I,
1174 Value *Num,
1175 Value *Den) const {
1176 if (Constant *C = dyn_cast<Constant>(Den)) {
1177 // Arbitrary constants get a better expansion as long as a wider mulhi is
1178 // legal.
1179 if (C->getType()->getScalarSizeInBits() <= 32)
1180 return true;
1181
1182 // TODO: Sdiv check for not exact for some reason.
1183
1184 // If there's no wider mulhi, there's only a better expansion for powers of
1185 // two.
1186 // TODO: Should really know for each vector element.
1188 return true;
1189
1190 return false;
1191 }
1192
1193 if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
1194 // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
1195 if (BinOpDen->getOpcode() == Instruction::Shl &&
1196 isa<Constant>(BinOpDen->getOperand(0)) &&
1197 isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), true,
1198 SQ.getWithInstruction(&I))) {
1199 return true;
1200 }
1201 }
1202
1203 return false;
1204}
1205
1206static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL) {
1207 // Check whether the sign can be determined statically.
1208 KnownBits Known = computeKnownBits(V, DL);
1209 if (Known.isNegative())
1210 return Constant::getAllOnesValue(V->getType());
1211 if (Known.isNonNegative())
1212 return Constant::getNullValue(V->getType());
1213 return Builder.CreateAShr(V, Builder.getInt32(31));
1214}
1215
1216Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
1217 BinaryOperator &I, Value *X,
1218 Value *Y) const {
1219 Instruction::BinaryOps Opc = I.getOpcode();
1220 assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
1221 Opc == Instruction::SRem || Opc == Instruction::SDiv);
1222
1223 FastMathFlags FMF;
1224 FMF.setFast();
1225 Builder.setFastMathFlags(FMF);
1226
1227 if (divHasSpecialOptimization(I, X, Y))
1228 return nullptr; // Keep it for later optimization.
1229
1230 bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
1231 bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
1232
1233 Type *Ty = X->getType();
1234 Type *I32Ty = Builder.getInt32Ty();
1235 Type *F32Ty = Builder.getFloatTy();
1236
1237 if (Ty->getScalarSizeInBits() != 32) {
1238 if (IsSigned) {
1239 X = Builder.CreateSExtOrTrunc(X, I32Ty);
1240 Y = Builder.CreateSExtOrTrunc(Y, I32Ty);
1241 } else {
1242 X = Builder.CreateZExtOrTrunc(X, I32Ty);
1243 Y = Builder.CreateZExtOrTrunc(Y, I32Ty);
1244 }
1245 }
1246
1247 if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
1248 return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
1249 Builder.CreateZExtOrTrunc(Res, Ty);
1250 }
1251
1252 ConstantInt *Zero = Builder.getInt32(0);
1253 ConstantInt *One = Builder.getInt32(1);
1254
1255 Value *Sign = nullptr;
1256 if (IsSigned) {
1257 Value *SignX = getSign32(X, Builder, DL);
1258 Value *SignY = getSign32(Y, Builder, DL);
1259 // Remainder sign is the same as LHS
1260 Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
1261
1262 X = Builder.CreateAdd(X, SignX);
1263 Y = Builder.CreateAdd(Y, SignY);
1264
1265 X = Builder.CreateXor(X, SignX);
1266 Y = Builder.CreateXor(Y, SignY);
1267 }
1268
1269 // The algorithm here is based on ideas from "Software Integer Division", Tom
1270 // Rodeheffer, August 2008.
1271 //
1272 // unsigned udiv(unsigned x, unsigned y) {
1273 // // Initial estimate of inv(y). The constant is less than 2^32 to ensure
1274 // // that this is a lower bound on inv(y), even if some of the calculations
1275 // // round up.
1276 // unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
1277 //
1278 // // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
1279 // // Empirically this is guaranteed to give a "two-y" lower bound on
1280 // // inv(y).
1281 // z += umulh(z, -y * z);
1282 //
1283 // // Quotient/remainder estimate.
1284 // unsigned q = umulh(x, z);
1285 // unsigned r = x - q * y;
1286 //
1287 // // Two rounds of quotient/remainder refinement.
1288 // if (r >= y) {
1289 // ++q;
1290 // r -= y;
1291 // }
1292 // if (r >= y) {
1293 // ++q;
1294 // r -= y;
1295 // }
1296 //
1297 // return q;
1298 // }
1299
1300 // Initial estimate of inv(y).
1301 Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
1302 Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
1303 Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
1304 Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
1305 Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
1306
1307 // One round of UNR.
1308 Value *NegY = Builder.CreateSub(Zero, Y);
1309 Value *NegYZ = Builder.CreateMul(NegY, Z);
1310 Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
1311
1312 // Quotient/remainder estimate.
1313 Value *Q = getMulHu(Builder, X, Z);
1314 Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
1315
1316 // First quotient/remainder refinement.
1317 Value *Cond = Builder.CreateICmpUGE(R, Y);
1318 if (IsDiv)
1319 Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1320 R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1321
1322 // Second quotient/remainder refinement.
1323 Cond = Builder.CreateICmpUGE(R, Y);
1324 Value *Res;
1325 if (IsDiv)
1326 Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
1327 else
1328 Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
1329
1330 if (IsSigned) {
1331 Res = Builder.CreateXor(Res, Sign);
1332 Res = Builder.CreateSub(Res, Sign);
1333 Res = Builder.CreateSExtOrTrunc(Res, Ty);
1334 } else {
1335 Res = Builder.CreateZExtOrTrunc(Res, Ty);
1336 }
1337 return Res;
1338}
1339
1340Value *AMDGPUCodeGenPrepareImpl::shrinkDivRem64(IRBuilder<> &Builder,
1341 BinaryOperator &I, Value *Num,
1342 Value *Den) const {
1343 if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
1344 return nullptr; // Keep it for later optimization.
1345
1346 Instruction::BinaryOps Opc = I.getOpcode();
1347
1348 bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
1349 bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
1350
1351 unsigned NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
1352 if (NumDivBits > 32)
1353 return nullptr;
1354
1355 Value *Narrowed = nullptr;
1356 if (NumDivBits <= 24) {
1357 Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
1358 IsDiv, IsSigned);
1359 } else if (NumDivBits <= 32) {
1360 Narrowed = expandDivRem32(Builder, I, Num, Den);
1361 }
1362
1363 if (Narrowed) {
1364 return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
1365 Builder.CreateZExt(Narrowed, Num->getType());
1366 }
1367
1368 return nullptr;
1369}
1370
1371void AMDGPUCodeGenPrepareImpl::expandDivRem64(BinaryOperator &I) const {
1372 Instruction::BinaryOps Opc = I.getOpcode();
1373 // Do the general expansion.
1374 if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
1376 return;
1377 }
1378
1379 if (Opc == Instruction::URem || Opc == Instruction::SRem) {
1381 return;
1382 }
1383
1384 llvm_unreachable("not a division");
1385}
1386
1387/*
1388This will cause non-byte load in consistency, for example:
1389```
1390 %load = load i1, ptr addrspace(4) %arg, align 4
1391 %zext = zext i1 %load to
1392 i64 %add = add i64 %zext
1393```
1394Instead of creating `s_and_b32 s0, s0, 1`,
1395it will create `s_and_b32 s0, s0, 0xff`.
1396We accept this change since the non-byte load assumes the upper bits
1397within the byte are all 0.
1398*/
1399bool AMDGPUCodeGenPrepareImpl::tryNarrowMathIfNoOverflow(Instruction *I) {
1400 unsigned Opc = I->getOpcode();
1401 Type *OldType = I->getType();
1402
1403 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1404 return false;
1405
1406 unsigned OrigBit = OldType->getScalarSizeInBits();
1407
1408 if (Opc != Instruction::Add && Opc != Instruction::Mul)
1409 llvm_unreachable("Unexpected opcode, only valid for Instruction::Add and "
1410 "Instruction::Mul.");
1411
1412 unsigned MaxBitsNeeded = computeKnownBits(I, DL).countMaxActiveBits();
1413
1414 MaxBitsNeeded = std::max<unsigned>(bit_ceil(MaxBitsNeeded), 8);
1415 Type *NewType = DL.getSmallestLegalIntType(I->getContext(), MaxBitsNeeded);
1416 if (!NewType)
1417 return false;
1418 unsigned NewBit = NewType->getIntegerBitWidth();
1419 if (NewBit >= OrigBit)
1420 return false;
1421 NewType = I->getType()->getWithNewBitWidth(NewBit);
1422
1423 // Old cost
1424 const TargetTransformInfo &TTI = TM.getTargetTransformInfo(F);
1425 InstructionCost OldCost =
1427 // New cost of new op
1428 InstructionCost NewCost =
1430 // New cost of narrowing 2 operands (use trunc)
1431 int NumOfNonConstOps = 2;
1432 if (isa<Constant>(I->getOperand(0)) || isa<Constant>(I->getOperand(1))) {
1433 // Cannot be both constant, should be propagated
1434 NumOfNonConstOps = 1;
1435 }
1436 NewCost += NumOfNonConstOps * TTI.getCastInstrCost(Instruction::Trunc,
1437 NewType, OldType,
1440 // New cost of zext narrowed result to original type
1441 NewCost +=
1442 TTI.getCastInstrCost(Instruction::ZExt, OldType, NewType,
1444 if (NewCost >= OldCost)
1445 return false;
1446
1447 IRBuilder<> Builder(I);
1448 Value *Trunc0 = Builder.CreateTrunc(I->getOperand(0), NewType);
1449 Value *Trunc1 = Builder.CreateTrunc(I->getOperand(1), NewType);
1450 Value *Arith =
1451 Builder.CreateBinOp((Instruction::BinaryOps)Opc, Trunc0, Trunc1);
1452
1453 Value *Zext = Builder.CreateZExt(Arith, OldType);
1454 I->replaceAllUsesWith(Zext);
1455 DeadVals.push_back(I);
1456 return true;
1457}
1458
1459bool AMDGPUCodeGenPrepareImpl::visitBinaryOperator(BinaryOperator &I) {
1460 if (foldBinOpIntoSelect(I))
1461 return true;
1462
1463 if (UseMul24Intrin && replaceMulWithMul24(I))
1464 return true;
1465 if (tryNarrowMathIfNoOverflow(&I))
1466 return true;
1467
1468 bool Changed = false;
1469 Instruction::BinaryOps Opc = I.getOpcode();
1470 Type *Ty = I.getType();
1471 Value *NewDiv = nullptr;
1472 unsigned ScalarSize = Ty->getScalarSizeInBits();
1473
1475
1476 if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
1477 Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
1478 ScalarSize <= 64 &&
1479 !DisableIDivExpand) {
1480 Value *Num = I.getOperand(0);
1481 Value *Den = I.getOperand(1);
1482 IRBuilder<> Builder(&I);
1483 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1484
1485 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1486 NewDiv = PoisonValue::get(VT);
1487
1488 for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
1489 Value *NumEltN = Builder.CreateExtractElement(Num, N);
1490 Value *DenEltN = Builder.CreateExtractElement(Den, N);
1491
1492 Value *NewElt;
1493 if (ScalarSize <= 32) {
1494 NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
1495 if (!NewElt)
1496 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1497 } else {
1498 // See if this 64-bit division can be shrunk to 32/24-bits before
1499 // producing the general expansion.
1500 NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
1501 if (!NewElt) {
1502 // The general 64-bit expansion introduces control flow and doesn't
1503 // return the new value. Just insert a scalar copy and defer
1504 // expanding it.
1505 NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
1506 // CreateBinOp does constant folding. If the operands are constant,
1507 // it will return a Constant instead of a BinaryOperator.
1508 if (auto *NewEltBO = dyn_cast<BinaryOperator>(NewElt))
1509 Div64ToExpand.push_back(NewEltBO);
1510 }
1511 }
1512
1513 if (auto *NewEltI = dyn_cast<Instruction>(NewElt))
1514 NewEltI->copyIRFlags(&I);
1515
1516 NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
1517 }
1518 } else {
1519 if (ScalarSize <= 32)
1520 NewDiv = expandDivRem32(Builder, I, Num, Den);
1521 else {
1522 NewDiv = shrinkDivRem64(Builder, I, Num, Den);
1523 if (!NewDiv)
1524 Div64ToExpand.push_back(&I);
1525 }
1526 }
1527
1528 if (NewDiv) {
1529 I.replaceAllUsesWith(NewDiv);
1530 DeadVals.push_back(&I);
1531 Changed = true;
1532 }
1533 }
1534
1535 if (ExpandDiv64InIR) {
1536 // TODO: We get much worse code in specially handled constant cases.
1537 for (BinaryOperator *Div : Div64ToExpand) {
1538 expandDivRem64(*Div);
1539 FlowChanged = true;
1540 Changed = true;
1541 }
1542 }
1543
1544 return Changed;
1545}
1546
1547bool AMDGPUCodeGenPrepareImpl::visitLoadInst(LoadInst &I) {
1548 if (!WidenLoads)
1549 return false;
1550
1551 if ((I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
1552 I.getPointerAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
1553 canWidenScalarExtLoad(I)) {
1554 IRBuilder<> Builder(&I);
1555 Builder.SetCurrentDebugLocation(I.getDebugLoc());
1556
1557 Type *I32Ty = Builder.getInt32Ty();
1558 LoadInst *WidenLoad = Builder.CreateLoad(I32Ty, I.getPointerOperand());
1559 WidenLoad->copyMetadata(I);
1560
1561 // If we have range metadata, we need to convert the type, and not make
1562 // assumptions about the high bits.
1563 if (auto *Range = WidenLoad->getMetadata(LLVMContext::MD_range)) {
1564 ConstantInt *Lower =
1565 mdconst::extract<ConstantInt>(Range->getOperand(0));
1566
1567 if (Lower->isNullValue()) {
1568 WidenLoad->setMetadata(LLVMContext::MD_range, nullptr);
1569 } else {
1570 Metadata *LowAndHigh[] = {
1571 ConstantAsMetadata::get(ConstantInt::get(I32Ty, Lower->getValue().zext(32))),
1572 // Don't make assumptions about the high bits.
1573 ConstantAsMetadata::get(ConstantInt::get(I32Ty, 0))
1574 };
1575
1576 WidenLoad->setMetadata(LLVMContext::MD_range,
1577 MDNode::get(F.getContext(), LowAndHigh));
1578 }
1579 }
1580
1581 int TySize = DL.getTypeSizeInBits(I.getType());
1582 Type *IntNTy = Builder.getIntNTy(TySize);
1583 Value *ValTrunc = Builder.CreateTrunc(WidenLoad, IntNTy);
1584 Value *ValOrig = Builder.CreateBitCast(ValTrunc, I.getType());
1585 I.replaceAllUsesWith(ValOrig);
1586 DeadVals.push_back(&I);
1587 return true;
1588 }
1589
1590 return false;
1591}
1592
1593bool AMDGPUCodeGenPrepareImpl::visitSelectInst(SelectInst &I) {
1594 Value *Cond = I.getCondition();
1595 Value *TrueVal = I.getTrueValue();
1596 Value *FalseVal = I.getFalseValue();
1597 Value *CmpVal;
1598 CmpPredicate IsNanPred;
1599
1600 // Match fract pattern with nan check.
1601 if (!match(Cond, m_FCmp(IsNanPred, m_Value(CmpVal), m_NonNaN())))
1602 return false;
1603
1604 FPMathOperator *FPOp = dyn_cast<FPMathOperator>(&I);
1605 if (!FPOp)
1606 return false;
1607
1608 IRBuilder<> Builder(&I);
1609 Builder.setFastMathFlags(FPOp->getFastMathFlags());
1610
1611 Value *Fract = nullptr;
1612 if (IsNanPred == FCmpInst::FCMP_UNO && TrueVal == CmpVal &&
1613 CmpVal == matchFractPat(*FalseVal)) {
1614 // isnan(x) ? x : fract(x)
1615 Fract = applyFractPat(Builder, CmpVal);
1616 } else if (IsNanPred == FCmpInst::FCMP_ORD && FalseVal == CmpVal) {
1617 if (CmpVal == matchFractPat(*TrueVal)) {
1618 // !isnan(x) ? fract(x) : x
1619 Fract = applyFractPat(Builder, CmpVal);
1620 } else {
1621 // Match an intermediate clamp infinity to 0 pattern. i.e.
1622 // !isnan(x) ? (!isinf(x) ? fract(x) : 0.0) : x
1623 CmpPredicate PredInf;
1624 Value *IfNotInf;
1625
1626 if (!match(TrueVal, m_Select(m_FCmp(PredInf, m_FAbs(m_Specific(CmpVal)),
1627 m_PosInf()),
1628 m_Value(IfNotInf), m_PosZeroFP())) ||
1629 PredInf != FCmpInst::FCMP_UNE || CmpVal != matchFractPat(*IfNotInf))
1630 return false;
1631
1632 SelectInst *ClampInfSelect = cast<SelectInst>(TrueVal);
1633
1634 // Insert before the fabs
1635 Value *InsertPt =
1636 cast<Instruction>(ClampInfSelect->getCondition())->getOperand(0);
1637
1638 Builder.SetInsertPoint(cast<Instruction>(InsertPt));
1639 Value *NewFract = applyFractPat(Builder, CmpVal);
1640 NewFract->takeName(TrueVal);
1641
1642 // Thread the new fract into the inf clamping sequence.
1643 DeadVals.push_back(ClampInfSelect->getOperand(1));
1644 ClampInfSelect->setOperand(1, NewFract);
1645
1646 // The outer select nan handling is also absorbed into the fract.
1647 Fract = ClampInfSelect;
1648 }
1649 } else
1650 return false;
1651
1652 Fract->takeName(&I);
1653 I.replaceAllUsesWith(Fract);
1654 DeadVals.push_back(&I);
1655 return true;
1656}
1657
1658static bool areInSameBB(const Value *A, const Value *B) {
1659 const auto *IA = dyn_cast<Instruction>(A);
1660 const auto *IB = dyn_cast<Instruction>(B);
1661 return IA && IB && IA->getParent() == IB->getParent();
1662}
1663
1664// Helper for breaking large PHIs that returns true when an extractelement on V
1665// is likely to be folded away by the DAG combiner.
1667 const auto *FVT = dyn_cast<FixedVectorType>(V->getType());
1668 if (!FVT)
1669 return false;
1670
1671 const Value *CurVal = V;
1672
1673 // Check for insertelements, keeping track of the elements covered.
1674 BitVector EltsCovered(FVT->getNumElements());
1675 while (const auto *IE = dyn_cast<InsertElementInst>(CurVal)) {
1676 const auto *Idx = dyn_cast<ConstantInt>(IE->getOperand(2));
1677
1678 // Non constant index/out of bounds index -> folding is unlikely.
1679 // The latter is more of a sanity check because canonical IR should just
1680 // have replaced those with poison.
1681 if (!Idx || Idx->getZExtValue() >= FVT->getNumElements())
1682 return false;
1683
1684 const auto *VecSrc = IE->getOperand(0);
1685
1686 // If the vector source is another instruction, it must be in the same basic
1687 // block. Otherwise, the DAGCombiner won't see the whole thing and is
1688 // unlikely to be able to do anything interesting here.
1689 if (isa<Instruction>(VecSrc) && !areInSameBB(VecSrc, IE))
1690 return false;
1691
1692 CurVal = VecSrc;
1693 EltsCovered.set(Idx->getZExtValue());
1694
1695 // All elements covered.
1696 if (EltsCovered.all())
1697 return true;
1698 }
1699
1700 // We either didn't find a single insertelement, or the insertelement chain
1701 // ended before all elements were covered. Check for other interesting values.
1702
1703 // Constants are always interesting because we can just constant fold the
1704 // extractelements.
1705 if (isa<Constant>(CurVal))
1706 return true;
1707
1708 // shufflevector is likely to be profitable if either operand is a constant,
1709 // or if either source is in the same block.
1710 // This is because shufflevector is most often lowered as a series of
1711 // insert/extract elements anyway.
1712 if (const auto *SV = dyn_cast<ShuffleVectorInst>(CurVal)) {
1713 return isa<Constant>(SV->getOperand(1)) ||
1714 areInSameBB(SV, SV->getOperand(0)) ||
1715 areInSameBB(SV, SV->getOperand(1));
1716 }
1717
1718 return false;
1719}
1720
1721static void collectPHINodes(const PHINode &I,
1723 const auto [It, Inserted] = SeenPHIs.insert(&I);
1724 if (!Inserted)
1725 return;
1726
1727 for (const Value *Inc : I.incoming_values()) {
1728 if (const auto *PhiInc = dyn_cast<PHINode>(Inc))
1729 collectPHINodes(*PhiInc, SeenPHIs);
1730 }
1731
1732 for (const User *U : I.users()) {
1733 if (const auto *PhiU = dyn_cast<PHINode>(U))
1734 collectPHINodes(*PhiU, SeenPHIs);
1735 }
1736}
1737
1738bool AMDGPUCodeGenPrepareImpl::canBreakPHINode(const PHINode &I) {
1739 // Check in the cache first.
1740 if (const auto It = BreakPhiNodesCache.find(&I);
1741 It != BreakPhiNodesCache.end())
1742 return It->second;
1743
1744 // We consider PHI nodes as part of "chains", so given a PHI node I, we
1745 // recursively consider all its users and incoming values that are also PHI
1746 // nodes. We then make a decision about all of those PHIs at once. Either they
1747 // all get broken up, or none of them do. That way, we avoid cases where a
1748 // single PHI is/is not broken and we end up reforming/exploding a vector
1749 // multiple times, or even worse, doing it in a loop.
1750 SmallPtrSet<const PHINode *, 8> WorkList;
1751 collectPHINodes(I, WorkList);
1752
1753#ifndef NDEBUG
1754 // Check that none of the PHI nodes in the worklist are in the map. If some of
1755 // them are, it means we're not good enough at collecting related PHIs.
1756 for (const PHINode *WLP : WorkList) {
1757 assert(BreakPhiNodesCache.count(WLP) == 0);
1758 }
1759#endif
1760
1761 // To consider a PHI profitable to break, we need to see some interesting
1762 // incoming values. At least 2/3rd (rounded up) of all PHIs in the worklist
1763 // must have one to consider all PHIs breakable.
1764 //
1765 // This threshold has been determined through performance testing.
1766 //
1767 // Note that the computation below is equivalent to
1768 //
1769 // (unsigned)ceil((K / 3.0) * 2)
1770 //
1771 // It's simply written this way to avoid mixing integral/FP arithmetic.
1772 const auto Threshold = (alignTo(WorkList.size() * 2, 3) / 3);
1773 unsigned NumBreakablePHIs = 0;
1774 bool CanBreak = false;
1775 for (const PHINode *Cur : WorkList) {
1776 // Don't break PHIs that have no interesting incoming values. That is, where
1777 // there is no clear opportunity to fold the "extractelement" instructions
1778 // we would add.
1779 //
1780 // Note: IC does not run after this pass, so we're only interested in the
1781 // foldings that the DAG combiner can do.
1782 if (any_of(Cur->incoming_values(), isInterestingPHIIncomingValue)) {
1783 if (++NumBreakablePHIs >= Threshold) {
1784 CanBreak = true;
1785 break;
1786 }
1787 }
1788 }
1789
1790 for (const PHINode *Cur : WorkList)
1791 BreakPhiNodesCache[Cur] = CanBreak;
1792
1793 return CanBreak;
1794}
1795
1796/// Helper class for "break large PHIs" (visitPHINode).
1797///
1798/// This represents a slice of a PHI's incoming value, which is made up of:
1799/// - The type of the slice (Ty)
1800/// - The index in the incoming value's vector where the slice starts (Idx)
1801/// - The number of elements in the slice (NumElts).
1802/// It also keeps track of the NewPHI node inserted for this particular slice.
1803///
1804/// Slice examples:
1805/// <4 x i64> -> Split into four i64 slices.
1806/// -> [i64, 0, 1], [i64, 1, 1], [i64, 2, 1], [i64, 3, 1]
1807/// <5 x i16> -> Split into 2 <2 x i16> slices + a i16 tail.
1808/// -> [<2 x i16>, 0, 2], [<2 x i16>, 2, 2], [i16, 4, 1]
1810public:
1811 VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
1812 : Ty(Ty), Idx(Idx), NumElts(NumElts) {}
1813
1814 Type *Ty = nullptr;
1815 unsigned Idx = 0;
1816 unsigned NumElts = 0;
1817 PHINode *NewPHI = nullptr;
1818
1819 /// Slice \p Inc according to the information contained within this slice.
1820 /// This is cached, so if called multiple times for the same \p BB & \p Inc
1821 /// pair, it returns the same Sliced value as well.
1822 ///
1823 /// Note this *intentionally* does not return the same value for, say,
1824 /// [%bb.0, %0] & [%bb.1, %0] as:
1825 /// - It could cause issues with dominance (e.g. if bb.1 is seen first, then
1826 /// the value in bb.1 may not be reachable from bb.0 if it's its
1827 /// predecessor.)
1828 /// - We also want to make our extract instructions as local as possible so
1829 /// the DAG has better chances of folding them out. Duplicating them like
1830 /// that is beneficial in that regard.
1831 ///
1832 /// This is both a minor optimization to avoid creating duplicate
1833 /// instructions, but also a requirement for correctness. It is not forbidden
1834 /// for a PHI node to have the same [BB, Val] pair multiple times. If we
1835 /// returned a new value each time, those previously identical pairs would all
1836 /// have different incoming values (from the same block) and it'd cause a "PHI
1837 /// node has multiple entries for the same basic block with different incoming
1838 /// values!" verifier error.
1839 Value *getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName) {
1840 Value *&Res = SlicedVals[{BB, Inc}];
1841 if (Res)
1842 return Res;
1843
1845 if (Instruction *IncInst = dyn_cast<Instruction>(Inc))
1846 B.SetCurrentDebugLocation(IncInst->getDebugLoc());
1847
1848 if (NumElts > 1) {
1850 for (unsigned K = Idx; K < (Idx + NumElts); ++K)
1851 Mask.push_back(K);
1852 Res = B.CreateShuffleVector(Inc, Mask, NewValName);
1853 } else
1854 Res = B.CreateExtractElement(Inc, Idx, NewValName);
1855
1856 return Res;
1857 }
1858
1859private:
1861};
1862
1863bool AMDGPUCodeGenPrepareImpl::visitPHINode(PHINode &I) {
1864 // Break-up fixed-vector PHIs into smaller pieces.
1865 // Default threshold is 32, so it breaks up any vector that's >32 bits into
1866 // its elements, or into 32-bit pieces (for 8/16 bit elts).
1867 //
1868 // This is only helpful for DAGISel because it doesn't handle large PHIs as
1869 // well as GlobalISel. DAGISel lowers PHIs by using CopyToReg/CopyFromReg.
1870 // With large, odd-sized PHIs we may end up needing many `build_vector`
1871 // operations with most elements being "undef". This inhibits a lot of
1872 // optimization opportunities and can result in unreasonably high register
1873 // pressure and the inevitable stack spilling.
1874 if (!BreakLargePHIs || getCGPassBuilderOption().EnableGlobalISelOption)
1875 return false;
1876
1877 FixedVectorType *FVT = dyn_cast<FixedVectorType>(I.getType());
1878 if (!FVT || FVT->getNumElements() == 1 ||
1879 DL.getTypeSizeInBits(FVT) <= BreakLargePHIsThreshold)
1880 return false;
1881
1882 if (!ForceBreakLargePHIs && !canBreakPHINode(I))
1883 return false;
1884
1885 std::vector<VectorSlice> Slices;
1886
1887 Type *EltTy = FVT->getElementType();
1888 {
1889 unsigned Idx = 0;
1890 // For 8/16 bits type, don't scalarize fully but break it up into as many
1891 // 32-bit slices as we can, and scalarize the tail.
1892 const unsigned EltSize = DL.getTypeSizeInBits(EltTy);
1893 const unsigned NumElts = FVT->getNumElements();
1894 if (EltSize == 8 || EltSize == 16) {
1895 const unsigned SubVecSize = (32 / EltSize);
1896 Type *SubVecTy = FixedVectorType::get(EltTy, SubVecSize);
1897 for (unsigned End = alignDown(NumElts, SubVecSize); Idx < End;
1898 Idx += SubVecSize)
1899 Slices.emplace_back(SubVecTy, Idx, SubVecSize);
1900 }
1901
1902 // Scalarize all remaining elements.
1903 for (; Idx < NumElts; ++Idx)
1904 Slices.emplace_back(EltTy, Idx, 1);
1905 }
1906
1907 assert(Slices.size() > 1);
1908
1909 // Create one PHI per vector piece. The "VectorSlice" class takes care of
1910 // creating the necessary instruction to extract the relevant slices of each
1911 // incoming value.
1912 IRBuilder<> B(I.getParent());
1913 B.SetCurrentDebugLocation(I.getDebugLoc());
1914
1915 unsigned IncNameSuffix = 0;
1916 for (VectorSlice &S : Slices) {
1917 // We need to reset the build on each iteration, because getSlicedVal may
1918 // have inserted something into I's BB.
1919 B.SetInsertPoint(I.getParent()->getFirstNonPHIIt());
1920 S.NewPHI = B.CreatePHI(S.Ty, I.getNumIncomingValues());
1921
1922 for (const auto &[Idx, BB] : enumerate(I.blocks())) {
1923 S.NewPHI->addIncoming(S.getSlicedVal(BB, I.getIncomingValue(Idx),
1924 "largephi.extractslice" +
1925 std::to_string(IncNameSuffix++)),
1926 BB);
1927 }
1928 }
1929
1930 // And replace this PHI with a vector of all the previous PHI values.
1931 Value *Vec = PoisonValue::get(FVT);
1932 unsigned NameSuffix = 0;
1933 for (VectorSlice &S : Slices) {
1934 const auto ValName = "largephi.insertslice" + std::to_string(NameSuffix++);
1935 if (S.NumElts > 1)
1936 Vec = B.CreateInsertVector(FVT, Vec, S.NewPHI, S.Idx, ValName);
1937 else
1938 Vec = B.CreateInsertElement(Vec, S.NewPHI, S.Idx, ValName);
1939 }
1940
1941 I.replaceAllUsesWith(Vec);
1942 DeadVals.push_back(&I);
1943 return true;
1944}
1945
1946/// \param V Value to check
1947/// \param DL DataLayout
1948/// \param TM TargetMachine (TODO: remove once DL contains nullptr values)
1949/// \param AS Target Address Space
1950/// \return true if \p V cannot be the null value of \p AS, false otherwise.
1951static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL,
1952 const AMDGPUTargetMachine &TM, unsigned AS) {
1953 // Pointer cannot be null if it's a block address, GV or alloca.
1954 // NOTE: We don't support extern_weak, but if we did, we'd need to check for
1955 // it as the symbol could be null in such cases.
1957 return true;
1958
1959 // Check nonnull arguments.
1960 if (const auto *Arg = dyn_cast<Argument>(V); Arg && Arg->hasNonNullAttr())
1961 return true;
1962
1963 // Check nonnull loads.
1964 if (const auto *Load = dyn_cast<LoadInst>(V);
1965 Load && Load->hasMetadata(LLVMContext::MD_nonnull))
1966 return true;
1967
1968 // getUnderlyingObject may have looked through another addrspacecast, although
1969 // the optimizable situations most likely folded out by now.
1970 if (AS != cast<PointerType>(V->getType())->getAddressSpace())
1971 return false;
1972
1973 // TODO: Calls that return nonnull?
1974
1975 // For all other things, use KnownBits.
1976 // We either use 0 or all bits set to indicate null, so check whether the
1977 // value can be zero or all ones.
1978 //
1979 // TODO: Use ValueTracking's isKnownNeverNull if it becomes aware that some
1980 // address spaces have non-zero null values.
1981 auto SrcPtrKB = computeKnownBits(V, DL);
1982 const auto NullVal = AMDGPU::getNullPointerValue(AS);
1983
1984 assert(SrcPtrKB.getBitWidth() == DL.getPointerSizeInBits(AS));
1985 assert((NullVal == 0 || NullVal == -1) &&
1986 "don't know how to check for this null value!");
1987 return NullVal ? !SrcPtrKB.getMaxValue().isAllOnes() : SrcPtrKB.isNonZero();
1988}
1989
1990bool AMDGPUCodeGenPrepareImpl::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
1991 // Intrinsic doesn't support vectors, also it seems that it's often difficult
1992 // to prove that a vector cannot have any nulls in it so it's unclear if it's
1993 // worth supporting.
1994 if (I.getType()->isVectorTy())
1995 return false;
1996
1997 // Check if this can be lowered to a amdgcn.addrspacecast.nonnull.
1998 // This is only worthwhile for casts from/to priv/local to flat.
1999 const unsigned SrcAS = I.getSrcAddressSpace();
2000 const unsigned DstAS = I.getDestAddressSpace();
2001
2002 bool CanLower = false;
2003 if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
2004 CanLower = (DstAS == AMDGPUAS::LOCAL_ADDRESS ||
2005 DstAS == AMDGPUAS::PRIVATE_ADDRESS);
2006 else if (DstAS == AMDGPUAS::FLAT_ADDRESS)
2007 CanLower = (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2008 SrcAS == AMDGPUAS::PRIVATE_ADDRESS);
2009 if (!CanLower)
2010 return false;
2011
2013 getUnderlyingObjects(I.getOperand(0), WorkList);
2014 if (!all_of(WorkList, [&](const Value *V) {
2015 return isPtrKnownNeverNull(V, DL, TM, SrcAS);
2016 }))
2017 return false;
2018
2019 IRBuilder<> B(&I);
2020 auto *Intrin = B.CreateIntrinsic(
2021 I.getType(), Intrinsic::amdgcn_addrspacecast_nonnull, {I.getOperand(0)});
2022 I.replaceAllUsesWith(Intrin);
2023 DeadVals.push_back(&I);
2024 return true;
2025}
2026
2027bool AMDGPUCodeGenPrepareImpl::visitIntrinsicInst(IntrinsicInst &I) {
2028 Intrinsic::ID IID = I.getIntrinsicID();
2029 switch (IID) {
2030 case Intrinsic::minnum:
2031 case Intrinsic::minimumnum:
2032 case Intrinsic::minimum:
2033 return visitFMinLike(I);
2034 case Intrinsic::sqrt:
2035 return visitSqrt(I);
2036 case Intrinsic::log:
2037 case Intrinsic::log10:
2038 return visitLog(cast<FPMathOperator>(I), IID);
2039 case Intrinsic::log2:
2040 // No reason to handle log2.
2041 return false;
2042 case Intrinsic::amdgcn_mbcnt_lo:
2043 return visitMbcntLo(I);
2044 case Intrinsic::amdgcn_mbcnt_hi:
2045 return visitMbcntHi(I);
2046 default:
2047 return false;
2048 }
2049}
2050
2051/// Match non-nan fract pattern.
2052/// minnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2053/// minimumnum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2054/// minimum(fsub(x, floor(x)), nextafter(1.0, -1.0))
2055///
2056/// If fract is a useful instruction for the subtarget. Does not account for the
2057/// nan handling; the instruction has a nan check on the input value.
2058Value *AMDGPUCodeGenPrepareImpl::matchFractPat(Value &V) {
2059 if (ST.hasFractBug())
2060 return nullptr;
2061
2062 IntrinsicInst *II = dyn_cast<IntrinsicInst>(&V);
2063 if (!II)
2064 return nullptr;
2065
2066 Intrinsic::ID IID = II->getIntrinsicID();
2067
2068 // The value is only used in contexts where we know the input isn't a nan, so
2069 // any of the fmin variants are fine.
2070 if (IID != Intrinsic::minnum && IID != Intrinsic::minimum &&
2071 IID != Intrinsic::minimumnum)
2072 return nullptr;
2073
2074 Type *Ty = V.getType();
2075 if (!isLegalFloatingTy(Ty->getScalarType()))
2076 return nullptr;
2077
2078 Value *Arg0 = II->getArgOperand(0);
2079 Value *Arg1 = II->getArgOperand(1);
2080
2081 const APFloat *C;
2082 if (!match(Arg1, m_APFloatAllowPoison(C)))
2083 return nullptr;
2084
2085 APFloat OneNextDown = APFloat::getOne(C->getSemantics());
2086 OneNextDown.next(true);
2087
2088 // Match nextafter(1.0, -1)
2089 if (OneNextDown != *C)
2090 return nullptr;
2091
2092 Value *FloorSrc;
2093 if (match(Arg0, m_FSub(m_Value(FloorSrc),
2095 return FloorSrc;
2096 return nullptr;
2097}
2098
2099Value *AMDGPUCodeGenPrepareImpl::applyFractPat(IRBuilder<> &Builder,
2100 Value *FractArg) {
2101 SmallVector<Value *, 4> FractVals;
2102 extractValues(Builder, FractVals, FractArg);
2103
2104 SmallVector<Value *, 4> ResultVals(FractVals.size());
2105
2106 Type *Ty = FractArg->getType()->getScalarType();
2107 for (unsigned I = 0, E = FractVals.size(); I != E; ++I) {
2108 ResultVals[I] =
2109 Builder.CreateIntrinsic(Intrinsic::amdgcn_fract, {Ty}, {FractVals[I]});
2110 }
2111
2112 return insertValues(Builder, FractArg->getType(), ResultVals);
2113}
2114
2115bool AMDGPUCodeGenPrepareImpl::visitFMinLike(IntrinsicInst &I) {
2116 Value *FractArg = matchFractPat(I);
2117 if (!FractArg)
2118 return false;
2119
2120 // Match pattern for fract intrinsic in contexts where the nan check has been
2121 // optimized out (and hope the knowledge the source can't be nan wasn't lost).
2122 if (!I.hasNoNaNs() && !isKnownNeverNaN(FractArg, SQ.getWithInstruction(&I)))
2123 return false;
2124
2125 IRBuilder<> Builder(&I);
2126 FastMathFlags FMF = I.getFastMathFlags();
2127 FMF.setNoNaNs();
2128 Builder.setFastMathFlags(FMF);
2129
2130 Value *Fract = applyFractPat(Builder, FractArg);
2131 Fract->takeName(&I);
2132 I.replaceAllUsesWith(Fract);
2133 DeadVals.push_back(&I);
2134 return true;
2135}
2136
2137// Expand llvm.sqrt.f32 calls with !fpmath metadata in a semi-fast way.
2138bool AMDGPUCodeGenPrepareImpl::visitSqrt(IntrinsicInst &Sqrt) {
2139 Type *Ty = Sqrt.getType()->getScalarType();
2140 if (!Ty->isFloatTy() && (!Ty->isHalfTy() || ST.has16BitInsts()))
2141 return false;
2142
2143 const FPMathOperator *FPOp = cast<const FPMathOperator>(&Sqrt);
2144 FastMathFlags SqrtFMF = FPOp->getFastMathFlags();
2145
2146 // We're trying to handle the fast-but-not-that-fast case only. The lowering
2147 // of fast llvm.sqrt will give the raw instruction anyway.
2148 if (SqrtFMF.approxFunc())
2149 return false;
2150
2151 const float ReqdAccuracy = FPOp->getFPAccuracy();
2152
2153 // Defer correctly rounded expansion to codegen.
2154 if (ReqdAccuracy < 1.0f)
2155 return false;
2156
2157 Value *SrcVal = Sqrt.getOperand(0);
2158 bool CanTreatAsDAZ = canIgnoreDenormalInput(SrcVal, &Sqrt);
2159
2160 // The raw instruction is 1 ulp, but the correction for denormal handling
2161 // brings it to 2.
2162 if (!CanTreatAsDAZ && ReqdAccuracy < 2.0f)
2163 return false;
2164
2165 IRBuilder<> Builder(&Sqrt);
2166 SmallVector<Value *, 4> SrcVals;
2167 extractValues(Builder, SrcVals, SrcVal);
2168
2169 SmallVector<Value *, 4> ResultVals(SrcVals.size());
2170 for (int I = 0, E = SrcVals.size(); I != E; ++I) {
2171 if (CanTreatAsDAZ)
2172 ResultVals[I] = Builder.CreateCall(getSqrtF32(), SrcVals[I]);
2173 else
2174 ResultVals[I] = emitSqrtIEEE2ULP(Builder, SrcVals[I], SqrtFMF);
2175 }
2176
2177 Value *NewSqrt = insertValues(Builder, Sqrt.getType(), ResultVals);
2178 NewSqrt->takeName(&Sqrt);
2179 Sqrt.replaceAllUsesWith(NewSqrt);
2180 DeadVals.push_back(&Sqrt);
2181 return true;
2182}
2183
2184/// Replace log and log10 intrinsic calls based on fpmath metadata.
2185bool AMDGPUCodeGenPrepareImpl::visitLog(FPMathOperator &Log,
2186 Intrinsic::ID IID) {
2187 Type *Ty = Log.getType();
2188 if (!Ty->getScalarType()->isHalfTy() || !ST.has16BitInsts())
2189 return false;
2190
2191 FastMathFlags FMF = Log.getFastMathFlags();
2192
2193 // Defer fast math cases to codegen.
2194 if (FMF.approxFunc())
2195 return false;
2196
2197 // Limit experimentally determined from OpenCL conformance test (1.79)
2198 if (Log.getFPAccuracy() < 1.80f)
2199 return false;
2200
2201 IRBuilder<> Builder(&cast<CallInst>(Log));
2202
2203 // Use the generic intrinsic for convenience in the vector case. Codegen will
2204 // recognize the denormal handling is not necessary from the fpext.
2205 // TODO: Move to generic code
2206 Value *Log2 =
2207 Builder.CreateUnaryIntrinsic(Intrinsic::log2, Log.getOperand(0), FMF);
2208
2209 double Log2BaseInverted =
2210 IID == Intrinsic::log10 ? numbers::ln2 / numbers::ln10 : numbers::ln2;
2211 Value *Mul =
2212 Builder.CreateFMulFMF(Log2, ConstantFP::get(Ty, Log2BaseInverted), FMF);
2213
2214 Mul->takeName(&Log);
2215
2216 Log.replaceAllUsesWith(Mul);
2217 DeadVals.push_back(&Log);
2218 return true;
2219}
2220
2221bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
2222 if (skipFunction(F))
2223 return false;
2224
2225 auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
2226 if (!TPC)
2227 return false;
2228
2229 const AMDGPUTargetMachine &TM = TPC->getTM<AMDGPUTargetMachine>();
2230 const TargetLibraryInfo *TLI =
2231 &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
2232 AssumptionCache *AC =
2233 &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
2234 auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
2235 const DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
2236 const UniformityInfo &UA =
2237 getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
2238 return AMDGPUCodeGenPrepareImpl(F, TM, TLI, AC, DT, UA).run();
2239}
2240
2243 const AMDGPUTargetMachine &ATM = static_cast<const AMDGPUTargetMachine &>(TM);
2244 const TargetLibraryInfo *TLI = &FAM.getResult<TargetLibraryAnalysis>(F);
2245 AssumptionCache *AC = &FAM.getResult<AssumptionAnalysis>(F);
2246 const DominatorTree *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
2247 const UniformityInfo &UA = FAM.getResult<UniformityInfoAnalysis>(F);
2248 AMDGPUCodeGenPrepareImpl Impl(F, ATM, TLI, AC, DT, UA);
2249 if (!Impl.run())
2250 return PreservedAnalyses::all();
2252 if (!Impl.FlowChanged)
2254 return PA;
2255}
2256
2257INITIALIZE_PASS_BEGIN(AMDGPUCodeGenPrepare, DEBUG_TYPE,
2258 "AMDGPU IR optimizations", false, false)
2262INITIALIZE_PASS_END(AMDGPUCodeGenPrepare, DEBUG_TYPE, "AMDGPU IR optimizations",
2264
2265/// Create a workitem.id.x intrinsic call with range metadata.
2266CallInst *AMDGPUCodeGenPrepareImpl::createWorkitemIdX(IRBuilder<> &B) const {
2267 CallInst *Tid = B.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
2268 ST.makeLIDRangeMetadata(Tid);
2269 return Tid;
2270}
2271
2272/// Replace the instruction with a direct workitem.id.x call.
2273void AMDGPUCodeGenPrepareImpl::replaceWithWorkitemIdX(Instruction &I) const {
2274 IRBuilder<> B(&I);
2275 CallInst *Tid = createWorkitemIdX(B);
2277 ReplaceInstWithValue(BI, Tid);
2278}
2279
2280/// Replace the instruction with (workitem.id.x & mask).
2281void AMDGPUCodeGenPrepareImpl::replaceWithMaskedWorkitemIdX(
2282 Instruction &I, unsigned WaveSize) const {
2283 IRBuilder<> B(&I);
2284 CallInst *Tid = createWorkitemIdX(B);
2285 Constant *Mask = ConstantInt::get(Tid->getType(), WaveSize - 1);
2286 Value *AndInst = B.CreateAnd(Tid, Mask);
2288 ReplaceInstWithValue(BI, AndInst);
2289}
2290
2291/// Try to optimize mbcnt instruction by replacing with workitem.id.x when
2292/// work group size allows direct computation of lane ID.
2293/// Returns true if optimization was applied, false otherwise.
2294bool AMDGPUCodeGenPrepareImpl::tryReplaceWithWorkitemId(Instruction &I,
2295 unsigned Wave) const {
2296 std::optional<unsigned> MaybeX = ST.getReqdWorkGroupSize(F, 0);
2297 if (!MaybeX)
2298 return false;
2299
2300 // When work group size == wave_size, each work group contains exactly one
2301 // wave, so the instruction can be replaced with workitem.id.x directly.
2302 if (*MaybeX == Wave) {
2303 replaceWithWorkitemIdX(I);
2304 return true;
2305 }
2306
2307 // When work group evenly splits into waves, compute lane ID within wave
2308 // using bit masking: lane_id = workitem.id.x & (wave_size - 1).
2309 if (ST.hasWavefrontsEvenlySplittingXDim(F, /*RequiresUniformYZ=*/true)) {
2310 replaceWithMaskedWorkitemIdX(I, Wave);
2311 return true;
2312 }
2313
2314 return false;
2315}
2316
2317/// Optimize mbcnt.lo calls on wave32 architectures for lane ID computation.
2318bool AMDGPUCodeGenPrepareImpl::visitMbcntLo(IntrinsicInst &I) const {
2319 // This optimization only applies to wave32 targets where mbcnt.lo operates on
2320 // the full execution mask.
2321 if (!ST.isWave32())
2322 return false;
2323
2324 // Only optimize the pattern mbcnt.lo(~0, 0) which counts active lanes with
2325 // lower IDs.
2326 if (!match(&I,
2328 return false;
2329
2330 return tryReplaceWithWorkitemId(I, ST.getWavefrontSize());
2331}
2332
2333/// Optimize mbcnt.hi calls for lane ID computation.
2334bool AMDGPUCodeGenPrepareImpl::visitMbcntHi(IntrinsicInst &I) const {
2335 // Abort if wave size is not known at compile time.
2336 if (!ST.isWaveSizeKnown())
2337 return false;
2338
2339 unsigned Wave = ST.getWavefrontSize();
2340
2341 // On wave32, the upper 32 bits of execution mask are always 0, so
2342 // mbcnt.hi(mask, val) always returns val unchanged.
2343 if (ST.isWave32()) {
2344 if (auto MaybeX = ST.getReqdWorkGroupSize(F, 0)) {
2345 // Replace mbcnt.hi(mask, val) with val only when work group size matches
2346 // wave size (single wave per work group).
2347 if (*MaybeX == Wave) {
2349 ReplaceInstWithValue(BI, I.getArgOperand(1));
2350 return true;
2351 }
2352 }
2353 }
2354
2355 // Optimize the complete lane ID computation pattern:
2356 // mbcnt.hi(~0, mbcnt.lo(~0, 0)) which counts all active lanes with lower IDs
2357 // across the full execution mask.
2358 using namespace PatternMatch;
2359
2360 // Check for pattern: mbcnt.hi(~0, mbcnt.lo(~0, 0))
2363 m_AllOnes(), m_Zero()))))
2364 return false;
2365
2366 return tryReplaceWithWorkitemId(I, Wave);
2367}
2368
2369char AMDGPUCodeGenPrepare::ID = 0;
2370
2372 return new AMDGPUCodeGenPrepare();
2373}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * insertValues(IRBuilder<> &Builder, Type *Ty, SmallVectorImpl< Value * > &Values)
static void extractValues(IRBuilder<> &Builder, SmallVectorImpl< Value * > &Values, Value *V)
static Value * getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static bool isInterestingPHIIncomingValue(const Value *V)
static SelectInst * findSelectThroughCast(Value *V, CastInst *&Cast)
static std::pair< Value *, Value * > getMul64(IRBuilder<> &Builder, Value *LHS, Value *RHS)
static Value * emitRsqIEEE1ULP(IRBuilder<> &Builder, Value *Src, bool IsNegative)
Emit an expansion of 1.0 / sqrt(Src) good for 1ulp that supports denormals.
static Value * getSign32(Value *V, IRBuilder<> &Builder, const DataLayout DL)
static void collectPHINodes(const PHINode &I, SmallPtrSet< const PHINode *, 8 > &SeenPHIs)
static bool isPtrKnownNeverNull(const Value *V, const DataLayout &DL, const AMDGPUTargetMachine &TM, unsigned AS)
static bool areInSameBB(const Value *A, const Value *B)
static cl::opt< bool > WidenLoads("amdgpu-late-codegenprepare-widen-constant-loads", cl::desc("Widen sub-dword constant address space loads in " "AMDGPULateCodeGenPrepare"), cl::ReallyHidden, cl::init(true))
The AMDGPU TargetMachine interface definition for hw codegen targets.
@ Scaled
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
dxil translate DXIL Translate Metadata
static bool runOnFunction(Function &F, bool PostInlining)
#define DEBUG_TYPE
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
const SmallVectorImpl< MachineOperand > & Cond
static void visit(BasicBlock &Start, std::function< bool(BasicBlock *)> op)
This file implements a set that has insertion order iteration characteristics.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static cl::opt< cl::boolOrDefault > EnableGlobalISelOption("global-isel", cl::Hidden, cl::desc("Enable the \"global\" instruction selector"))
Target-Independent Code Generator Pass Configuration Options pass.
This pass exposes codegen information to IR-level passes.
LLVM IR instance of the generic uniformity analysis.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
Value * RHS
Value * LHS
BinaryOperator * Mul
VectorSlice(Type *Ty, unsigned Idx, unsigned NumElts)
Value * getSlicedVal(BasicBlock *BB, Value *Inc, StringRef NewValName)
Slice Inc according to the information contained within this slice.
PreservedAnalyses run(Function &, FunctionAnalysisManager &)
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
unsigned getWavefrontSize() const
static APFloat getOne(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative One.
Definition APFloat.h:1143
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1213
opStatus next(bool nextDown)
Definition APFloat.h:1309
This class represents a conversion between pointers from one address space to another.
Represent the analysis usage information of a pass.
AnalysisUsage & addRequired()
void setPreservesAll()
Set by analyses that do not transform their input at all.
A function analysis which provides an AssumptionCache.
An immutable pass that tracks lazily created AssumptionCache objects.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
BinaryOps getOpcode() const
Definition InstrTypes.h:374
BitVector & set()
Definition BitVector.h:370
bool all() const
all - Returns true if all bits are set.
Definition BitVector.h:194
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
This class represents a function call, abstracting a target machine's calling convention.
This is the base class for all instructions that perform data casts.
Definition InstrTypes.h:448
Instruction::CastOps getOpcode() const
Return the opcode of this CastInst.
Definition InstrTypes.h:610
TargetTransformInfo getTargetTransformInfo(const Function &F) const override
Get a TargetTransformInfo implementation for the target.
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getAllOnesValue(Type *Ty)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
void setFast(bool B=true)
Definition FMF.h:99
bool noInfs() const
Definition FMF.h:69
bool allowReciprocal() const
Definition FMF.h:71
bool approxFunc() const
Definition FMF.h:73
void setNoNaNs(bool B=true)
Definition FMF.h:81
bool noNaNs() const
Definition FMF.h:68
bool allowContract() const
Definition FMF.h:72
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
bool isWave32() const
bool isWaveSizeKnown() const
Returns if the wavesize of this subtarget is known reliable.
bool hasFractBug() const
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2584
Value * CreateFDiv(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1696
Value * CreateSIToFP(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2150
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2572
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:592
Value * CreateZExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a ZExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2095
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2631
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFPToUI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2123
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2089
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:579
Value * CreateUIToFP(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2137
void setFastMathFlags(FastMathFlags NewFMF)
Set the fast-math flags to be used with generated fp-math operators.
Definition IRBuilder.h:345
Value * CreateFCmpOLT(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2386
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateNeg(Value *V, const Twine &Name="", bool HasNSW=false)
Definition IRBuilder.h:1811
LLVM_ABI Value * createIsFPClass(Value *FPNum, unsigned Test)
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1446
Value * CreateFMA(Value *Factor1, Value *Factor2, Value *Summand, FMFSource FMFSource={}, const Twine &Name="")
Create call to the fma intrinsic.
Definition IRBuilder.h:1100
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2199
LLVM_ABI CallInst * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1877
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1518
FastMathFlags getFastMathFlags() const
Get the flags to be applied to created floating point ops.
Definition IRBuilder.h:334
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2077
Value * CreateFCmpOEQ(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2371
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1577
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1429
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:607
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2510
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2063
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1734
Value * CreateICmpUGE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2343
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1558
Value * CreateXor(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1625
Value * CreateFMul(Value *L, Value *R, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1677
Value * CreateFNeg(Value *V, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1820
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1599
Value * CreateSExtOrTrunc(Value *V, Type *DestTy, const Twine &Name="")
Create a SExt or Trunc from the integer value V to DestTy.
Definition IRBuilder.h:2110
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1682
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1463
Value * CreateFCmpOGE(Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2381
Value * CreateFPToSI(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2130
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
Base class for instruction visitors.
Definition InstVisitor.h:78
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses none()
Convenience factory function for the empty preserved set.
Definition Analysis.h:115
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
This class represents the LLVM 'select' instruction.
const Value * getFalseValue() const
const Value * getCondition() const
const Value * getTrueValue() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
const STC & getSubtarget(const Function &F) const
This method returns a pointer to the specified type of TargetSubtargetInfo.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
LLVM_ABI unsigned getIntegerBitWidth() const
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:110
Analysis pass which computes UniformityInfo.
Legacy analysis pass which computes a CycleInfo.
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
Type * getElementType() const
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ PRIVATE_ADDRESS
Address space for private memory.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
CmpClass_match< LHS, RHS, FCmpInst > m_FCmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::FSub > m_FSub(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
ap_match< APFloat > m_APFloatAllowPoison(const APFloat *&Res)
Match APFloat while allowing poison in splat vector constants.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
cstfp_pred_ty< is_nonnan > m_NonNaN()
Match a non-NaN FP constant.
cstfp_pred_ty< is_signed_inf< false > > m_PosInf()
Match a positive infinity FP constant.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
cstfp_pred_ty< is_pos_zero_fp > m_PosZeroFP()
Match a floating-point positive zero.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract(Y &&MD)
Extract a Value from Metadata.
Definition Metadata.h:668
constexpr double ln2
constexpr double ln10
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
LLVM_ABI KnownFPClass computeKnownFPClass(const Value *V, const APInt &DemandedElts, FPClassTest InterestedClasses, const SimplifyQuery &SQ, unsigned Depth=0)
Determine which floating-point classes are valid for V, and return them in KnownFPClass bit sets.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool expandRemainderUpTo64Bits(BinaryOperator *Rem)
Generate code to calculate the remainder of two integers, replacing Rem with the generated code.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
LLVM_ABI void ReplaceInstWithValue(BasicBlock::iterator &BI, Value *V)
Replace all uses of an instruction (specified by BI) with a value, then remove and delete the origina...
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:360
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
LLVM_ABI bool expandDivisionUpTo64Bits(BinaryOperator *Div)
Generate code to divide two integers, replacing Div with the generated code.
FPClassTest
Floating-point class tests, supported by 'is_fpclass' intrinsic.
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
LLVM_ABI Constant * ConstantFoldCastOperand(unsigned Opcode, Constant *C, Type *DestTy, const DataLayout &DL)
Attempt to constant fold a cast with the specified operand.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI Constant * ConstantFoldBinaryOpOperands(unsigned Opcode, Constant *LHS, Constant *RHS, const DataLayout &DL)
Attempt to constant fold a binary operation with the specified operands.
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
FunctionPass * createAMDGPUCodeGenPreparePass()
To bit_cast(const From &from) noexcept
Definition bit.h:90
DWARFExpression::Operation Op
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isKnownNeverNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point scalar value is not a NaN or if the floating-point vector value has...
LLVM_ABI unsigned ComputeMaxSignificantBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Get the upper bound on bit size for this Value Op as a signed integer.
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL, bool OrZero=false, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return true if the given value is known to have exactly one bit set when defined.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl< const Value * > &Objects, const LoopInfo *LI=nullptr, unsigned MaxLookup=MaxLookupSearchDepth)
This method is similar to getUnderlyingObject except that it can look through phi and select instruct...
LLVM_ABI CGPassBuilderOption getCGPassBuilderOption()
#define N
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
constexpr bool inputsAreZero() const
Return true if input denormals must be implicitly treated as 0.
static constexpr DenormalMode getPreserveSign()
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
bool isKnownNeverSubnormal() const
Return true if it's known this can never be a subnormal.
LLVM_ABI bool isKnownNeverLogicalZero(DenormalMode Mode) const
Return true if it's known this can never be interpreted as a zero.
bool isKnownNeverPosInfinity() const
Return true if it's known this can never be +infinity.
const DataLayout & DL
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC