LLVM 23.0.0git
AMDGPUInstCombineIntrinsic.cpp
Go to the documentation of this file.
1//===- AMDGPInstCombineIntrinsic.cpp - AMDGPU specific InstCombine pass ---===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
17#include "AMDGPUInstrInfo.h"
19#include "GCNSubtarget.h"
20#include "SIDefines.h"
22#include "llvm/ADT/STLExtras.h"
23#include "llvm/ADT/Sequence.h"
26#include "llvm/IR/Constants.h"
27#include "llvm/IR/Dominators.h"
28#include "llvm/IR/IntrinsicsAMDGPU.h"
31#include <optional>
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36#define DEBUG_TYPE "AMDGPUtti"
37
38namespace {
39
40struct AMDGPUImageDMaskIntrinsic {
41 unsigned Intr;
42};
43
44#define GET_AMDGPUImageDMaskIntrinsicTable_IMPL
45#include "AMDGPUGenSearchableTables.inc"
46
47} // end anonymous namespace
48
49// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
50//
51// A single NaN input is folded to minnum, so we rely on that folding for
52// handling NaNs.
53static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
54 const APFloat &Src2) {
55 APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
56
57 APFloat::cmpResult Cmp0 = Max3.compare(Src0);
58 assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
59 if (Cmp0 == APFloat::cmpEqual)
60 return maxnum(Src1, Src2);
61
62 APFloat::cmpResult Cmp1 = Max3.compare(Src1);
63 assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
64 if (Cmp1 == APFloat::cmpEqual)
65 return maxnum(Src0, Src2);
66
67 return maxnum(Src0, Src1);
68}
69
70// Check if a value can be converted to a 16-bit value without losing
71// precision.
72// The value is expected to be either a float (IsFloat = true) or an unsigned
73// integer (IsFloat = false).
74static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat) {
75 Type *VTy = V.getType();
76 if (VTy->isHalfTy() || VTy->isIntegerTy(16)) {
77 // The value is already 16-bit, so we don't want to convert to 16-bit again!
78 return false;
79 }
80 if (IsFloat) {
81 if (ConstantFP *ConstFloat = dyn_cast<ConstantFP>(&V)) {
82 // We need to check that if we cast the index down to a half, we do not
83 // lose precision.
84 APFloat FloatValue(ConstFloat->getValueAPF());
85 bool LosesInfo = true;
87 &LosesInfo);
88 return !LosesInfo;
89 }
90 } else {
91 if (ConstantInt *ConstInt = dyn_cast<ConstantInt>(&V)) {
92 // We need to check that if we cast the index down to an i16, we do not
93 // lose precision.
94 APInt IntValue(ConstInt->getValue());
95 return IntValue.getActiveBits() <= 16;
96 }
97 }
98
99 Value *CastSrc;
100 bool IsExt = IsFloat ? match(&V, m_FPExt(PatternMatch::m_Value(CastSrc)))
101 : match(&V, m_ZExt(PatternMatch::m_Value(CastSrc)));
102 if (IsExt) {
103 Type *CastSrcTy = CastSrc->getType();
104 if (CastSrcTy->isHalfTy() || CastSrcTy->isIntegerTy(16))
105 return true;
106 }
107
108 return false;
109}
110
111// Convert a value to 16-bit.
113 Type *VTy = V.getType();
115 return cast<Instruction>(&V)->getOperand(0);
116 if (VTy->isIntegerTy())
117 return Builder.CreateIntCast(&V, Type::getInt16Ty(V.getContext()), false);
118 if (VTy->isFloatingPointTy())
119 return Builder.CreateFPCast(&V, Type::getHalfTy(V.getContext()));
120
121 llvm_unreachable("Should never be called!");
122}
123
124/// Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with
125/// modified arguments (based on OldIntr) and replaces InstToReplace with
126/// this newly created intrinsic call.
127static std::optional<Instruction *> modifyIntrinsicCall(
128 IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr,
129 InstCombiner &IC,
130 std::function<void(SmallVectorImpl<Value *> &, SmallVectorImpl<Type *> &)>
131 Func) {
132 SmallVector<Type *, 4> OverloadTys;
133 if (!Intrinsic::isSignatureValid(OldIntr.getCalledFunction(), OverloadTys))
134 return std::nullopt;
135
136 SmallVector<Value *, 8> Args(OldIntr.args());
137
138 // Modify arguments and types
139 Func(Args, OverloadTys);
140
141 CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, OverloadTys, Args);
142 NewCall->takeName(&OldIntr);
143 NewCall->copyMetadata(OldIntr);
144 if (isa<FPMathOperator>(NewCall))
145 NewCall->copyFastMathFlags(&OldIntr);
146
147 // Erase and replace uses
148 if (!InstToReplace.getType()->isVoidTy())
149 IC.replaceInstUsesWith(InstToReplace, NewCall);
150
151 bool RemoveOldIntr = &OldIntr != &InstToReplace;
152
153 auto *RetValue = IC.eraseInstFromFunction(InstToReplace);
154 if (RemoveOldIntr)
155 IC.eraseInstFromFunction(OldIntr);
156
157 return RetValue;
158}
159
160static std::optional<Instruction *>
162 const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
164 // Optimize _L to _LZ when _L is zero
165 if (const auto *LZMappingInfo =
167 if (auto *ConstantLod =
168 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->LodIndex))) {
169 if (ConstantLod->isZero() || ConstantLod->isNegative()) {
170 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
172 ImageDimIntr->Dim);
173 return modifyIntrinsicCall(
174 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
175 Args.erase(Args.begin() + ImageDimIntr->LodIndex);
176 });
177 }
178 }
179 }
180
181 // Optimize _mip away, when 'lod' is zero
182 if (const auto *MIPMappingInfo =
184 if (auto *ConstantMip =
185 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->MipIndex))) {
186 if (ConstantMip->isZero()) {
187 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
188 AMDGPU::getImageDimIntrinsicByBaseOpcode(MIPMappingInfo->NONMIP,
189 ImageDimIntr->Dim);
190 return modifyIntrinsicCall(
191 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
192 Args.erase(Args.begin() + ImageDimIntr->MipIndex);
193 });
194 }
195 }
196 }
197
198 // Optimize _bias away when 'bias' is zero
199 if (const auto *BiasMappingInfo =
201 if (auto *ConstantBias =
202 dyn_cast<ConstantFP>(II.getOperand(ImageDimIntr->BiasIndex))) {
203 if (ConstantBias->isZero()) {
204 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
205 AMDGPU::getImageDimIntrinsicByBaseOpcode(BiasMappingInfo->NoBias,
206 ImageDimIntr->Dim);
207 return modifyIntrinsicCall(
208 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
209 Args.erase(Args.begin() + ImageDimIntr->BiasIndex);
210 ArgTys.erase(ArgTys.begin() + ImageDimIntr->BiasTyArg);
211 });
212 }
213 }
214 }
215
216 // Optimize _offset away when 'offset' is zero
217 if (const auto *OffsetMappingInfo =
219 if (auto *ConstantOffset =
220 dyn_cast<ConstantInt>(II.getOperand(ImageDimIntr->OffsetIndex))) {
221 if (ConstantOffset->isZero()) {
222 const AMDGPU::ImageDimIntrinsicInfo *NewImageDimIntr =
224 OffsetMappingInfo->NoOffset, ImageDimIntr->Dim);
225 return modifyIntrinsicCall(
226 II, II, NewImageDimIntr->Intr, IC, [&](auto &Args, auto &ArgTys) {
227 Args.erase(Args.begin() + ImageDimIntr->OffsetIndex);
228 });
229 }
230 }
231 }
232
233 // Try to use D16
234 if (ST->hasD16Images()) {
235
236 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
238
239 if (BaseOpcode->HasD16) {
240
241 // If the only use of image intrinsic is a fptrunc (with conversion to
242 // half) then both fptrunc and image intrinsic will be replaced with image
243 // intrinsic with D16 flag.
244 if (II.hasOneUse()) {
245 Instruction *User = II.user_back();
246
247 if (User->getOpcode() == Instruction::FPTrunc &&
249
250 return modifyIntrinsicCall(II, *User, ImageDimIntr->Intr, IC,
251 [&](auto &Args, auto &ArgTys) {
252 // Change return type of image intrinsic.
253 // Set it to return type of fptrunc.
254 ArgTys[0] = User->getType();
255 });
256 }
257 }
258
259 // Only perform D16 folding if every user of the image sample is
260 // an ExtractElementInst immediately followed by an FPTrunc to half.
262 ExtractTruncPairs;
263 bool AllHalfExtracts = true;
264
265 for (User *U : II.users()) {
266 auto *Ext = dyn_cast<ExtractElementInst>(U);
267 if (!Ext || !Ext->hasOneUse()) {
268 AllHalfExtracts = false;
269 break;
270 }
271
272 auto *Tr = dyn_cast<FPTruncInst>(*Ext->user_begin());
273 if (!Tr || !Tr->getType()->isHalfTy()) {
274 AllHalfExtracts = false;
275 break;
276 }
277
278 ExtractTruncPairs.emplace_back(Ext, Tr);
279 }
280
281 if (!ExtractTruncPairs.empty() && AllHalfExtracts) {
282 auto *VecTy = cast<VectorType>(II.getType());
283 Type *HalfVecTy =
284 VecTy->getWithNewType(Type::getHalfTy(II.getContext()));
285
286 // Obtain the original image sample intrinsic's signature
287 // and replace its return type with the half-vector for D16 folding
288 SmallVector<Type *, 8> OverloadTys;
289 Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys);
290 OverloadTys[0] = HalfVecTy;
291
292 Module *M = II.getModule();
294 M, ImageDimIntr->Intr, OverloadTys);
295
296 II.mutateType(HalfVecTy);
297 II.setCalledFunction(HalfDecl);
298
299 IRBuilder<> Builder(II.getContext());
300 for (auto &[Ext, Tr] : ExtractTruncPairs) {
301 Value *Idx = Ext->getIndexOperand();
302
303 Builder.SetInsertPoint(Tr);
304
305 Value *HalfExtract = Builder.CreateExtractElement(&II, Idx);
306 HalfExtract->takeName(Tr);
307
308 Tr->replaceAllUsesWith(HalfExtract);
309 }
310
311 for (auto &[Ext, Tr] : ExtractTruncPairs) {
312 IC.eraseInstFromFunction(*Tr);
313 IC.eraseInstFromFunction(*Ext);
314 }
315
316 return &II;
317 }
318 }
319 }
320
321 // Try to use A16 or G16
322 if (!ST->hasA16() && !ST->hasG16())
323 return std::nullopt;
324
325 // Address is interpreted as float if the instruction has a sampler or as
326 // unsigned int if there is no sampler.
327 bool HasSampler =
329 bool FloatCoord = false;
330 // true means derivatives can be converted to 16 bit, coordinates not
331 bool OnlyDerivatives = false;
332
333 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
334 OperandIndex < ImageDimIntr->VAddrEnd; OperandIndex++) {
335 Value *Coord = II.getOperand(OperandIndex);
336 // If the values are not derived from 16-bit values, we cannot optimize.
337 if (!canSafelyConvertTo16Bit(*Coord, HasSampler)) {
338 if (OperandIndex < ImageDimIntr->CoordStart ||
339 ImageDimIntr->GradientStart == ImageDimIntr->CoordStart) {
340 return std::nullopt;
341 }
342 // All gradients can be converted, so convert only them
343 OnlyDerivatives = true;
344 break;
345 }
346
347 assert(OperandIndex == ImageDimIntr->GradientStart ||
348 FloatCoord == Coord->getType()->isFloatingPointTy());
349 FloatCoord = Coord->getType()->isFloatingPointTy();
350 }
351
352 if (!OnlyDerivatives && !ST->hasA16())
353 OnlyDerivatives = true; // Only supports G16
354
355 // Check if there is a bias parameter and if it can be converted to f16
356 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
357 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
358 assert(HasSampler &&
359 "Only image instructions with a sampler can have a bias");
360 if (!canSafelyConvertTo16Bit(*Bias, HasSampler))
361 OnlyDerivatives = true;
362 }
363
364 if (OnlyDerivatives && (!ST->hasG16() || ImageDimIntr->GradientStart ==
365 ImageDimIntr->CoordStart))
366 return std::nullopt;
367
368 Type *CoordType = FloatCoord ? Type::getHalfTy(II.getContext())
369 : Type::getInt16Ty(II.getContext());
370
371 return modifyIntrinsicCall(
372 II, II, II.getIntrinsicID(), IC, [&](auto &Args, auto &ArgTys) {
373 ArgTys[ImageDimIntr->GradientTyArg] = CoordType;
374 if (!OnlyDerivatives) {
375 ArgTys[ImageDimIntr->CoordTyArg] = CoordType;
376
377 // Change the bias type
378 if (ImageDimIntr->NumBiasArgs != 0)
379 ArgTys[ImageDimIntr->BiasTyArg] = Type::getHalfTy(II.getContext());
380 }
381
382 unsigned EndIndex =
383 OnlyDerivatives ? ImageDimIntr->CoordStart : ImageDimIntr->VAddrEnd;
384 for (unsigned OperandIndex = ImageDimIntr->GradientStart;
385 OperandIndex < EndIndex; OperandIndex++) {
386 Args[OperandIndex] =
387 convertTo16Bit(*II.getOperand(OperandIndex), IC.Builder);
388 }
389
390 // Convert the bias
391 if (!OnlyDerivatives && ImageDimIntr->NumBiasArgs != 0) {
392 Value *Bias = II.getOperand(ImageDimIntr->BiasIndex);
393 Args[ImageDimIntr->BiasIndex] = convertTo16Bit(*Bias, IC.Builder);
394 }
395 });
396}
397
399 const Value *Op0, const Value *Op1,
400 InstCombiner &IC) const {
401 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
402 // infinity, gives +0.0. If we can prove we don't have one of the special
403 // cases then we can use a normal multiply instead.
404 // TODO: Create and use isKnownFiniteNonZero instead of just matching
405 // constants here.
408 // One operand is not zero or infinity or NaN.
409 return true;
410 }
411
413 if (isKnownNeverInfOrNaN(Op0, SQ) && isKnownNeverInfOrNaN(Op1, SQ)) {
414 // Neither operand is infinity or NaN.
415 return true;
416 }
417 return false;
418}
419
420/// Match an fpext from half to float, or a constant we can convert.
422 Value *Src = nullptr;
423 ConstantFP *CFP = nullptr;
424 if (match(Arg, m_OneUse(m_FPExt(m_Value(Src))))) {
425 if (Src->getType()->isHalfTy())
426 return Src;
427 } else if (match(Arg, m_ConstantFP(CFP))) {
428 bool LosesInfo;
429 APFloat Val(CFP->getValueAPF());
431 if (!LosesInfo)
432 return ConstantFP::get(Type::getHalfTy(Arg->getContext()), Val);
433 }
434 return nullptr;
435}
436
437// Trim all zero components from the end of the vector \p UseV and return
438// an appropriate bitset with known elements.
440 Instruction *I) {
441 auto *VTy = cast<FixedVectorType>(UseV->getType());
442 unsigned VWidth = VTy->getNumElements();
443 APInt DemandedElts = APInt::getAllOnes(VWidth);
444
445 for (int i = VWidth - 1; i > 0; --i) {
446 auto *Elt = findScalarElement(UseV, i);
447 if (!Elt)
448 break;
449
450 if (auto *ConstElt = dyn_cast<Constant>(Elt)) {
451 if (!ConstElt->isNullValue() && !isa<UndefValue>(Elt))
452 break;
453 } else {
454 break;
455 }
456
457 DemandedElts.clearBit(i);
458 }
459
460 return DemandedElts;
461}
462
463// Trim elements of the end of the vector \p V, if they are
464// equal to the first element of the vector.
466 auto *VTy = cast<FixedVectorType>(V->getType());
467 unsigned VWidth = VTy->getNumElements();
468 APInt DemandedElts = APInt::getAllOnes(VWidth);
469 Value *FirstComponent = findScalarElement(V, 0);
470
471 SmallVector<int> ShuffleMask;
472 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
473 SVI->getShuffleMask(ShuffleMask);
474
475 for (int I = VWidth - 1; I > 0; --I) {
476 if (ShuffleMask.empty()) {
477 auto *Elt = findScalarElement(V, I);
478 if (!Elt || (Elt != FirstComponent && !isa<UndefValue>(Elt)))
479 break;
480 } else {
481 // Detect identical elements in the shufflevector result, even though
482 // findScalarElement cannot tell us what that element is.
483 if (ShuffleMask[I] != ShuffleMask[0] && ShuffleMask[I] != PoisonMaskElem)
484 break;
485 }
486 DemandedElts.clearBit(I);
487 }
488
489 return DemandedElts;
490}
491
494 APInt DemandedElts,
495 int DMaskIdx = -1,
496 bool IsLoad = true);
497
498/// Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
499static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp) {
500 return (SqrtOp->getType()->isFloatTy() &&
501 (SqrtOp->hasApproxFunc() || SqrtOp->getFPAccuracy() >= 1.0f)) ||
502 SqrtOp->getType()->isHalfTy();
503}
504
505/// Return true if we can easily prove that use U is uniform.
506static bool isTriviallyUniform(const Use &U) {
507 Value *V = U.get();
508 if (isa<Constant>(V))
509 return true;
510 if (const auto *A = dyn_cast<Argument>(V))
512 if (const auto *II = dyn_cast<IntrinsicInst>(V)) {
513 if (!AMDGPU::isIntrinsicAlwaysUniform(II->getIntrinsicID()))
514 return false;
515 // If II and U are in different blocks then there is a possibility of
516 // temporal divergence.
517 return II->getParent() == cast<Instruction>(U.getUser())->getParent();
518 }
519 return false;
520}
521
522/// Simplify a lane index operand (e.g. llvm.amdgcn.readlane src1).
523///
524/// The instruction only reads the low 5 bits for wave32, and 6 bits for wave64.
527 unsigned LaneArgIdx) const {
528 unsigned MaskBits = ST->getWavefrontSizeLog2();
529 APInt DemandedMask(32, maskTrailingOnes<unsigned>(MaskBits));
530
531 KnownBits Known(32);
532 if (IC.SimplifyDemandedBits(&II, LaneArgIdx, DemandedMask, Known))
533 return true;
534
535 if (!Known.isConstant())
536 return false;
537
538 // Out of bounds indexes may appear in wave64 code compiled for wave32.
539 // Unlike the DAG version, SimplifyDemandedBits does not change constants, so
540 // manually fix it up.
541
542 Value *LaneArg = II.getArgOperand(LaneArgIdx);
543 Constant *MaskedConst =
544 ConstantInt::get(LaneArg->getType(), Known.getConstant() & DemandedMask);
545 if (MaskedConst != LaneArg) {
546 II.getOperandUse(LaneArgIdx).set(MaskedConst);
547 return true;
548 }
549
550 return false;
551}
552
554 Function &NewCallee, ArrayRef<Value *> Ops) {
556 Old.getOperandBundlesAsDefs(OpBundles);
557
558 CallInst *NewCall = B.CreateCall(&NewCallee, Ops, OpBundles);
559 NewCall->takeName(&Old);
560 return NewCall;
561}
562
563// Return true for sequences of instructions that effectively assign
564// each lane to its thread ID
565static bool isThreadID(const GCNSubtarget &ST, Value *V) {
566 // Case 1:
567 // wave32: mbcnt_lo(-1, 0)
568 // wave64: mbcnt_hi(-1, mbcnt_lo(-1, 0))
574 if (ST.isWave32() && match(V, W32Pred))
575 return true;
576 if (ST.isWave64() && match(V, W64Pred))
577 return true;
578
579 return false;
580}
581
584 IntrinsicInst &II) const {
585 const auto IID = II.getIntrinsicID();
586 assert(IID == Intrinsic::amdgcn_readlane ||
587 IID == Intrinsic::amdgcn_readfirstlane ||
588 IID == Intrinsic::amdgcn_permlane64);
589
590 Instruction *OpInst = dyn_cast<Instruction>(II.getOperand(0));
591
592 // Only do this if both instructions are in the same block
593 // (so the exec mask won't change) and the readlane is the only user of its
594 // operand.
595 if (!OpInst || !OpInst->hasOneUser() || OpInst->getParent() != II.getParent())
596 return nullptr;
597
598 const bool IsReadLane = (IID == Intrinsic::amdgcn_readlane);
599
600 // If this is a readlane, check that the second operand is a constant, or is
601 // defined before OpInst so we know it's safe to move this intrinsic higher.
602 Value *LaneID = nullptr;
603 if (IsReadLane) {
604 LaneID = II.getOperand(1);
605
606 // readlane take an extra operand for the lane ID, so we must check if that
607 // LaneID value can be used at the point where we want to move the
608 // intrinsic.
609 if (auto *LaneIDInst = dyn_cast<Instruction>(LaneID)) {
610 if (!IC.getDominatorTree().dominates(LaneIDInst, OpInst))
611 return nullptr;
612 }
613 }
614
615 // Hoist the intrinsic (II) through OpInst.
616 //
617 // (II (OpInst x)) -> (OpInst (II x))
618 const auto DoIt = [&](unsigned OpIdx,
619 Function *NewIntrinsic) -> Instruction * {
621 if (IsReadLane)
622 Ops.push_back(LaneID);
623
624 // Rewrite the intrinsic call.
625 CallInst *NewII = rewriteCall(IC.Builder, II, *NewIntrinsic, Ops);
626
627 // Rewrite OpInst so it takes the result of the intrinsic now.
628 Instruction &NewOp = *OpInst->clone();
629 NewOp.setOperand(OpIdx, NewII);
630 return &NewOp;
631 };
632
633 // TODO(?): Should we do more with permlane64?
634 if (IID == Intrinsic::amdgcn_permlane64 && !isa<BitCastInst>(OpInst))
635 return nullptr;
636
637 if (isa<UnaryOperator>(OpInst))
638 return DoIt(0, II.getCalledFunction());
639
640 if (isa<CastInst>(OpInst)) {
641 Value *Src = OpInst->getOperand(0);
642 Type *SrcTy = Src->getType();
643 if (!isTypeLegal(SrcTy))
644 return nullptr;
645
646 Function *Remangled =
647 Intrinsic::getOrInsertDeclaration(II.getModule(), IID, {SrcTy});
648 return DoIt(0, Remangled);
649 }
650
651 // We can also hoist through binary operators if the other operand is uniform.
652 if (isa<BinaryOperator>(OpInst)) {
653 // FIXME: If we had access to UniformityInfo here we could just check
654 // if the operand is uniform.
655 if (isTriviallyUniform(OpInst->getOperandUse(0)))
656 return DoIt(1, II.getCalledFunction());
657 if (isTriviallyUniform(OpInst->getOperandUse(1)))
658 return DoIt(0, II.getCalledFunction());
659 }
660
661 return nullptr;
662}
663
664/// Evaluate V as a function of the lane ID and return its value on Lane, or
665/// std::nullopt if V is not a closed-form expression of the lane ID.
666static std::optional<unsigned> evalLaneExpr(Value *V, unsigned Lane,
667 const GCNSubtarget &ST,
668 const DataLayout &DL,
669 unsigned Depth = 0) {
671 return std::nullopt;
672
673 // Poison/undef in the index expression: bail and let InstCombine fold the
674 // intrinsic the usual way.
675 if (isa<UndefValue>(V))
676 return std::nullopt;
677
678 if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
679 return CI->getZExtValue();
680
681 if (isThreadID(ST, V))
682 return Lane;
683
685 if (!BO)
686 return std::nullopt;
687
688 std::optional<unsigned> LHS =
689 evalLaneExpr(BO->getOperand(0), Lane, ST, DL, Depth + 1);
690 if (!LHS)
691 return std::nullopt;
692 std::optional<unsigned> RHS =
693 evalLaneExpr(BO->getOperand(1), Lane, ST, DL, Depth + 1);
694 if (!RHS)
695 return std::nullopt;
696
697 Type *Ty = BO->getType();
698 Constant *Ops[] = {ConstantInt::get(Ty, *LHS), ConstantInt::get(Ty, *RHS)};
699 auto *CI =
701 return CI ? std::optional<unsigned>(CI->getZExtValue()) : std::nullopt;
702}
703
704/// Build the per-lane shuffle map by evaluating Index for every lane in the
705/// wave. Returns false if any lane index is non-constant or out of range.
706static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST,
708 const DataLayout &DL) {
709 unsigned WaveSize = ST.getWavefrontSize();
710 Ids.resize(WaveSize);
711 for (unsigned Lane : seq(WaveSize)) {
712 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
713 if (!Val || *Val >= WaveSize)
714 return false;
715 Ids[Lane] = *Val;
716 }
717 return true;
718}
719
720/// Lanes are partitioned into groups of Period; each group is a translated
721/// copy of the first: Ids[I] = Ids[I % Period] + (I & ~(Period - 1)).
722template <unsigned Period>
724 static_assert(isPowerOf2_32(Period), "Period must be a power of two");
725 for (unsigned I = Period, E = Ids.size(); I < E; ++I)
726 if (Ids[I] != Ids[I % Period] + (I & ~(Period - 1)))
727 return false;
728 return true;
729}
730
731/// Match an N-lane row pattern: each lane in [0, N) reads from a source lane
732/// in the same N-lane row, and the pattern repeats periodically across rows.
733template <unsigned N> static bool isRowPattern(ArrayRef<uint8_t> Ids) {
734 for (unsigned I = 0; I < N; ++I)
735 if (Ids[I] >= N)
736 return false;
737 return hasPeriodicLayout<N>(Ids);
738}
739
740static constexpr auto isQuadPattern = isRowPattern<4>;
741static constexpr auto isHalfRowPattern = isRowPattern<8>;
742static constexpr auto isFullRowPattern = isRowPattern<16>;
743
744/// Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp
745/// QUAD_PERM control word: bits[1:0]=Ids[0], [3:2]=Ids[1], [5:4]=Ids[2],
746/// [7:6]=Ids[3].
747static std::optional<unsigned> matchQuadPermPattern(ArrayRef<uint8_t> Ids) {
748 if (!isQuadPattern(Ids))
749 return std::nullopt;
750 return Ids[3] << 6 | Ids[2] << 4 | Ids[1] << 2 | Ids[0];
751}
752
753/// Match an N-lane reversal (mirror) pattern.
754template <unsigned N> static bool matchMirrorPattern(ArrayRef<uint8_t> Ids) {
755 if (!isRowPattern<N>(Ids))
756 return false;
757 for (unsigned J = 0; J < N; ++J)
758 if (Ids[J] != (N - 1) - J)
759 return false;
760 return true;
761}
762
765
766/// Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
767static std::optional<unsigned> matchRowRotatePattern(ArrayRef<uint8_t> Ids) {
768 if (Ids[0] == 0 || !isFullRowPattern(Ids))
769 return std::nullopt;
770 for (unsigned J = 1; J < 16; ++J)
771 if (Ids[J] != (Ids[0] + J) % 16)
772 return std::nullopt;
773 return 16u - Ids[0];
774}
775
776/// Match a row-share pattern: all 16 lanes of each row read the same source
777/// lane. Returns the shared source lane index in [0, 16).
778static std::optional<unsigned> matchRowSharePattern(ArrayRef<uint8_t> Ids) {
779 if (!isFullRowPattern(Ids))
780 return std::nullopt;
781 if (!all_equal(Ids.take_front(16)))
782 return std::nullopt;
783 return Ids[0];
784}
785
786/// Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J,
787/// with Mask in [1, 15].
788static std::optional<unsigned> matchRowXMaskPattern(ArrayRef<uint8_t> Ids) {
789 unsigned Mask = Ids[0];
790 if (Mask == 0 || !isFullRowPattern(Ids))
791 return std::nullopt;
792 for (unsigned J = 0; J < 16; ++J)
793 if (Ids[J] != (Mask ^ J))
794 return std::nullopt;
795 return Mask;
796}
797
798/// Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8
799/// 24-bit selector (three bits per output lane).
800static std::optional<unsigned> matchHalfRowPermPattern(ArrayRef<uint8_t> Ids) {
801 if (!isHalfRowPattern(Ids))
802 return std::nullopt;
803 unsigned Selector = 0;
804 for (unsigned J = 0; J < 8; ++J)
805 Selector |= Ids[J] << (J * 3);
806 return Selector;
807}
808
809/// Pack a 16-lane permutation into a single 64-bit value: four bits per output
810/// lane, lane J in bits [J*4 + 3 : J*4]. The caller splits it into the low and
811/// high 32-bit selector operands of v_permlane16 / v_permlanex16.
813 uint64_t Sel = 0;
814 for (unsigned J = 0; J < 16; ++J)
815 Sel |= static_cast<uint64_t>(Ids[J] & 0xF) << (J * 4);
816 return Sel;
817}
818
819/// Match a half-wave swap: lane J reads from lane J ^ 32. Only meaningful on
820/// wave64 targets.
822 if (Ids.size() != 64)
823 return false;
824 for (unsigned J = 0; J < 64; ++J)
825 if (Ids[J] != (J ^ 32))
826 return false;
827 return true;
828}
829
830/// Match a cross-row permutation suitable for v_permlanex16: every lane in
831/// the low 16-lane half reads from the high half of its own row, and vice
832/// versa.
834 if (!hasPeriodicLayout<32>(Ids))
835 return false;
836 for (unsigned J = 0; J < 16; ++J) {
837 if (Ids[J] < 16 || Ids[J] >= 32)
838 return false;
839 if (Ids[J + 16] != Ids[J] - 16)
840 return false;
841 }
842 return true;
843}
844
845/// Match a DS_SWIZZLE bitmask-mode permutation:
846/// dst_lane = ((src_lane & AND) | OR) ^ XOR
847/// with each mask being five bits. Returns the encoded swizzle immediate.
848/// The hardware applies the formula independently within each 32-lane group,
849/// so on wave64 the high group must replicate the low one (translated by 32).
850static std::optional<unsigned>
852 if (!hasPeriodicLayout<32>(Ids))
853 return std::nullopt;
854
855 // The formula is per-bit: output bit B depends only on input bit B. Probe
856 // each bit with src=0 and src=(1<<B); if the output bit flipped, AND[B]=1
857 // and XOR[B] carries the constant offset; otherwise it is a constant bit
858 // encoded in OR (with AND[B]=0, XOR[B]=0).
859 unsigned AndMask = 0, OrMask = 0, XorMask = 0;
860 for (unsigned B = 0; B < 5; ++B) {
861 unsigned Bit0 = (Ids[0] >> B) & 1;
862 unsigned Bit1 = (Ids[1u << B] >> B) & 1;
863 if (Bit0 != Bit1) {
864 AndMask |= 1u << B;
865 XorMask |= Bit0 << B;
866 } else {
867 OrMask |= Bit0 << B;
868 }
869 }
870
871 // The per-bit derivation assumes bit independence; verify the masks
872 // actually reproduce every lane in the 32-lane group.
873 for (unsigned I : seq(32u)) {
874 unsigned Expected = ((I & AndMask) | OrMask) ^ XorMask;
875 if (Ids[I] != Expected)
876 return std::nullopt;
877 }
878
883}
884
885/// Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and
886/// bound_ctrl=1 so out-of-bounds lanes are well-defined and the DPP mov can
887/// be folded into a consuming VALU op by GCNDPPCombine.
888static Value *createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl) {
889 Type *Ty = Val->getType();
890 return B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, {Ty},
891 {PoisonValue::get(Ty), Val, B.getInt32(Ctrl),
892 B.getInt32(0xF), B.getInt32(0xF), B.getTrue()});
893}
894
895/// Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
896static Value *createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector) {
897 return B.CreateIntrinsic(Intrinsic::amdgcn_mov_dpp8, {Val->getType()},
898 {Val, B.getInt32(Selector)});
899}
900
901/// Emit v_permlane16 with the precomputed lane-select halves.
903 uint32_t Hi) {
904 Type *Ty = Val->getType();
905 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane16, {Ty},
906 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
907 B.getInt32(Hi), B.getFalse(), B.getFalse()});
908}
909
910/// Emit v_permlanex16 with the precomputed lane-select halves. Each output
911/// lane reads from the other 16-lane half of the same row.
913 uint32_t Hi) {
914 Type *Ty = Val->getType();
915 return B.CreateIntrinsic(Intrinsic::amdgcn_permlanex16, {Ty},
916 {PoisonValue::get(Ty), Val, B.getInt32(Lo),
917 B.getInt32(Hi), B.getFalse(), B.getFalse()});
918}
919
920/// Emit ds_swizzle with the given immediate, bitcasting/converting between
921/// pointer/float types and i32 as required by the intrinsic signature.
923 const DataLayout &DL) {
924 Type *OrigTy = Val->getType();
925 assert(DL.getTypeSizeInBits(OrigTy) == 32 &&
926 "ds_swizzle only supports 32-bit operands");
927 IntegerType *I32Ty = B.getInt32Ty();
928 Value *Src = Val;
929 if (OrigTy->isPointerTy())
930 Src = B.CreatePtrToInt(Src, I32Ty);
931 else if (OrigTy != I32Ty)
932 Src = B.CreateBitCast(Src, I32Ty);
933 Value *Result = B.CreateIntrinsic(Intrinsic::amdgcn_ds_swizzle, {},
934 {Src, B.getInt32(Offset)});
935 if (OrigTy->isPointerTy())
936 return B.CreateIntToPtr(Result, OrigTy);
937 if (OrigTy != I32Ty)
938 return B.CreateBitCast(Result, OrigTy);
939 return Result;
940}
941
942/// Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
944 return B.CreateIntrinsic(Intrinsic::amdgcn_permlane64, {Val->getType()},
945 {Val});
946}
947
948/// Given a shuffle map, try to emit the best hardware intrinsic.
951 const GCNSubtarget &ST,
952 const DataLayout &DL) {
953 // Uniform shuffle (all lanes read the same value) is handled by cheaper
954 // broadcast/readlane intrinsics.
955 if (all_equal(Ids))
956 return nullptr;
957
958 if (std::optional<unsigned> QP = matchQuadPermPattern(Ids)) {
959 if (ST.hasDPP())
960 return createUpdateDpp(B, Src, *QP);
962 }
963
964 if (ST.hasDPP()) {
969 if (std::optional<unsigned> Amt = matchRowRotatePattern(Ids))
970 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_ROR_FIRST + *Amt - 1);
971 }
972
973 // row_share is supported on GFX90A and GFX10+; row_xmask is GFX10+ only.
974 if (ST.hasDPPRowShare()) {
975 if (std::optional<unsigned> Lane = matchRowSharePattern(Ids))
976 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_SHARE_FIRST + *Lane);
977 }
978
979 if (ST.hasDPP() && ST.hasGFX10Insts()) {
980 if (std::optional<unsigned> Mask = matchRowXMaskPattern(Ids))
981 return createUpdateDpp(B, Src, AMDGPU::DPP::ROW_XMASK_FIRST + *Mask);
982 }
983
984 if (ST.hasDPP8()) {
985 if (std::optional<unsigned> Sel = matchHalfRowPermPattern(Ids))
986 return createMovDpp8(B, Src, *Sel);
987 }
988
989 if (ST.hasPermLaneX16()) {
990 if (isFullRowPattern(Ids)) {
992 return createPermlane16(B, Src, Lo_32(Sel), Hi_32(Sel));
993 }
994 // Cross-row shuffles (e.g. XOR 16..31) — covered by permlanex16.
995 if (isCrossRowPattern(Ids)) {
997 return createPermlaneX16(B, Src, Lo_32(Sel), Hi_32(Sel));
998 }
999 }
1000
1001 // Generic DS_SWIZZLE bitmask-mode fallback: handles any 32-lane shuffle that
1002 // can be expressed as dst = ((src & AND) | OR) ^ XOR with 5-bit masks. This
1003 // is available on every target that has ds_swizzle.
1004 if (std::optional<unsigned> Imm = matchDsSwizzleBitmaskPattern(Ids))
1005 return createDsSwizzle(B, Src, *Imm, DL);
1006
1007 if (ST.hasPermLane64() && matchHalfWaveSwapPattern(Ids))
1008 return createPermlane64(B, Src);
1009
1010 return nullptr;
1011}
1012
1013/// Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant
1014/// function of the lane ID into a hardware-specific lane permutation intrinsic.
1015static std::optional<Instruction *>
1017 const GCNSubtarget &ST) {
1018 const DataLayout &DL = IC.getDataLayout();
1019 if (DL.getTypeSizeInBits(II.getType()) != 32)
1020 return std::nullopt;
1021
1022 if (!ST.isWaveSizeKnown())
1023 return std::nullopt;
1024
1025 unsigned WaveSize = ST.getWavefrontSize();
1026 bool IsBpermute = II.getIntrinsicID() == Intrinsic::amdgcn_ds_bpermute;
1027 Value *Src = II.getArgOperand(IsBpermute ? 1 : 0);
1028 Value *Index = II.getArgOperand(IsBpermute ? 0 : 1);
1029
1031 if (IsBpermute) {
1032 Ids.resize(WaveSize);
1033 for (unsigned Lane : seq(WaveSize)) {
1034 std::optional<unsigned> Val = evalLaneExpr(Index, Lane, ST, DL);
1035 if (!Val || (*Val & 3) || (*Val >> 2) >= WaveSize)
1036 return std::nullopt;
1037 Ids[Lane] = *Val >> 2;
1038 }
1039 } else {
1040 if (!tryBuildShuffleMap(Index, ST, Ids, DL))
1041 return std::nullopt;
1042 }
1043
1044 Value *Result = matchShuffleToHWIntrinsic(IC.Builder, Src, Ids, ST, DL);
1045 if (!Result)
1046 return std::nullopt;
1047
1048 return IC.replaceInstUsesWith(II, Result);
1049}
1050
1051std::optional<Instruction *>
1053 Intrinsic::ID IID = II.getIntrinsicID();
1054 switch (IID) {
1055 case Intrinsic::amdgcn_implicitarg_ptr: {
1056 if (II.getFunction()->hasFnAttribute("amdgpu-no-implicitarg-ptr"))
1057 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1058 uint64_t ImplicitArgBytes = ST->getImplicitArgNumBytes(*II.getFunction());
1059
1060 uint64_t CurrentOrNullBytes =
1061 II.getAttributes().getRetDereferenceableOrNullBytes();
1062 if (CurrentOrNullBytes != 0) {
1063 // Refine "dereferenceable (A) meets dereferenceable_or_null(B)"
1064 // into dereferenceable(max(A, B))
1065 uint64_t NewBytes = std::max(CurrentOrNullBytes, ImplicitArgBytes);
1066 II.addRetAttr(
1067 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1068 II.removeRetAttr(Attribute::DereferenceableOrNull);
1069 return &II;
1070 }
1071
1072 uint64_t CurrentBytes = II.getAttributes().getRetDereferenceableBytes();
1073 uint64_t NewBytes = std::max(CurrentBytes, ImplicitArgBytes);
1074 if (NewBytes != CurrentBytes) {
1075 II.addRetAttr(
1076 Attribute::getWithDereferenceableBytes(II.getContext(), NewBytes));
1077 return &II;
1078 }
1079
1080 return std::nullopt;
1081 }
1082 case Intrinsic::amdgcn_rcp: {
1083 Value *Src = II.getArgOperand(0);
1084 if (isa<PoisonValue>(Src))
1085 return IC.replaceInstUsesWith(II, Src);
1086
1087 // TODO: Move to ConstantFolding/InstSimplify?
1088 if (isa<UndefValue>(Src)) {
1089 Type *Ty = II.getType();
1090 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1091 return IC.replaceInstUsesWith(II, QNaN);
1092 }
1093
1094 if (II.isStrictFP())
1095 break;
1096
1097 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1098 const APFloat &ArgVal = C->getValueAPF();
1099 APFloat Val(ArgVal.getSemantics(), 1);
1101
1102 // This is more precise than the instruction may give.
1103 //
1104 // TODO: The instruction always flushes denormal results (except for f16),
1105 // should this also?
1106 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getContext(), Val));
1107 }
1108
1109 FastMathFlags FMF = cast<FPMathOperator>(II).getFastMathFlags();
1110 if (!FMF.allowContract())
1111 break;
1112 auto *SrcCI = dyn_cast<IntrinsicInst>(Src);
1113 if (!SrcCI)
1114 break;
1115
1116 auto IID = SrcCI->getIntrinsicID();
1117 // llvm.amdgcn.rcp(llvm.amdgcn.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable
1118 //
1119 // llvm.amdgcn.rcp(llvm.sqrt(x)) -> llvm.amdgcn.rsq(x) if contractable and
1120 // relaxed.
1121 if (IID == Intrinsic::amdgcn_sqrt || IID == Intrinsic::sqrt) {
1122 const FPMathOperator *SqrtOp = cast<FPMathOperator>(SrcCI);
1123 FastMathFlags InnerFMF = SqrtOp->getFastMathFlags();
1124 if (!InnerFMF.allowContract() || !SrcCI->hasOneUse())
1125 break;
1126
1127 if (IID == Intrinsic::sqrt && !canContractSqrtToRsq(SqrtOp))
1128 break;
1129
1131 SrcCI->getModule(), Intrinsic::amdgcn_rsq, {SrcCI->getType()});
1132
1133 InnerFMF |= FMF;
1134 II.setFastMathFlags(InnerFMF);
1135
1136 II.setCalledFunction(NewDecl);
1137 return IC.replaceOperand(II, 0, SrcCI->getArgOperand(0));
1138 }
1139
1140 break;
1141 }
1142 case Intrinsic::amdgcn_sqrt:
1143 case Intrinsic::amdgcn_rsq:
1144 case Intrinsic::amdgcn_tanh: {
1145 Value *Src = II.getArgOperand(0);
1146 if (isa<PoisonValue>(Src))
1147 return IC.replaceInstUsesWith(II, Src);
1148
1149 // TODO: Move to ConstantFolding/InstSimplify?
1150 if (isa<UndefValue>(Src)) {
1151 Type *Ty = II.getType();
1152 auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
1153 return IC.replaceInstUsesWith(II, QNaN);
1154 }
1155
1156 // f16 amdgcn.sqrt is identical to regular sqrt.
1157 if (IID == Intrinsic::amdgcn_sqrt && Src->getType()->isHalfTy()) {
1159 II.getModule(), Intrinsic::sqrt, {II.getType()});
1160 II.setCalledFunction(NewDecl);
1161 return &II;
1162 }
1163
1164 break;
1165 }
1166 case Intrinsic::amdgcn_log:
1167 case Intrinsic::amdgcn_exp2: {
1168 const bool IsLog = IID == Intrinsic::amdgcn_log;
1169 const bool IsExp = IID == Intrinsic::amdgcn_exp2;
1170 Value *Src = II.getArgOperand(0);
1171 Type *Ty = II.getType();
1172
1173 if (isa<PoisonValue>(Src))
1174 return IC.replaceInstUsesWith(II, Src);
1175
1176 if (IC.getSimplifyQuery().isUndefValue(Src))
1178
1179 if (ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1180 if (C->isInfinity()) {
1181 // exp2(+inf) -> +inf
1182 // log2(+inf) -> +inf
1183 if (!C->isNegative())
1184 return IC.replaceInstUsesWith(II, C);
1185
1186 // exp2(-inf) -> 0
1187 if (IsExp && C->isNegative())
1189 }
1190
1191 if (II.isStrictFP())
1192 break;
1193
1194 if (C->isNaN()) {
1195 Constant *Quieted = ConstantFP::get(Ty, C->getValue().makeQuiet());
1196 return IC.replaceInstUsesWith(II, Quieted);
1197 }
1198
1199 // f32 instruction doesn't handle denormals, f16 does.
1200 if (C->isZero() || (C->getValue().isDenormal() && Ty->isFloatTy())) {
1201 Constant *FoldedValue = IsLog ? ConstantFP::getInfinity(Ty, true)
1202 : ConstantFP::get(Ty, 1.0);
1203 return IC.replaceInstUsesWith(II, FoldedValue);
1204 }
1205
1206 if (IsLog && C->isNegative())
1208
1209 // TODO: Full constant folding matching hardware behavior.
1210 }
1211
1212 break;
1213 }
1214 case Intrinsic::amdgcn_frexp_mant:
1215 case Intrinsic::amdgcn_frexp_exp: {
1216 Value *Src = II.getArgOperand(0);
1217 if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
1218 int Exp;
1219 APFloat Significand =
1220 frexp(C->getValueAPF(), Exp, APFloat::rmNearestTiesToEven);
1221
1222 if (IID == Intrinsic::amdgcn_frexp_mant) {
1223 return IC.replaceInstUsesWith(
1224 II, ConstantFP::get(II.getContext(), Significand));
1225 }
1226
1227 // Match instruction special case behavior.
1228 if (Exp == APFloat::IEK_NaN || Exp == APFloat::IEK_Inf)
1229 Exp = 0;
1230
1231 return IC.replaceInstUsesWith(II,
1232 ConstantInt::getSigned(II.getType(), Exp));
1233 }
1234
1235 if (isa<PoisonValue>(Src))
1236 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1237
1238 if (isa<UndefValue>(Src)) {
1239 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1240 }
1241
1242 break;
1243 }
1244 case Intrinsic::amdgcn_class: {
1245 Value *Src0 = II.getArgOperand(0);
1246 Value *Src1 = II.getArgOperand(1);
1247 const ConstantInt *CMask = dyn_cast<ConstantInt>(Src1);
1248 if (CMask) {
1249 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
1250 II.getModule(), Intrinsic::is_fpclass, Src0->getType()));
1251
1252 // Clamp any excess bits, as they're illegal for the generic intrinsic.
1253 II.setArgOperand(1, ConstantInt::get(Src1->getType(),
1254 CMask->getZExtValue() & fcAllFlags));
1255 return &II;
1256 }
1257
1258 // Propagate poison.
1259 if (isa<PoisonValue>(Src0) || isa<PoisonValue>(Src1))
1260 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1261
1262 // llvm.amdgcn.class(_, undef) -> false
1263 if (IC.getSimplifyQuery().isUndefValue(Src1))
1264 return IC.replaceInstUsesWith(II, ConstantInt::get(II.getType(), false));
1265
1266 // llvm.amdgcn.class(undef, mask) -> mask != 0
1267 if (IC.getSimplifyQuery().isUndefValue(Src0)) {
1268 Value *CmpMask = IC.Builder.CreateICmpNE(
1269 Src1, ConstantInt::getNullValue(Src1->getType()));
1270 return IC.replaceInstUsesWith(II, CmpMask);
1271 }
1272 break;
1273 }
1274 case Intrinsic::amdgcn_cvt_pkrtz: {
1275 auto foldFPTruncToF16RTZ = [](Value *Arg) -> Value * {
1276 Type *HalfTy = Type::getHalfTy(Arg->getContext());
1277
1278 if (isa<PoisonValue>(Arg))
1279 return PoisonValue::get(HalfTy);
1280 if (isa<UndefValue>(Arg))
1281 return UndefValue::get(HalfTy);
1282
1283 ConstantFP *CFP = nullptr;
1284 if (match(Arg, m_ConstantFP(CFP))) {
1285 bool LosesInfo;
1286 APFloat Val(CFP->getValueAPF());
1288 return ConstantFP::get(HalfTy, Val);
1289 }
1290
1291 Value *Src = nullptr;
1292 if (match(Arg, m_FPExt(m_Value(Src)))) {
1293 if (Src->getType()->isHalfTy())
1294 return Src;
1295 }
1296
1297 return nullptr;
1298 };
1299
1300 if (Value *Src0 = foldFPTruncToF16RTZ(II.getArgOperand(0))) {
1301 if (Value *Src1 = foldFPTruncToF16RTZ(II.getArgOperand(1))) {
1302 Value *V = PoisonValue::get(II.getType());
1303 V = IC.Builder.CreateInsertElement(V, Src0, (uint64_t)0);
1304 V = IC.Builder.CreateInsertElement(V, Src1, (uint64_t)1);
1305 return IC.replaceInstUsesWith(II, V);
1306 }
1307 }
1308
1309 break;
1310 }
1311 case Intrinsic::amdgcn_cvt_pknorm_i16:
1312 case Intrinsic::amdgcn_cvt_pknorm_u16:
1313 case Intrinsic::amdgcn_cvt_pk_i16:
1314 case Intrinsic::amdgcn_cvt_pk_u16: {
1315 Value *Src0 = II.getArgOperand(0);
1316 Value *Src1 = II.getArgOperand(1);
1317
1318 // TODO: Replace call with scalar operation if only one element is poison.
1319 if (isa<PoisonValue>(Src0) && isa<PoisonValue>(Src1))
1320 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1321
1322 if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1)) {
1323 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
1324 }
1325
1326 break;
1327 }
1328 case Intrinsic::amdgcn_cvt_off_f32_i4: {
1329 Value* Arg = II.getArgOperand(0);
1330 Type *Ty = II.getType();
1331
1332 if (isa<PoisonValue>(Arg))
1333 return IC.replaceInstUsesWith(II, PoisonValue::get(Ty));
1334
1335 if(IC.getSimplifyQuery().isUndefValue(Arg))
1337
1338 ConstantInt *CArg = dyn_cast<ConstantInt>(II.getArgOperand(0));
1339 if (!CArg)
1340 break;
1341
1342 // Tabulated 0.0625 * (sext (CArg & 0xf)).
1343 constexpr size_t ResValsSize = 16;
1344 static constexpr float ResVals[ResValsSize] = {
1345 0.0, 0.0625, 0.125, 0.1875, 0.25, 0.3125, 0.375, 0.4375,
1346 -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625};
1347 Constant *Res =
1348 ConstantFP::get(Ty, ResVals[CArg->getZExtValue() & (ResValsSize - 1)]);
1349 return IC.replaceInstUsesWith(II, Res);
1350 }
1351 case Intrinsic::amdgcn_ubfe:
1352 case Intrinsic::amdgcn_sbfe: {
1353 // Decompose simple cases into standard shifts.
1354 Value *Src = II.getArgOperand(0);
1355 if (isa<UndefValue>(Src)) {
1356 return IC.replaceInstUsesWith(II, Src);
1357 }
1358
1359 unsigned Width;
1360 Type *Ty = II.getType();
1361 unsigned IntSize = Ty->getIntegerBitWidth();
1362
1363 ConstantInt *CWidth = dyn_cast<ConstantInt>(II.getArgOperand(2));
1364 if (CWidth) {
1365 Width = CWidth->getZExtValue();
1366 if ((Width & (IntSize - 1)) == 0) {
1368 }
1369
1370 // Hardware ignores high bits, so remove those.
1371 if (Width >= IntSize) {
1372 return IC.replaceOperand(
1373 II, 2, ConstantInt::get(CWidth->getType(), Width & (IntSize - 1)));
1374 }
1375 }
1376
1377 unsigned Offset;
1378 ConstantInt *COffset = dyn_cast<ConstantInt>(II.getArgOperand(1));
1379 if (COffset) {
1380 Offset = COffset->getZExtValue();
1381 if (Offset >= IntSize) {
1382 return IC.replaceOperand(
1383 II, 1,
1384 ConstantInt::get(COffset->getType(), Offset & (IntSize - 1)));
1385 }
1386 }
1387
1388 bool Signed = IID == Intrinsic::amdgcn_sbfe;
1389
1390 if (!CWidth || !COffset)
1391 break;
1392
1393 // The case of Width == 0 is handled above, which makes this transformation
1394 // safe. If Width == 0, then the ashr and lshr instructions become poison
1395 // value since the shift amount would be equal to the bit size.
1396 assert(Width != 0);
1397
1398 // TODO: This allows folding to undef when the hardware has specific
1399 // behavior?
1400 if (Offset + Width < IntSize) {
1401 Value *Shl = IC.Builder.CreateShl(Src, IntSize - Offset - Width);
1402 Value *RightShift = Signed ? IC.Builder.CreateAShr(Shl, IntSize - Width)
1403 : IC.Builder.CreateLShr(Shl, IntSize - Width);
1404 RightShift->takeName(&II);
1405 return IC.replaceInstUsesWith(II, RightShift);
1406 }
1407
1408 Value *RightShift = Signed ? IC.Builder.CreateAShr(Src, Offset)
1409 : IC.Builder.CreateLShr(Src, Offset);
1410
1411 RightShift->takeName(&II);
1412 return IC.replaceInstUsesWith(II, RightShift);
1413 }
1414 case Intrinsic::amdgcn_exp:
1415 case Intrinsic::amdgcn_exp_row:
1416 case Intrinsic::amdgcn_exp_compr: {
1417 ConstantInt *En = cast<ConstantInt>(II.getArgOperand(1));
1418 unsigned EnBits = En->getZExtValue();
1419 if (EnBits == 0xf)
1420 break; // All inputs enabled.
1421
1422 bool IsCompr = IID == Intrinsic::amdgcn_exp_compr;
1423 bool Changed = false;
1424 for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
1425 if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
1426 (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
1427 Value *Src = II.getArgOperand(I + 2);
1428 if (!isa<PoisonValue>(Src)) {
1429 IC.replaceOperand(II, I + 2, PoisonValue::get(Src->getType()));
1430 Changed = true;
1431 }
1432 }
1433 }
1434
1435 if (Changed) {
1436 return &II;
1437 }
1438
1439 break;
1440 }
1441 case Intrinsic::amdgcn_fmed3: {
1442 Value *Src0 = II.getArgOperand(0);
1443 Value *Src1 = II.getArgOperand(1);
1444 Value *Src2 = II.getArgOperand(2);
1445
1446 for (Value *Src : {Src0, Src1, Src2}) {
1447 if (isa<PoisonValue>(Src))
1448 return IC.replaceInstUsesWith(II, Src);
1449 }
1450
1451 if (II.isStrictFP())
1452 break;
1453
1454 // med3 with a nan input acts like
1455 // v_min_f32(v_min_f32(s0, s1), s2)
1456 //
1457 // Signalingness is ignored with ieee=0, so we fold to
1458 // minimumnum/maximumnum. With ieee=1, the v_min_f32 acts like llvm.minnum
1459 // with signaling nan handling. With ieee=0, like llvm.minimumnum except a
1460 // returned signaling nan will not be quieted.
1461
1462 // ieee=1
1463 // s0 snan: s2
1464 // s1 snan: s2
1465 // s2 snan: qnan
1466
1467 // s0 qnan: min(s1, s2)
1468 // s1 qnan: min(s0, s2)
1469 // s2 qnan: min(s0, s1)
1470
1471 // ieee=0
1472 // s0 _nan: min(s1, s2)
1473 // s1 _nan: min(s0, s2)
1474 // s2 _nan: min(s0, s1)
1475
1476 // med3 behavior with infinity
1477 // s0 +inf: max(s1, s2)
1478 // s1 +inf: max(s0, s2)
1479 // s2 +inf: max(s0, s1)
1480 // s0 -inf: min(s1, s2)
1481 // s1 -inf: min(s0, s2)
1482 // s2 -inf: min(s0, s1)
1483
1484 // Checking for NaN before canonicalization provides better fidelity when
1485 // mapping other operations onto fmed3 since the order of operands is
1486 // unchanged.
1487 Value *V = nullptr;
1488 const APFloat *ConstSrc0 = nullptr;
1489 const APFloat *ConstSrc1 = nullptr;
1490 const APFloat *ConstSrc2 = nullptr;
1491
1492 if ((match(Src0, m_APFloat(ConstSrc0)) &&
1493 (ConstSrc0->isNaN() || ConstSrc0->isInfinity())) ||
1494 isa<UndefValue>(Src0)) {
1495 const bool IsPosInfinity = ConstSrc0 && ConstSrc0->isPosInfinity();
1496 switch (fpenvIEEEMode(II)) {
1497 case KnownIEEEMode::On:
1498 // TODO: If Src2 is snan, does it need quieting?
1499 if (ConstSrc0 && ConstSrc0->isNaN() && ConstSrc0->isSignaling())
1500 return IC.replaceInstUsesWith(II, Src2);
1501
1502 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src1, Src2)
1503 : IC.Builder.CreateMinNum(Src1, Src2);
1504 break;
1505 case KnownIEEEMode::Off:
1506 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src1, Src2)
1507 : IC.Builder.CreateMinimumNum(Src1, Src2);
1508 break;
1510 break;
1511 }
1512 } else if ((match(Src1, m_APFloat(ConstSrc1)) &&
1513 (ConstSrc1->isNaN() || ConstSrc1->isInfinity())) ||
1514 isa<UndefValue>(Src1)) {
1515 const bool IsPosInfinity = ConstSrc1 && ConstSrc1->isPosInfinity();
1516 switch (fpenvIEEEMode(II)) {
1517 case KnownIEEEMode::On:
1518 // TODO: If Src2 is snan, does it need quieting?
1519 if (ConstSrc1 && ConstSrc1->isNaN() && ConstSrc1->isSignaling())
1520 return IC.replaceInstUsesWith(II, Src2);
1521
1522 V = IsPosInfinity ? IC.Builder.CreateMaxNum(Src0, Src2)
1523 : IC.Builder.CreateMinNum(Src0, Src2);
1524 break;
1525 case KnownIEEEMode::Off:
1526 V = IsPosInfinity ? IC.Builder.CreateMaximumNum(Src0, Src2)
1527 : IC.Builder.CreateMinimumNum(Src0, Src2);
1528 break;
1530 break;
1531 }
1532 } else if ((match(Src2, m_APFloat(ConstSrc2)) &&
1533 (ConstSrc2->isNaN() || ConstSrc2->isInfinity())) ||
1534 isa<UndefValue>(Src2)) {
1535 switch (fpenvIEEEMode(II)) {
1536 case KnownIEEEMode::On:
1537 if (ConstSrc2 && ConstSrc2->isNaN() && ConstSrc2->isSignaling()) {
1538 auto *Quieted = ConstantFP::get(II.getType(), ConstSrc2->makeQuiet());
1539 return IC.replaceInstUsesWith(II, Quieted);
1540 }
1541
1542 V = (ConstSrc2 && ConstSrc2->isPosInfinity())
1543 ? IC.Builder.CreateMaxNum(Src0, Src1)
1544 : IC.Builder.CreateMinNum(Src0, Src1);
1545 break;
1546 case KnownIEEEMode::Off:
1547 V = (ConstSrc2 && ConstSrc2->isNegInfinity())
1548 ? IC.Builder.CreateMinimumNum(Src0, Src1)
1549 : IC.Builder.CreateMaximumNum(Src0, Src1);
1550 break;
1552 break;
1553 }
1554 }
1555
1556 if (V) {
1557 if (auto *CI = dyn_cast<CallInst>(V)) {
1558 CI->copyFastMathFlags(&II);
1559 CI->takeName(&II);
1560 }
1561 return IC.replaceInstUsesWith(II, V);
1562 }
1563
1564 bool Swap = false;
1565 // Canonicalize constants to RHS operands.
1566 //
1567 // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
1568 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1569 std::swap(Src0, Src1);
1570 Swap = true;
1571 }
1572
1573 if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
1574 std::swap(Src1, Src2);
1575 Swap = true;
1576 }
1577
1578 if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
1579 std::swap(Src0, Src1);
1580 Swap = true;
1581 }
1582
1583 if (Swap) {
1584 II.setArgOperand(0, Src0);
1585 II.setArgOperand(1, Src1);
1586 II.setArgOperand(2, Src2);
1587 return &II;
1588 }
1589
1590 if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
1591 if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
1592 if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
1593 APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
1594 C2->getValueAPF());
1595 return IC.replaceInstUsesWith(II,
1596 ConstantFP::get(II.getType(), Result));
1597 }
1598 }
1599 }
1600
1601 if (!ST->hasMed3_16())
1602 break;
1603
1604 // Repeat floating-point width reduction done for minnum/maxnum.
1605 // fmed3((fpext X), (fpext Y), (fpext Z)) -> fpext (fmed3(X, Y, Z))
1606 if (Value *X = matchFPExtFromF16(Src0)) {
1607 if (Value *Y = matchFPExtFromF16(Src1)) {
1608 if (Value *Z = matchFPExtFromF16(Src2)) {
1609 Value *NewCall = IC.Builder.CreateIntrinsic(
1610 IID, {X->getType()}, {X, Y, Z}, &II, II.getName());
1611 return new FPExtInst(NewCall, II.getType());
1612 }
1613 }
1614 }
1615
1616 break;
1617 }
1618 case Intrinsic::amdgcn_icmp:
1619 case Intrinsic::amdgcn_fcmp: {
1620 const ConstantInt *CC = cast<ConstantInt>(II.getArgOperand(2));
1621 // Guard against invalid arguments.
1622 int64_t CCVal = CC->getZExtValue();
1623 bool IsInteger = IID == Intrinsic::amdgcn_icmp;
1624 if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
1625 CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
1626 (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
1628 break;
1629
1630 Value *Src0 = II.getArgOperand(0);
1631 Value *Src1 = II.getArgOperand(1);
1632
1633 if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
1634 if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
1636 (ICmpInst::Predicate)CCVal, CSrc0, CSrc1, DL);
1637 if (CCmp && CCmp->isNullValue()) {
1638 return IC.replaceInstUsesWith(
1639 II, IC.Builder.CreateSExt(CCmp, II.getType()));
1640 }
1641
1642 // The result of V_ICMP/V_FCMP assembly instructions (which this
1643 // intrinsic exposes) is one bit per thread, masked with the EXEC
1644 // register (which contains the bitmask of live threads). So a
1645 // comparison that always returns true is the same as a read of the
1646 // EXEC register.
1647 Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
1648 MDNode *MD = MDNode::get(II.getContext(), MDArgs);
1649 Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
1650 CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
1651 II.getType(), Args);
1652 NewCall->addFnAttr(Attribute::Convergent);
1653 NewCall->takeName(&II);
1654 return IC.replaceInstUsesWith(II, NewCall);
1655 }
1656
1657 // Canonicalize constants to RHS.
1658 CmpInst::Predicate SwapPred =
1660 II.setArgOperand(0, Src1);
1661 II.setArgOperand(1, Src0);
1662 II.setArgOperand(
1663 2, ConstantInt::get(CC->getType(), static_cast<int>(SwapPred)));
1664 return &II;
1665 }
1666
1667 if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
1668 break;
1669
1670 // Canonicalize compare eq with true value to compare != 0
1671 // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
1672 // -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
1673 // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
1674 // -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
1675 Value *ExtSrc;
1676 if (CCVal == CmpInst::ICMP_EQ &&
1677 ((match(Src1, PatternMatch::m_One()) &&
1678 match(Src0, m_ZExt(PatternMatch::m_Value(ExtSrc)))) ||
1679 (match(Src1, PatternMatch::m_AllOnes()) &&
1680 match(Src0, m_SExt(PatternMatch::m_Value(ExtSrc))))) &&
1681 ExtSrc->getType()->isIntegerTy(1)) {
1683 IC.replaceOperand(II, 2,
1684 ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
1685 return &II;
1686 }
1687
1688 CmpPredicate SrcPred;
1689 Value *SrcLHS;
1690 Value *SrcRHS;
1691
1692 // Fold compare eq/ne with 0 from a compare result as the predicate to the
1693 // intrinsic. The typical use is a wave vote function in the library, which
1694 // will be fed from a user code condition compared with 0. Fold in the
1695 // redundant compare.
1696
1697 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
1698 // -> llvm.amdgcn.[if]cmp(a, b, pred)
1699 //
1700 // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
1701 // -> llvm.amdgcn.[if]cmp(a, b, inv pred)
1702 if (match(Src1, PatternMatch::m_Zero()) &&
1704 m_Cmp(SrcPred, PatternMatch::m_Value(SrcLHS),
1705 PatternMatch::m_Value(SrcRHS))))) {
1706 if (CCVal == CmpInst::ICMP_EQ)
1707 SrcPred = CmpInst::getInversePredicate(SrcPred);
1708
1709 Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred)
1710 ? Intrinsic::amdgcn_fcmp
1711 : Intrinsic::amdgcn_icmp;
1712
1713 Type *Ty = SrcLHS->getType();
1714 if (auto *CmpType = dyn_cast<IntegerType>(Ty)) {
1715 // Promote to next legal integer type.
1716 unsigned Width = CmpType->getBitWidth();
1717 unsigned NewWidth = Width;
1718
1719 // Don't do anything for i1 comparisons.
1720 if (Width == 1)
1721 break;
1722
1723 if (Width <= 16)
1724 NewWidth = 16;
1725 else if (Width <= 32)
1726 NewWidth = 32;
1727 else if (Width <= 64)
1728 NewWidth = 64;
1729 else
1730 break; // Can't handle this.
1731
1732 if (Width != NewWidth) {
1733 IntegerType *CmpTy = IC.Builder.getIntNTy(NewWidth);
1734 if (CmpInst::isSigned(SrcPred)) {
1735 SrcLHS = IC.Builder.CreateSExt(SrcLHS, CmpTy);
1736 SrcRHS = IC.Builder.CreateSExt(SrcRHS, CmpTy);
1737 } else {
1738 SrcLHS = IC.Builder.CreateZExt(SrcLHS, CmpTy);
1739 SrcRHS = IC.Builder.CreateZExt(SrcRHS, CmpTy);
1740 }
1741 }
1742 } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
1743 break;
1744
1745 Value *Args[] = {SrcLHS, SrcRHS,
1746 ConstantInt::get(CC->getType(), SrcPred)};
1747 CallInst *NewCall = IC.Builder.CreateIntrinsic(
1748 NewIID, {II.getType(), SrcLHS->getType()}, Args);
1749 NewCall->takeName(&II);
1750 return IC.replaceInstUsesWith(II, NewCall);
1751 }
1752
1753 break;
1754 }
1755 case Intrinsic::amdgcn_mbcnt_hi:
1756 // exec_hi is all 0, so this is just a copy.
1757 if (ST->isWave32())
1758 return IC.replaceInstUsesWith(II, II.getArgOperand(1));
1759 [[fallthrough]];
1760 case Intrinsic::amdgcn_mbcnt_lo: {
1761 ConstantRange AccRange =
1762 computeConstantRange(II.getArgOperand(1),
1763 /*ForSigned=*/false, IC.getSimplifyQuery());
1764 if (AccRange.isFullSet())
1765 return nullptr;
1766
1767 // TODO: Can raise lower bound by inspecting first argument.
1768 ConstantRange MbcntRange(APInt(32, 0), APInt(32, 32 + 1));
1769 ConstantRange ComputedRange = AccRange.add(MbcntRange);
1770 if (ComputedRange.isFullSet())
1771 return nullptr;
1772
1773 if (std::optional<ConstantRange> ExistingRange = II.getRange()) {
1774 ComputedRange = ComputedRange.intersectWith(*ExistingRange);
1775 if (ComputedRange == *ExistingRange)
1776 return nullptr;
1777 }
1778
1779 II.addRangeRetAttr(ComputedRange);
1780 return nullptr;
1781 }
1782 case Intrinsic::amdgcn_ballot: {
1783 Value *Arg = II.getArgOperand(0);
1784 if (isa<PoisonValue>(Arg))
1785 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1786
1787 if (auto *Src = dyn_cast<ConstantInt>(Arg)) {
1788 if (Src->isZero()) {
1789 // amdgcn.ballot(i1 0) is zero.
1790 return IC.replaceInstUsesWith(II, Constant::getNullValue(II.getType()));
1791 }
1792 }
1793 if (ST->isWave32() && II.getType()->getIntegerBitWidth() == 64) {
1794 // %b64 = call i64 ballot.i64(...)
1795 // =>
1796 // %b32 = call i32 ballot.i32(...)
1797 // %b64 = zext i32 %b32 to i64
1799 IC.Builder.CreateIntrinsic(Intrinsic::amdgcn_ballot,
1800 {IC.Builder.getInt32Ty()},
1801 {II.getArgOperand(0)}),
1802 II.getType());
1803 Call->takeName(&II);
1804 return IC.replaceInstUsesWith(II, Call);
1805 }
1806 break;
1807 }
1808 case Intrinsic::amdgcn_wavefrontsize: {
1809 if (ST->isWaveSizeKnown())
1810 return IC.replaceInstUsesWith(
1811 II, ConstantInt::get(II.getType(), ST->getWavefrontSize()));
1812 break;
1813 }
1814 case Intrinsic::amdgcn_wqm_vote: {
1815 // wqm_vote is identity when the argument is constant.
1816 if (!isa<Constant>(II.getArgOperand(0)))
1817 break;
1818
1819 return IC.replaceInstUsesWith(II, II.getArgOperand(0));
1820 }
1821 case Intrinsic::amdgcn_kill: {
1822 const ConstantInt *C = dyn_cast<ConstantInt>(II.getArgOperand(0));
1823 if (!C || !C->getZExtValue())
1824 break;
1825
1826 // amdgcn.kill(i1 1) is a no-op
1827 return IC.eraseInstFromFunction(II);
1828 }
1829 case Intrinsic::amdgcn_s_sendmsg:
1830 case Intrinsic::amdgcn_s_sendmsghalt: {
1831 // The second operand is copied to m0, but is only actually used for
1832 // certain message types. For message types that are known to not use m0,
1833 // fold it to poison.
1834 using namespace AMDGPU::SendMsg;
1835
1836 Value *M0Val = II.getArgOperand(1);
1837 if (isa<PoisonValue>(M0Val))
1838 break;
1839
1840 auto *MsgImm = cast<ConstantInt>(II.getArgOperand(0));
1841 uint16_t MsgId, OpId, StreamId;
1842 decodeMsg(MsgImm->getZExtValue(), MsgId, OpId, StreamId, *ST);
1843
1844 if (!msgDoesNotUseM0(MsgId, *ST))
1845 break;
1846
1847 // Drop UB-implying attributes since we're replacing with poison.
1848 II.dropUBImplyingAttrsAndMetadata();
1849 IC.replaceOperand(II, 1, PoisonValue::get(M0Val->getType()));
1850 return nullptr;
1851 }
1852 case Intrinsic::amdgcn_update_dpp: {
1853 Value *Old = II.getArgOperand(0);
1854
1855 auto *BC = cast<ConstantInt>(II.getArgOperand(5));
1856 auto *RM = cast<ConstantInt>(II.getArgOperand(3));
1857 auto *BM = cast<ConstantInt>(II.getArgOperand(4));
1858 if (BC->isNullValue() || RM->getZExtValue() != 0xF ||
1859 BM->getZExtValue() != 0xF || isa<PoisonValue>(Old))
1860 break;
1861
1862 // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
1863 return IC.replaceOperand(II, 0, PoisonValue::get(Old->getType()));
1864 }
1865 case Intrinsic::amdgcn_permlane16:
1866 case Intrinsic::amdgcn_permlane16_var:
1867 case Intrinsic::amdgcn_permlanex16:
1868 case Intrinsic::amdgcn_permlanex16_var: {
1869 // Discard vdst_in if it's not going to be read.
1870 Value *VDstIn = II.getArgOperand(0);
1871 if (isa<PoisonValue>(VDstIn))
1872 break;
1873
1874 // FetchInvalid operand idx.
1875 unsigned int FiIdx = (IID == Intrinsic::amdgcn_permlane16 ||
1876 IID == Intrinsic::amdgcn_permlanex16)
1877 ? 4 /* for permlane16 and permlanex16 */
1878 : 3; /* for permlane16_var and permlanex16_var */
1879
1880 // BoundCtrl operand idx.
1881 // For permlane16 and permlanex16 it should be 5
1882 // For Permlane16_var and permlanex16_var it should be 4
1883 unsigned int BcIdx = FiIdx + 1;
1884
1885 ConstantInt *FetchInvalid = cast<ConstantInt>(II.getArgOperand(FiIdx));
1886 ConstantInt *BoundCtrl = cast<ConstantInt>(II.getArgOperand(BcIdx));
1887 if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
1888 break;
1889
1890 return IC.replaceOperand(II, 0, PoisonValue::get(VDstIn->getType()));
1891 }
1892 case Intrinsic::amdgcn_wave_shuffle:
1893 return tryOptimizeShufflePattern(IC, II, *ST);
1894 case Intrinsic::amdgcn_permlane64:
1895 case Intrinsic::amdgcn_readfirstlane:
1896 case Intrinsic::amdgcn_readlane:
1897 case Intrinsic::amdgcn_ds_bpermute: {
1898 // If the data argument is uniform these intrinsics return it unchanged.
1899 unsigned SrcIdx = IID == Intrinsic::amdgcn_ds_bpermute ? 1 : 0;
1900 const Use &Src = II.getArgOperandUse(SrcIdx);
1901 if (isTriviallyUniform(Src))
1902 return IC.replaceInstUsesWith(II, Src.get());
1903
1904 if (IID == Intrinsic::amdgcn_readlane &&
1906 return &II;
1907
1908 // If the lane argument of bpermute is uniform, change it to readlane. This
1909 // generates better code and can enable further optimizations because
1910 // readlane is AlwaysUniform.
1911 if (IID == Intrinsic::amdgcn_ds_bpermute) {
1912 const Use &Lane = II.getArgOperandUse(0);
1913 if (isTriviallyUniform(Lane)) {
1914 Value *NewLane = IC.Builder.CreateLShr(Lane, 2);
1916 II.getModule(), Intrinsic::amdgcn_readlane, II.getType());
1917 II.setCalledFunction(NewDecl);
1918 II.setOperand(0, Src);
1919 II.setOperand(1, NewLane);
1920 return &II;
1921 }
1922 }
1923
1924 if (IID == Intrinsic::amdgcn_ds_bpermute)
1925 return tryOptimizeShufflePattern(IC, II, *ST);
1926
1928 return Res;
1929
1930 return std::nullopt;
1931 }
1932 case Intrinsic::amdgcn_writelane: {
1933 // TODO: Fold bitcast like readlane.
1934 if (simplifyDemandedLaneMaskArg(IC, II, 1))
1935 return &II;
1936 return std::nullopt;
1937 }
1938 case Intrinsic::amdgcn_trig_preop: {
1939 // The intrinsic is declared with name mangling, but currently the
1940 // instruction only exists for f64
1941 if (!II.getType()->isDoubleTy())
1942 break;
1943
1944 Value *Src = II.getArgOperand(0);
1945 Value *Segment = II.getArgOperand(1);
1946 if (isa<PoisonValue>(Src) || isa<PoisonValue>(Segment))
1947 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
1948
1949 if (isa<UndefValue>(Segment))
1950 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1951
1952 // Sign bit is not used.
1953 Value *StrippedSign = InstCombiner::stripSignOnlyFPOps(Src);
1954 if (StrippedSign != Src)
1955 return IC.replaceOperand(II, 0, StrippedSign);
1956
1957 if (II.isStrictFP())
1958 break;
1959
1960 const ConstantFP *CSrc = dyn_cast<ConstantFP>(Src);
1961 if (!CSrc && !isa<UndefValue>(Src))
1962 break;
1963
1964 // The instruction ignores special cases, and literally just extracts the
1965 // exponents. Fold undef to nan, and index the table as normal.
1966 APInt FSrcInt = CSrc ? CSrc->getValueAPF().bitcastToAPInt()
1967 : APFloat::getQNaN(II.getType()->getFltSemantics())
1968 .bitcastToAPInt();
1969
1970 const ConstantInt *Cseg = dyn_cast<ConstantInt>(Segment);
1971 if (!Cseg) {
1972 if (isa<UndefValue>(Src))
1973 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
1974 break;
1975 }
1976
1977 unsigned Exponent = FSrcInt.extractBitsAsZExtValue(11, 52);
1978 unsigned SegmentVal = Cseg->getValue().trunc(5).getZExtValue();
1979 unsigned Shift = SegmentVal * 53;
1980 if (Exponent > 1077)
1981 Shift += Exponent - 1077;
1982
1983 // 2.0/PI table.
1984 static const uint32_t TwoByPi[] = {
1985 0xa2f9836e, 0x4e441529, 0xfc2757d1, 0xf534ddc0, 0xdb629599, 0x3c439041,
1986 0xfe5163ab, 0xdebbc561, 0xb7246e3a, 0x424dd2e0, 0x06492eea, 0x09d1921c,
1987 0xfe1deb1c, 0xb129a73e, 0xe88235f5, 0x2ebb4484, 0xe99c7026, 0xb45f7e41,
1988 0x3991d639, 0x835339f4, 0x9c845f8b, 0xbdf9283b, 0x1ff897ff, 0xde05980f,
1989 0xef2f118b, 0x5a0a6d1f, 0x6d367ecf, 0x27cb09b7, 0x4f463f66, 0x9e5fea2d,
1990 0x7527bac7, 0xebe5f17b, 0x3d0739f7, 0x8a5292ea, 0x6bfb5fb1, 0x1f8d5d08,
1991 0x56033046};
1992
1993 // Return 0 for outbound segment (hardware behavior).
1994 unsigned Idx = Shift >> 5;
1995 if (Idx + 2 >= std::size(TwoByPi)) {
1996 APFloat Zero = APFloat::getZero(II.getType()->getFltSemantics());
1997 return IC.replaceInstUsesWith(II, ConstantFP::get(II.getType(), Zero));
1998 }
1999
2000 unsigned BShift = Shift & 0x1f;
2001 uint64_t Thi = Make_64(TwoByPi[Idx], TwoByPi[Idx + 1]);
2002 uint64_t Tlo = Make_64(TwoByPi[Idx + 2], 0);
2003 if (BShift)
2004 Thi = (Thi << BShift) | (Tlo >> (64 - BShift));
2005 Thi = Thi >> 11;
2006 APFloat Result = APFloat((double)Thi);
2007
2008 int Scale = -53 - Shift;
2009 if (Exponent >= 1968)
2010 Scale += 128;
2011
2012 Result = scalbn(Result, Scale, RoundingMode::NearestTiesToEven);
2013 return IC.replaceInstUsesWith(II, ConstantFP::get(Src->getType(), Result));
2014 }
2015 case Intrinsic::amdgcn_fmul_legacy: {
2016 Value *Op0 = II.getArgOperand(0);
2017 Value *Op1 = II.getArgOperand(1);
2018
2019 for (Value *Src : {Op0, Op1}) {
2020 if (isa<PoisonValue>(Src))
2021 return IC.replaceInstUsesWith(II, Src);
2022 }
2023
2024 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2025 // infinity, gives +0.0.
2026 // TODO: Move to InstSimplify?
2027 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2029 return IC.replaceInstUsesWith(II, ConstantFP::getZero(II.getType()));
2030
2031 // If we can prove we don't have one of the special cases then we can use a
2032 // normal fmul instruction instead.
2033 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2034 auto *FMul = IC.Builder.CreateFMulFMF(Op0, Op1, &II);
2035 FMul->takeName(&II);
2036 return IC.replaceInstUsesWith(II, FMul);
2037 }
2038 break;
2039 }
2040 case Intrinsic::amdgcn_fma_legacy: {
2041 Value *Op0 = II.getArgOperand(0);
2042 Value *Op1 = II.getArgOperand(1);
2043 Value *Op2 = II.getArgOperand(2);
2044
2045 for (Value *Src : {Op0, Op1, Op2}) {
2046 if (isa<PoisonValue>(Src))
2047 return IC.replaceInstUsesWith(II, Src);
2048 }
2049
2050 // The legacy behaviour is that multiplying +/-0.0 by anything, even NaN or
2051 // infinity, gives +0.0.
2052 // TODO: Move to InstSimplify?
2053 if (match(Op0, PatternMatch::m_AnyZeroFP()) ||
2055 // It's tempting to just return Op2 here, but that would give the wrong
2056 // result if Op2 was -0.0.
2057 auto *Zero = ConstantFP::getZero(II.getType());
2058 auto *FAdd = IC.Builder.CreateFAddFMF(Zero, Op2, &II);
2059 FAdd->takeName(&II);
2060 return IC.replaceInstUsesWith(II, FAdd);
2061 }
2062
2063 // If we can prove we don't have one of the special cases then we can use a
2064 // normal fma instead.
2065 if (canSimplifyLegacyMulToMul(II, Op0, Op1, IC)) {
2066 II.setCalledOperand(Intrinsic::getOrInsertDeclaration(
2067 II.getModule(), Intrinsic::fma, II.getType()));
2068 return &II;
2069 }
2070 break;
2071 }
2072 case Intrinsic::amdgcn_is_shared:
2073 case Intrinsic::amdgcn_is_private: {
2074 Value *Src = II.getArgOperand(0);
2075 if (isa<PoisonValue>(Src))
2076 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2077 if (isa<UndefValue>(Src))
2078 return IC.replaceInstUsesWith(II, UndefValue::get(II.getType()));
2079
2080 if (isa<ConstantPointerNull>(II.getArgOperand(0)))
2081 return IC.replaceInstUsesWith(II, ConstantInt::getFalse(II.getType()));
2082 break;
2083 }
2084 case Intrinsic::amdgcn_make_buffer_rsrc: {
2085 Value *Src = II.getArgOperand(0);
2086 if (isa<PoisonValue>(Src))
2087 return IC.replaceInstUsesWith(II, PoisonValue::get(II.getType()));
2088 return std::nullopt;
2089 }
2090 case Intrinsic::amdgcn_raw_buffer_store_format:
2091 case Intrinsic::amdgcn_struct_buffer_store_format:
2092 case Intrinsic::amdgcn_raw_tbuffer_store:
2093 case Intrinsic::amdgcn_struct_tbuffer_store:
2094 case Intrinsic::amdgcn_image_store_1d:
2095 case Intrinsic::amdgcn_image_store_1darray:
2096 case Intrinsic::amdgcn_image_store_2d:
2097 case Intrinsic::amdgcn_image_store_2darray:
2098 case Intrinsic::amdgcn_image_store_2darraymsaa:
2099 case Intrinsic::amdgcn_image_store_2dmsaa:
2100 case Intrinsic::amdgcn_image_store_3d:
2101 case Intrinsic::amdgcn_image_store_cube:
2102 case Intrinsic::amdgcn_image_store_mip_1d:
2103 case Intrinsic::amdgcn_image_store_mip_1darray:
2104 case Intrinsic::amdgcn_image_store_mip_2d:
2105 case Intrinsic::amdgcn_image_store_mip_2darray:
2106 case Intrinsic::amdgcn_image_store_mip_3d:
2107 case Intrinsic::amdgcn_image_store_mip_cube: {
2108 if (!isa<FixedVectorType>(II.getArgOperand(0)->getType()))
2109 break;
2110
2111 APInt DemandedElts;
2112 if (ST->hasDefaultComponentBroadcast())
2113 DemandedElts = defaultComponentBroadcast(II.getArgOperand(0));
2114 else if (ST->hasDefaultComponentZero())
2115 DemandedElts = trimTrailingZerosInVector(IC, II.getArgOperand(0), &II);
2116 else
2117 break;
2118
2119 int DMaskIdx = getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID()) ? 1 : -1;
2120 if (simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, DMaskIdx,
2121 false)) {
2122 return IC.eraseInstFromFunction(II);
2123 }
2124
2125 break;
2126 }
2127 case Intrinsic::amdgcn_prng_b32: {
2128 auto *Src = II.getArgOperand(0);
2129 if (isa<UndefValue>(Src)) {
2130 return IC.replaceInstUsesWith(II, Src);
2131 }
2132 return std::nullopt;
2133 }
2134 case Intrinsic::amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
2135 case Intrinsic::amdgcn_mfma_scale_f32_32x32x64_f8f6f4: {
2136 Value *Src0 = II.getArgOperand(0);
2137 Value *Src1 = II.getArgOperand(1);
2138 uint64_t CBSZ = cast<ConstantInt>(II.getArgOperand(3))->getZExtValue();
2139 uint64_t BLGP = cast<ConstantInt>(II.getArgOperand(4))->getZExtValue();
2140 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2141 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2142
2143 auto getFormatNumRegs = [](unsigned FormatVal) {
2144 switch (FormatVal) {
2147 return 6u;
2149 return 4u;
2152 return 8u;
2153 default:
2154 llvm_unreachable("invalid format value");
2155 }
2156 };
2157
2158 bool MadeChange = false;
2159 unsigned Src0NumElts = getFormatNumRegs(CBSZ);
2160 unsigned Src1NumElts = getFormatNumRegs(BLGP);
2161
2162 // Depending on the used format, fewer registers are required so shrink the
2163 // vector type.
2164 if (Src0Ty->getNumElements() > Src0NumElts) {
2165 Src0 = IC.Builder.CreateExtractVector(
2166 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2167 uint64_t(0));
2168 MadeChange = true;
2169 }
2170
2171 if (Src1Ty->getNumElements() > Src1NumElts) {
2172 Src1 = IC.Builder.CreateExtractVector(
2173 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2174 uint64_t(0));
2175 MadeChange = true;
2176 }
2177
2178 if (!MadeChange)
2179 return std::nullopt;
2180
2181 SmallVector<Value *, 10> Args(II.args());
2182 Args[0] = Src0;
2183 Args[1] = Src1;
2184
2185 CallInst *NewII = IC.Builder.CreateIntrinsic(
2186 IID, {Src0->getType(), Src1->getType()}, Args, &II);
2187 NewII->takeName(&II);
2188 return IC.replaceInstUsesWith(II, NewII);
2189 }
2190 case Intrinsic::amdgcn_wmma_f32_16x16x128_f8f6f4:
2191 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
2192 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
2193 Value *Src0 = II.getArgOperand(1);
2194 Value *Src1 = II.getArgOperand(3);
2195 unsigned FmtA = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2196 uint64_t FmtB = cast<ConstantInt>(II.getArgOperand(2))->getZExtValue();
2197 auto *Src0Ty = cast<FixedVectorType>(Src0->getType());
2198 auto *Src1Ty = cast<FixedVectorType>(Src1->getType());
2199
2200 bool MadeChange = false;
2201 unsigned Src0NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtA);
2202 unsigned Src1NumElts = AMDGPU::wmmaScaleF8F6F4FormatToNumRegs(FmtB);
2203
2204 // Depending on the used format, fewer registers are required so shrink the
2205 // vector type.
2206 if (Src0Ty->getNumElements() > Src0NumElts) {
2207 Src0 = IC.Builder.CreateExtractVector(
2208 FixedVectorType::get(Src0Ty->getElementType(), Src0NumElts), Src0,
2209 IC.Builder.getInt64(0));
2210 MadeChange = true;
2211 }
2212
2213 if (Src1Ty->getNumElements() > Src1NumElts) {
2214 Src1 = IC.Builder.CreateExtractVector(
2215 FixedVectorType::get(Src1Ty->getElementType(), Src1NumElts), Src1,
2216 IC.Builder.getInt64(0));
2217 MadeChange = true;
2218 }
2219
2220 if (!MadeChange)
2221 return std::nullopt;
2222
2223 SmallVector<Value *, 13> Args(II.args());
2224 Args[1] = Src0;
2225 Args[3] = Src1;
2226
2227 CallInst *NewII = IC.Builder.CreateIntrinsic(
2228 IID, {II.getArgOperand(5)->getType(), Src0->getType(), Src1->getType()},
2229 Args, &II);
2230 NewII->takeName(&II);
2231 return IC.replaceInstUsesWith(II, NewII);
2232 }
2233 }
2234 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
2235 AMDGPU::getImageDimIntrinsicInfo(II.getIntrinsicID())) {
2236 return simplifyAMDGCNImageIntrinsic(ST, ImageDimIntr, II, IC);
2237 }
2238 return std::nullopt;
2239}
2240
2241/// Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
2242///
2243/// The result of simplifying amdgcn image and buffer store intrinsics is updating
2244/// definitions of the intrinsics vector argument, not Uses of the result like
2245/// image and buffer loads.
2246/// Note: This only supports non-TFE/LWE image intrinsic calls; those have
2247/// struct returns.
2250 APInt DemandedElts,
2251 int DMaskIdx, bool IsLoad) {
2252
2253 auto *IIVTy = cast<FixedVectorType>(IsLoad ? II.getType()
2254 : II.getOperand(0)->getType());
2255 unsigned VWidth = IIVTy->getNumElements();
2256 if (VWidth == 1)
2257 return nullptr;
2258 Type *EltTy = IIVTy->getElementType();
2259
2262
2263 // Assume the arguments are unchanged and later override them, if needed.
2264 SmallVector<Value *, 16> Args(II.args());
2265
2266 if (DMaskIdx < 0) {
2267 // Buffer case.
2268
2269 const unsigned ActiveBits = DemandedElts.getActiveBits();
2270 const unsigned UnusedComponentsAtFront = DemandedElts.countr_zero();
2271
2272 // Start assuming the prefix of elements is demanded, but possibly clear
2273 // some other bits if there are trailing zeros (unused components at front)
2274 // and update offset.
2275 DemandedElts = (1 << ActiveBits) - 1;
2276
2277 if (UnusedComponentsAtFront > 0) {
2278 static const unsigned InvalidOffsetIdx = 0xf;
2279
2280 unsigned OffsetIdx;
2281 switch (II.getIntrinsicID()) {
2282 case Intrinsic::amdgcn_raw_buffer_load:
2283 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2284 OffsetIdx = 1;
2285 break;
2286 case Intrinsic::amdgcn_s_buffer_load:
2287 // If resulting type is vec3, there is no point in trimming the
2288 // load with updated offset, as the vec3 would most likely be widened to
2289 // vec4 anyway during lowering.
2290 if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
2291 OffsetIdx = InvalidOffsetIdx;
2292 else
2293 OffsetIdx = 1;
2294 break;
2295 case Intrinsic::amdgcn_struct_buffer_load:
2296 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2297 OffsetIdx = 2;
2298 break;
2299 default:
2300 // TODO: handle tbuffer* intrinsics.
2301 OffsetIdx = InvalidOffsetIdx;
2302 break;
2303 }
2304
2305 if (OffsetIdx != InvalidOffsetIdx) {
2306 // Clear demanded bits and update the offset.
2307 DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
2308 auto *Offset = Args[OffsetIdx];
2309 unsigned SingleComponentSizeInBits =
2310 IC.getDataLayout().getTypeSizeInBits(EltTy);
2311 unsigned OffsetAdd =
2312 UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
2313 auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
2314 Args[OffsetIdx] = IC.Builder.CreateAdd(Offset, OffsetAddVal);
2315 }
2316 }
2317 } else {
2318 // Image case.
2319
2320 ConstantInt *DMask = cast<ConstantInt>(Args[DMaskIdx]);
2321 unsigned DMaskVal = DMask->getZExtValue() & 0xf;
2322
2323 // dmask 0 has special semantics, do not simplify.
2324 if (DMaskVal == 0)
2325 return nullptr;
2326
2327 // Mask off values that are undefined because the dmask doesn't cover them
2328 DemandedElts &= (1 << llvm::popcount(DMaskVal)) - 1;
2329
2330 unsigned NewDMaskVal = 0;
2331 unsigned OrigLdStIdx = 0;
2332 for (unsigned SrcIdx = 0; SrcIdx < 4; ++SrcIdx) {
2333 const unsigned Bit = 1 << SrcIdx;
2334 if (!!(DMaskVal & Bit)) {
2335 if (!!DemandedElts[OrigLdStIdx])
2336 NewDMaskVal |= Bit;
2337 OrigLdStIdx++;
2338 }
2339 }
2340
2341 if (DMaskVal != NewDMaskVal)
2342 Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
2343 }
2344
2345 unsigned NewNumElts = DemandedElts.popcount();
2346 if (!NewNumElts)
2347 return PoisonValue::get(IIVTy);
2348
2349 if (NewNumElts >= VWidth && DemandedElts.isMask()) {
2350 if (DMaskIdx >= 0)
2351 II.setArgOperand(DMaskIdx, Args[DMaskIdx]);
2352 return nullptr;
2353 }
2354
2355 // Validate function argument and return types, extracting overloaded types
2356 // along the way.
2357 SmallVector<Type *, 6> OverloadTys;
2358 if (!Intrinsic::isSignatureValid(II.getCalledFunction(), OverloadTys))
2359 return nullptr;
2360
2361 Type *NewTy =
2362 (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
2363 OverloadTys[0] = NewTy;
2364
2365 if (!IsLoad) {
2366 SmallVector<int, 8> EltMask;
2367 for (unsigned OrigStoreIdx = 0; OrigStoreIdx < VWidth; ++OrigStoreIdx)
2368 if (DemandedElts[OrigStoreIdx])
2369 EltMask.push_back(OrigStoreIdx);
2370
2371 if (NewNumElts == 1)
2372 Args[0] = IC.Builder.CreateExtractElement(II.getOperand(0), EltMask[0]);
2373 else
2374 Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
2375 }
2376
2377 CallInst *NewCall =
2378 IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
2379 NewCall->takeName(&II);
2380 NewCall->copyMetadata(II);
2381
2382 if (IsLoad) {
2383 if (NewNumElts == 1) {
2384 return IC.Builder.CreateInsertElement(PoisonValue::get(IIVTy), NewCall,
2385 DemandedElts.countr_zero());
2386 }
2387
2388 SmallVector<int, 8> EltMask;
2389 unsigned NewLoadIdx = 0;
2390 for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
2391 if (!!DemandedElts[OrigLoadIdx])
2392 EltMask.push_back(NewLoadIdx++);
2393 else
2394 EltMask.push_back(NewNumElts);
2395 }
2396
2397 auto *Shuffle = IC.Builder.CreateShuffleVector(NewCall, EltMask);
2398
2399 return Shuffle;
2400 }
2401
2402 return NewCall;
2403}
2404
2406 InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts,
2407 APInt &UndefElts) const {
2408 auto *VT = dyn_cast<FixedVectorType>(II.getType());
2409 if (!VT)
2410 return nullptr;
2411
2412 const unsigned FirstElt = DemandedElts.countr_zero();
2413 const unsigned LastElt = DemandedElts.getActiveBits() - 1;
2414 const unsigned MaskLen = LastElt - FirstElt + 1;
2415
2416 unsigned OldNumElts = VT->getNumElements();
2417 if (MaskLen == OldNumElts && MaskLen != 1)
2418 return nullptr;
2419
2420 Type *EltTy = VT->getElementType();
2421 Type *NewVT = MaskLen == 1 ? EltTy : FixedVectorType::get(EltTy, MaskLen);
2422
2423 // Theoretically we should support these intrinsics for any legal type. Avoid
2424 // introducing cases that aren't direct register types like v3i16.
2425 if (!isTypeLegal(NewVT))
2426 return nullptr;
2427
2428 Value *Src = II.getArgOperand(0);
2429
2430 // Make sure convergence tokens are preserved.
2431 // TODO: CreateIntrinsic should allow directly copying bundles
2433 II.getOperandBundlesAsDefs(OpBundles);
2434
2436 Function *Remangled =
2437 Intrinsic::getOrInsertDeclaration(M, II.getIntrinsicID(), {NewVT});
2438
2439 if (MaskLen == 1) {
2440 Value *Extract = IC.Builder.CreateExtractElement(Src, FirstElt);
2441
2442 // TODO: Preserve callsite attributes?
2443 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2444
2445 return IC.Builder.CreateInsertElement(PoisonValue::get(II.getType()),
2446 NewCall, FirstElt);
2447 }
2448
2449 SmallVector<int> ExtractMask(MaskLen, -1);
2450 for (unsigned I = 0; I != MaskLen; ++I) {
2451 if (DemandedElts[FirstElt + I])
2452 ExtractMask[I] = FirstElt + I;
2453 }
2454
2455 Value *Extract = IC.Builder.CreateShuffleVector(Src, ExtractMask);
2456
2457 // TODO: Preserve callsite attributes?
2458 CallInst *NewCall = IC.Builder.CreateCall(Remangled, {Extract}, OpBundles);
2459
2460 SmallVector<int> InsertMask(OldNumElts, -1);
2461 for (unsigned I = 0; I != MaskLen; ++I) {
2462 if (DemandedElts[FirstElt + I])
2463 InsertMask[FirstElt + I] = I;
2464 }
2465
2466 // FIXME: If the call has a convergence bundle, we end up leaving the dead
2467 // call behind.
2468 return IC.Builder.CreateShuffleVector(NewCall, InsertMask);
2469}
2470
2472 InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts,
2473 APInt &UndefElts2, APInt &UndefElts3,
2474 std::function<void(Instruction *, unsigned, APInt, APInt &)>
2475 SimplifyAndSetOp) const {
2476 switch (II.getIntrinsicID()) {
2477 case Intrinsic::amdgcn_readfirstlane:
2478 SimplifyAndSetOp(&II, 0, DemandedElts, UndefElts);
2479 return simplifyAMDGCNLaneIntrinsicDemanded(IC, II, DemandedElts, UndefElts);
2480 case Intrinsic::amdgcn_raw_buffer_load:
2481 case Intrinsic::amdgcn_raw_ptr_buffer_load:
2482 case Intrinsic::amdgcn_raw_buffer_load_format:
2483 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
2484 case Intrinsic::amdgcn_raw_tbuffer_load:
2485 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
2486 case Intrinsic::amdgcn_s_buffer_load:
2487 case Intrinsic::amdgcn_struct_buffer_load:
2488 case Intrinsic::amdgcn_struct_ptr_buffer_load:
2489 case Intrinsic::amdgcn_struct_buffer_load_format:
2490 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
2491 case Intrinsic::amdgcn_struct_tbuffer_load:
2492 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
2493 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts);
2494 default: {
2495 if (getAMDGPUImageDMaskIntrinsic(II.getIntrinsicID())) {
2496 return simplifyAMDGCNMemoryIntrinsicDemanded(IC, II, DemandedElts, 0);
2497 }
2498 break;
2499 }
2500 }
2501 return std::nullopt;
2502}
for(const MachineOperand &MO :llvm::drop_begin(OldMI.operands(), Desc.getNumOperands()))
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static Value * createPermlane16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlane16 with the precomputed lane-select halves.
static std::optional< unsigned > matchRowSharePattern(ArrayRef< uint8_t > Ids)
Match a row-share pattern: all 16 lanes of each row read the same source lane.
static bool matchMirrorPattern(ArrayRef< uint8_t > Ids)
Match an N-lane reversal (mirror) pattern.
static bool tryBuildShuffleMap(Value *Index, const GCNSubtarget &ST, SmallVectorImpl< uint8_t > &Ids, const DataLayout &DL)
Build the per-lane shuffle map by evaluating Index for every lane in the wave.
static std::optional< unsigned > matchQuadPermPattern(ArrayRef< uint8_t > Ids)
Match a 4-lane (quad) permutation, encoded as the v_mov_b32_dpp QUAD_PERM control word: bits[1:0]=Ids...
static std::optional< unsigned > matchHalfRowPermPattern(ArrayRef< uint8_t > Ids)
Match an 8-lane arbitrary permutation, encoded as the v_mov_b32_dpp8 24-bit selector (three bits per ...
static std::optional< unsigned > matchRowXMaskPattern(ArrayRef< uint8_t > Ids)
Match an XOR mask pattern within each 16-lane row: Ids[J] == Mask ^ J, with Mask in [1,...
static constexpr auto matchHalfRowMirrorPattern
static Value * createPermlaneX16(IRBuilderBase &B, Value *Val, uint32_t Lo, uint32_t Hi)
Emit v_permlanex16 with the precomputed lane-select halves.
static bool isRowPattern(ArrayRef< uint8_t > Ids)
Match an N-lane row pattern: each lane in [0, N) reads from a source lane in the same N-lane row,...
static bool canContractSqrtToRsq(const FPMathOperator *SqrtOp)
Return true if it's legal to contract llvm.amdgcn.rcp(llvm.sqrt)
static bool isTriviallyUniform(const Use &U)
Return true if we can easily prove that use U is uniform.
static CallInst * rewriteCall(IRBuilderBase &B, CallInst &Old, Function &NewCallee, ArrayRef< Value * > Ops)
static Value * convertTo16Bit(Value &V, InstCombiner::BuilderTy &Builder)
static constexpr auto isFullRowPattern
static constexpr auto isQuadPattern
static APInt trimTrailingZerosInVector(InstCombiner &IC, Value *UseV, Instruction *I)
static uint64_t computePermlane16Masks(ArrayRef< uint8_t > Ids)
Pack a 16-lane permutation into a single 64-bit value: four bits per output lane, lane J in bits [J*4...
static bool matchHalfWaveSwapPattern(ArrayRef< uint8_t > Ids)
Match a half-wave swap: lane J reads from lane J ^ 32.
static bool hasPeriodicLayout(ArrayRef< uint8_t > Ids)
Lanes are partitioned into groups of Period; each group is a translated copy of the first: Ids[I] = I...
static std::optional< Instruction * > tryOptimizeShufflePattern(InstCombiner &IC, IntrinsicInst &II, const GCNSubtarget &ST)
Try to fold a wave_shuffle/ds_bpermute whose lane index is a constant function of the lane ID into a ...
static constexpr auto isHalfRowPattern
static APInt defaultComponentBroadcast(Value *V)
static std::optional< unsigned > matchDsSwizzleBitmaskPattern(ArrayRef< uint8_t > Ids)
Match a DS_SWIZZLE bitmask-mode permutation: dst_lane = ((src_lane & AND) | OR) ^ XOR with each mask ...
static Value * createDsSwizzle(IRBuilderBase &B, Value *Val, unsigned Offset, const DataLayout &DL)
Emit ds_swizzle with the given immediate, bitcasting/converting between pointer/float types and i32 a...
static std::optional< Instruction * > modifyIntrinsicCall(IntrinsicInst &OldIntr, Instruction &InstToReplace, unsigned NewIntr, InstCombiner &IC, std::function< void(SmallVectorImpl< Value * > &, SmallVectorImpl< Type * > &)> Func)
Applies Func(OldIntr.Args, OldIntr.ArgTys), creates intrinsic call with modified arguments (based on ...
static Value * matchShuffleToHWIntrinsic(IRBuilderBase &B, Value *Src, ArrayRef< uint8_t > Ids, const GCNSubtarget &ST, const DataLayout &DL)
Given a shuffle map, try to emit the best hardware intrinsic.
static std::optional< unsigned > matchRowRotatePattern(ArrayRef< uint8_t > Ids)
Match a 16-lane cyclic rotation; returns the rotation amount in [1, 15].
static bool isCrossRowPattern(ArrayRef< uint8_t > Ids)
Match a cross-row permutation suitable for v_permlanex16: every lane in the low 16-lane half reads fr...
static bool isThreadID(const GCNSubtarget &ST, Value *V)
static Value * createUpdateDpp(IRBuilderBase &B, Value *Val, unsigned Ctrl)
Emit v_mov_b32_dpp with the given control word, row/bank masks 0xF, and bound_ctrl=1 so out-of-bounds...
static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1, const APFloat &Src2)
static Value * simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, int DMaskIdx=-1, bool IsLoad=true)
Implement SimplifyDemandedVectorElts for amdgcn buffer and image intrinsics.
static std::optional< Instruction * > simplifyAMDGCNImageIntrinsic(const GCNSubtarget *ST, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr, IntrinsicInst &II, InstCombiner &IC)
static bool canSafelyConvertTo16Bit(Value &V, bool IsFloat)
static Value * createMovDpp8(IRBuilderBase &B, Value *Val, unsigned Selector)
Emit v_mov_b32_dpp8 with the given 24-bit lane selector.
static Value * matchFPExtFromF16(Value *Arg)
Match an fpext from half to float, or a constant we can convert.
static constexpr auto matchFullRowMirrorPattern
static std::optional< unsigned > evalLaneExpr(Value *V, unsigned Lane, const GCNSubtarget &ST, const DataLayout &DL, unsigned Depth=0)
Evaluate V as a function of the lane ID and return its value on Lane, or std::nullopt if V is not a c...
static Value * createPermlane64(IRBuilderBase &B, Value *Val)
Emit v_permlane64 (swap of the two 32-lane halves of a wave64).
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
This file provides the interface for the instcombine pass implementation.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
if(PassOpts->AAPipeline)
This file contains some templates that are useful if you are working with the STL at all.
Provides some synthesis utilities to produce sequences of values.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
cmpResult
IEEE-754R 5.11: Floating Point Comparison Relations.
Definition APFloat.h:334
static constexpr roundingMode rmTowardZero
Definition APFloat.h:348
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1179
opStatus divide(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1267
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5899
bool isPosInfinity() const
Definition APFloat.h:1551
const fltSemantics & getSemantics() const
Definition APFloat.h:1546
APFloat makeQuiet() const
Assuming this is an IEEE-754 NaN value, quiet its signaling bit.
Definition APFloat.h:1375
bool isNaN() const
Definition APFloat.h:1536
bool isSignaling() const
Definition APFloat.h:1540
APInt bitcastToAPInt() const
Definition APFloat.h:1430
bool isNegInfinity() const
Definition APFloat.h:1552
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1138
cmpResult compare(const APFloat &RHS) const
Definition APFloat.h:1481
bool isInfinity() const
Definition APFloat.h:1535
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI uint64_t extractBitsAsZExtValue(unsigned numBits, unsigned bitPosition) const
Definition APInt.cpp:521
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isMask(unsigned numBits) const
Definition APInt.h:489
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > take_front(size_t N=1) const
Return a copy of *this with only the first N elements.
Definition ArrayRef.h:218
size_t size() const
Get the array size.
Definition ArrayRef.h:141
static LLVM_ABI Attribute getWithDereferenceableBytes(LLVMContext &Context, uint64_t Bytes)
LLVM_ABI const Module * getModule() const
Return the module owning the function this basic block belongs to, or nullptr if the function does no...
bool isTypeLegal(Type *Ty) const override
void addFnAttr(Attribute::AttrKind Kind)
Adds the attribute to the function.
LLVM_ABI void getOperandBundlesAsDefs(SmallVectorImpl< OperandBundleDef > &Defs) const
Return the list of operand bundles attached to this instruction as a vector of OperandBundleDefs.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
Predicate getSwappedPredicate() const
For example, EQ->EQ, SLE->SGE, ULT->UGT, OEQ->OEQ, ULE->UGE, OLT->OGT, etc.
Definition InstrTypes.h:827
bool isFPPredicate() const
Definition InstrTypes.h:782
Predicate getInversePredicate() const
For example, EQ -> NE, UGT -> ULE, SLT -> SGE, OEQ -> UNE, UGT -> OLE, OLT -> UGE,...
Definition InstrTypes.h:789
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
const APFloat & getValueAPF() const
Definition Constants.h:463
static LLVM_ABI Constant * getInfinity(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getZero(Type *Ty, bool Negative=false)
static LLVM_ABI Constant * getNaN(Type *Ty, bool Negative=false, uint64_t Payload=0)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static ConstantInt * getSigned(IntegerType *Ty, int64_t V, bool ImplicitTrunc=false)
Return a ConstantInt with the specified value for the specified type.
Definition Constants.h:135
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange add(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an addition of a value in this ran...
LLVM_ABI bool isFullSet() const
Return true if this set contains all of the elements possible for this data-type.
LLVM_ABI ConstantRange intersectWith(const ConstantRange &CR, PreferredRangeType Type=Smallest) const
Return the range that results from the intersection of this range with another range.
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
LLVM_ABI bool isNullValue() const
Return true if this is the value that would be returned by getNullValue.
Definition Constants.cpp:84
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
Tagged union holding either a T or a Error.
Definition Error.h:485
This class represents an extension of floating point types.
Utility class for floating point operations which can have information about relaxed accuracy require...
Definition Operator.h:200
FastMathFlags getFastMathFlags() const
Convenience function for getting all the fast-math flags.
Definition Operator.h:333
bool hasApproxFunc() const
Test if this operation allows approximations of math library functions or intrinsics.
Definition Operator.h:328
LLVM_ABI float getFPAccuracy() const
Get the maximum error permitted by this operation in ULPs.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool allowContract() const
Definition FMF.h:72
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
bool simplifyDemandedLaneMaskArg(InstCombiner &IC, IntrinsicInst &II, unsigned LaneAgIdx) const
Simplify a lane index operand (e.g.
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
Instruction * hoistLaneIntrinsicThroughOperand(InstCombiner &IC, IntrinsicInst &II) const
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
Value * simplifyAMDGCNLaneIntrinsicDemanded(InstCombiner &IC, IntrinsicInst &II, const APInt &DemandedElts, APInt &UndefElts) const
bool canSimplifyLegacyMulToMul(const Instruction &I, const Value *Op0, const Value *Op1, InstCombiner &IC) const
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
CallInst * CreateExtractVector(Type *DstType, Value *SrcVec, Value *Idx, const Twine &Name="")
Create a call to the vector.extract intrinsic.
Definition IRBuilder.h:1135
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2627
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2615
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:599
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateSExt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2132
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1554
BasicBlock * GetInsertBlock() const
Definition IRBuilder.h:201
Value * CreateICmpNE(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2378
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:534
Value * CreateMaxNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the maxnum intrinsic.
Definition IRBuilder.h:1066
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1533
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2120
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2649
Value * CreateMaximumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the maximum intrinsic.
Definition IRBuilder.h:1094
Value * CreateMinNum(Value *LHS, Value *RHS, FMFSource FMFSource={}, const Twine &Name="")
Create call to the minnum intrinsic.
Definition IRBuilder.h:1054
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1444
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2553
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFAddFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1663
Value * CreateMinimumNum(Value *LHS, Value *RHS, const Twine &Name="")
Create call to the minimumnum intrinsic.
Definition IRBuilder.h:1088
Value * CreateAShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1573
Value * CreateFMulFMF(Value *L, Value *R, FMFSource FMFSource, const Twine &Name="", MDNode *FPMD=nullptr)
Definition IRBuilder.h:1701
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
IRBuilder< TargetFolder, IRBuilderCallbackInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
static Value * stripSignOnlyFPOps(Value *Val)
Ignore all operations which only change the sign of a value, returning the underlying magnitude value...
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
BuilderTy & Builder
const SimplifyQuery & getSimplifyQuery() const
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void copyFastMathFlags(FastMathFlags FMF)
Convenience function for transferring all fast-math flag values to this instruction,...
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
Metadata node.
Definition Metadata.h:1080
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1572
static LLVM_ABI MDString * get(LLVMContext &Context, StringRef Str)
Definition Metadata.cpp:614
static LLVM_ABI MetadataAsValue * get(LLVMContext &Context, Metadata *MD)
Definition Metadata.cpp:110
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
reference emplace_back(ArgTypes &&... Args)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:284
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI Type * getHalfTy(LLVMContext &C)
Definition Type.cpp:288
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
void setOperand(unsigned i, Value *Val)
Definition User.h:212
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:162
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_READONLY const MIMGOffsetMappingInfo * getMIMGOffsetMappingInfo(unsigned Offset)
uint8_t wmmaScaleF8F6F4FormatToNumRegs(unsigned Fmt)
const ImageDimIntrinsicInfo * getImageDimIntrinsicByBaseOpcode(unsigned BaseOpcode, unsigned Dim)
LLVM_READONLY const MIMGMIPMappingInfo * getMIMGMIPMappingInfo(unsigned MIP)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
LLVM_READONLY const MIMGBiasMappingInfo * getMIMGBiasMappingInfo(unsigned Bias)
LLVM_READONLY const MIMGLZMappingInfo * getMIMGLZMappingInfo(unsigned L)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI bool isSignatureValid(Intrinsic::ID ID, FunctionType *FT, SmallVectorImpl< Type * > &OverloadTys, raw_ostream &OS=nulls())
Returns true if FT is a valid function type for intrinsic ID.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
auto m_Cmp()
Matches any compare instruction and ignore it.
bool match(Val *V, const Pattern &P)
cstfp_pred_ty< is_any_zero_fp > m_AnyZeroFP()
Match a floating-point negative zero or positive zero.
ap_match< APFloat > m_APFloat(const APFloat *&Res)
Match a ConstantFP or splatted ConstantVector, binding the specified pointer to the contained APFloat...
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
CastInst_match< OpTy, FPExtInst > m_FPExt(const OpTy &Op)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
cstfp_pred_ty< is_finitenonzero > m_FiniteNonZero()
Match a finite non-zero FP constant.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
auto m_ConstantFP()
Match an arbitrary ConstantFP and ignore it.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:557
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Constant * ConstantFoldCompareInstOperands(unsigned Predicate, Constant *LHS, Constant *RHS, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, const Instruction *I=nullptr)
Attempt to constant fold a compare instruction (icmp/fcmp) with the specified operands.
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
APFloat frexp(const APFloat &X, int &Exp, APFloat::roundingMode RM)
Equivalent of C standard library function.
Definition APFloat.h:1652
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
LLVM_READONLY APFloat maxnum(const APFloat &A, const APFloat &B)
Implements IEEE-754 2008 maxNum semantics.
Definition APFloat.h:1695
constexpr unsigned MaxAnalysisRecursionDepth
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
APFloat scalbn(APFloat X, int Exp, APFloat::roundingMode RM)
Returns: X * 2^Exp for integral exponents.
Definition APFloat.h:1640
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
@ FMul
Product of floats.
@ FAdd
Sum of floats.
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
@ NearestTiesToEven
roundTiesToEven.
LLVM_ABI bool isKnownNeverInfOrNaN(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if the floating-point value can never contain a NaN or infinity.
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2165
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
LLVM_ABI Constant * ConstantFoldInstOperands(const Instruction *I, ArrayRef< Constant * > Ops, const DataLayout &DL, const TargetLibraryInfo *TLI=nullptr, bool AllowNonDeterministic=true)
ConstantFoldInstOperands - Attempt to constant fold an instruction with the specified operands.
constexpr uint64_t Make_64(uint32_t High, uint32_t Low)
Make a 64-bit integer from a high / low pair of 32-bit integers.
Definition MathExtras.h:160
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
bool isConstant() const
Returns true if we know the value of all bits.
Definition KnownBits.h:54
const APInt & getConstant() const
Returns the value when all bits have a known value.
Definition KnownBits.h:58
SimplifyQuery getWithInstruction(const Instruction *I) const
LLVM_ABI bool isUndefValue(Value *V) const
If CanUseUndef is true, returns whether V is undef.