LLVM 23.0.0git
AMDGPUTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// \file
10// This file implements a TargetTransformInfo analysis pass specific to the
11// AMDGPU target machine. It uses the target's detailed information to provide
12// more precise answers to certain TTI queries, while letting the target
13// independent and default TTI implementations handle the rest.
14//
15//===----------------------------------------------------------------------===//
16
18#include "AMDGPUSubtarget.h"
19#include "AMDGPUTargetMachine.h"
27#include "llvm/IR/Function.h"
28#include "llvm/IR/IRBuilder.h"
29#include "llvm/IR/IntrinsicsAMDGPU.h"
33#include <optional>
34
35using namespace llvm;
36
37#define DEBUG_TYPE "AMDGPUtti"
38
40 "amdgpu-unroll-threshold-private",
41 cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
42 cl::init(2700), cl::Hidden);
43
45 "amdgpu-unroll-threshold-local",
46 cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
47 cl::init(1000), cl::Hidden);
48
50 "amdgpu-unroll-threshold-if",
51 cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
52 cl::init(200), cl::Hidden);
53
55 "amdgpu-unroll-runtime-local",
56 cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
57 cl::init(true), cl::Hidden);
58
60 "amdgpu-unroll-max-block-to-analyze",
61 cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
62 cl::init(32), cl::Hidden);
63
64static cl::opt<unsigned> ArgAllocaCost("amdgpu-inline-arg-alloca-cost",
65 cl::Hidden, cl::init(4000),
66 cl::desc("Cost of alloca argument"));
67
68// If the amount of scratch memory to eliminate exceeds our ability to allocate
69// it into registers we gain nothing by aggressively inlining functions for that
70// heuristic.
72 ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden,
73 cl::init(256),
74 cl::desc("Maximum alloca size to use for inline cost"));
75
76// Inliner constraint to achieve reasonable compilation time.
78 "amdgpu-inline-max-bb", cl::Hidden, cl::init(1100),
79 cl::desc("Maximum number of BBs allowed in a function after inlining"
80 " (compile time constraint)"));
81
82// This default unroll factor is based on microbenchmarks on gfx1030.
84 "amdgpu-memcpy-loop-unroll",
85 cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory "
86 "operations when lowering statically-sized memcpy, memmove, or"
87 "memset as a loop"),
88 cl::init(16), cl::Hidden);
89
90static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
91 unsigned Depth = 0) {
93 if (!I)
94 return false;
95
96 for (const Value *V : I->operand_values()) {
97 if (!L->contains(I))
98 continue;
99 if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
100 if (llvm::none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
101 return SubLoop->contains(PHI); }))
102 return true;
103 } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
104 return true;
105 }
106 return false;
107}
108
110 : BaseT(TM, F.getDataLayout()),
111 TargetTriple(TM->getTargetTriple()),
112 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
113 TLI(ST->getTargetLowering()) {}
114
117 OptimizationRemarkEmitter *ORE) const {
118 const Function &F = *L->getHeader()->getParent();
119 UP.Threshold =
120 F.getFnAttributeAsParsedInteger("amdgpu-unroll-threshold", 300);
121 UP.MaxCount = std::numeric_limits<unsigned>::max();
122 UP.Partial = true;
123
124 // Conditional branch in a loop back edge needs 3 additional exec
125 // manipulations in average.
126 UP.BEInsns += 3;
127
128 // We want to run unroll even for the loops which have been vectorized.
129 UP.UnrollVectorizedLoop = true;
130
131 // TODO: Do we want runtime unrolling?
132
133 // Maximum alloca size than can fit registers. Reserve 16 registers.
134 const unsigned MaxAlloca = (256 - 16) * 4;
135 unsigned ThresholdPrivate = UnrollThresholdPrivate;
136 unsigned ThresholdLocal = UnrollThresholdLocal;
137
138 // If this loop has the amdgpu.loop.unroll.threshold metadata we will use the
139 // provided threshold value as the default for Threshold
140 if (MDNode *LoopUnrollThreshold =
141 findOptionMDForLoop(L, "amdgpu.loop.unroll.threshold")) {
142 if (LoopUnrollThreshold->getNumOperands() == 2) {
144 LoopUnrollThreshold->getOperand(1));
145 if (MetaThresholdValue) {
146 // We will also use the supplied value for PartialThreshold for now.
147 // We may introduce additional metadata if it becomes necessary in the
148 // future.
149 UP.Threshold = MetaThresholdValue->getSExtValue();
151 ThresholdPrivate = std::min(ThresholdPrivate, UP.Threshold);
152 ThresholdLocal = std::min(ThresholdLocal, UP.Threshold);
153 }
154 }
155 }
156
157 unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
158 for (const BasicBlock *BB : L->getBlocks()) {
159 const DataLayout &DL = BB->getDataLayout();
160 unsigned LocalGEPsSeen = 0;
161
162 if (llvm::any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
163 return SubLoop->contains(BB); }))
164 continue; // Block belongs to an inner loop.
165
166 for (const Instruction &I : *BB) {
167 // Unroll a loop which contains an "if" statement whose condition
168 // defined by a PHI belonging to the loop. This may help to eliminate
169 // if region and potentially even PHI itself, saving on both divergence
170 // and registers used for the PHI.
171 // Add a small bonus for each of such "if" statements.
172 if (const CondBrInst *Br = dyn_cast<CondBrInst>(&I)) {
173 if (UP.Threshold < MaxBoost) {
174 BasicBlock *Succ0 = Br->getSuccessor(0);
175 BasicBlock *Succ1 = Br->getSuccessor(1);
176 if ((L->contains(Succ0) && L->isLoopExiting(Succ0)) ||
177 (L->contains(Succ1) && L->isLoopExiting(Succ1)))
178 continue;
179 if (dependsOnLocalPhi(L, Br->getCondition())) {
181 LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
182 << " for loop:\n"
183 << *L << " due to " << *Br << '\n');
184 if (UP.Threshold >= MaxBoost)
185 return;
186 }
187 }
188 continue;
189 }
190
192 if (!GEP)
193 continue;
194
195 unsigned AS = GEP->getAddressSpace();
196 unsigned Threshold = 0;
198 Threshold = ThresholdPrivate;
200 Threshold = ThresholdLocal;
201 else
202 continue;
203
204 if (UP.Threshold >= Threshold)
205 continue;
206
207 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
208 const Value *Ptr = GEP->getPointerOperand();
209 const AllocaInst *Alloca =
211 if (!Alloca || !Alloca->isStaticAlloca())
212 continue;
213 auto AllocaSize = Alloca->getAllocationSize(DL);
214 if (!AllocaSize || AllocaSize->getFixedValue() > MaxAlloca)
215 continue;
216 } else if (AS == AMDGPUAS::LOCAL_ADDRESS ||
218 LocalGEPsSeen++;
219 // Inhibit unroll for local memory if we have seen addressing not to
220 // a variable, most likely we will be unable to combine it.
221 // Do not unroll too deep inner loops for local memory to give a chance
222 // to unroll an outer loop for a more important reason.
223 if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
224 (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
225 !isa<Argument>(GEP->getPointerOperand())))
226 continue;
227 LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
228 << *L << " due to LDS use.\n");
230 }
231
232 // Check if GEP depends on a value defined by this loop itself.
233 bool HasLoopDef = false;
234 for (const Value *Op : GEP->operands()) {
235 const Instruction *Inst = dyn_cast<Instruction>(Op);
236 if (!Inst || L->isLoopInvariant(Op))
237 continue;
238
239 if (llvm::any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
240 return SubLoop->contains(Inst); }))
241 continue;
242 HasLoopDef = true;
243 break;
244 }
245 if (!HasLoopDef)
246 continue;
247
248 // We want to do whatever we can to limit the number of alloca
249 // instructions that make it through to the code generator. allocas
250 // require us to use indirect addressing, which is slow and prone to
251 // compiler bugs. If this loop does an address calculation on an
252 // alloca ptr, then we want to use a higher than normal loop unroll
253 // threshold. This will give SROA a better chance to eliminate these
254 // allocas.
255 //
256 // We also want to have more unrolling for local memory to let ds
257 // instructions with different offsets combine.
258 //
259 // Don't use the maximum allowed value here as it will make some
260 // programs way too big.
261 UP.Threshold = Threshold;
262 LLVM_DEBUG(dbgs() << "Set unroll threshold " << Threshold
263 << " for loop:\n"
264 << *L << " due to " << *GEP << '\n');
265 if (UP.Threshold >= MaxBoost)
266 return;
267 }
268
269 // If we got a GEP in a small BB from inner loop then increase max trip
270 // count to analyze for better estimation cost in unroll
271 if (L->isInnermost() && BB->size() < UnrollMaxBlockToAnalyze)
273 }
274 // If a user provided an explicit unroll pragma (with or without count),
275 // override expensive trip count checks
276 UnrollPragmaInfo PInfo(L);
277 if (PInfo.PragmaEnableUnroll || PInfo.PragmaCount > 0)
278 UP.AllowExpensiveTripCount = true;
279}
280
285
289
290const FeatureBitset GCNTTIImpl::InlineFeatureIgnoreList = {
291 // Codegen control options which don't matter.
292 AMDGPU::FeatureEnableLoadStoreOpt, AMDGPU::FeatureEnableSIScheduler,
293 AMDGPU::FeatureEnableUnsafeDSOffsetFolding, AMDGPU::FeatureUseFlatForGlobal,
294 AMDGPU::FeatureUnalignedScratchAccess, AMDGPU::FeatureUnalignedAccessMode,
295
296 AMDGPU::FeatureAutoWaitcntBeforeBarrier,
297
298 // Property of the kernel/environment which can't actually differ.
299 AMDGPU::FeatureSGPRInitBug, AMDGPU::FeatureXNACK,
300 AMDGPU::FeatureTrapHandler,
301
302 // The default assumption needs to be ecc is enabled, but no directly
303 // exposed operations depend on it, so it can be safely inlined.
304 AMDGPU::FeatureSRAMECC,
305
306 // Perf-tuning features
307 AMDGPU::FeatureFastFMAF32, AMDGPU::FeatureHalfRate64Ops};
308
310 : BaseT(TM, F.getDataLayout()),
311 ST(static_cast<const GCNSubtarget *>(TM->getSubtargetImpl(F))),
312 TLI(ST->getTargetLowering()), CommonTTI(TM, F),
313 IsGraphics(AMDGPU::isGraphics(F.getCallingConv())) {
315 HasFP32Denormals = Mode.FP32Denormals != DenormalMode::getPreserveSign();
316 HasFP64FP16Denormals =
317 Mode.FP64FP16Denormals != DenormalMode::getPreserveSign();
318}
319
321 return !F || !ST->isSingleLaneExecution(*F);
322}
323
324unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
325 // NB: RCID is not an RCID. In fact it is 0 or 1 for scalar or vector
326 // registers. See getRegisterClassForType for the implementation.
327 // In this case vector registers are not vector in terms of
328 // VGPRs, but those which can hold multiple values.
329
330 // This is really the number of registers to fill when vectorizing /
331 // interleaving loops, so we lie to avoid trying to use all registers.
332 return 4;
333}
334
337 switch (K) {
339 return TypeSize::getFixed(32);
341 return TypeSize::getFixed(ST->hasPackedFP32Ops() ? 64 : 32);
343 return TypeSize::getScalable(0);
344 }
345 llvm_unreachable("Unsupported register kind");
346}
347
349 return 32;
350}
351
352unsigned GCNTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
353 if (Opcode == Instruction::Load || Opcode == Instruction::Store)
354 return 32 * 4 / ElemWidth;
355 // For a given width return the max 0number of elements that can be combined
356 // into a wider bit value:
357 return (ElemWidth == 8 && ST->has16BitInsts()) ? 4
358 : (ElemWidth == 16 && ST->has16BitInsts()) ? 2
359 : (ElemWidth == 32 && ST->hasPackedFP32Ops()) ? 2
360 : 1;
361}
362
363unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
364 unsigned ChainSizeInBytes,
365 VectorType *VecTy) const {
366 unsigned VecRegBitWidth = VF * LoadSize;
367 if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
368 // TODO: Support element-size less than 32bit?
369 return 128 / LoadSize;
370
371 return VF;
372}
373
374unsigned GCNTTIImpl::getStoreVectorFactor(unsigned VF, unsigned StoreSize,
375 unsigned ChainSizeInBytes,
376 VectorType *VecTy) const {
377 unsigned VecRegBitWidth = VF * StoreSize;
378 if (VecRegBitWidth > 128)
379 return 128 / StoreSize;
380
381 return VF;
382}
383
384unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
385 if (AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ||
386 AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
388 AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
389 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
391 return 512;
392 }
393
394 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
395 return 8 * ST->getMaxPrivateElementSize();
396
397 // Common to flat, global, local and region. Assume for unknown addrspace.
398 return 128;
399}
400
401bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
402 Align Alignment,
403 unsigned AddrSpace) const {
404 // We allow vectorization of flat stores, even though we may need to decompose
405 // them later if they may access private memory. We don't have enough context
406 // here, and legalization can handle it.
407 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS) {
408 return (Alignment >= 4 || ST->hasUnalignedScratchAccessEnabled()) &&
409 ChainSizeInBytes <= ST->getMaxPrivateElementSize();
410 }
411 return true;
412}
413
414bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
415 Align Alignment,
416 unsigned AddrSpace) const {
417 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
418}
419
420bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
421 Align Alignment,
422 unsigned AddrSpace) const {
423 return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
424}
425
429
431 LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
432 unsigned DestAddrSpace, Align SrcAlign, Align DestAlign,
433 std::optional<uint32_t> AtomicElementSize) const {
434
435 if (AtomicElementSize)
436 return Type::getIntNTy(Context, *AtomicElementSize * 8);
437
438 // 16-byte accesses achieve the highest copy throughput.
439 // If the operation has a fixed known length that is large enough, it is
440 // worthwhile to return an even wider type and let legalization lower it into
441 // multiple accesses, effectively unrolling the memcpy loop.
442 // We also rely on legalization to decompose into smaller accesses for
443 // subtargets and address spaces where it is necessary.
444 //
445 // Don't unroll if Length is not a constant, since unrolling leads to worse
446 // performance for length values that are smaller or slightly larger than the
447 // total size of the type returned here. Mitigating that would require a more
448 // complex lowering for variable-length memcpy and memmove.
449 unsigned I32EltsInVector = 4;
452 MemcpyLoopUnroll * I32EltsInVector);
453
454 return FixedVectorType::get(Type::getInt32Ty(Context), I32EltsInVector);
455}
456
458 SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
459 unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
460 Align SrcAlign, Align DestAlign,
461 std::optional<uint32_t> AtomicCpySize) const {
462
463 if (AtomicCpySize)
465 OpsOut, Context, RemainingBytes, SrcAddrSpace, DestAddrSpace, SrcAlign,
466 DestAlign, AtomicCpySize);
467
468 Type *I32x4Ty = FixedVectorType::get(Type::getInt32Ty(Context), 4);
469 while (RemainingBytes >= 16) {
470 OpsOut.push_back(I32x4Ty);
471 RemainingBytes -= 16;
472 }
473
474 Type *I64Ty = Type::getInt64Ty(Context);
475 while (RemainingBytes >= 8) {
476 OpsOut.push_back(I64Ty);
477 RemainingBytes -= 8;
478 }
479
480 Type *I32Ty = Type::getInt32Ty(Context);
481 while (RemainingBytes >= 4) {
482 OpsOut.push_back(I32Ty);
483 RemainingBytes -= 4;
484 }
485
486 Type *I16Ty = Type::getInt16Ty(Context);
487 while (RemainingBytes >= 2) {
488 OpsOut.push_back(I16Ty);
489 RemainingBytes -= 2;
490 }
491
492 Type *I8Ty = Type::getInt8Ty(Context);
493 while (RemainingBytes) {
494 OpsOut.push_back(I8Ty);
495 --RemainingBytes;
496 }
497}
498
500 // Disable unrolling if the loop is not vectorized.
501 // TODO: Enable this again.
502 if (VF.isScalar())
503 return 1;
504
505 return 8;
506}
507
509 MemIntrinsicInfo &Info) const {
510 switch (Inst->getIntrinsicID()) {
511 case Intrinsic::amdgcn_ds_ordered_add:
512 case Intrinsic::amdgcn_ds_ordered_swap: {
513 auto *Ordering = dyn_cast<ConstantInt>(Inst->getArgOperand(2));
514 auto *Volatile = dyn_cast<ConstantInt>(Inst->getArgOperand(4));
515 if (!Ordering || !Volatile)
516 return false; // Invalid.
517
518 unsigned OrderingVal = Ordering->getZExtValue();
519 if (OrderingVal > static_cast<unsigned>(AtomicOrdering::SequentiallyConsistent))
520 return false;
521
522 Info.PtrVal = Inst->getArgOperand(0);
523 Info.Ordering = static_cast<AtomicOrdering>(OrderingVal);
524 Info.ReadMem = true;
525 Info.WriteMem = true;
526 Info.IsVolatile = !Volatile->isZero();
527 return true;
528 }
529 default:
530 return false;
531 }
532}
533
535 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
537 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
538
539 // Legalize the type.
540 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
541 int ISD = TLI->InstructionOpcodeToISD(Opcode);
542
543 // Because we don't have any legal vector operations, but the legal types, we
544 // need to account for split vectors.
545 unsigned NElts = LT.second.isVector() ?
546 LT.second.getVectorNumElements() : 1;
547
548 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
549
550 switch (ISD) {
551 case ISD::SHL:
552 case ISD::SRL:
553 case ISD::SRA:
554 if (SLT == MVT::i64)
555 return get64BitInstrCost(CostKind) * LT.first * NElts;
556
557 if (ST->has16BitInsts() && SLT == MVT::i16)
558 NElts = (NElts + 1) / 2;
559
560 // i32
561 return getFullRateInstrCost() * LT.first * NElts;
562 case ISD::ADD:
563 case ISD::SUB:
564 case ISD::AND:
565 case ISD::OR:
566 case ISD::XOR:
567 if (SLT == MVT::i64) {
568 // and, or and xor are typically split into 2 VALU instructions.
569 return 2 * getFullRateInstrCost() * LT.first * NElts;
570 }
571
572 if (ST->has16BitInsts() && SLT == MVT::i16)
573 NElts = (NElts + 1) / 2;
574
575 return LT.first * NElts * getFullRateInstrCost();
576 case ISD::MUL: {
577 const int QuarterRateCost = getQuarterRateInstrCost(CostKind);
578 if (SLT == MVT::i64) {
579 const int FullRateCost = getFullRateInstrCost();
580 return (4 * QuarterRateCost + (2 * 2) * FullRateCost) * LT.first * NElts;
581 }
582
583 if (ST->has16BitInsts() && SLT == MVT::i16)
584 NElts = (NElts + 1) / 2;
585
586 // i32
587 return QuarterRateCost * NElts * LT.first;
588 }
589 case ISD::FMUL:
590 // Check possible fuse {fadd|fsub}(a,fmul(b,c)) and return zero cost for
591 // fmul(b,c) supposing the fadd|fsub will get estimated cost for the whole
592 // fused operation.
593 if (CxtI && CxtI->hasOneUse())
594 if (const auto *FAdd = dyn_cast<BinaryOperator>(*CxtI->user_begin())) {
595 const int OPC = TLI->InstructionOpcodeToISD(FAdd->getOpcode());
596 if (OPC == ISD::FADD || OPC == ISD::FSUB) {
597 if (ST->hasMadMacF32Insts() && SLT == MVT::f32 && !HasFP32Denormals)
599 if (ST->has16BitInsts() && SLT == MVT::f16 && !HasFP64FP16Denormals)
601
602 // Estimate all types may be fused with contract/unsafe flags
603 const TargetOptions &Options = TLI->getTargetMachine().Options;
604 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
605 (FAdd->hasAllowContract() && CxtI->hasAllowContract()))
607 }
608 }
609 [[fallthrough]];
610 case ISD::FADD:
611 case ISD::FSUB:
612 if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
613 NElts = (NElts + 1) / 2;
614 if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
615 NElts = (NElts + 1) / 2;
616 if (SLT == MVT::f64)
617 return LT.first * NElts * get64BitInstrCost(CostKind);
618
619 if (ST->has16BitInsts() && SLT == MVT::f16)
620 NElts = (NElts + 1) / 2;
621
622 if (SLT == MVT::f32 || SLT == MVT::f16 || SLT == MVT::bf16)
623 return LT.first * NElts * getFullRateInstrCost();
624 break;
625 case ISD::FDIV:
626 case ISD::FREM:
627 // FIXME: frem should be handled separately. The fdiv in it is most of it,
628 // but the current lowering is also not entirely correct.
629 if (SLT == MVT::f64) {
630 int Cost = 7 * get64BitInstrCost(CostKind) +
631 getQuarterRateInstrCost(CostKind) +
632 3 * getHalfRateInstrCost(CostKind);
633 // Add cost of workaround.
634 if (!ST->hasUsableDivScaleConditionOutput())
635 Cost += 3 * getFullRateInstrCost();
636
637 return LT.first * Cost * NElts;
638 }
639
640 if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
641 // TODO: This is more complicated, unsafe flags etc.
642 if ((SLT == MVT::f32 && !HasFP32Denormals) ||
643 (SLT == MVT::f16 && ST->has16BitInsts())) {
644 return LT.first * getTransInstrCost(CostKind) * NElts;
645 }
646 }
647
648 if (SLT == MVT::f16 && ST->has16BitInsts()) {
649 // 2 x v_cvt_f32_f16
650 // f32 rcp
651 // f32 fmul
652 // v_cvt_f16_f32
653 // f16 div_fixup
654 int Cost = 4 * getFullRateInstrCost() + 2 * getTransInstrCost(CostKind);
655 return LT.first * Cost * NElts;
656 }
657
658 if (SLT == MVT::f32 && (CxtI && CxtI->hasApproxFunc())) {
659 // Fast unsafe fdiv lowering:
660 // f32 rcp
661 // f32 fmul
662 int Cost = getTransInstrCost(CostKind) + getFullRateInstrCost();
663 return LT.first * Cost * NElts;
664 }
665
666 if (SLT == MVT::f32 || SLT == MVT::f16) {
667 // 4 more v_cvt_* insts without f16 insts support
668 int Cost = (SLT == MVT::f16 ? 14 : 10) * getFullRateInstrCost() +
669 1 * getTransInstrCost(CostKind);
670
671 if (!HasFP32Denormals) {
672 // FP mode switches.
673 Cost += 2 * getFullRateInstrCost();
674 }
675
676 return LT.first * NElts * Cost;
677 }
678 break;
679 case ISD::FNEG:
680 // Use the backend' estimation. If fneg is not free each element will cost
681 // one additional instruction.
682 return TLI->isFNegFree(SLT) ? 0 : NElts;
683 default:
684 break;
685 }
686
687 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
688 Args, CxtI);
689}
690
691// Return true if there's a potential benefit from using v2f16/v2i16
692// instructions for an intrinsic, even if it requires nontrivial legalization.
694 switch (ID) {
695 case Intrinsic::fma:
696 case Intrinsic::fmuladd:
697 case Intrinsic::copysign:
698 case Intrinsic::minimumnum:
699 case Intrinsic::maximumnum:
700 case Intrinsic::canonicalize:
701 // There's a small benefit to using vector ops in the legalized code.
702 case Intrinsic::round:
703 case Intrinsic::uadd_sat:
704 case Intrinsic::usub_sat:
705 case Intrinsic::sadd_sat:
706 case Intrinsic::ssub_sat:
707 case Intrinsic::abs:
708 return true;
709 default:
710 return false;
711 }
712}
713
717 switch (ICA.getID()) {
718 case Intrinsic::fabs:
719 // Free source modifier in the common case.
720 return 0;
721 case Intrinsic::amdgcn_workitem_id_x:
722 case Intrinsic::amdgcn_workitem_id_y:
723 case Intrinsic::amdgcn_workitem_id_z:
724 // TODO: If hasPackedTID, or if the calling context is not an entry point
725 // there may be a bit instruction.
726 return 0;
727 case Intrinsic::amdgcn_workgroup_id_x:
728 case Intrinsic::amdgcn_workgroup_id_y:
729 case Intrinsic::amdgcn_workgroup_id_z:
730 case Intrinsic::amdgcn_lds_kernel_id:
731 case Intrinsic::amdgcn_dispatch_ptr:
732 case Intrinsic::amdgcn_dispatch_id:
733 case Intrinsic::amdgcn_implicitarg_ptr:
734 case Intrinsic::amdgcn_queue_ptr:
735 // Read from an argument register.
736 return 0;
737 default:
738 break;
739 }
740
741 Type *RetTy = ICA.getReturnType();
742
743 Intrinsic::ID IID = ICA.getID();
744 switch (IID) {
745 case Intrinsic::exp:
746 case Intrinsic::exp2:
747 case Intrinsic::exp10: {
748 // Legalize the type.
749 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
750 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
751 unsigned NElts =
752 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
753
754 if (SLT == MVT::f64) {
755 unsigned NumOps = 20;
756 if (IID == Intrinsic::exp)
757 ++NumOps;
758 else if (IID == Intrinsic::exp10)
759 NumOps += 3;
760
761 return LT.first * NElts * NumOps * get64BitInstrCost(CostKind);
762 }
763
764 if (SLT == MVT::f32) {
765 unsigned NumFullRateOps = 0;
766 // v_exp_f32 (transcendental).
767 unsigned NumTransOps = 1;
768
769 if (!ICA.getFlags().approxFunc() && IID != Intrinsic::exp2) {
770 // Non-AFN exp/exp10: range reduction + v_exp_f32 + ldexp +
771 // overflow/underflow checks (lowerFEXP). Denorm is also handled.
772 // FMA preamble: ~13 full-rate ops; non-FMA: ~17.
773 NumFullRateOps = ST->hasFastFMAF32() ? 13 : 17;
774 } else {
775 if (IID == Intrinsic::exp) {
776 // lowerFEXPUnsafe: fmul (base conversion) + v_exp_f32.
777 NumFullRateOps = 1;
778 } else if (IID == Intrinsic::exp10) {
779 // lowerFEXP10Unsafe: 3 fmul + 2 v_exp_f32 (double-exp2).
780 NumFullRateOps = 3;
781 NumTransOps = 2;
782 }
783 // Denorm scaling adds setcc + select + fadd + select + fmul.
784 if (HasFP32Denormals)
785 NumFullRateOps += 5;
786 }
787
788 InstructionCost Cost = NumFullRateOps * getFullRateInstrCost() +
789 NumTransOps * getTransInstrCost(CostKind);
790 return LT.first * NElts * Cost;
791 }
792
793 break;
794 }
795 case Intrinsic::log:
796 case Intrinsic::log2:
797 case Intrinsic::log10: {
798 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
799 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
800 unsigned NElts =
801 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
802
803 if (SLT == MVT::f32) {
804 unsigned NumFullRateOps = 0;
805
806 if (IID == Intrinsic::log2) {
807 // LowerFLOG2: just v_log_f32.
808 } else if (ICA.getFlags().approxFunc()) {
809 // LowerFLOGUnsafe: v_log_f32 + fmul (base conversion).
810 NumFullRateOps = 1;
811 } else {
812 // LowerFLOGCommon non-AFN: v_log_f32 + extended-precision
813 // multiply + finite check.
814 NumFullRateOps = ST->hasFastFMAF32() ? 8 : 11;
815 }
816
817 if (HasFP32Denormals)
818 NumFullRateOps += 5;
819
821 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
822 return LT.first * NElts * Cost;
823 }
824
825 break;
826 }
827 case Intrinsic::sin:
828 case Intrinsic::cos: {
829 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
830 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
831 unsigned NElts =
832 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
833
834 if (SLT == MVT::f32) {
835 // LowerTrig: fmul(1/2pi) + v_sin/v_cos.
836 unsigned NumFullRateOps = ST->hasTrigReducedRange() ? 2 : 1;
837
839 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
840 return LT.first * NElts * Cost;
841 }
842
843 break;
844 }
845 case Intrinsic::sqrt: {
846 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
847 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
848 unsigned NElts =
849 LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
850
851 if (SLT == MVT::f32) {
852 unsigned NumFullRateOps = 0;
853
854 if (!ICA.getFlags().approxFunc()) {
855 // lowerFSQRTF32 non-AFN: v_sqrt_f32 + refinement + scale fixup.
856 NumFullRateOps = HasFP32Denormals ? 17 : 16;
857 }
858
860 NumFullRateOps * getFullRateInstrCost() + getTransInstrCost(CostKind);
861 return LT.first * NElts * Cost;
862 }
863
864 break;
865 }
866 default:
867 break;
868 }
869
872
873 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
874 MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
875 unsigned NElts = LT.second.isVector() ? LT.second.getVectorNumElements() : 1;
876
877 if ((ST->hasVOP3PInsts() &&
878 (SLT == MVT::f16 || SLT == MVT::i16 ||
879 (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
880 (ST->hasPackedFP32Ops() && SLT == MVT::f32))
881 NElts = (NElts + 1) / 2;
882
883 // TODO: Get more refined intrinsic costs?
884 unsigned InstRate = getQuarterRateInstrCost(CostKind);
885
886 switch (ICA.getID()) {
887 case Intrinsic::fma:
888 case Intrinsic::fmuladd:
889 if (SLT == MVT::f64) {
890 InstRate = get64BitInstrCost(CostKind);
891 break;
892 }
893
894 if ((SLT == MVT::f32 && ST->hasFastFMAF32()) || SLT == MVT::f16)
895 InstRate = getFullRateInstrCost();
896 else {
897 InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost(CostKind)
898 : getQuarterRateInstrCost(CostKind);
899 }
900 break;
901 case Intrinsic::copysign:
902 return NElts * getFullRateInstrCost();
903 case Intrinsic::minimumnum:
904 case Intrinsic::maximumnum: {
905 // Instruction + 2 canonicalizes. For cases that need type promotion, we the
906 // promotion takes the place of the canonicalize.
907 unsigned NumOps = 3;
908 if (const IntrinsicInst *II = ICA.getInst()) {
909 // Directly legal with ieee=0
910 // TODO: Not directly legal with strictfp
912 NumOps = 1;
913 }
914
915 unsigned BaseRate =
916 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
917 InstRate = BaseRate * NumOps;
918 break;
919 }
920 case Intrinsic::canonicalize: {
921 InstRate =
922 SLT == MVT::f64 ? get64BitInstrCost(CostKind) : getFullRateInstrCost();
923 break;
924 }
925 case Intrinsic::uadd_sat:
926 case Intrinsic::usub_sat:
927 case Intrinsic::sadd_sat:
928 case Intrinsic::ssub_sat: {
929 if (SLT == MVT::i16 || SLT == MVT::i32)
930 InstRate = getFullRateInstrCost();
931
932 static const auto ValidSatTys = {MVT::v2i16, MVT::v4i16};
933 if (any_of(ValidSatTys, equal_to(LT.second)))
934 NElts = 1;
935 break;
936 }
937 case Intrinsic::abs:
938 // Expansion takes 2 instructions for VALU
939 if (SLT == MVT::i16 || SLT == MVT::i32)
940 InstRate = 2 * getFullRateInstrCost();
941 break;
942 default:
943 break;
944 }
945
946 return LT.first * NElts * InstRate;
947}
948
951 const Instruction *I) const {
952 assert((I == nullptr || I->getOpcode() == Opcode) &&
953 "Opcode should reflect passed instruction.");
954 const bool SCost =
956 const int CBrCost = SCost ? 5 : 7;
957 switch (Opcode) {
958 case Instruction::UncondBr:
959 // Branch instruction takes about 4 slots on gfx900.
960 return SCost ? 1 : 4;
961 case Instruction::CondBr:
962 // Suppose conditional branch takes additional 3 exec manipulations
963 // instructions in average.
964 return CBrCost;
965 case Instruction::Switch: {
966 const auto *SI = dyn_cast_or_null<SwitchInst>(I);
967 // Each case (including default) takes 1 cmp + 1 cbr instructions in
968 // average.
969 return (SI ? (SI->getNumCases() + 1) : 4) * (CBrCost + 1);
970 }
971 case Instruction::Ret:
972 return SCost ? 1 : 10;
973 }
974 return BaseT::getCFInstrCost(Opcode, CostKind, I);
975}
976
979 std::optional<FastMathFlags> FMF,
982 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
983
984 EVT OrigTy = TLI->getValueType(DL, Ty);
985
986 // Computes cost on targets that have packed math instructions(which support
987 // 16-bit types only).
988 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
989 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
990
991 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
992 return LT.first * getFullRateInstrCost();
993}
994
997 FastMathFlags FMF,
999 EVT OrigTy = TLI->getValueType(DL, Ty);
1000
1001 // Computes cost on targets that have packed math instructions(which support
1002 // 16-bit types only).
1003 if (!ST->hasVOP3PInsts() || OrigTy.getScalarSizeInBits() != 16)
1004 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1005
1006 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1007 return LT.first * getHalfRateInstrCost(CostKind);
1008}
1009
1011 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
1012 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
1013 switch (Opcode) {
1014 case Instruction::ExtractElement:
1015 case Instruction::InsertElement: {
1016 unsigned EltSize
1017 = DL.getTypeSizeInBits(cast<VectorType>(ValTy)->getElementType());
1018 if (EltSize < 32) {
1019 if (EltSize == 16 && Index == 0 && ST->has16BitInsts())
1020 return 0;
1021 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1022 VIC);
1023 }
1024
1025 // Extracts are just reads of a subregister, so are free. Inserts are
1026 // considered free because we don't want to have any cost for scalarizing
1027 // operations, and we don't have to copy into a different register class.
1028
1029 // Dynamic indexing isn't free and is best avoided.
1030 return Index == ~0u ? 2 : 0;
1031 }
1032 default:
1033 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
1034 VIC);
1035 }
1036}
1037
1038/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
1039/// this is analyzing the collective result of all output registers. Otherwise,
1040/// this is only querying a specific result index if this returns multiple
1041/// registers in a struct.
1043 const CallInst *CI, ArrayRef<unsigned> Indices) const {
1044 // TODO: Handle complex extract indices
1045 if (Indices.size() > 1)
1046 return true;
1047
1048 const DataLayout &DL = CI->getDataLayout();
1049 const SIRegisterInfo *TRI = ST->getRegisterInfo();
1050 TargetLowering::AsmOperandInfoVector TargetConstraints =
1051 TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
1052
1053 const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
1054
1055 int OutputIdx = 0;
1056 for (auto &TC : TargetConstraints) {
1057 if (TC.Type != InlineAsm::isOutput)
1058 continue;
1059
1060 // Skip outputs we don't care about.
1061 if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
1062 continue;
1063
1064 TLI->ComputeConstraintToUse(TC, SDValue());
1065
1066 const TargetRegisterClass *RC = TLI->getRegForInlineAsmConstraint(
1067 TRI, TC.ConstraintCode, TC.ConstraintVT).second;
1068
1069 // For AGPR constraints null is returned on subtargets without AGPRs, so
1070 // assume divergent for null.
1071 if (!RC || !TRI->isSGPRClass(RC))
1072 return true;
1073 }
1074
1075 return false;
1076}
1077
1079 const IntrinsicInst *ReadReg) const {
1080 Metadata *MD =
1081 cast<MetadataAsValue>(ReadReg->getArgOperand(0))->getMetadata();
1083 cast<MDString>(cast<MDNode>(MD)->getOperand(0))->getString();
1084
1085 // Special case registers that look like VCC.
1086 MVT VT = MVT::getVT(ReadReg->getType());
1087 if (VT == MVT::i1)
1088 return true;
1089
1090 // Special case scalar registers that start with 'v'.
1091 if (RegName.starts_with("vcc") || RegName.empty())
1092 return false;
1093
1094 // VGPR or AGPR is divergent. There aren't any specially named vector
1095 // registers.
1096 return RegName[0] == 'v' || RegName[0] == 'a';
1097}
1098
1099/// \returns true if the result of the value could potentially be
1100/// different across workitems in a wavefront.
1101bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
1102 if (const Argument *A = dyn_cast<Argument>(V))
1104
1105 // Loads from the private and flat address spaces are divergent, because
1106 // threads can execute the load instruction with the same inputs and get
1107 // different results.
1108 //
1109 // All other loads are not divergent, because if threads issue loads with the
1110 // same arguments, they will always get the same result.
1111 if (const LoadInst *Load = dyn_cast<LoadInst>(V))
1112 return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS ||
1113 Load->getPointerAddressSpace() == AMDGPUAS::FLAT_ADDRESS;
1114
1115 // Atomics are divergent because they are executed sequentially: when an
1116 // atomic operation refers to the same address in each thread, then each
1117 // thread after the first sees the value written by the previous thread as
1118 // original value.
1120 return true;
1121
1123 Intrinsic::ID IID = Intrinsic->getIntrinsicID();
1124 switch (IID) {
1125 case Intrinsic::read_register:
1127 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1128 unsigned SrcAS =
1129 Intrinsic->getOperand(0)->getType()->getPointerAddressSpace();
1130 unsigned DstAS = Intrinsic->getType()->getPointerAddressSpace();
1131 return SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
1132 DstAS == AMDGPUAS::FLAT_ADDRESS &&
1133 ST->hasGloballyAddressableScratch();
1134 }
1135 case Intrinsic::amdgcn_workitem_id_y:
1136 case Intrinsic::amdgcn_workitem_id_z: {
1137 const Function *F = Intrinsic->getFunction();
1138 bool HasUniformYZ =
1139 ST->hasWavefrontsEvenlySplittingXDim(*F, /*RequitezUniformYZ=*/true);
1140 std::optional<unsigned> ThisDimSize = ST->getReqdWorkGroupSize(
1141 *F, IID == Intrinsic::amdgcn_workitem_id_y ? 1 : 2);
1142 return !HasUniformYZ && (!ThisDimSize || *ThisDimSize != 1);
1143 }
1144 default:
1146 }
1147 }
1148
1149 // Assume all function calls are a source of divergence.
1150 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1151 if (CI->isInlineAsm())
1153 return true;
1154 }
1155
1156 // Assume all function calls are a source of divergence.
1157 if (isa<InvokeInst>(V))
1158 return true;
1159
1160 // If the target supports globally addressable scratch, the mapping from
1161 // scratch memory to the flat aperture changes therefore an address space cast
1162 // is no longer uniform.
1163 if (auto *CastI = dyn_cast<AddrSpaceCastInst>(V)) {
1164 return CastI->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS &&
1165 CastI->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS &&
1166 ST->hasGloballyAddressableScratch();
1167 }
1168
1169 return false;
1170}
1171
1172bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
1173 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
1174 return AMDGPU::isIntrinsicAlwaysUniform(Intrinsic->getIntrinsicID());
1175
1176 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
1177 if (CI->isInlineAsm())
1179 return false;
1180 }
1181
1182 // In most cases TID / wavefrontsize is uniform.
1183 //
1184 // However, if a kernel has uneven dimesions we can have a value of
1185 // workitem-id-x divided by the wavefrontsize non-uniform. For example
1186 // dimensions (65, 2) will have workitems with address (64, 0) and (0, 1)
1187 // packed into a same wave which gives 1 and 0 after the division by 64
1188 // respectively.
1189 //
1190 // The X dimension doesn't reset within a wave if either both the Y
1191 // and Z dimensions are of length 1, or if the X dimension's required
1192 // size is a power of 2. Note, however, if the X dimension's maximum
1193 // size is a power of 2 < the wavefront size, division by the wavefront
1194 // size is guaranteed to yield 0, so this is also a no-reset case.
1195 bool XDimDoesntResetWithinWaves = false;
1196 if (auto *I = dyn_cast<Instruction>(V)) {
1197 const Function *F = I->getFunction();
1198 XDimDoesntResetWithinWaves = ST->hasWavefrontsEvenlySplittingXDim(*F);
1199 }
1200 using namespace llvm::PatternMatch;
1201 uint64_t C;
1203 m_ConstantInt(C))) ||
1205 m_ConstantInt(C)))) {
1206 return C >= ST->getWavefrontSizeLog2() && XDimDoesntResetWithinWaves;
1207 }
1208
1209 Value *Mask;
1211 m_Value(Mask)))) {
1212 return computeKnownBits(Mask, DL).countMinTrailingZeros() >=
1213 ST->getWavefrontSizeLog2() &&
1214 XDimDoesntResetWithinWaves;
1215 }
1216
1217 const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
1218 if (!ExtValue)
1219 return false;
1220
1221 const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
1222 if (!CI)
1223 return false;
1224
1225 if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
1226 switch (Intrinsic->getIntrinsicID()) {
1227 default:
1228 return false;
1229 case Intrinsic::amdgcn_if:
1230 case Intrinsic::amdgcn_else: {
1231 ArrayRef<unsigned> Indices = ExtValue->getIndices();
1232 return Indices.size() == 1 && Indices[0] == 1;
1233 }
1234 }
1235 }
1236
1237 // If we have inline asm returning mixed SGPR and VGPR results, we inferred
1238 // divergent for the overall struct return. We need to override it in the
1239 // case we're extracting an SGPR component here.
1240 if (CI->isInlineAsm())
1241 return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
1242
1243 return false;
1244}
1245
1247 Intrinsic::ID IID) const {
1248 switch (IID) {
1249 case Intrinsic::amdgcn_is_shared:
1250 case Intrinsic::amdgcn_is_private:
1251 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1252 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1253 case Intrinsic::amdgcn_load_to_lds:
1254 case Intrinsic::amdgcn_make_buffer_rsrc:
1255 OpIndexes.push_back(0);
1256 return true;
1257 default:
1258 return false;
1259 }
1260}
1261
1263 Value *OldV,
1264 Value *NewV) const {
1265 auto IntrID = II->getIntrinsicID();
1266 switch (IntrID) {
1267 case Intrinsic::amdgcn_is_shared:
1268 case Intrinsic::amdgcn_is_private: {
1269 unsigned TrueAS = IntrID == Intrinsic::amdgcn_is_shared ?
1271 unsigned NewAS = NewV->getType()->getPointerAddressSpace();
1272 LLVMContext &Ctx = NewV->getType()->getContext();
1273 ConstantInt *NewVal = (TrueAS == NewAS) ?
1275 return NewVal;
1276 }
1277 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1278 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
1279 Type *DestTy = II->getType();
1280 Type *SrcTy = NewV->getType();
1281 unsigned NewAS = SrcTy->getPointerAddressSpace();
1283 return nullptr;
1284 Module *M = II->getModule();
1286 M, II->getIntrinsicID(), {DestTy, SrcTy, DestTy});
1287 II->setArgOperand(0, NewV);
1288 II->setCalledFunction(NewDecl);
1289 return II;
1290 }
1291 case Intrinsic::amdgcn_load_to_lds: {
1292 Type *SrcTy = NewV->getType();
1293 Module *M = II->getModule();
1294 Function *NewDecl =
1295 Intrinsic::getOrInsertDeclaration(M, II->getIntrinsicID(), {SrcTy});
1296 II->setArgOperand(0, NewV);
1297 II->setCalledFunction(NewDecl);
1298 return II;
1299 }
1300 case Intrinsic::amdgcn_make_buffer_rsrc: {
1301 Type *SrcTy = NewV->getType();
1302 Type *DstTy = II->getType();
1303 Module *M = II->getModule();
1305 M, II->getIntrinsicID(), {DstTy, SrcTy});
1306 II->setArgOperand(0, NewV);
1307 II->setCalledFunction(NewDecl);
1308 return II;
1309 }
1310 default:
1311 return nullptr;
1312 }
1313}
1314
1316 VectorType *DstTy, VectorType *SrcTy,
1317 ArrayRef<int> Mask,
1319 int Index, VectorType *SubTp,
1321 const Instruction *CxtI) const {
1322 if (!isa<FixedVectorType>(SrcTy))
1323 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1324 SubTp);
1325
1326 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1327
1328 unsigned ScalarSize = DL.getTypeSizeInBits(SrcTy->getElementType());
1329 if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
1330 (ScalarSize == 16 || ScalarSize == 8)) {
1331 // Larger vector widths may require additional instructions, but are
1332 // typically cheaper than scalarized versions.
1333 //
1334 // We assume that shuffling at a register granularity can be done for free.
1335 // This is not true for vectors fed into memory instructions, but it is
1336 // effectively true for all other shuffling. The emphasis of the logic here
1337 // is to assist generic transform in cleaning up / canonicalizing those
1338 // shuffles.
1339
1340 // With op_sel VOP3P instructions freely can access the low half or high
1341 // half of a register, so any swizzle of two elements is free.
1342 if (auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcTy)) {
1343 unsigned NumSrcElts = SrcVecTy->getNumElements();
1344 if (ST->hasVOP3PInsts() && ScalarSize == 16 && NumSrcElts == 2 &&
1345 (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Reverse ||
1346 Kind == TTI::SK_PermuteSingleSrc))
1347 return 0;
1348 }
1349
1350 unsigned EltsPerReg = 32 / ScalarSize;
1351 switch (Kind) {
1352 case TTI::SK_Broadcast:
1353 // A single v_perm_b32 can be re-used for all destination registers.
1354 return 1;
1355 case TTI::SK_Reverse:
1356 // One instruction per register.
1357 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1358 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1361 if (Index % EltsPerReg == 0)
1362 return 0; // Shuffling at register granularity
1363 if (auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy))
1364 return divideCeil(DstVecTy->getNumElements(), EltsPerReg);
1367 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1368 if (!DstVecTy)
1370 unsigned NumDstElts = DstVecTy->getNumElements();
1371 unsigned NumInsertElts = cast<FixedVectorType>(SubTp)->getNumElements();
1372 unsigned EndIndex = Index + NumInsertElts;
1373 unsigned BeginSubIdx = Index % EltsPerReg;
1374 unsigned EndSubIdx = EndIndex % EltsPerReg;
1375 unsigned Cost = 0;
1376
1377 if (BeginSubIdx != 0) {
1378 // Need to shift the inserted vector into place. The cost is the number
1379 // of destination registers overlapped by the inserted vector.
1380 Cost = divideCeil(EndIndex, EltsPerReg) - (Index / EltsPerReg);
1381 }
1382
1383 // If the last register overlap is partial, there may be three source
1384 // registers feeding into it; that takes an extra instruction.
1385 if (EndIndex < NumDstElts && BeginSubIdx < EndSubIdx)
1386 Cost += 1;
1387
1388 return Cost;
1389 }
1390 case TTI::SK_Splice: {
1391 auto *DstVecTy = dyn_cast<FixedVectorType>(DstTy);
1392 if (!DstVecTy)
1394 unsigned NumElts = DstVecTy->getNumElements();
1395 assert(NumElts == cast<FixedVectorType>(SrcTy)->getNumElements());
1396 // Determine the sub-region of the result vector that requires
1397 // sub-register shuffles / mixing.
1398 unsigned EltsFromLHS = NumElts - Index;
1399 bool LHSIsAligned = (Index % EltsPerReg) == 0;
1400 bool RHSIsAligned = (EltsFromLHS % EltsPerReg) == 0;
1401 if (LHSIsAligned && RHSIsAligned)
1402 return 0;
1403 if (LHSIsAligned && !RHSIsAligned)
1404 return divideCeil(NumElts, EltsPerReg) - (EltsFromLHS / EltsPerReg);
1405 if (!LHSIsAligned && RHSIsAligned)
1406 return divideCeil(EltsFromLHS, EltsPerReg);
1407 return divideCeil(NumElts, EltsPerReg);
1408 }
1409 default:
1410 break;
1411 }
1412
1413 if (!Mask.empty()) {
1414 unsigned NumSrcElts = cast<FixedVectorType>(SrcTy)->getNumElements();
1415
1416 // Generically estimate the cost by assuming that each destination
1417 // register is derived from sources via v_perm_b32 instructions if it
1418 // can't be copied as-is.
1419 //
1420 // For each destination register, derive the cost of obtaining it based
1421 // on the number of source registers that feed into it.
1422 unsigned Cost = 0;
1423 for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx += EltsPerReg) {
1425 bool Aligned = true;
1426 for (unsigned I = 0; I < EltsPerReg && DstIdx + I < Mask.size(); ++I) {
1427 int SrcIdx = Mask[DstIdx + I];
1428 if (SrcIdx == -1)
1429 continue;
1430 int Reg;
1431 if (SrcIdx < (int)NumSrcElts) {
1432 Reg = SrcIdx / EltsPerReg;
1433 if (SrcIdx % EltsPerReg != I)
1434 Aligned = false;
1435 } else {
1436 Reg = NumSrcElts + (SrcIdx - NumSrcElts) / EltsPerReg;
1437 if ((SrcIdx - NumSrcElts) % EltsPerReg != I)
1438 Aligned = false;
1439 }
1440 if (!llvm::is_contained(Regs, Reg))
1441 Regs.push_back(Reg);
1442 }
1443 if (Regs.size() >= 2)
1444 Cost += Regs.size() - 1;
1445 else if (!Aligned)
1446 Cost += 1;
1447 }
1448 return Cost;
1449 }
1450 }
1451
1452 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
1453 SubTp);
1454}
1455
1456/// Whether it is profitable to sink the operands of an
1457/// Instruction I to the basic block of I.
1458/// This helps using several modifiers (like abs and neg) more often.
1460 SmallVectorImpl<Use *> &Ops) const {
1461 using namespace PatternMatch;
1462
1463 for (auto &Op : I->operands()) {
1464 // Ensure we are not already sinking this operand.
1465 if (any_of(Ops, [&](Use *U) { return U->get() == Op.get(); }))
1466 continue;
1467
1468 if (match(&Op, m_FAbs(m_Value())) || match(&Op, m_FNeg(m_Value()))) {
1469 Ops.push_back(&Op);
1470 continue;
1471 }
1472
1473 // Check for zero-cost multiple use InsertElement/ExtractElement
1474 // instructions
1475 if (Instruction *OpInst = dyn_cast<Instruction>(Op.get())) {
1476 if (OpInst->getType()->isVectorTy() && OpInst->getNumOperands() > 1) {
1477 Instruction *VecOpInst = dyn_cast<Instruction>(OpInst->getOperand(0));
1478 if (VecOpInst && VecOpInst->hasOneUse())
1479 continue;
1480
1481 if (getVectorInstrCost(OpInst->getOpcode(), OpInst->getType(),
1483 OpInst->getOperand(0),
1484 OpInst->getOperand(1)) == 0) {
1485 Ops.push_back(&Op);
1486 continue;
1487 }
1488 }
1489 }
1490
1491 if (auto *Shuffle = dyn_cast<ShuffleVectorInst>(Op.get())) {
1492
1493 unsigned EltSize = DL.getTypeSizeInBits(
1494 cast<VectorType>(Shuffle->getType())->getElementType());
1495
1496 // For i32 (or greater) shufflevectors, these will be lowered into a
1497 // series of insert / extract elements, which will be coalesced away.
1498 if (EltSize < 16 || !ST->has16BitInsts())
1499 continue;
1500
1501 int NumSubElts, SubIndex;
1502 if (Shuffle->changesLength()) {
1503 if (Shuffle->increasesLength() && Shuffle->isIdentityWithPadding()) {
1504 Ops.push_back(&Op);
1505 continue;
1506 }
1507
1508 if ((Shuffle->isExtractSubvectorMask(SubIndex) ||
1509 Shuffle->isInsertSubvectorMask(NumSubElts, SubIndex)) &&
1510 !(SubIndex & 0x1)) {
1511 Ops.push_back(&Op);
1512 continue;
1513 }
1514 }
1515
1516 if (Shuffle->isReverse() || Shuffle->isZeroEltSplat() ||
1517 Shuffle->isSingleSource()) {
1518 Ops.push_back(&Op);
1519 continue;
1520 }
1521 }
1522 }
1523
1524 return !Ops.empty();
1525}
1526
1528 const Function *Callee) const {
1529 const TargetMachine &TM = getTLI()->getTargetMachine();
1530 const GCNSubtarget *CallerST
1531 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
1532 const GCNSubtarget *CalleeST
1533 = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
1534
1535 const FeatureBitset &CallerBits = CallerST->getFeatureBits();
1536 const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
1537
1538 FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
1539 FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
1540 if ((RealCallerBits & RealCalleeBits) != RealCalleeBits)
1541 return false;
1542
1543 // FIXME: dx10_clamp can just take the caller setting, but there seems to be
1544 // no way to support merge for backend defined attributes.
1545 SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
1546 SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
1547 if (!CallerMode.isInlineCompatible(CalleeMode))
1548 return false;
1549
1550 if (Callee->hasFnAttribute(Attribute::AlwaysInline) ||
1551 Callee->hasFnAttribute(Attribute::InlineHint))
1552 return true;
1553
1554 // Hack to make compile times reasonable.
1555 if (InlineMaxBB) {
1556 // Single BB does not increase total BB amount.
1557 if (Callee->size() == 1)
1558 return true;
1559 size_t BBSize = Caller->size() + Callee->size() - 1;
1560 return BBSize <= InlineMaxBB;
1561 }
1562
1563 return true;
1564}
1565
1567 const SITargetLowering *TLI,
1568 const GCNTTIImpl *TTIImpl) {
1569 const int NrOfSGPRUntilSpill = 26;
1570 const int NrOfVGPRUntilSpill = 32;
1571
1572 const DataLayout &DL = TTIImpl->getDataLayout();
1573
1574 unsigned adjustThreshold = 0;
1575 int SGPRsInUse = 0;
1576 int VGPRsInUse = 0;
1577 for (const Use &A : CB->args()) {
1578 SmallVector<EVT, 4> ValueVTs;
1579 ComputeValueVTs(*TLI, DL, A.get()->getType(), ValueVTs);
1580 for (auto ArgVT : ValueVTs) {
1581 unsigned CCRegNum = TLI->getNumRegistersForCallingConv(
1582 CB->getContext(), CB->getCallingConv(), ArgVT);
1584 SGPRsInUse += CCRegNum;
1585 else
1586 VGPRsInUse += CCRegNum;
1587 }
1588 }
1589
1590 // The cost of passing function arguments through the stack:
1591 // 1 instruction to put a function argument on the stack in the caller.
1592 // 1 instruction to take a function argument from the stack in callee.
1593 // 1 instruction is explicitly take care of data dependencies in callee
1594 // function.
1595 InstructionCost ArgStackCost(1);
1596 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1597 Instruction::Store, Type::getInt32Ty(CB->getContext()), Align(4),
1599 ArgStackCost += const_cast<GCNTTIImpl *>(TTIImpl)->getMemoryOpCost(
1600 Instruction::Load, Type::getInt32Ty(CB->getContext()), Align(4),
1602
1603 // The penalty cost is computed relative to the cost of instructions and does
1604 // not model any storage costs.
1605 adjustThreshold += std::max(0, SGPRsInUse - NrOfSGPRUntilSpill) *
1606 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1607 adjustThreshold += std::max(0, VGPRsInUse - NrOfVGPRUntilSpill) *
1608 ArgStackCost.getValue() * InlineConstants::getInstrCost();
1609 return adjustThreshold;
1610}
1611
1612static unsigned getCallArgsTotalAllocaSize(const CallBase *CB,
1613 const DataLayout &DL) {
1614 // If we have a pointer to a private array passed into a function
1615 // it will not be optimized out, leaving scratch usage.
1616 // This function calculates the total size in bytes of the memory that would
1617 // end in scratch if the call was not inlined.
1618 unsigned AllocaSize = 0;
1620 for (Value *PtrArg : CB->args()) {
1621 PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
1622 if (!Ty)
1623 continue;
1624
1625 unsigned AddrSpace = Ty->getAddressSpace();
1626 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS &&
1627 AddrSpace != AMDGPUAS::PRIVATE_ADDRESS)
1628 continue;
1629
1631 if (!AI || !AI->isStaticAlloca() || !AIVisited.insert(AI).second)
1632 continue;
1633
1634 if (auto Size = AI->getAllocationSize(DL))
1635 AllocaSize += Size->getFixedValue();
1636 }
1637 return AllocaSize;
1638}
1639
1644
1646 unsigned Threshold = adjustInliningThresholdUsingCallee(CB, TLI, this);
1647
1648 // Private object passed as arguments may end up in scratch usage if the call
1649 // is not inlined. Increase the inline threshold to promote inlining.
1650 unsigned AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1651 if (AllocaSize > 0)
1652 Threshold += ArgAllocaCost;
1653 return Threshold;
1654}
1655
1657 const AllocaInst *AI) const {
1658
1659 // Below the cutoff, assume that the private memory objects would be
1660 // optimized
1661 auto AllocaSize = getCallArgsTotalAllocaSize(CB, DL);
1662 if (AllocaSize <= ArgAllocaCutoff)
1663 return 0;
1664
1665 // Above the cutoff, we give a cost to each private memory object
1666 // depending its size. If the array can be optimized by SROA this cost is not
1667 // added to the total-cost in the inliner cost analysis.
1668 //
1669 // We choose the total cost of the alloca such that their sum cancels the
1670 // bonus given in the threshold (ArgAllocaCost).
1671 //
1672 // Cost_Alloca_0 + ... + Cost_Alloca_N == ArgAllocaCost
1673 //
1674 // Awkwardly, the ArgAllocaCost bonus is multiplied by threshold-multiplier,
1675 // the single-bb bonus and the vector-bonus.
1676 //
1677 // We compensate the first two multipliers, by repeating logic from the
1678 // inliner-cost in here. The vector-bonus is 0 on AMDGPU.
1679 static_assert(InlinerVectorBonusPercent == 0, "vector bonus assumed to be 0");
1680 unsigned Threshold = ArgAllocaCost * getInliningThresholdMultiplier();
1681
1682 bool SingleBB = none_of(*CB->getCalledFunction(), [](const BasicBlock &BB) {
1683 return BB.getTerminator()->getNumSuccessors() > 1;
1684 });
1685 if (SingleBB) {
1686 Threshold += Threshold / 2;
1687 }
1688
1689 auto ArgAllocaSize = AI->getAllocationSize(DL);
1690 if (!ArgAllocaSize)
1691 return 0;
1692
1693 // Attribute the bonus proportionally to the alloca size
1694 unsigned AllocaThresholdBonus =
1695 (Threshold * ArgAllocaSize->getFixedValue()) / AllocaSize;
1696
1697 return AllocaThresholdBonus;
1698}
1699
1702 OptimizationRemarkEmitter *ORE) const {
1703 CommonTTI.getUnrollingPreferences(L, SE, UP, ORE);
1704}
1705
1707 TTI::PeelingPreferences &PP) const {
1708 CommonTTI.getPeelingPreferences(L, SE, PP);
1709}
1710
1711int GCNTTIImpl::getTransInstrCost(TTI::TargetCostKind CostKind) const {
1712 return getQuarterRateInstrCost(CostKind);
1713}
1714
1715int GCNTTIImpl::get64BitInstrCost(TTI::TargetCostKind CostKind) const {
1716 return ST->hasFullRate64Ops()
1717 ? getFullRateInstrCost()
1718 : ST->hasHalfRate64Ops() ? getHalfRateInstrCost(CostKind)
1719 : getQuarterRateInstrCost(CostKind);
1720}
1721
1722std::pair<InstructionCost, MVT>
1723GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const {
1724 std::pair<InstructionCost, MVT> Cost = BaseT::getTypeLegalizationCost(Ty);
1725 auto Size = DL.getTypeSizeInBits(Ty);
1726 // Maximum load or store can handle 8 dwords for scalar and 4 for
1727 // vector ALU. Let's assume anything above 8 dwords is expensive
1728 // even if legal.
1729 if (Size <= 256)
1730 return Cost;
1731
1732 Cost.first += (Size + 255) / 256;
1733 return Cost;
1734}
1735
1737 return ST->hasPrefetch() ? 128 : 0;
1738}
1739
1742}
1743
1745 const Function &F,
1746 SmallVectorImpl<std::pair<StringRef, int64_t>> &LB) const {
1747 SmallVector<unsigned> MaxNumWorkgroups = ST->getMaxNumWorkGroups(F);
1748 LB.push_back({"amdgpu-max-num-workgroups[0]", MaxNumWorkgroups[0]});
1749 LB.push_back({"amdgpu-max-num-workgroups[1]", MaxNumWorkgroups[1]});
1750 LB.push_back({"amdgpu-max-num-workgroups[2]", MaxNumWorkgroups[2]});
1751 std::pair<unsigned, unsigned> FlatWorkGroupSize =
1752 ST->getFlatWorkGroupSizes(F);
1753 LB.push_back({"amdgpu-flat-work-group-size[0]", FlatWorkGroupSize.first});
1754 LB.push_back({"amdgpu-flat-work-group-size[1]", FlatWorkGroupSize.second});
1755 std::pair<unsigned, unsigned> WavesPerEU = ST->getWavesPerEU(F);
1756 LB.push_back({"amdgpu-waves-per-eu[0]", WavesPerEU.first});
1757 LB.push_back({"amdgpu-waves-per-eu[1]", WavesPerEU.second});
1758}
1759
1762 if (!ST->hasFeature(AMDGPU::FeatureDX10ClampAndIEEEMode))
1763 return KnownIEEEMode::On; // Only mode on gfx1170+
1764
1765 const Function *F = I.getFunction();
1766 if (!F)
1768
1769 Attribute IEEEAttr = F->getFnAttribute("amdgpu-ieee");
1770 if (IEEEAttr.isValid())
1772
1773 return AMDGPU::isShader(F->getCallingConv()) ? KnownIEEEMode::Off
1775}
1776
1778 Align Alignment,
1779 unsigned AddressSpace,
1781 TTI::OperandValueInfo OpInfo,
1782 const Instruction *I) const {
1783 if (VectorType *VecTy = dyn_cast<VectorType>(Src)) {
1784 if ((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1785 VecTy->getElementType()->isIntegerTy(8)) {
1786 return divideCeil(DL.getTypeSizeInBits(VecTy) - 1,
1788 }
1789 }
1790 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind,
1791 OpInfo, I);
1792}
1793
1795 if (VectorType *VecTy = dyn_cast<VectorType>(Tp)) {
1796 if (VecTy->getElementType()->isIntegerTy(8)) {
1797 unsigned ElementCount = VecTy->getElementCount().getFixedValue();
1798 return divideCeil(ElementCount - 1, 4);
1799 }
1800 }
1801 return BaseT::getNumberOfParts(Tp);
1802}
1803
1807 switch (Intrinsic->getIntrinsicID()) {
1808 case Intrinsic::amdgcn_wave_shuffle:
1810 default:
1811 break;
1812 }
1813 }
1814
1815 if (isAlwaysUniform(V))
1817
1818 if (isSourceOfDivergence(V))
1820
1822}
1823
1825 StackOffset BaseOffset,
1826 bool HasBaseReg, int64_t Scale,
1827 unsigned AddrSpace) const {
1828 if (HasBaseReg && Scale != 0) {
1829 // gfx1250+ can fold base+scale*index when scale matches the memory access
1830 // size (scale_offset bit). Supported for flat/global/constant/scratch
1831 // (VMEM, max 128 bits) and constant_32bit (SMRD, capped to 128 bits here).
1832 if (getST()->hasScaleOffset() && Ty && Ty->isSized() &&
1834 AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
1835 AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)) {
1836 TypeSize StoreSize = getDataLayout().getTypeStoreSize(Ty);
1837 if (TypeSize::isKnownLE(StoreSize, TypeSize::getFixed(16)) &&
1838 static_cast<int64_t>(StoreSize.getFixedValue()) == Scale)
1839 return 0;
1840 }
1841 return 1;
1842 }
1843 return BaseT::getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg, Scale,
1844 AddrSpace);
1845}
1846
1848 const TTI::LSRCost &B) const {
1849 // Favor lower per-iteration work over preheader/setup costs.
1850 // AMDGPU lacks rich addressing modes, so ScaleCost is folded into the
1851 // effective instruction count (base+scale*index requires a separate ADD).
1852 unsigned EffInsnsA = A.Insns + A.ScaleCost;
1853 unsigned EffInsnsB = B.Insns + B.ScaleCost;
1854
1855 return std::tie(EffInsnsA, A.NumIVMuls, A.AddRecCost, A.NumBaseAdds,
1856 A.SetupCost, A.ImmCost, A.NumRegs) <
1857 std::tie(EffInsnsB, B.NumIVMuls, B.AddRecCost, B.NumBaseAdds,
1858 B.SetupCost, B.ImmCost, B.NumRegs);
1859}
1860
1862 // isLSRCostLess de-prioritizes register count; keep consistent.
1863 return false;
1864}
1865
1867 // Prefer the baseline when LSR cannot clearly reduce per-iteration work.
1868 return true;
1869}
1870
1872 const SmallBitVector &UniformArgs) const {
1874 switch (Intrinsic->getIntrinsicID()) {
1875 case Intrinsic::amdgcn_wave_shuffle:
1876 // wave_shuffle(Value, Index): result is uniform when either Value or Index
1877 // is uniform.
1878 return UniformArgs[0] || UniformArgs[1];
1879 default:
1880 llvm_unreachable("unexpected intrinsic in isUniform");
1881 }
1882}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
aarch64 promote const
Provides AMDGPU specific target descriptions.
Rewrite undef for PHI
Base class for AMDGPU specific classes of TargetSubtarget.
The AMDGPU TargetMachine interface definition for hw codegen targets.
static cl::opt< unsigned > MemcpyLoopUnroll("amdgpu-memcpy-loop-unroll", cl::desc("Unroll factor (affecting 4x32-bit operations) to use for memory " "operations when lowering statically-sized memcpy, memmove, or" "memset as a loop"), cl::init(16), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdIf("amdgpu-unroll-threshold-if", cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"), cl::init(200), cl::Hidden)
static cl::opt< unsigned > ArgAllocaCost("amdgpu-inline-arg-alloca-cost", cl::Hidden, cl::init(4000), cl::desc("Cost of alloca argument"))
static bool dependsOnLocalPhi(const Loop *L, const Value *Cond, unsigned Depth=0)
static cl::opt< bool > UnrollRuntimeLocal("amdgpu-unroll-runtime-local", cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"), cl::init(true), cl::Hidden)
static unsigned adjustInliningThresholdUsingCallee(const CallBase *CB, const SITargetLowering *TLI, const GCNTTIImpl *TTIImpl)
static cl::opt< unsigned > ArgAllocaCutoff("amdgpu-inline-arg-alloca-cutoff", cl::Hidden, cl::init(256), cl::desc("Maximum alloca size to use for inline cost"))
static cl::opt< size_t > InlineMaxBB("amdgpu-inline-max-bb", cl::Hidden, cl::init(1100), cl::desc("Maximum number of BBs allowed in a function after inlining" " (compile time constraint)"))
static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID)
static cl::opt< unsigned > UnrollMaxBlockToAnalyze("amdgpu-unroll-max-block-to-analyze", cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"), cl::init(32), cl::Hidden)
static unsigned getCallArgsTotalAllocaSize(const CallBase *CB, const DataLayout &DL)
static cl::opt< unsigned > UnrollThresholdPrivate("amdgpu-unroll-threshold-private", cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"), cl::init(2700), cl::Hidden)
static cl::opt< unsigned > UnrollThresholdLocal("amdgpu-unroll-threshold-local", cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"), cl::init(1000), cl::Hidden)
This file a TargetTransformInfoImplBase conforming object specific to the AMDGPU target machine.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
uint64_t IntrinsicInst * II
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
static unsigned getNumElements(Type *Ty)
This file implements the SmallBitVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:114
std::optional< unsigned > getReqdWorkGroupSize(const Function &F, unsigned Dim) const
bool hasWavefrontsEvenlySplittingXDim(const Function &F, bool REquiresUniformYZ=false) const
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
AMDGPUTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
an instruction to allocate memory on the stack
LLVM_ABI bool isStaticAlloca() const
Return true if this alloca is in the entry block of the function and is a constant size.
LLVM_ABI std::optional< TypeSize > getAllocationSize(const DataLayout &DL) const
Get allocation size in bytes.
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
bool empty() const
empty - Check if the array is empty.
Definition ArrayRef.h:137
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
unsigned getNumberOfParts(Type *Tp) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool isInlineAsm() const
Check if this call is an inline asm statement.
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
CallingConv::ID getCallingConv() const
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
unsigned getArgOperandNo(const Use *U) const
Given a use for a arg operand, get the arg operand number that corresponds to it.
This class represents a function call, abstracting a target machine's calling convention.
Conditional Branch instruction.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
static LLVM_ABI ConstantInt * getFalse(LLVMContext &Context)
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeStoreSize(Type *Ty) const
Returns the maximum number of bytes that may be overwritten by storing the specified type.
Definition DataLayout.h:572
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
ArrayRef< unsigned > getIndices() const
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool approxFunc() const
Definition FMF.h:73
Container class for subtarget features.
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:873
GCNTTIImpl(const AMDGPUTargetMachine *TM, const Function &F)
unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
Account for loads of i8 vector types to have reduced cost.
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
void collectKernelLaunchBounds(const Function &F, SmallVectorImpl< std::pair< StringRef, int64_t > > &LB) const override
bool isUniform(const Instruction *I, const SmallBitVector &UniformArgs) const override
bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
bool isInlineAsmSourceOfDivergence(const CallInst *CI, ArrayRef< unsigned > Indices={}) const
Analyze if the results of inline asm are divergent.
bool isReadRegisterSourceOfDivergence(const IntrinsicInst *ReadReg) const
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
unsigned getNumberOfRegisters(unsigned RCID) const override
bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const override
unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const
bool isLSRCostLess(const TTI::LSRCost &A, const TTI::LSRCost &B) const override
bool shouldPrefetchAddressSpace(unsigned AS) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
bool hasBranchDivergence(const Function *F=nullptr) const override
Value * rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override
unsigned getCallerAllocaCost(const CallBase *CB, const AllocaInst *AI) const override
unsigned getMaxInterleaveFactor(ElementCount VF) const override
void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
unsigned getInliningThresholdMultiplier() const override
unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize, unsigned ChainSizeInBytes, VectorType *VecTy) const override
unsigned getPrefetchDistance() const override
How much before a load we should place the prefetch instruction.
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
KnownIEEEMode fpenvIEEEMode(const Instruction &I) const
Return KnownIEEEMode::On if we know if the use context can assume "amdgpu-ieee"="true" and KnownIEEEM...
unsigned adjustInliningThreshold(const CallBase *CB) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Whether it is profitable to sink the operands of an Instruction I to the basic block of I.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool shouldDropLSRSolutionIfLessProfitable() const override
int getInliningLastCallToStaticBonus() const override
bool collectFlatAddressOperands(SmallVectorImpl< int > &OpIndexes, Intrinsic::ID IID) const override
unsigned getNumberOfParts(Type *Tp) const override
When counting parts on AMD GPUs, account for i8s being grouped together under a single i32 value.
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
unsigned getMinVectorRegisterBitWidth() const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind Vector) const override
bool isNumRegsMajorCostOfLSR() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
Type * getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicElementSize) const override
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
InstructionUniformity getInstructionUniformity(const Value *V) const override
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool hasApproxFunc() const LLVM_READONLY
Determine whether the approximate-math-functions flag is set.
LLVM_ABI bool hasAllowContract() const LLVM_READONLY
Determine whether the allow-contract flag is set.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Metadata node.
Definition Metadata.h:1080
Machine Value Type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
Root of the metadata hierarchy.
Definition Metadata.h:64
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
The optimization diagnostic interface.
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
The main scalar evolution driver.
This is a 'bitvector' (really, a variable-sized bit array), optimized for the case when the array is ...
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
std::vector< AsmOperandInfo > AsmOperandInfoVector
Primary interface to the complete machine description for the target machine.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
virtual const DataLayout & getDataLayout() const
virtual void getMemcpyLoopResidualLoweringType(SmallVectorImpl< Type * > &OpsOut, LLVMContext &Context, unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace, Align SrcAlign, Align DestAlign, std::optional< uint32_t > AtomicCpySize) const
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Free
Expected to fold away in lowering.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:314
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:311
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:312
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:403
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
Base class of all SIMD vector types.
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:230
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isArgPassedInSGPR(const Argument *A)
bool isIntrinsicAlwaysUniform(unsigned IntrID)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
bool isExtendedGlobalAddrSpace(unsigned AS)
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
LLVM_ABI int getInstrCost()
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > Tys={})
Look up the Function declaration of the intrinsic id in the Module M.
BinaryOp_match< LHS, RHS, Instruction::AShr > m_AShr(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
bool match(Val *V, const Pattern &P)
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
specific_fpval m_FPOne()
Match a float 1.0 or vector with all elements equal to 1.0.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
m_Intrinsic_Ty< Opnd0 >::Ty m_FAbs(const Opnd0 &Op0)
initializer< Ty > init(const Ty &Val)
std::enable_if_t< detail::IsValidPointer< X, Y >::value, X * > extract_or_null(Y &&MD)
Extract a Value from Metadata, allowing null.
Definition Metadata.h:683
This is an optimization pass for GlobalISel generic memory operations.
@ Length
Definition DWP.cpp:532
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
InstructionCost Cost
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI MDNode * findOptionMDForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for a loop.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
InstructionUniformity
Enum describing how instructions behave with respect to uniformity and divergence,...
Definition Uniformity.h:18
@ AlwaysUniform
The result values are always uniform.
Definition Uniformity.h:23
@ NeverUniform
The result values can never be assumed to be uniform.
Definition Uniformity.h:26
@ Default
The result values are uniform if and only if all operands are uniform.
Definition Uniformity.h:20
@ Custom
The result values require a custom uniformity check.
Definition Uniformity.h:31
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
Information about a load/store intrinsic defined by the target.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const
Parameters that control the generic loop unrolling transformation.
unsigned Threshold
The cost threshold for the unrolled loop.
bool UnrollVectorizedLoop
Disable runtime unrolling by default for vectorized loops.
unsigned MaxIterationsCountToAnalyze
Don't allow loop unrolling to simulate more than this number of iterations when checking full unroll ...
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
bool AllowExpensiveTripCount
Allow emitting expensive instructions (such as divisions) when computing the trip count of a loop for...
const unsigned PragmaCount
Definition UnrollLoop.h:131
const bool PragmaEnableUnroll
Definition UnrollLoop.h:132