LLVM 23.0.0git
GCNSubtarget.cpp
Go to the documentation of this file.
1//===-- GCNSubtarget.cpp - GCN Subtarget Information ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Implements the GCN specific subclass of TargetSubtarget.
11//
12//===----------------------------------------------------------------------===//
13
14#include "GCNSubtarget.h"
15#include "AMDGPUCallLowering.h"
17#include "AMDGPULegalizerInfo.h"
20#include "AMDGPUTargetMachine.h"
28#include "llvm/IR/MDBuilder.h"
29#include <algorithm>
30
31using namespace llvm;
32
33#define DEBUG_TYPE "gcn-subtarget"
34
35#define GET_SUBTARGETINFO_TARGET_DESC
36#define GET_SUBTARGETINFO_CTOR
37#define AMDGPUSubtarget GCNSubtarget
38#include "AMDGPUGenSubtargetInfo.inc"
39#undef AMDGPUSubtarget
40
42 "amdgpu-vgpr-index-mode",
43 cl::desc("Use GPR indexing mode instead of movrel for vector indexing"),
44 cl::init(false));
45
46static cl::opt<bool> UseAA("amdgpu-use-aa-in-codegen",
47 cl::desc("Enable the use of AA during codegen."),
48 cl::init(true));
49
51 NSAThreshold("amdgpu-nsa-threshold",
52 cl::desc("Number of addresses from which to enable MIMG NSA."),
54
56
58 StringRef GPU,
59 StringRef FS) {
60 // Determine default and user-specified characteristics
61 //
62 // We want to be able to turn these off, but making this a subtarget feature
63 // for SI has the unhelpful behavior that it unsets everything else if you
64 // disable it.
65 //
66 // Similarly we want enable-prt-strict-null to be on by default and not to
67 // unset everything else if it is disabled
68
69 SmallString<256> FullFS("+load-store-opt,+enable-ds128,");
70
71 // Turn on features that HSA ABI requires. Also turn on FlatForGlobal by
72 // default
73 if (isAmdHsaOS())
74 FullFS += "+flat-for-global,+unaligned-access-mode,+trap-handler,";
75
76 FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
77
78 // Disable mutually exclusive bits.
79 if (FS.contains_insensitive("+wavefrontsize")) {
80 if (!FS.contains_insensitive("wavefrontsize16"))
81 FullFS += "-wavefrontsize16,";
82 if (!FS.contains_insensitive("wavefrontsize32"))
83 FullFS += "-wavefrontsize32,";
84 if (!FS.contains_insensitive("wavefrontsize64"))
85 FullFS += "-wavefrontsize64,";
86 }
87
88 FullFS += FS;
89
90 ParseSubtargetFeatures(GPU, /*TuneCPU*/ GPU, FullFS);
91
92 // Implement the "generic" processors, which acts as the default when no
93 // generation features are enabled (e.g for -mcpu=''). HSA OS defaults to
94 // the first amdgcn target that supports flat addressing. Other OSes defaults
95 // to the first amdgcn target.
99 // Assume wave64 for the unknown target, if not explicitly set.
100 if (getWavefrontSizeLog2() == 0)
102 } else if (!hasFeature(AMDGPU::FeatureWavefrontSize32) &&
103 !hasFeature(AMDGPU::FeatureWavefrontSize64)) {
104 // If there is no default wave size it must be a generation before gfx10,
105 // these have FeatureWavefrontSize64 in their definition already. For gfx10+
106 // set wave32 as a default.
107 ToggleFeature(AMDGPU::FeatureWavefrontSize32);
109 }
110
111 // We don't support FP64 for EG/NI atm.
113
114 // Targets must either support 64-bit offsets for MUBUF instructions, and/or
115 // support flat operations, otherwise they cannot access a 64-bit global
116 // address space
117 assert(hasAddr64() || hasFlat());
118 // Unless +-flat-for-global is specified, turn on FlatForGlobal for targets
119 // that do not support ADDR64 variants of MUBUF instructions. Such targets
120 // cannot use a 64 bit offset with a MUBUF instruction to access the global
121 // address space
122 if (!hasAddr64() && !FS.contains("flat-for-global") && !UseFlatForGlobal) {
123 ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
124 UseFlatForGlobal = true;
125 }
126 // Unless +-flat-for-global is specified, use MUBUF instructions for global
127 // address space access if flat operations are not available.
128 if (!hasFlat() && !FS.contains("flat-for-global") && UseFlatForGlobal) {
129 ToggleFeature(AMDGPU::FeatureUseFlatForGlobal);
130 UseFlatForGlobal = false;
131 }
132
133 // Set defaults if needed.
134 if (MaxPrivateElementSize == 0)
136
137 if (LDSBankCount == 0)
138 LDSBankCount = 32;
139
142
143 if (FlatOffsetBitWidth == 0)
145
147 // LDS Allocation Granularity calculated in bytes from dwords
149 AMDGPU::getLdsDwGranularity(*this) * sizeof(uint32_t);
150
153
154 // InstCacheLineSize is set from TableGen subtarget features
155 // (FeatureInstCacheLineSize64 / FeatureInstCacheLineSize128).
156 // Fall back to 64 if no feature was specified (e.g. generic targets).
157 if (InstCacheLineSize == 0)
159
161 "InstCacheLineSize must be a power of 2");
162
163 LLVM_DEBUG(dbgs() << "xnack setting for subtarget: "
164 << TargetID.getXnackSetting() << '\n');
165 LLVM_DEBUG(dbgs() << "sramecc setting for subtarget: "
166 << TargetID.getSramEccSetting() << '\n');
167
168 return *this;
169}
170
172 LLVMContext &Ctx = F.getContext();
173 if (hasFeature(AMDGPU::FeatureWavefrontSize32) &&
174 hasFeature(AMDGPU::FeatureWavefrontSize64)) {
175 Ctx.diagnose(DiagnosticInfoUnsupported(
176 F, "must specify exactly one of wavefrontsize32 and wavefrontsize64"));
177 }
178}
179
181 const GCNTargetMachine &TM, bool BufferOOBRelaxed,
183 : // clang-format off
184 AMDGPUGenSubtargetInfo(TT, GPU, /*TuneCPU*/ GPU, FS),
185 AMDGPUSubtarget(TT),
186 TargetID(AMDGPU::createAMDGPUTargetID(*this, FS)),
187 InstrItins(getInstrItineraryForCPU(GPU)),
190 InstrInfo(initializeSubtargetDependencies(TT, GPU, FS)),
191 TLInfo(TM, *this),
192 // Frame index expansion sometimes assumes the low bit of SP is 0
193 FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0,
194 /*TransAl=*/Align(4)) {
195 // clang-format on
198
199 TSInfo = std::make_unique<AMDGPUSelectionDAGInfo>();
200
201 CallLoweringInfo = std::make_unique<AMDGPUCallLowering>(*getTargetLowering());
202 InlineAsmLoweringInfo =
203 std::make_unique<InlineAsmLowering>(getTargetLowering());
204 Legalizer = std::make_unique<AMDGPULegalizerInfo>(*this, TM);
205 RegBankInfo = std::make_unique<AMDGPURegisterBankInfo>(*this);
206 InstSelector =
207 std::make_unique<AMDGPUInstructionSelector>(*this, *RegBankInfo);
208}
209
211 return TSInfo.get();
212}
213
214unsigned GCNSubtarget::getConstantBusLimit(unsigned Opcode) const {
215 if (getGeneration() < GFX10)
216 return 1;
217
218 switch (Opcode) {
219 case AMDGPU::V_LSHLREV_B64_e64:
220 case AMDGPU::V_LSHLREV_B64_gfx10:
221 case AMDGPU::V_LSHLREV_B64_e64_gfx11:
222 case AMDGPU::V_LSHLREV_B64_e32_gfx12:
223 case AMDGPU::V_LSHLREV_B64_e64_gfx12:
224 case AMDGPU::V_LSHL_B64_e64:
225 case AMDGPU::V_LSHRREV_B64_e64:
226 case AMDGPU::V_LSHRREV_B64_gfx10:
227 case AMDGPU::V_LSHRREV_B64_e64_gfx11:
228 case AMDGPU::V_LSHRREV_B64_e64_gfx12:
229 case AMDGPU::V_LSHR_B64_e64:
230 case AMDGPU::V_ASHRREV_I64_e64:
231 case AMDGPU::V_ASHRREV_I64_gfx10:
232 case AMDGPU::V_ASHRREV_I64_e64_gfx11:
233 case AMDGPU::V_ASHRREV_I64_e64_gfx12:
234 case AMDGPU::V_ASHR_I64_e64:
235 return 1;
236 }
237
238 return 2;
239}
240
241/// This list was mostly derived from experimentation.
242bool GCNSubtarget::zeroesHigh16BitsOfDest(unsigned Opcode) const {
243 switch (Opcode) {
244 case AMDGPU::V_CVT_F16_F32_e32:
245 case AMDGPU::V_CVT_F16_F32_e64:
246 case AMDGPU::V_CVT_F16_U16_e32:
247 case AMDGPU::V_CVT_F16_U16_e64:
248 case AMDGPU::V_CVT_F16_I16_e32:
249 case AMDGPU::V_CVT_F16_I16_e64:
250 case AMDGPU::V_RCP_F16_e64:
251 case AMDGPU::V_RCP_F16_e32:
252 case AMDGPU::V_RSQ_F16_e64:
253 case AMDGPU::V_RSQ_F16_e32:
254 case AMDGPU::V_SQRT_F16_e64:
255 case AMDGPU::V_SQRT_F16_e32:
256 case AMDGPU::V_LOG_F16_e64:
257 case AMDGPU::V_LOG_F16_e32:
258 case AMDGPU::V_EXP_F16_e64:
259 case AMDGPU::V_EXP_F16_e32:
260 case AMDGPU::V_SIN_F16_e64:
261 case AMDGPU::V_SIN_F16_e32:
262 case AMDGPU::V_COS_F16_e64:
263 case AMDGPU::V_COS_F16_e32:
264 case AMDGPU::V_FLOOR_F16_e64:
265 case AMDGPU::V_FLOOR_F16_e32:
266 case AMDGPU::V_CEIL_F16_e64:
267 case AMDGPU::V_CEIL_F16_e32:
268 case AMDGPU::V_TRUNC_F16_e64:
269 case AMDGPU::V_TRUNC_F16_e32:
270 case AMDGPU::V_RNDNE_F16_e64:
271 case AMDGPU::V_RNDNE_F16_e32:
272 case AMDGPU::V_FRACT_F16_e64:
273 case AMDGPU::V_FRACT_F16_e32:
274 case AMDGPU::V_FREXP_MANT_F16_e64:
275 case AMDGPU::V_FREXP_MANT_F16_e32:
276 case AMDGPU::V_FREXP_EXP_I16_F16_e64:
277 case AMDGPU::V_FREXP_EXP_I16_F16_e32:
278 case AMDGPU::V_LDEXP_F16_e64:
279 case AMDGPU::V_LDEXP_F16_e32:
280 case AMDGPU::V_LSHLREV_B16_e64:
281 case AMDGPU::V_LSHLREV_B16_e32:
282 case AMDGPU::V_LSHRREV_B16_e64:
283 case AMDGPU::V_LSHRREV_B16_e32:
284 case AMDGPU::V_ASHRREV_I16_e64:
285 case AMDGPU::V_ASHRREV_I16_e32:
286 case AMDGPU::V_ADD_U16_e64:
287 case AMDGPU::V_ADD_U16_e32:
288 case AMDGPU::V_SUB_U16_e64:
289 case AMDGPU::V_SUB_U16_e32:
290 case AMDGPU::V_SUBREV_U16_e64:
291 case AMDGPU::V_SUBREV_U16_e32:
292 case AMDGPU::V_MUL_LO_U16_e64:
293 case AMDGPU::V_MUL_LO_U16_e32:
294 case AMDGPU::V_ADD_F16_e64:
295 case AMDGPU::V_ADD_F16_e32:
296 case AMDGPU::V_SUB_F16_e64:
297 case AMDGPU::V_SUB_F16_e32:
298 case AMDGPU::V_SUBREV_F16_e64:
299 case AMDGPU::V_SUBREV_F16_e32:
300 case AMDGPU::V_MUL_F16_e64:
301 case AMDGPU::V_MUL_F16_e32:
302 case AMDGPU::V_MAX_F16_e64:
303 case AMDGPU::V_MAX_F16_e32:
304 case AMDGPU::V_MIN_F16_e64:
305 case AMDGPU::V_MIN_F16_e32:
306 case AMDGPU::V_MAX_U16_e64:
307 case AMDGPU::V_MAX_U16_e32:
308 case AMDGPU::V_MIN_U16_e64:
309 case AMDGPU::V_MIN_U16_e32:
310 case AMDGPU::V_MAX_I16_e64:
311 case AMDGPU::V_MAX_I16_e32:
312 case AMDGPU::V_MIN_I16_e64:
313 case AMDGPU::V_MIN_I16_e32:
314 case AMDGPU::V_MAD_F16_e64:
315 case AMDGPU::V_MAD_U16_e64:
316 case AMDGPU::V_MAD_I16_e64:
317 case AMDGPU::V_FMA_F16_e64:
318 case AMDGPU::V_DIV_FIXUP_F16_e64:
319 // On gfx10, all 16-bit instructions preserve the high bits.
321 case AMDGPU::V_MADAK_F16:
322 case AMDGPU::V_MADMK_F16:
323 case AMDGPU::V_MAC_F16_e64:
324 case AMDGPU::V_MAC_F16_e32:
325 case AMDGPU::V_FMAMK_F16:
326 case AMDGPU::V_FMAAK_F16:
327 case AMDGPU::V_FMAC_F16_e64:
328 case AMDGPU::V_FMAC_F16_e32:
329 // In gfx9, the preferred handling of the unused high 16-bits changed. Most
330 // instructions maintain the legacy behavior of 0ing. Some instructions
331 // changed to preserving the high bits.
333 case AMDGPU::V_MAD_MIXLO_F16:
334 case AMDGPU::V_MAD_MIXHI_F16:
335 default:
336 return false;
337 }
338}
339
341 const SchedRegion &Region) const {
342 // Track register pressure so the scheduler can try to decrease
343 // pressure once register usage is above the threshold defined by
344 // SIRegisterInfo::getRegPressureSetLimit()
345 Policy.ShouldTrackPressure = true;
346
347 const Function &F = Region.RegionBegin->getMF()->getFunction();
348 if (AMDGPU::getSchedStrategy(F) == "coexec") {
349 Policy.OnlyTopDown = true;
350 Policy.OnlyBottomUp = false;
351 return;
352 }
353
354 // Enabling both top down and bottom up scheduling seems to give us less
355 // register spills than just using one of these approaches on its own.
356 Policy.OnlyTopDown = false;
357 Policy.OnlyBottomUp = false;
358
359 // Enabling ShouldTrackLaneMasks crashes the SI Machine Scheduler.
360 if (!enableSIScheduler())
361 Policy.ShouldTrackLaneMasks = true;
362}
363
365 const SchedRegion &Region) const {
366 const Function &F = Region.RegionBegin->getMF()->getFunction();
367 Attribute PostRADirectionAttr = F.getFnAttribute("amdgpu-post-ra-direction");
368 if (!PostRADirectionAttr.isValid())
369 return;
370
371 StringRef PostRADirectionStr = PostRADirectionAttr.getValueAsString();
372 if (PostRADirectionStr == "topdown") {
373 Policy.OnlyTopDown = true;
374 Policy.OnlyBottomUp = false;
375 } else if (PostRADirectionStr == "bottomup") {
376 Policy.OnlyTopDown = false;
377 Policy.OnlyBottomUp = true;
378 } else if (PostRADirectionStr == "bidirectional") {
379 Policy.OnlyTopDown = false;
380 Policy.OnlyBottomUp = false;
381 } else {
383 F, F.getSubprogram(), "invalid value for postRA direction attribute");
384 F.getContext().diagnose(Diag);
385 }
386
387 LLVM_DEBUG({
388 const char *DirStr = "default";
389 if (Policy.OnlyTopDown && !Policy.OnlyBottomUp)
390 DirStr = "topdown";
391 else if (!Policy.OnlyTopDown && Policy.OnlyBottomUp)
392 DirStr = "bottomup";
393 else if (!Policy.OnlyTopDown && !Policy.OnlyBottomUp)
394 DirStr = "bidirectional";
395
396 dbgs() << "Post-MI-sched direction (" << F.getName() << "): " << DirStr
397 << '\n';
398 });
399}
400
402 if (isWave32()) {
403 // Fix implicit $vcc operands after MIParser has verified that they match
404 // the instruction definitions.
405 for (auto &MBB : MF) {
406 for (auto &MI : MBB)
407 InstrInfo.fixImplicitOperands(MI);
408 }
409 }
410}
411
413 return InstrInfo.pseudoToMCOpcode(AMDGPU::V_MAD_F16_e64) != -1;
414}
415
417 return hasVGPRIndexMode() && (!hasMovrel() || EnableVGPRIndexMode);
418}
419
420bool GCNSubtarget::useAA() const { return UseAA; }
421
422unsigned GCNSubtarget::getOccupancyWithNumSGPRs(unsigned SGPRs) const {
424}
425
426unsigned
428 unsigned DynamicVGPRBlockSize) const {
430 DynamicVGPRBlockSize);
431}
432
433unsigned
434GCNSubtarget::getBaseReservedNumSGPRs(const bool HasFlatScratch) const {
436 return 2; // VCC. FLAT_SCRATCH and XNACK are no longer in SGPRs.
437
438 if (HasFlatScratch || HasArchitectedFlatScratch) {
440 return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
442 return 4; // FLAT_SCRATCH, VCC (in that order).
443 }
444
445 if (isXNACKEnabled())
446 return 4; // XNACK, VCC (in that order).
447 return 2; // VCC.
448}
449
454
456 // In principle we do not need to reserve SGPR pair used for flat_scratch if
457 // we know flat instructions do not access the stack anywhere in the
458 // program. For now assume it's needed if we have flat instructions.
459 const bool KernelUsesFlatScratch = hasFlatAddressSpace();
460 return getBaseReservedNumSGPRs(KernelUsesFlatScratch);
461}
462
463std::pair<unsigned, unsigned>
465 unsigned NumSGPRs, unsigned NumVGPRs) const {
466 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
467 // Temporarily check both the attribute and the subtarget feature until the
468 // latter is removed.
469 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
470 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
471
472 auto [MinOcc, MaxOcc] = getOccupancyWithWorkGroupSizes(LDSSize, F);
473 unsigned SGPROcc = getOccupancyWithNumSGPRs(NumSGPRs);
474 unsigned VGPROcc = getOccupancyWithNumVGPRs(NumVGPRs, DynamicVGPRBlockSize);
475
476 // Maximum occupancy may be further limited by high SGPR/VGPR usage.
477 MaxOcc = std::min({MaxOcc, SGPROcc, VGPROcc});
478 return {std::min(MinOcc, MaxOcc), MaxOcc};
479}
480
482 const Function &F, std::pair<unsigned, unsigned> WavesPerEU,
483 unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const {
484 // Compute maximum number of SGPRs function can use using default/requested
485 // minimum number of waves per execution unit.
486 unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
487 unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
488
489 // Check if maximum number of SGPRs was explicitly requested using
490 // "amdgpu-num-sgpr" attribute.
491 unsigned Requested =
492 F.getFnAttributeAsParsedInteger("amdgpu-num-sgpr", MaxNumSGPRs);
493
494 if (Requested != MaxNumSGPRs) {
495 // Make sure requested value does not violate subtarget's specifications.
496 if (Requested && (Requested <= ReservedNumSGPRs))
497 Requested = 0;
498
499 // If more SGPRs are required to support the input user/system SGPRs,
500 // increase to accommodate them.
501 //
502 // FIXME: This really ends up using the requested number of SGPRs + number
503 // of reserved special registers in total. Theoretically you could re-use
504 // the last input registers for these special registers, but this would
505 // require a lot of complexity to deal with the weird aliasing.
506 unsigned InputNumSGPRs = PreloadedSGPRs;
507 if (Requested && Requested < InputNumSGPRs)
508 Requested = InputNumSGPRs;
509
510 // Make sure requested value is compatible with values implied by
511 // default/requested minimum/maximum number of waves per execution unit.
512 if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
513 Requested = 0;
514 if (WavesPerEU.second && Requested &&
515 Requested < getMinNumSGPRs(WavesPerEU.second))
516 Requested = 0;
517
518 if (Requested)
519 MaxNumSGPRs = Requested;
520 }
521
522 if (hasSGPRInitBug())
524
525 return std::min(MaxNumSGPRs - ReservedNumSGPRs, MaxAddressableNumSGPRs);
526}
527
529 const Function &F = MF.getFunction();
533}
534
536 using USI = GCNUserSGPRUsageInfo;
537 // Max number of user SGPRs
538 const unsigned MaxUserSGPRs =
539 USI::getNumUserSGPRForField(USI::PrivateSegmentBufferID) +
540 USI::getNumUserSGPRForField(USI::DispatchPtrID) +
541 USI::getNumUserSGPRForField(USI::QueuePtrID) +
542 USI::getNumUserSGPRForField(USI::KernargSegmentPtrID) +
543 USI::getNumUserSGPRForField(USI::DispatchIdID) +
544 USI::getNumUserSGPRForField(USI::FlatScratchInitID) +
545 USI::getNumUserSGPRForField(USI::ImplicitBufferPtrID);
546
547 // Max number of system SGPRs
548 const unsigned MaxSystemSGPRs = 1 + // WorkGroupIDX
549 1 + // WorkGroupIDY
550 1 + // WorkGroupIDZ
551 1 + // WorkGroupInfo
552 1; // private segment wave byte offset
553
554 // Max number of synthetic SGPRs
555 const unsigned SyntheticSGPRs = 1; // LDSKernelId
556
557 return MaxUserSGPRs + MaxSystemSGPRs + SyntheticSGPRs;
558}
559
564
566 const Function &F, std::pair<unsigned, unsigned> NumVGPRBounds) const {
567 const auto [Min, Max] = NumVGPRBounds;
568
569 // Check if maximum number of VGPRs was explicitly requested using
570 // "amdgpu-num-vgpr" attribute.
571
572 unsigned Requested = F.getFnAttributeAsParsedInteger("amdgpu-num-vgpr", Max);
573 if (Requested != Max && hasGFX90AInsts())
574 Requested *= 2;
575
576 // Make sure requested value is inside the range of possible VGPR usage.
577 return std::clamp(Requested, Min, Max);
578}
579
581 // Temporarily check both the attribute and the subtarget feature, until the
582 // latter is removed.
583 unsigned DynamicVGPRBlockSize = AMDGPU::getDynamicVGPRBlockSize(F);
584 if (DynamicVGPRBlockSize == 0 && isDynamicVGPREnabled())
585 DynamicVGPRBlockSize = getDynamicVGPRBlockSize();
586
587 std::pair<unsigned, unsigned> Waves = getWavesPerEU(F);
588 return getBaseMaxNumVGPRs(
589 F, {getMinNumVGPRs(Waves.second, DynamicVGPRBlockSize),
590 getMaxNumVGPRs(Waves.first, DynamicVGPRBlockSize)});
591}
592
594 return getMaxNumVGPRs(MF.getFunction());
595}
596
597std::pair<unsigned, unsigned>
599 const unsigned MaxVectorRegs = getMaxNumVGPRs(F);
600
601 unsigned MaxNumVGPRs = MaxVectorRegs;
602 unsigned MaxNumAGPRs = 0;
603 unsigned NumArchVGPRs = getAddressableNumArchVGPRs();
604
605 // On GFX90A, the number of VGPRs and AGPRs need not be equal. Theoretically,
606 // a wave may have up to 512 total vector registers combining together both
607 // VGPRs and AGPRs. Hence, in an entry function without calls and without
608 // AGPRs used within it, it is possible to use the whole vector register
609 // budget for VGPRs.
610 //
611 // TODO: it shall be possible to estimate maximum AGPR/VGPR pressure and split
612 // register file accordingly.
613 if (hasGFX90AInsts()) {
614 unsigned MinNumAGPRs = 0;
615 const unsigned TotalNumAGPRs = AMDGPU::AGPR_32RegClass.getNumRegs();
616
617 const std::pair<unsigned, unsigned> DefaultNumAGPR = {~0u, ~0u};
618
619 // TODO: The lower bound should probably force the number of required
620 // registers up, overriding amdgpu-waves-per-eu.
621 std::tie(MinNumAGPRs, MaxNumAGPRs) =
622 AMDGPU::getIntegerPairAttribute(F, "amdgpu-agpr-alloc", DefaultNumAGPR,
623 /*OnlyFirstRequired=*/true);
624
625 if (MinNumAGPRs == DefaultNumAGPR.first) {
626 // Default to splitting half the registers if AGPRs are required.
627 MinNumAGPRs = MaxNumAGPRs = MaxVectorRegs / 2;
628 } else {
629 // Align to accum_offset's allocation granularity.
630 MinNumAGPRs = alignTo(MinNumAGPRs, 4);
631
632 MinNumAGPRs = std::min(MinNumAGPRs, TotalNumAGPRs);
633 }
634
635 // Clamp values to be inbounds of our limits, and ensure min <= max.
636
637 MaxNumAGPRs = std::min(std::max(MinNumAGPRs, MaxNumAGPRs), MaxVectorRegs);
638 MinNumAGPRs = std::min({MinNumAGPRs, TotalNumAGPRs, MaxNumAGPRs});
639
640 MaxNumVGPRs = std::min(MaxVectorRegs - MinNumAGPRs, NumArchVGPRs);
641 MaxNumAGPRs = std::min(MaxVectorRegs - MaxNumVGPRs, MaxNumAGPRs);
642
643 assert(MaxNumVGPRs + MaxNumAGPRs <= MaxVectorRegs &&
644 MaxNumAGPRs <= TotalNumAGPRs && MaxNumVGPRs <= NumArchVGPRs &&
645 "invalid register counts");
646 } else if (hasMAIInsts()) {
647 // On gfx908 the number of AGPRs always equals the number of VGPRs.
648 MaxNumAGPRs = MaxNumVGPRs = MaxVectorRegs;
649 }
650
651 return std::pair(MaxNumVGPRs, MaxNumAGPRs);
652}
653
654// Check to which source operand UseOpIdx points to and return a pointer to the
655// operand of the corresponding source modifier.
656// Return nullptr if UseOpIdx either doesn't point to src0/1/2 or if there is no
657// operand for the corresponding source modifier.
658static const MachineOperand *
660 const SIInstrInfo &InstrInfo) {
661 AMDGPU::OpName UseName =
662 AMDGPU::getOperandIdxName(UseI.getOpcode(), UseOpIdx);
663 switch (UseName) {
664 case AMDGPU::OpName::src0:
665 return InstrInfo.getNamedOperand(UseI, AMDGPU::OpName::src0_modifiers);
666 case AMDGPU::OpName::src1:
667 return InstrInfo.getNamedOperand(UseI, AMDGPU::OpName::src1_modifiers);
668 case AMDGPU::OpName::src2:
669 return InstrInfo.getNamedOperand(UseI, AMDGPU::OpName::src2_modifiers);
670 default:
671 return nullptr;
672 }
673}
674
675// Get the subreg idx of the subreg that is used by the given instruction
676// operand, considering the given op_sel modifier.
677// Return 0 if the whole register is used or as a conservative fallback.
679 const SIInstrInfo &InstrInfo,
680 const MachineInstr &I,
681 const MachineOperand &Op) {
682 if (!InstrInfo.isVOP3P(I) || InstrInfo.isWMMA(I) || InstrInfo.isSWMMAC(I))
683 return AMDGPU::NoSubRegister;
684
685 const MachineOperand *OpMod =
686 getVOP3PSourceModifierFromOpIdx(I, Op.getOperandNo(), InstrInfo);
687 if (!OpMod)
688 return AMDGPU::NoSubRegister;
689
690 // Note: the FMA_MIX* and MAD_MIX* instructions have different semantics for
691 // the op_sel and op_sel_hi source modifiers:
692 // - op_sel: selects low/high operand bits as input to the operation;
693 // has only meaning for 16-bit source operands
694 // - op_sel_hi: specifies the size of the source operands (16 or 32 bits);
695 // a value of 0 indicates 32 bit, 1 indicates 16 bit
696 // For the other VOP3P instructions, the semantics are:
697 // - op_sel: selects low/high operand bits as input to the operation which
698 // results in the lower-half of the destination
699 // - op_sel_hi: selects the low/high operand bits as input to the operation
700 // which results in the higher-half of the destination
701 int64_t OpSel = OpMod->getImm() & SISrcMods::OP_SEL_0;
702 int64_t OpSelHi = OpMod->getImm() & SISrcMods::OP_SEL_1;
703
704 // Check if all parts of the register are being used (= op_sel and op_sel_hi
705 // differ for VOP3P or op_sel_hi=0 for VOP3PMix). In that case we can return
706 // early.
707 if ((!InstrInfo.isVOP3PMix(I) && (!OpSel || !OpSelHi) &&
708 (OpSel || OpSelHi)) ||
709 (InstrInfo.isVOP3PMix(I) && !OpSelHi))
710 return AMDGPU::NoSubRegister;
711
712 const MachineRegisterInfo &MRI = I.getParent()->getParent()->getRegInfo();
713 const TargetRegisterClass *RC = TRI.getRegClassForOperandReg(MRI, Op);
714
715 if (unsigned SubRegIdx = OpSel ? AMDGPU::sub1 : AMDGPU::sub0;
716 TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)
717 return SubRegIdx;
718 if (unsigned SubRegIdx = OpSel ? AMDGPU::hi16 : AMDGPU::lo16;
719 TRI.getSubClassWithSubReg(RC, SubRegIdx) == RC)
720 return SubRegIdx;
721
722 return AMDGPU::NoSubRegister;
723}
724
725Register GCNSubtarget::getRealSchedDependency(const MachineInstr &DefI,
726 int DefOpIdx,
727 const MachineInstr &UseI,
728 int UseOpIdx) const {
729 const SIRegisterInfo *TRI = getRegisterInfo();
730 const MachineOperand &DefOp = DefI.getOperand(DefOpIdx);
731 const MachineOperand &UseOp = UseI.getOperand(UseOpIdx);
732 Register DefReg = DefOp.getReg();
733 Register UseReg = UseOp.getReg();
734
735 // If the registers aren't restricted to a sub-register, there is no point in
736 // further analysis. This check makes only sense for virtual registers because
737 // physical registers may form a tuple and thus be part of a superregister
738 // although they are not a subregister themselves (vgpr0 is a "subreg" of
739 // vgpr0_vgpr1 without being a subreg in itself).
740 unsigned DefSubRegIdx = DefOp.getSubReg();
741 if (DefReg.isVirtual() && DefSubRegIdx == AMDGPU::NoSubRegister)
742 return DefReg;
743 unsigned UseSubRegIdx = getEffectiveSubRegIdx(*TRI, InstrInfo, UseI, UseOp);
744 if (UseReg.isVirtual() && UseSubRegIdx == AMDGPU::NoSubRegister)
745 return DefReg;
746
747 if (!TRI->checkSubRegInterference(DefReg, DefSubRegIdx, UseReg, UseSubRegIdx))
748 return Register(); // No real dependency
749
750 // UseReg might be smaller or larger than DefReg, depending on the subreg and
751 // on whether DefReg is a subreg, too. -> Find the smaller one. This does not
752 // apply to virtual registers because we cannot construct a subreg for them.
753 if (DefReg.isVirtual())
754 return DefReg;
755 MCRegister DefMCReg =
756 DefSubRegIdx ? TRI->getSubReg(DefReg, DefSubRegIdx) : DefReg.asMCReg();
757 MCRegister UseMCReg =
758 UseSubRegIdx ? TRI->getSubReg(UseReg, UseSubRegIdx) : UseReg.asMCReg();
759 return TRI->isSubRegisterEq(DefMCReg, UseMCReg) ? UseMCReg : DefMCReg;
760}
761
763 SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep,
764 const TargetSchedModel *SchedModel) const {
765 if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() || !Def->isInstr() ||
766 !Use->isInstr())
767 return;
768
769 MachineInstr *DefI = Def->getInstr();
770 MachineInstr *UseI = Use->getInstr();
771
772 // Check for false latency on $tensorcnt / $asynccnt dependencies
773 if (Dep.getReg() == AMDGPU::TENSORcnt || Dep.getReg() == AMDGPU::ASYNCcnt) {
774 unsigned UseOp = UseI->getOpcode();
775 // Do not adjust latency for load->s_wait
776 bool IsBarrierCase =
777 InstrInfo.isLDSDMA(*DefI) &&
778 (UseOp == AMDGPU::S_WAIT_TENSORCNT || UseOp == AMDGPU::S_WAIT_ASYNCCNT);
779 if (!IsBarrierCase) {
780 Dep.setLatency(1);
781 return;
782 }
783 }
784
785 if (Register Reg = getRealSchedDependency(*DefI, DefOpIdx, *UseI, UseOpIdx)) {
786 Dep.setReg(Reg);
787 } else {
788 Dep = SDep(Def, SDep::Artificial);
789 return; // This is not a data dependency anymore.
790 }
791
792 if (DefI->isBundle()) {
794 auto Reg = Dep.getReg();
797 unsigned Lat = 0;
798 for (++I; I != E && I->isBundledWithPred(); ++I) {
799 if (I->isMetaInstruction())
800 continue;
801 if (I->modifiesRegister(Reg, TRI))
802 Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *I);
803 else if (Lat)
804 --Lat;
805 }
806 Dep.setLatency(Lat);
807 } else if (UseI->isBundle()) {
809 auto Reg = Dep.getReg();
812 unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
813 for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
814 if (I->isMetaInstruction())
815 continue;
816 if (I->readsRegister(Reg, TRI))
817 break;
818 --Lat;
819 }
820 Dep.setLatency(Lat);
821 } else if (Dep.getLatency() == 0 && Dep.getReg() == AMDGPU::VCC_LO) {
822 // Work around the fact that SIInstrInfo::fixImplicitOperands modifies
823 // implicit operands which come from the MCInstrDesc, which can fool
824 // ScheduleDAGInstrs::addPhysRegDataDeps into treating them as implicit
825 // pseudo operands.
826 Dep.setLatency(InstrInfo.getSchedModel().computeOperandLatency(
827 DefI, DefOpIdx, UseI, UseOpIdx));
828 }
829}
830
833 return 0; // Not MIMG encoding.
834
835 if (NSAThreshold.getNumOccurrences() > 0)
836 return std::max(NSAThreshold.getValue(), 2u);
837
839 "amdgpu-nsa-threshold", -1);
840 if (Value > 0)
841 return std::max(Value, 2);
842
843 return NSAThreshold;
844}
845
847 const GCNSubtarget &ST)
848 : ST(ST) {
849 const CallingConv::ID CC = F.getCallingConv();
850 const bool IsKernel =
852
853 if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
854 KernargSegmentPtr = true;
855
856 bool IsAmdHsaOrMesa = ST.isAmdHsaOrMesa(F);
857 if (IsAmdHsaOrMesa && !ST.hasFlatScratchEnabled())
858 PrivateSegmentBuffer = true;
859 else if (ST.isMesaGfxShader(F))
860 ImplicitBufferPtr = true;
861
862 if (!AMDGPU::isGraphics(CC)) {
863 if (!F.hasFnAttribute("amdgpu-no-dispatch-ptr"))
864 DispatchPtr = true;
865
866 // FIXME: Can this always be disabled with < COv5?
867 if (!F.hasFnAttribute("amdgpu-no-queue-ptr"))
868 QueuePtr = true;
869
870 if (!F.hasFnAttribute("amdgpu-no-dispatch-id"))
871 DispatchID = true;
872 }
873
874 if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
875 (IsAmdHsaOrMesa || ST.hasFlatScratchEnabled()) &&
876 // FlatScratchInit cannot be true for graphics CC if
877 // hasFlatScratchEnabled() is false.
878 (ST.hasFlatScratchEnabled() ||
879 (!AMDGPU::isGraphics(CC) &&
880 !F.hasFnAttribute("amdgpu-no-flat-scratch-init"))) &&
881 !ST.hasArchitectedFlatScratch()) {
882 FlatScratchInit = true;
883 }
884
886 NumUsedUserSGPRs += getNumUserSGPRForField(ImplicitBufferPtrID);
887
890
891 if (hasDispatchPtr())
892 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchPtrID);
893
894 if (hasQueuePtr())
895 NumUsedUserSGPRs += getNumUserSGPRForField(QueuePtrID);
896
898 NumUsedUserSGPRs += getNumUserSGPRForField(KernargSegmentPtrID);
899
900 if (hasDispatchID())
901 NumUsedUserSGPRs += getNumUserSGPRForField(DispatchIdID);
902
903 if (hasFlatScratchInit())
904 NumUsedUserSGPRs += getNumUserSGPRForField(FlatScratchInitID);
905
907 NumUsedUserSGPRs += getNumUserSGPRForField(PrivateSegmentSizeID);
908}
909
911 assert(NumKernargPreloadSGPRs + NumSGPRs <= AMDGPU::getMaxNumUserSGPRs(ST));
912 NumKernargPreloadSGPRs += NumSGPRs;
913 NumUsedUserSGPRs += NumSGPRs;
914}
915
917 return AMDGPU::getMaxNumUserSGPRs(ST) - NumUsedUserSGPRs;
918}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static cl::opt< bool > UseAA("aarch64-use-aa", cl::init(true), cl::desc("Enable the use of AA during codegen."))
This file describes how to lower LLVM calls to machine code calls.
This file declares the targeting of the InstructionSelector class for AMDGPU.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
This file declares the targeting of the RegisterBankInfo class for AMDGPU.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock & MBB
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static cl::opt< unsigned > NSAThreshold("amdgpu-nsa-threshold", cl::desc("Number of addresses from which to enable MIMG NSA."), cl::init(2), cl::Hidden)
static cl::opt< bool > EnableVGPRIndexMode("amdgpu-vgpr-index-mode", cl::desc("Use GPR indexing mode instead of movrel for vector indexing"), cl::init(false))
static cl::opt< bool > UseAA("amdgpu-use-aa-in-codegen", cl::desc("Enable the use of AA during codegen."), cl::init(true))
static const MachineOperand * getVOP3PSourceModifierFromOpIdx(const MachineInstr &UseI, int UseOpIdx, const SIInstrInfo &InstrInfo)
static unsigned getEffectiveSubRegIdx(const SIRegisterInfo &TRI, const SIInstrInfo &InstrInfo, const MachineInstr &I, const MachineOperand &Op)
AMD GCN specific subclass of TargetSubtarget.
static Register UseReg(const MachineOperand &MO)
IRTranslator LLVM IR MI
This file describes how to lower LLVM inline asm to machine code INLINEASM.
static bool hasFeature(StringRef Feature, const FeatureBitset &FeatureBits, ArrayRef< SubtargetFeatureKV > ProcFeatures)
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
if(PassOpts->AAPipeline)
This file defines the SmallString class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
std::pair< unsigned, unsigned > getWavesPerEU(const Function &F) const
std::pair< unsigned, unsigned > getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getWavefrontSizeLog2() const
AMDGPUSubtarget(const Triple &TT)
unsigned AddressableLocalMemorySize
Functions, function parameters, and return types can have attributes to indicate how they should be t...
Definition Attributes.h:105
LLVM_ABI StringRef getValueAsString() const
Return the attribute's value as a string.
bool isValid() const
Return true if the attribute is any kind of attribute.
Definition Attributes.h:261
Diagnostic information for optimization failures.
Diagnostic information for unsupported feature in backend.
uint64_t getFnAttributeAsParsedInteger(StringRef Kind, uint64_t Default=0) const
For a string attribute Kind, parse attribute as an integer.
Definition Function.cpp:770
bool hasFlat() const
InstrItineraryData InstrItins
bool useVGPRIndexMode() const
void mirFileLoaded(MachineFunction &MF) const override
unsigned MaxPrivateElementSize
unsigned getAddressableNumArchVGPRs() const
unsigned getMinNumSGPRs(unsigned WavesPerEU) const
void ParseSubtargetFeatures(StringRef CPU, StringRef TuneCPU, StringRef FS)
GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const GCNTargetMachine &TM, bool BufferOOBRelaxed=false, bool TBufferOOBRelaxed=false)
unsigned getConstantBusLimit(unsigned Opcode) const
const InstrItineraryData * getInstrItineraryData() const override
void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx, SDep &Dep, const TargetSchedModel *SchedModel) const override
void overridePostRASchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
Align getStackAlignment() const
const bool BufferOOBRelaxed
bool hasMadF16() const
unsigned getMinNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
bool isDynamicVGPREnabled() const
const SIRegisterInfo * getRegisterInfo() const override
unsigned getBaseMaxNumVGPRs(const Function &F, std::pair< unsigned, unsigned > NumVGPRBounds) const
bool zeroesHigh16BitsOfDest(unsigned Opcode) const
Returns if the result of this instruction with a 16-bit result returned in a 32-bit register implicit...
unsigned getBaseMaxNumSGPRs(const Function &F, std::pair< unsigned, unsigned > WavesPerEU, unsigned PreloadedSGPRs, unsigned ReservedNumSGPRs) const
unsigned getMaxNumPreloadedSGPRs() const
GCNSubtarget & initializeSubtargetDependencies(const Triple &TT, StringRef GPU, StringRef FS)
void overrideSchedPolicy(MachineSchedPolicy &Policy, const SchedRegion &Region) const override
std::pair< unsigned, unsigned > computeOccupancy(const Function &F, unsigned LDSSize=0, unsigned NumSGPRs=0, unsigned NumVGPRs=0) const
Subtarget's minimum/maximum occupancy, in number of waves per EU, that can be achieved when the only ...
unsigned getMaxNumVGPRs(unsigned WavesPerEU, unsigned DynamicVGPRBlockSize) const
AMDGPU::TargetID TargetID
const SITargetLowering * getTargetLowering() const override
unsigned getNSAThreshold(const MachineFunction &MF) const
unsigned getReservedNumSGPRs(const MachineFunction &MF) const
const bool TBufferOOBRelaxed
bool useAA() const override
bool isWave32() const
unsigned getOccupancyWithNumVGPRs(unsigned VGPRs, unsigned DynamicVGPRBlockSize) const
Return the maximum number of waves per SIMD for kernels using VGPRs VGPRs.
unsigned InstCacheLineSize
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const
Return the maximum number of waves per SIMD for kernels using SGPRs SGPRs.
Generation getGeneration() const
unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const
std::pair< unsigned, unsigned > getMaxNumVectorRegs(const Function &F) const
Return a pair of maximum numbers of VGPRs and AGPRs that meet the number of waves per execution unit ...
bool isXNACKEnabled() const
unsigned getBaseReservedNumSGPRs(const bool HasFlatScratch) const
bool hasAddr64() const
unsigned getDynamicVGPRBlockSize() const
void checkSubtargetFeatures(const Function &F) const
Diagnose inconsistent subtarget features before attempting to codegen function F.
~GCNSubtarget() override
const SelectionDAGTargetInfo * getSelectionDAGInfo() const override
static unsigned getNumUserSGPRForField(UserSGPRID ID)
void allocKernargPreloadSGPRs(unsigned NumSGPRs)
bool hasPrivateSegmentBuffer() const
GCNUserSGPRUsageInfo(const Function &F, const GCNSubtarget &ST)
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Instructions::const_iterator const_instr_iterator
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
const MachineBasicBlock * getParent() const
bool isBundle() const
const MachineOperand & getOperand(unsigned i) const
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
MCRegister asMCReg() const
Utility to check-convert this value to a MCRegister.
Definition Register.h:107
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
Scheduling dependency.
Definition ScheduleDAG.h:51
Kind getKind() const
Returns an enum value representing the kind of the dependence.
@ Data
Regular data dependence (aka true-dependence).
Definition ScheduleDAG.h:55
void setLatency(unsigned Lat)
Sets the latency for this edge.
@ Artificial
Arbitrary strong DAG edge (no real dependence).
Definition ScheduleDAG.h:74
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
Register getReg() const
Returns the register associated with this edge.
void setReg(Register Reg)
Assigns the associated register for this edge.
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
std::pair< unsigned, unsigned > getWavesPerEU() const
GCNUserSGPRUsageInfo & getUserSGPRInfo()
Scheduling unit. This is a node in the scheduling DAG.
Targets can subclass this to parameterize the SelectionDAG lowering and instruction selection process...
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Information about stack frame layout on the target.
Provide an instruction scheduling machine model to CodeGen passes.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
self_iterator getIterator()
Definition ilist_node.h:123
unsigned getNumWavesPerEUWithNumVGPRs(const MCSubtargetInfo &STI, unsigned NumVGPRs, unsigned DynamicVGPRBlockSize)
unsigned getEUsPerCU(const MCSubtargetInfo &STI)
unsigned getOccupancyWithNumSGPRs(unsigned SGPRs, unsigned MaxWaves, unsigned TotalNumSGPRs, unsigned Granule, unsigned TrapReserve)
unsigned getMaxWavesPerEU(const MCSubtargetInfo &STI)
unsigned getLocalMemorySize(const MCSubtargetInfo &STI)
StringRef getSchedStrategy(const Function &F)
unsigned getMaxNumUserSGPRs(const MCSubtargetInfo &STI)
unsigned getLdsDwGranularity(const MCSubtargetInfo &ST)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
unsigned getDynamicVGPRBlockSize(const Function &F)
std::pair< unsigned, unsigned > getIntegerPairAttribute(const Function &F, StringRef Name, std::pair< unsigned, unsigned > Default, bool OnlyFirstRequired)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ SPIR_KERNEL
Used for SPIR kernel functions.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Define a generic scheduling policy for targets that don't provide their own MachineSchedStrategy.
bool ShouldTrackLaneMasks
Track LaneMasks to allow reordering of independent subregister writes of the same vreg.
A region of an MBB for scheduling.