LLVM 23.0.0git
AMDGPUCoExecSchedStrategy.h
Go to the documentation of this file.
1//===- AMDGPUCoExecSchedStrategy.h - CoExec Scheduling Strategy -*- C++ -*-===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Coexecution-focused scheduling strategy for AMDGPU.
11//
12//===----------------------------------------------------------------------===//
13
14#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
15#define LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
16
17#include "GCNSchedStrategy.h"
19
20namespace llvm {
21
22namespace AMDGPU {
23
24//===----------------------------------------------------------------------===//
25// Instruction Flavor Classification
26//===----------------------------------------------------------------------===//
27
29 WMMA, // WMMA/MFMA matrix operations
30 SingleCycleVALU, // Single-cycle VALU (not TRANS32, not multi-cycle CVT)
31 TRANS, // Transcendental ops (v_exp, v_log, etc.)
32 MultiCycleVALU, // VALU instructions with repeat rate > 1
33 VMEM, // FLAT/GLOBAL memory operations
34 DS, // LDS/GDS operations
35 SALU, // Scalar ALU
36 DMA, // Tensor DMA operations
37 Fence, // Fences and waits
38 Other, // Everything else
40};
41
43 switch (F) {
45 return "WMMA";
47 return "VALU(1c)";
49 return "TRANS";
51 return "VALU(Nc)";
53 return "VMEM";
55 return "DS";
57 return "SALU";
59 return "DMA";
61 return "Fence";
63 return "Other";
65 return "???";
66 }
67 llvm_unreachable("Unknown InstructionFlavor");
68}
69
71 switch (F) {
73 return "W";
75 return "V";
77 return "T";
79 return "C";
81 return "M";
83 return "D";
85 return "S";
87 return "X";
89 return "F";
91 return "O";
93 return "?";
94 }
95 llvm_unreachable("Unknown InstructionFlavor");
96}
97
98InstructionFlavor classifyFlavor(const MachineInstr &MI,
99 const SIInstrInfo &SII);
100
102
103namespace FlavorGroups {
113inline FlavorGroup all() {
115 for (unsigned I = 0;
117 G.push_back(static_cast<InstructionFlavor>(I));
118 return G;
119}
120} // namespace FlavorGroups
121
122/// AMDGPU-specific scheduling decision reasons. These provide more granularity
123/// than the generic CandReason enum for debugging purposes.
126 CritResourceBalance, // tryCriticalResource chose based on resource pressure
127 CritResourceDep, // tryCriticalResourceDependency chose based on enabling
129};
130
132 switch (R) {
134 return "None";
136 return "CritResource";
138 return "CritResourceDep";
140 return "???";
141 }
142 llvm_unreachable("Unknown AMDGPUSchedReason");
143}
144
145} // End namespace AMDGPU
146
147//===----------------------------------------------------------------------===//
148// Hardware Unit Information
149//===----------------------------------------------------------------------===//
150
151/// HardwareUnitInfo is a wrapper class which maps to some real hardware
152/// resource. This is used to model hardware resource pressure per region, and
153/// guide scheduling heuristics.
155private:
156 /// PrioritySUs maintains a list of the SUs we want to prioritize scheduling
157 /// for this HardwareUnit. This is used for agreement between
158 /// tryCriticalResourceDependency and tryCriticalResource: we schedule the
159 /// dependencies for a SU on critical resource, then schedule that same SU on
160 /// the critical resource. This agreement results in shorter live ranges and
161 /// more regular HardwareUnit access patterns. SUs are prioritized based on
162 /// depth for top-down scheduling.
163 SmallSetVector<SUnit *, 16> PrioritySUs;
164 /// All the SUs in the region that consume this resource
166 /// The total number of busy cycles for this HardwareUnit for a given region.
167 unsigned TotalCycles = 0;
168 // InstructionFlavor mapping
170 // Whether or not instructions on this HardwareUnit may produce a window in
171 // which instructions in other HardwareUnits can coexecute. For example, WMMA
172 // / MFMA instructions may take multiple cycles, which may be overlapped with
173 // instructions on other HardwareUnits
174 bool ProducesCoexecWindow = false;
175
176public:
178
179 unsigned size() { return AllSUs.size(); }
180
181 unsigned getTotalCycles() { return TotalCycles; }
182
183 void setType(unsigned TheType) {
185 Type = (AMDGPU::InstructionFlavor)(TheType);
186 }
187
188 AMDGPU::InstructionFlavor getType() const { return Type; }
189
190 bool producesCoexecWindow() const { return ProducesCoexecWindow; }
191
192 void setProducesCoexecWindow(bool Val) { ProducesCoexecWindow = Val; }
193
194 bool contains(SUnit *SU) const { return AllSUs.contains(SU); }
195
196 /// \returns true if there is a difference in priority between \p SU and \p
197 /// Other. If so, \returns the SUnit with higher priority. This
198 /// method looks through the PrioritySUs to determine if one SU is more
199 /// prioritized than the other. If neither are in the PrioritySUs list, then
200 /// neither have priority over each other.
202 for (auto *SUOrder : PrioritySUs) {
203 if (SUOrder == SU)
204 return SU;
205
206 if (SUOrder == Other)
207 return Other;
208 }
209 return nullptr;
210 }
211
212 void reset() {
213 AllSUs.clear();
214 PrioritySUs.clear();
215 TotalCycles = 0;
217 ProducesCoexecWindow = false;
218 }
219
220 /// \returns the next SU in PrioritySUs that is not ready. If \p LookDeep is
221 /// set, we will look beyond the PrioritySUs (if all the PrioritySUs are
222 /// ready) to AllSUs to attempt to find a target SU. When looking through
223 /// AllSUs we sort pick the target SU by minimal depth for top-down
224 /// scheduling. getNextTargetSU is useful for determining which SU on this
225 /// HardwareUnit we are trying to schedule - this info helps us determine
226 /// which dependencies to schedule. LookDeep is useful if the dependencies are
227 /// long latency (e.g. memory instructions). If we have many long latency
228 /// dependencies, it is beneficial to enable SUs multiple levels ahead.
229 SUnit *getNextTargetSU(bool LookDeep = false) const;
230 /// Insert the \p SU into the AllSUs and account its \p BlockingCycles into
231 /// the TotalCycles. This maintains the list of PrioritySUs.
232 void insert(SUnit *SU, unsigned BlockingCycles);
233 /// Update the state for \p SU being scheduled by removing it from the AllSus
234 /// and reducing its \p BlockingCycles from the TotalCycles. This maintains
235 /// the list of PrioritySUS.
236 void markScheduled(SUnit *SU, unsigned BlockingCycles);
237};
238
239//===----------------------------------------------------------------------===//
240// Candidate Heuristics
241//===----------------------------------------------------------------------===//
242
243/// CandidateHeuristics contains state and implementations to facilitate making
244/// per instruction scheduling decisions; it contains methods used in
245/// tryCandidate to decide which instruction to schedule next.
247protected:
253
254 /// Walk over the region and collect total usage per HardwareUnit
255 void collectHWUIPressure();
256
257 /// Compute the blocking cycles for the appropriate HardwareUnit given an \p
258 /// SU
259 unsigned getHWUICyclesForInst(SUnit *SU);
260
261 /// Given a \p Flavor , find the corresponding HardwareUnit. \returns the
262 /// mapped HardwareUnit.
264
265public:
267
269 const TargetRegisterInfo *TRI);
270
271 /// Update the state to reflect that \p SU is going to be scheduled.
272 void updateForScheduling(SUnit *SU);
273
274 /// Sort the HWUInfo vector. After sorting, the HardwareUnits that are highest
275 /// priority are first. Priority is determined by maximizing coexecution and
276 /// keeping the critical HardwareUnit busy.
277 void sortHWUIResources();
278
279 /// Check for critical resource consumption. Prefer the candidate that uses
280 /// the most prioritized HardwareUnit. If both candidates use the same
281 /// HarwareUnit, prefer the candidate with higher priority on that
282 /// HardwareUnit.
285 SchedBoundary *Zone) const;
286
287 /// Check for dependencies of instructions that use prioritized HardwareUnits.
288 /// Prefer the candidate that is a dependency of an instruction that uses the
289 /// most prioritized HardwareUnit. If both candidates enable the same
290 /// HardwareUnit, prefer the candidate that enables the higher priority
291 /// instruction on that HardwareUnit.
292 bool
295 SchedBoundary *Zone) const;
296
297 void dumpRegionSummary();
298};
299
301protected:
303 SchedBoundary &Zone) const;
306
307#ifndef NDEBUG
308 void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand);
309#endif
310
312 SchedBoundary *Zone);
313 void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy,
314 const RegPressureTracker &RPTracker,
315 SchedCandidate &Cand, bool &PickedPending,
316 bool IsBottomUp);
317
318public:
320
323 unsigned NumRegionInstrs) override;
324 void initialize(ScheduleDAGMI *DAG) override;
325 SUnit *pickNode(bool &IsTopNode) override;
326 void schedNode(SUnit *SU, bool IsTopNode) override;
327};
328
331
332} // End namespace llvm
333
334#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPUCOEXECSCHEDSTRATEGY_H
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
Register const TargetRegisterInfo * TRI
bool tryEffectiveStall(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary &Zone) const
void initPolicy(MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End, unsigned NumRegionInstrs) override
Optionally override the per-region scheduling policy.
SUnit * pickNode(bool &IsTopNode) override
Pick the next node to schedule, or return NULL.
void pickNodeFromQueue(SchedBoundary &Zone, const CandPolicy &ZonePolicy, const RegPressureTracker &RPTracker, SchedCandidate &Cand, bool &PickedPending, bool IsBottomUp)
void initialize(ScheduleDAGMI *DAG) override
Initialize the strategy after building the DAG for a new region.
void schedNode(SUnit *SU, bool IsTopNode) override
Notify MachineSchedStrategy that ScheduleDAGMI has scheduled an instruction and updated scheduled/rem...
AMDGPUCoExecSchedStrategy(const MachineSchedContext *C)
void dumpPickSummary(SUnit *SU, bool IsTopNode, SchedCandidate &Cand)
bool tryCandidateCoexec(SchedCandidate &Cand, SchedCandidate &TryCand, SchedBoundary *Zone)
CandidateHeuristics contains state and implementations to facilitate making per instruction schedulin...
void updateForScheduling(SUnit *SU)
Update the state to reflect that SU is going to be scheduled.
HardwareUnitInfo * getHWUIFromFlavor(AMDGPU::InstructionFlavor Flavor)
Given a Flavor , find the corresponding HardwareUnit.
void sortHWUIResources()
Sort the HWUInfo vector.
bool tryCriticalResource(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for critical resource consumption.
bool tryCriticalResourceDependency(GenericSchedulerBase::SchedCandidate &TryCand, GenericSchedulerBase::SchedCandidate &Cand, SchedBoundary *Zone) const
Check for dependencies of instructions that use prioritized HardwareUnits.
SmallVector< HardwareUnitInfo, 8 > HWUInfo
const TargetSchedModel * SchedModel
void collectHWUIPressure()
Walk over the region and collect total usage per HardwareUnit.
void initialize(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel, const TargetRegisterInfo *TRI)
unsigned getHWUICyclesForInst(SUnit *SU)
Compute the blocking cycles for the appropriate HardwareUnit given an SU.
GCNSchedStrategy(const MachineSchedContext *C)
ScheduleDAGMILive * DAG
HardwareUnitInfo is a wrapper class which maps to some real hardware resource.
void markScheduled(SUnit *SU, unsigned BlockingCycles)
Update the state for SU being scheduled by removing it from the AllSus and reducing its BlockingCycle...
SUnit * getNextTargetSU(bool LookDeep=false) const
void insert(SUnit *SU, unsigned BlockingCycles)
Insert the SU into the AllSUs and account its BlockingCycles into the TotalCycles.
AMDGPU::InstructionFlavor getType() const
SUnit * getHigherPriority(SUnit *SU, SUnit *Other) const
MachineInstrBundleIterator< MachineInstr > iterator
Representation of each machine instruction.
Track the current register pressure at some position in the instruction stream, and remember the high...
Scheduling unit. This is a node in the scheduling DAG.
Each Scheduling boundary is associated with ready queues.
A ScheduleDAG for scheduling lists of MachineInstr.
ScheduleDAGMI is an implementation of ScheduleDAGInstrs that simply schedules machine instructions ac...
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Provide an instruction scheduling machine model to CodeGen passes.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
FlavorGroup individual(InstructionFlavor F)
StringRef getFlavorShortName(InstructionFlavor F)
AMDGPUSchedReason
AMDGPU-specific scheduling decision reasons.
InstructionFlavor classifyFlavor(const MachineInstr &MI, const SIInstrInfo &SII)
StringRef getReasonName(AMDGPUSchedReason R)
SmallVector< InstructionFlavor, 4 > FlavorGroup
StringRef getFlavorName(InstructionFlavor F)
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
This is an optimization pass for GlobalISel generic memory operations.
ScheduleDAGInstrs * createGCNNoopPostMachineScheduler(MachineSchedContext *C)
@ Other
Any other memory.
Definition ModRef.h:68
ScheduleDAGInstrs * createGCNCoExecMachineScheduler(MachineSchedContext *C)
Policy for scheduling the next instruction in the candidate's zone.
Store the state used by GenericScheduler heuristics, required for the lifetime of one invocation of p...
MachineSchedContext provides enough context from the MachineScheduler pass for the target to instanti...