LLVM 22.0.0git
AMDGPUBarrierLatency.cpp
Go to the documentation of this file.
1//===--- AMDGPUBarrierLatency.cpp - AMDGPU Barrier Latency ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file This file contains a DAG scheduling mutation to add latency to:
10/// 1. Barrier edges between ATOMIC_FENCE instructions and preceding
11/// memory accesses potentially affected by the fence.
12/// This encourages the scheduling of more instructions before
13/// ATOMIC_FENCE instructions. ATOMIC_FENCE instructions may
14/// introduce wait counting or indicate an impending S_BARRIER
15/// wait. Having more instructions in-flight across these
16/// constructs improves latency hiding.
17/// 2. Barrier edges from S_BARRIER_SIGNAL to S_BARRIER_WAIT.
18/// This encourages independent work to be scheduled between
19/// signal and wait, hiding barrier synchronization latency.
20//
21//===----------------------------------------------------------------------===//
22
25#include "SIInstrInfo.h"
28
29using namespace llvm;
30
32 "amdgpu-barrier-signal-wait-latency",
33 cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT "
34 "to encourage scheduling independent work between them"),
35 cl::init(16), cl::Hidden);
36
37namespace {
38
39class BarrierLatency : public ScheduleDAGMutation {
40private:
41 SmallSet<SyncScope::ID, 4> IgnoredScopes;
42
43public:
44 BarrierLatency(MachineFunction *MF) {
45 LLVMContext &Context = MF->getFunction().getContext();
46 IgnoredScopes.insert(SyncScope::SingleThread);
47 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront"));
48 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("wavefront-one-as"));
49 IgnoredScopes.insert(Context.getOrInsertSyncScopeID("singlethread-one-as"));
50 }
51 void apply(ScheduleDAGInstrs *DAG) override;
52};
53
54void addLatencyToEdge(SDep &PredDep, SUnit &SU, unsigned Latency) {
55 SUnit *PredSU = PredDep.getSUnit();
56 SDep ForwardD = PredDep;
57 ForwardD.setSUnit(&SU);
58 for (SDep &SuccDep : PredSU->Succs) {
59 if (SuccDep == ForwardD) {
60 SuccDep.setLatency(SuccDep.getLatency() + Latency);
61 break;
62 }
63 }
64 PredDep.setLatency(PredDep.getLatency() + Latency);
65 PredSU->setDepthDirty();
66 SU.setDepthDirty();
67}
68
69void BarrierLatency::apply(ScheduleDAGInstrs *DAG) {
70 const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
71 constexpr unsigned FenceLatency = 2000;
72 const unsigned BarrierSignalWaitLatency = BarrierSignalWaitLatencyOpt;
73
74 for (SUnit &SU : DAG->SUnits) {
75 const MachineInstr *MI = SU.getInstr();
76 unsigned Op = MI->getOpcode();
77
78 if (Op == AMDGPU::ATOMIC_FENCE) {
79 // Update latency on barrier edges of ATOMIC_FENCE.
80 // Ignore scopes not expected to have any latency.
81 SyncScope::ID SSID =
82 static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
83 if (IgnoredScopes.contains(SSID))
84 continue;
85
86 for (SDep &PredDep : SU.Preds) {
87 if (!PredDep.isBarrier())
88 continue;
89 SUnit *PredSU = PredDep.getSUnit();
90 MachineInstr *MI = PredSU->getInstr();
91 // Only consider memory loads
92 if (!MI->mayLoad() || MI->mayStore())
93 continue;
94 addLatencyToEdge(PredDep, SU, FenceLatency);
95 }
96 } else if (Op == AMDGPU::S_BARRIER_WAIT) {
97 for (SDep &PredDep : SU.Preds) {
98 SUnit *PredSU = PredDep.getSUnit();
99 const MachineInstr *PredMI = PredSU->getInstr();
100 if (TII->isBarrierStart(PredMI->getOpcode())) {
101 addLatencyToEdge(PredDep, SU, BarrierSignalWaitLatency);
102 }
103 }
104 }
105 }
106}
107
108} // end namespace
109
110std::unique_ptr<ScheduleDAGMutation>
112 return std::make_unique<BarrierLatency>(MF);
113}
static cl::opt< unsigned > BarrierSignalWaitLatencyOpt("amdgpu-barrier-signal-wait-latency", cl::desc("Synthetic latency between S_BARRIER_SIGNAL and S_BARRIER_WAIT " "to encourage scheduling independent work between them"), cl::init(16), cl::Hidden)
Provides AMDGPU specific target descriptions.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Interface definition for SIInstrInfo.
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Function & getFunction()
Return the LLVM function that this machine code represents.
unsigned getOpcode() const
Returns the opcode of this MachineInstr.
Scheduling dependency.
Definition ScheduleDAG.h:51
SUnit * getSUnit() const
void setLatency(unsigned Lat)
Sets the latency for this edge.
unsigned getLatency() const
Returns the latency value for this edge, which roughly means the minimum number of cycles that must e...
void setSUnit(SUnit *SU)
bool isBarrier() const
Tests if this is an Order dependence that is marked as a barrier.
Scheduling unit. This is a node in the scheduling DAG.
SmallVector< SDep, 4 > Succs
All sunit successors.
LLVM_ABI void setDepthDirty()
Sets a flag in this node to indicate that its stored Depth value will require recomputation the next ...
SmallVector< SDep, 4 > Preds
All sunit predecessors.
MachineInstr * getInstr() const
Returns the representative MachineInstr for this SUnit.
A ScheduleDAG for scheduling lists of MachineInstr.
Mutate the DAG as a postpass after normal DAG building.
const TargetInstrInfo * TII
Target instruction information.
std::vector< SUnit > SUnits
The scheduling units.
SmallSet - This maintains a set of unique values, optimizing for the case when the set is small (less...
Definition SmallSet.h:133
bool contains(const T &V) const
Check if the SmallSet contains the given element.
Definition SmallSet.h:228
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:183
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
void apply(Opt *O, const Mod &M, const Mods &... Ms)
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
std::unique_ptr< ScheduleDAGMutation > createAMDGPUBarrierLatencyDAGMutation(MachineFunction *MF)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
DWARFExpression::Operation Op