LLVM 22.0.0git
SIMemoryLegalizer.cpp
Go to the documentation of this file.
1//===- SIMemoryLegalizer.cpp ----------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Memory legalizer - implements memory model. More information can be
11/// found here:
12/// http://llvm.org/docs/AMDGPUUsage.html#memory-model
13//
14//===----------------------------------------------------------------------===//
15
16#include "AMDGPU.h"
18#include "GCNSubtarget.h"
27#include "llvm/IR/PassManager.h"
31
32using namespace llvm;
33using namespace llvm::AMDGPU;
34
35#define DEBUG_TYPE "si-memory-legalizer"
36#define PASS_NAME "SI Memory Legalizer"
37
39 "amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden,
40 cl::desc("Use this to skip inserting cache invalidating instructions."));
41
42namespace {
43
45
46/// Memory operation flags. Can be ORed together.
47enum class SIMemOp {
48 NONE = 0u,
49 LOAD = 1u << 0,
50 STORE = 1u << 1,
51 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ STORE)
52};
53
54/// Position to insert a new instruction relative to an existing
55/// instruction.
56enum class Position {
57 BEFORE,
58 AFTER
59};
60
61/// The atomic synchronization scopes supported by the AMDGPU target.
62enum class SIAtomicScope {
63 NONE,
64 SINGLETHREAD,
65 WAVEFRONT,
66 WORKGROUP,
67 CLUSTER, // Promoted to AGENT on targets without workgroup clusters.
68 AGENT,
69 SYSTEM
70};
71
72/// The distinct address spaces supported by the AMDGPU target for
73/// atomic memory operation. Can be ORed together.
74enum class SIAtomicAddrSpace {
75 NONE = 0u,
76 GLOBAL = 1u << 0,
77 LDS = 1u << 1,
78 SCRATCH = 1u << 2,
79 GDS = 1u << 3,
80 OTHER = 1u << 4,
81
82 /// The address spaces that can be accessed by a FLAT instruction.
83 FLAT = GLOBAL | LDS | SCRATCH,
84
85 /// The address spaces that support atomic instructions.
86 ATOMIC = GLOBAL | LDS | SCRATCH | GDS,
87
88 /// All address spaces.
89 ALL = GLOBAL | LDS | SCRATCH | GDS | OTHER,
90
91 LLVM_MARK_AS_BITMASK_ENUM(/* LargestFlag = */ ALL)
92};
93
94class SIMemOpInfo final {
95private:
96
97 friend class SIMemOpAccess;
98
99 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
100 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
101 SIAtomicScope Scope = SIAtomicScope::SYSTEM;
102 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
103 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
104 bool IsCrossAddressSpaceOrdering = false;
105 bool IsVolatile = false;
106 bool IsNonTemporal = false;
107 bool IsLastUse = false;
108 bool IsCooperative = false;
109
110 // TODO: Should we assume Cooperative=true if no MMO is present?
111 SIMemOpInfo(
112 const GCNSubtarget &ST,
113 AtomicOrdering Ordering = AtomicOrdering::SequentiallyConsistent,
114 SIAtomicScope Scope = SIAtomicScope::SYSTEM,
115 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::ATOMIC,
116 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::ALL,
117 bool IsCrossAddressSpaceOrdering = true,
118 AtomicOrdering FailureOrdering = AtomicOrdering::SequentiallyConsistent,
119 bool IsVolatile = false, bool IsNonTemporal = false,
120 bool IsLastUse = false, bool IsCooperative = false)
121 : Ordering(Ordering), FailureOrdering(FailureOrdering), Scope(Scope),
122 OrderingAddrSpace(OrderingAddrSpace), InstrAddrSpace(InstrAddrSpace),
123 IsCrossAddressSpaceOrdering(IsCrossAddressSpaceOrdering),
124 IsVolatile(IsVolatile), IsNonTemporal(IsNonTemporal),
125 IsLastUse(IsLastUse), IsCooperative(IsCooperative) {
126
127 if (Ordering == AtomicOrdering::NotAtomic) {
128 assert(!IsCooperative && "Cannot be cooperative & non-atomic!");
129 assert(Scope == SIAtomicScope::NONE &&
130 OrderingAddrSpace == SIAtomicAddrSpace::NONE &&
131 !IsCrossAddressSpaceOrdering &&
132 FailureOrdering == AtomicOrdering::NotAtomic);
133 return;
134 }
135
136 assert(Scope != SIAtomicScope::NONE &&
137 (OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
138 SIAtomicAddrSpace::NONE &&
139 (InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) !=
140 SIAtomicAddrSpace::NONE);
141
142 // There is also no cross address space ordering if the ordering
143 // address space is the same as the instruction address space and
144 // only contains a single address space.
145 if ((OrderingAddrSpace == InstrAddrSpace) &&
146 isPowerOf2_32(uint32_t(InstrAddrSpace)))
147 this->IsCrossAddressSpaceOrdering = false;
148
149 // Limit the scope to the maximum supported by the instruction's address
150 // spaces.
151 if ((InstrAddrSpace & ~SIAtomicAddrSpace::SCRATCH) ==
152 SIAtomicAddrSpace::NONE) {
153 this->Scope = std::min(Scope, SIAtomicScope::SINGLETHREAD);
154 } else if ((InstrAddrSpace &
155 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS)) ==
156 SIAtomicAddrSpace::NONE) {
157 this->Scope = std::min(Scope, SIAtomicScope::WORKGROUP);
158 } else if ((InstrAddrSpace &
159 ~(SIAtomicAddrSpace::SCRATCH | SIAtomicAddrSpace::LDS |
160 SIAtomicAddrSpace::GDS)) == SIAtomicAddrSpace::NONE) {
161 this->Scope = std::min(Scope, SIAtomicScope::AGENT);
162 }
163
164 // On targets that have no concept of a workgroup cluster, use
165 // AGENT scope as a conservatively correct alternative.
166 if (this->Scope == SIAtomicScope::CLUSTER && !ST.hasClusters())
167 this->Scope = SIAtomicScope::AGENT;
168 }
169
170public:
171 /// \returns Atomic synchronization scope of the machine instruction used to
172 /// create this SIMemOpInfo.
173 SIAtomicScope getScope() const {
174 return Scope;
175 }
176
177 /// \returns Ordering constraint of the machine instruction used to
178 /// create this SIMemOpInfo.
179 AtomicOrdering getOrdering() const {
180 return Ordering;
181 }
182
183 /// \returns Failure ordering constraint of the machine instruction used to
184 /// create this SIMemOpInfo.
185 AtomicOrdering getFailureOrdering() const {
186 return FailureOrdering;
187 }
188
189 /// \returns The address spaces be accessed by the machine
190 /// instruction used to create this SIMemOpInfo.
191 SIAtomicAddrSpace getInstrAddrSpace() const {
192 return InstrAddrSpace;
193 }
194
195 /// \returns The address spaces that must be ordered by the machine
196 /// instruction used to create this SIMemOpInfo.
197 SIAtomicAddrSpace getOrderingAddrSpace() const {
198 return OrderingAddrSpace;
199 }
200
201 /// \returns Return true iff memory ordering of operations on
202 /// different address spaces is required.
203 bool getIsCrossAddressSpaceOrdering() const {
204 return IsCrossAddressSpaceOrdering;
205 }
206
207 /// \returns True if memory access of the machine instruction used to
208 /// create this SIMemOpInfo is volatile, false otherwise.
209 bool isVolatile() const {
210 return IsVolatile;
211 }
212
213 /// \returns True if memory access of the machine instruction used to
214 /// create this SIMemOpInfo is nontemporal, false otherwise.
215 bool isNonTemporal() const {
216 return IsNonTemporal;
217 }
218
219 /// \returns True if memory access of the machine instruction used to
220 /// create this SIMemOpInfo is last use, false otherwise.
221 bool isLastUse() const { return IsLastUse; }
222
223 /// \returns True if this is a cooperative load or store atomic.
224 bool isCooperative() const { return IsCooperative; }
225
226 /// \returns True if ordering constraint of the machine instruction used to
227 /// create this SIMemOpInfo is unordered or higher, false otherwise.
228 bool isAtomic() const {
229 return Ordering != AtomicOrdering::NotAtomic;
230 }
231
232};
233
234class SIMemOpAccess final {
235private:
236 const AMDGPUMachineModuleInfo *MMI = nullptr;
237 const GCNSubtarget &ST;
238
239 /// Reports unsupported message \p Msg for \p MI to LLVM context.
240 void reportUnsupported(const MachineBasicBlock::iterator &MI,
241 const char *Msg) const;
242
243 /// Inspects the target synchronization scope \p SSID and determines
244 /// the SI atomic scope it corresponds to, the address spaces it
245 /// covers, and whether the memory ordering applies between address
246 /// spaces.
247 std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
248 toSIAtomicScope(SyncScope::ID SSID, SIAtomicAddrSpace InstrAddrSpace) const;
249
250 /// \return Return a bit set of the address spaces accessed by \p AS.
251 SIAtomicAddrSpace toSIAtomicAddrSpace(unsigned AS) const;
252
253 /// \returns Info constructed from \p MI, which has at least machine memory
254 /// operand.
255 std::optional<SIMemOpInfo>
256 constructFromMIWithMMO(const MachineBasicBlock::iterator &MI) const;
257
258public:
259 /// Construct class to support accessing the machine memory operands
260 /// of instructions in the machine function \p MF.
261 SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI, const GCNSubtarget &ST);
262
263 /// \returns Load info if \p MI is a load operation, "std::nullopt" otherwise.
264 std::optional<SIMemOpInfo>
266
267 /// \returns Store info if \p MI is a store operation, "std::nullopt"
268 /// otherwise.
269 std::optional<SIMemOpInfo>
270 getStoreInfo(const MachineBasicBlock::iterator &MI) const;
271
272 /// \returns Atomic fence info if \p MI is an atomic fence operation,
273 /// "std::nullopt" otherwise.
274 std::optional<SIMemOpInfo>
275 getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const;
276
277 /// \returns Atomic cmpxchg/rmw info if \p MI is an atomic cmpxchg or
278 /// rmw operation, "std::nullopt" otherwise.
279 std::optional<SIMemOpInfo>
280 getAtomicCmpxchgOrRmwInfo(const MachineBasicBlock::iterator &MI) const;
281
282 /// \returns DMA to LDS info if \p MI is as a direct-to/from-LDS load/store,
283 /// along with an indication of whether this is a load or store. If it is not
284 /// a direct-to-LDS operation, returns std::nullopt.
285 std::optional<SIMemOpInfo>
286 getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const;
287};
288
289class SICacheControl {
290protected:
291
292 /// AMDGPU subtarget info.
293 const GCNSubtarget &ST;
294
295 /// Instruction info.
296 const SIInstrInfo *TII = nullptr;
297
298 IsaVersion IV;
299
300 /// Whether to insert cache invalidating instructions.
301 bool InsertCacheInv;
302
303 SICacheControl(const GCNSubtarget &ST);
304
305 /// Sets named bit \p BitName to "true" if present in instruction \p MI.
306 /// \returns Returns true if \p MI is modified, false otherwise.
307 bool enableNamedBit(const MachineBasicBlock::iterator MI,
308 AMDGPU::CPol::CPol Bit) const;
309
310 /// Check if any atomic operation on AS can affect memory accessible via the
311 /// global address space.
312 bool canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const;
313
314public:
315
316 /// Create a cache control for the subtarget \p ST.
317 static std::unique_ptr<SICacheControl> create(const GCNSubtarget &ST);
318
319 /// Update \p MI memory load instruction to bypass any caches up to
320 /// the \p Scope memory scope for address spaces \p
321 /// AddrSpace. Return true iff the instruction was modified.
322 virtual bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
323 SIAtomicScope Scope,
324 SIAtomicAddrSpace AddrSpace) const = 0;
325
326 /// Update \p MI memory store instruction to bypass any caches up to
327 /// the \p Scope memory scope for address spaces \p
328 /// AddrSpace. Return true iff the instruction was modified.
329 virtual bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
330 SIAtomicScope Scope,
331 SIAtomicAddrSpace AddrSpace) const = 0;
332
333 /// Update \p MI memory read-modify-write instruction to bypass any caches up
334 /// to the \p Scope memory scope for address spaces \p AddrSpace. Return true
335 /// iff the instruction was modified.
336 virtual bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
337 SIAtomicScope Scope,
338 SIAtomicAddrSpace AddrSpace) const = 0;
339
340 /// Update \p MI memory instruction of kind \p Op associated with address
341 /// spaces \p AddrSpace to indicate it is volatile and/or
342 /// nontemporal/last-use. Return true iff the instruction was modified.
343 virtual bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
344 SIAtomicAddrSpace AddrSpace,
345 SIMemOp Op, bool IsVolatile,
346 bool IsNonTemporal,
347 bool IsLastUse = false) const = 0;
348
349 /// Add final touches to a `mayStore` instruction \p MI, which may be a
350 /// Store or RMW instruction.
351 /// FIXME: This takes a MI because iterators aren't handled properly. When
352 /// this is called, they often point to entirely different insts. Thus we back
353 /// up the inst early and pass it here instead.
354 virtual bool finalizeStore(MachineInstr &MI, bool Atomic) const {
355 return false;
356 };
357
358 /// Handle cooperative load/store atomics.
359 virtual bool handleCooperativeAtomic(MachineInstr &MI) const {
361 "cooperative atomics are not available on this architecture");
362 }
363
364 /// Inserts any necessary instructions at position \p Pos relative
365 /// to instruction \p MI to ensure memory instructions before \p Pos of kind
366 /// \p Op associated with address spaces \p AddrSpace have completed. Used
367 /// between memory instructions to enforce the order they become visible as
368 /// observed by other memory instructions executing in memory scope \p Scope.
369 /// \p IsCrossAddrSpaceOrdering indicates if the memory ordering is between
370 /// address spaces. If \p AtomicsOnly is true, only insert waits for counters
371 /// that are used by atomic instructions.
372 /// Returns true iff any instructions inserted.
373 virtual bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
374 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
375 bool IsCrossAddrSpaceOrdering, Position Pos,
376 AtomicOrdering Order, bool AtomicsOnly) const = 0;
377
378 /// Inserts any necessary instructions at position \p Pos relative to
379 /// instruction \p MI to ensure any subsequent memory instructions of this
380 /// thread with address spaces \p AddrSpace will observe the previous memory
381 /// operations by any thread for memory scopes up to memory scope \p Scope .
382 /// Returns true iff any instructions inserted.
383 virtual bool insertAcquire(MachineBasicBlock::iterator &MI,
384 SIAtomicScope Scope,
385 SIAtomicAddrSpace AddrSpace,
386 Position Pos) const = 0;
387
388 /// Inserts any necessary instructions at position \p Pos relative to
389 /// instruction \p MI to ensure previous memory instructions by this thread
390 /// with address spaces \p AddrSpace have completed and can be observed by
391 /// subsequent memory instructions by any thread executing in memory scope \p
392 /// Scope. \p IsCrossAddrSpaceOrdering indicates if the memory ordering is
393 /// between address spaces. Returns true iff any instructions inserted.
394 virtual bool insertRelease(MachineBasicBlock::iterator &MI,
395 SIAtomicScope Scope,
396 SIAtomicAddrSpace AddrSpace,
397 bool IsCrossAddrSpaceOrdering,
398 Position Pos) const = 0;
399
400 /// Virtual destructor to allow derivations to be deleted.
401 virtual ~SICacheControl() = default;
402};
403
404class SIGfx6CacheControl : public SICacheControl {
405protected:
406
407 /// Sets GLC bit to "true" if present in \p MI. Returns true if \p MI
408 /// is modified, false otherwise.
409 bool enableGLCBit(const MachineBasicBlock::iterator &MI) const {
410 return enableNamedBit(MI, AMDGPU::CPol::GLC);
411 }
412
413 /// Sets SLC bit to "true" if present in \p MI. Returns true if \p MI
414 /// is modified, false otherwise.
415 bool enableSLCBit(const MachineBasicBlock::iterator &MI) const {
416 return enableNamedBit(MI, AMDGPU::CPol::SLC);
417 }
418
419public:
420
421 SIGfx6CacheControl(const GCNSubtarget &ST) : SICacheControl(ST) {}
422
423 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
424 SIAtomicScope Scope,
425 SIAtomicAddrSpace AddrSpace) const override;
426
427 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
428 SIAtomicScope Scope,
429 SIAtomicAddrSpace AddrSpace) const override;
430
431 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
432 SIAtomicScope Scope,
433 SIAtomicAddrSpace AddrSpace) const override;
434
435 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
436 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
437 bool IsVolatile, bool IsNonTemporal,
438 bool IsLastUse) const override;
439
440 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
441 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
442 bool IsCrossAddrSpaceOrdering, Position Pos,
443 AtomicOrdering Order, bool AtomicsOnly) const override;
444
445 bool insertAcquire(MachineBasicBlock::iterator &MI,
446 SIAtomicScope Scope,
447 SIAtomicAddrSpace AddrSpace,
448 Position Pos) const override;
449
450 bool insertRelease(MachineBasicBlock::iterator &MI,
451 SIAtomicScope Scope,
452 SIAtomicAddrSpace AddrSpace,
453 bool IsCrossAddrSpaceOrdering,
454 Position Pos) const override;
455};
456
457class SIGfx7CacheControl : public SIGfx6CacheControl {
458public:
459
460 SIGfx7CacheControl(const GCNSubtarget &ST) : SIGfx6CacheControl(ST) {}
461
462 bool insertAcquire(MachineBasicBlock::iterator &MI,
463 SIAtomicScope Scope,
464 SIAtomicAddrSpace AddrSpace,
465 Position Pos) const override;
466
467};
468
469class SIGfx90ACacheControl : public SIGfx7CacheControl {
470public:
471
472 SIGfx90ACacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
473
474 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
475 SIAtomicScope Scope,
476 SIAtomicAddrSpace AddrSpace) const override;
477
478 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
479 SIAtomicScope Scope,
480 SIAtomicAddrSpace AddrSpace) const override;
481
482 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
483 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
484 bool IsVolatile, bool IsNonTemporal,
485 bool IsLastUse) const override;
486
487 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
488 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
489 bool IsCrossAddrSpaceOrdering, Position Pos,
490 AtomicOrdering Order, bool AtomicsOnly) const override;
491
492 bool insertAcquire(MachineBasicBlock::iterator &MI,
493 SIAtomicScope Scope,
494 SIAtomicAddrSpace AddrSpace,
495 Position Pos) const override;
496
497 bool insertRelease(MachineBasicBlock::iterator &MI,
498 SIAtomicScope Scope,
499 SIAtomicAddrSpace AddrSpace,
500 bool IsCrossAddrSpaceOrdering,
501 Position Pos) const override;
502};
503
504class SIGfx940CacheControl : public SIGfx90ACacheControl {
505protected:
506
507 /// Sets SC0 bit to "true" if present in \p MI. Returns true if \p MI
508 /// is modified, false otherwise.
509 bool enableSC0Bit(const MachineBasicBlock::iterator &MI) const {
510 return enableNamedBit(MI, AMDGPU::CPol::SC0);
511 }
512
513 /// Sets SC1 bit to "true" if present in \p MI. Returns true if \p MI
514 /// is modified, false otherwise.
515 bool enableSC1Bit(const MachineBasicBlock::iterator &MI) const {
516 return enableNamedBit(MI, AMDGPU::CPol::SC1);
517 }
518
519 /// Sets NT bit to "true" if present in \p MI. Returns true if \p MI
520 /// is modified, false otherwise.
521 bool enableNTBit(const MachineBasicBlock::iterator &MI) const {
522 return enableNamedBit(MI, AMDGPU::CPol::NT);
523 }
524
525public:
526 SIGfx940CacheControl(const GCNSubtarget &ST) : SIGfx90ACacheControl(ST) {};
527
528 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
529 SIAtomicScope Scope,
530 SIAtomicAddrSpace AddrSpace) const override;
531
532 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
533 SIAtomicScope Scope,
534 SIAtomicAddrSpace AddrSpace) const override;
535
536 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
537 SIAtomicScope Scope,
538 SIAtomicAddrSpace AddrSpace) const override;
539
540 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
541 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
542 bool IsVolatile, bool IsNonTemporal,
543 bool IsLastUse) const override;
544
545 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
546 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
547
548 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
549 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
550 Position Pos) const override;
551};
552
553class SIGfx10CacheControl : public SIGfx7CacheControl {
554protected:
555
556 /// Sets DLC bit to "true" if present in \p MI. Returns true if \p MI
557 /// is modified, false otherwise.
558 bool enableDLCBit(const MachineBasicBlock::iterator &MI) const {
559 return enableNamedBit(MI, AMDGPU::CPol::DLC);
560 }
561
562public:
563
564 SIGfx10CacheControl(const GCNSubtarget &ST) : SIGfx7CacheControl(ST) {}
565
566 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
567 SIAtomicScope Scope,
568 SIAtomicAddrSpace AddrSpace) const override;
569
570 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
571 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
572 bool IsVolatile, bool IsNonTemporal,
573 bool IsLastUse) const override;
574
575 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
576 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
577 bool IsCrossAddrSpaceOrdering, Position Pos,
578 AtomicOrdering Order, bool AtomicsOnly) const override;
579
580 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
581 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
582};
583
584class SIGfx11CacheControl : public SIGfx10CacheControl {
585public:
586 SIGfx11CacheControl(const GCNSubtarget &ST) : SIGfx10CacheControl(ST) {}
587
588 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
589 SIAtomicScope Scope,
590 SIAtomicAddrSpace AddrSpace) const override;
591
592 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
593 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
594 bool IsVolatile, bool IsNonTemporal,
595 bool IsLastUse) const override;
596};
597
598class SIGfx12CacheControl : public SIGfx11CacheControl {
599protected:
600 // Sets TH policy to \p Value if CPol operand is present in instruction \p MI.
601 // \returns Returns true if \p MI is modified, false otherwise.
602 bool setTH(const MachineBasicBlock::iterator MI,
604 // Sets Scope policy to \p Value if CPol operand is present in instruction \p
605 // MI. \returns Returns true if \p MI is modified, false otherwise.
606 bool setScope(const MachineBasicBlock::iterator MI,
608
609 // Stores with system scope (SCOPE_SYS) need to wait for:
610 // - loads or atomics(returning) - wait for {LOAD|SAMPLE|BVH|KM}CNT==0
611 // - non-returning-atomics - wait for STORECNT==0
612 // TODO: SIInsertWaitcnts will not always be able to remove STORECNT waits
613 // since it does not distinguish atomics-with-return from regular stores.
614 // There is no need to wait if memory is cached (mtype != UC).
615 bool
616 insertWaitsBeforeSystemScopeStore(const MachineBasicBlock::iterator MI) const;
617
618 bool setAtomicScope(const MachineBasicBlock::iterator &MI,
619 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const;
620
621public:
622 SIGfx12CacheControl(const GCNSubtarget &ST) : SIGfx11CacheControl(ST) {
623 // GFX12.0 and GFX12.5 memory models greatly overlap, and in some cases
624 // the behavior is the same if assuming GFX12.0 in CU mode.
625 assert(!ST.hasGFX1250Insts() || ST.isCuModeEnabled());
626 }
627
628 bool insertWait(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
629 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
630 bool IsCrossAddrSpaceOrdering, Position Pos,
631 AtomicOrdering Order, bool AtomicsOnly) const override;
632
633 bool insertAcquire(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
634 SIAtomicAddrSpace AddrSpace, Position Pos) const override;
635
636 bool enableVolatileAndOrNonTemporal(MachineBasicBlock::iterator &MI,
637 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
638 bool IsVolatile, bool IsNonTemporal,
639 bool IsLastUse) const override;
640
641 bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
642
643 bool handleCooperativeAtomic(MachineInstr &MI) const override;
644
645 bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
646 SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
647 Position Pos) const override;
648
649 bool enableLoadCacheBypass(const MachineBasicBlock::iterator &MI,
650 SIAtomicScope Scope,
651 SIAtomicAddrSpace AddrSpace) const override {
652 return setAtomicScope(MI, Scope, AddrSpace);
653 }
654
655 bool enableStoreCacheBypass(const MachineBasicBlock::iterator &MI,
656 SIAtomicScope Scope,
657 SIAtomicAddrSpace AddrSpace) const override {
658 return setAtomicScope(MI, Scope, AddrSpace);
659 }
660
661 bool enableRMWCacheBypass(const MachineBasicBlock::iterator &MI,
662 SIAtomicScope Scope,
663 SIAtomicAddrSpace AddrSpace) const override {
664 return setAtomicScope(MI, Scope, AddrSpace);
665 }
666};
667
668class SIMemoryLegalizer final {
669private:
670 const MachineModuleInfo &MMI;
671 /// Cache Control.
672 std::unique_ptr<SICacheControl> CC = nullptr;
673
674 /// List of atomic pseudo instructions.
675 std::list<MachineBasicBlock::iterator> AtomicPseudoMIs;
676
677 /// Return true iff instruction \p MI is a atomic instruction that
678 /// returns a result.
679 bool isAtomicRet(const MachineInstr &MI) const {
681 }
682
683 /// Removes all processed atomic pseudo instructions from the current
684 /// function. Returns true if current function is modified, false otherwise.
685 bool removeAtomicPseudoMIs();
686
687 /// Expands load operation \p MI. Returns true if instructions are
688 /// added/deleted or \p MI is modified, false otherwise.
689 bool expandLoad(const SIMemOpInfo &MOI,
691 /// Expands store operation \p MI. Returns true if instructions are
692 /// added/deleted or \p MI is modified, false otherwise.
693 bool expandStore(const SIMemOpInfo &MOI,
695 /// Expands atomic fence operation \p MI. Returns true if
696 /// instructions are added/deleted or \p MI is modified, false otherwise.
697 bool expandAtomicFence(const SIMemOpInfo &MOI,
699 /// Expands atomic cmpxchg or rmw operation \p MI. Returns true if
700 /// instructions are added/deleted or \p MI is modified, false otherwise.
701 bool expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
703 /// Expands LDS DMA operation \p MI. Returns true if instructions are
704 /// added/deleted or \p MI is modified, false otherwise.
705 bool expandLDSDMA(const SIMemOpInfo &MOI, MachineBasicBlock::iterator &MI);
706
707public:
708 SIMemoryLegalizer(const MachineModuleInfo &MMI) : MMI(MMI) {};
709 bool run(MachineFunction &MF);
710};
711
712class SIMemoryLegalizerLegacy final : public MachineFunctionPass {
713public:
714 static char ID;
715
716 SIMemoryLegalizerLegacy() : MachineFunctionPass(ID) {}
717
718 void getAnalysisUsage(AnalysisUsage &AU) const override {
719 AU.setPreservesCFG();
721 }
722
723 StringRef getPassName() const override {
724 return PASS_NAME;
725 }
726
727 bool runOnMachineFunction(MachineFunction &MF) override;
728};
729
730static const StringMap<SIAtomicAddrSpace> ASNames = {{
731 {"global", SIAtomicAddrSpace::GLOBAL},
732 {"local", SIAtomicAddrSpace::LDS},
733}};
734
735void diagnoseUnknownMMRAASName(const MachineInstr &MI, StringRef AS) {
736 const MachineFunction *MF = MI.getMF();
737 const Function &Fn = MF->getFunction();
739 raw_svector_ostream OS(Str);
740 OS << "unknown address space '" << AS << "'; expected one of ";
742 for (const auto &[Name, Val] : ASNames)
743 OS << LS << '\'' << Name << '\'';
744 Fn.getContext().diagnose(
745 DiagnosticInfoUnsupported(Fn, Str.str(), MI.getDebugLoc(), DS_Warning));
746}
747
748/// Reads \p MI's MMRAs to parse the "amdgpu-synchronize-as" MMRA.
749/// If this tag isn't present, or if it has no meaningful values, returns
750/// \p none, otherwise returns the address spaces specified by the MD.
751static std::optional<SIAtomicAddrSpace>
752getSynchronizeAddrSpaceMD(const MachineInstr &MI) {
753 static constexpr StringLiteral FenceASPrefix = "amdgpu-synchronize-as";
754
755 auto MMRA = MMRAMetadata(MI.getMMRAMetadata());
756 if (!MMRA)
757 return std::nullopt;
758
759 SIAtomicAddrSpace Result = SIAtomicAddrSpace::NONE;
760 for (const auto &[Prefix, Suffix] : MMRA) {
761 if (Prefix != FenceASPrefix)
762 continue;
763
764 if (auto It = ASNames.find(Suffix); It != ASNames.end())
765 Result |= It->second;
766 else
767 diagnoseUnknownMMRAASName(MI, Suffix);
768 }
769
770 if (Result == SIAtomicAddrSpace::NONE)
771 return std::nullopt;
772
773 return Result;
774}
775
776} // end anonymous namespace
777
778void SIMemOpAccess::reportUnsupported(const MachineBasicBlock::iterator &MI,
779 const char *Msg) const {
780 const Function &Func = MI->getParent()->getParent()->getFunction();
781 Func.getContext().diagnose(
782 DiagnosticInfoUnsupported(Func, Msg, MI->getDebugLoc()));
783}
784
785std::optional<std::tuple<SIAtomicScope, SIAtomicAddrSpace, bool>>
786SIMemOpAccess::toSIAtomicScope(SyncScope::ID SSID,
787 SIAtomicAddrSpace InstrAddrSpace) const {
788 if (SSID == SyncScope::System)
789 return std::tuple(SIAtomicScope::SYSTEM, SIAtomicAddrSpace::ATOMIC, true);
790 if (SSID == MMI->getAgentSSID())
791 return std::tuple(SIAtomicScope::AGENT, SIAtomicAddrSpace::ATOMIC, true);
792 if (SSID == MMI->getClusterSSID())
793 return std::tuple(SIAtomicScope::CLUSTER, SIAtomicAddrSpace::ATOMIC, true);
794 if (SSID == MMI->getWorkgroupSSID())
795 return std::tuple(SIAtomicScope::WORKGROUP, SIAtomicAddrSpace::ATOMIC,
796 true);
797 if (SSID == MMI->getWavefrontSSID())
798 return std::tuple(SIAtomicScope::WAVEFRONT, SIAtomicAddrSpace::ATOMIC,
799 true);
800 if (SSID == SyncScope::SingleThread)
801 return std::tuple(SIAtomicScope::SINGLETHREAD, SIAtomicAddrSpace::ATOMIC,
802 true);
803 if (SSID == MMI->getSystemOneAddressSpaceSSID())
804 return std::tuple(SIAtomicScope::SYSTEM,
805 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
806 if (SSID == MMI->getAgentOneAddressSpaceSSID())
807 return std::tuple(SIAtomicScope::AGENT,
808 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
809 if (SSID == MMI->getClusterOneAddressSpaceSSID())
810 return std::tuple(SIAtomicScope::CLUSTER,
811 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
812 if (SSID == MMI->getWorkgroupOneAddressSpaceSSID())
813 return std::tuple(SIAtomicScope::WORKGROUP,
814 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
815 if (SSID == MMI->getWavefrontOneAddressSpaceSSID())
816 return std::tuple(SIAtomicScope::WAVEFRONT,
817 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
818 if (SSID == MMI->getSingleThreadOneAddressSpaceSSID())
819 return std::tuple(SIAtomicScope::SINGLETHREAD,
820 SIAtomicAddrSpace::ATOMIC & InstrAddrSpace, false);
821 return std::nullopt;
822}
823
824SIAtomicAddrSpace SIMemOpAccess::toSIAtomicAddrSpace(unsigned AS) const {
825 if (AS == AMDGPUAS::FLAT_ADDRESS)
826 return SIAtomicAddrSpace::FLAT;
827 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
828 return SIAtomicAddrSpace::GLOBAL;
829 if (AS == AMDGPUAS::LOCAL_ADDRESS)
830 return SIAtomicAddrSpace::LDS;
832 return SIAtomicAddrSpace::SCRATCH;
833 if (AS == AMDGPUAS::REGION_ADDRESS)
834 return SIAtomicAddrSpace::GDS;
837 return SIAtomicAddrSpace::GLOBAL;
838
839 return SIAtomicAddrSpace::OTHER;
840}
841
842SIMemOpAccess::SIMemOpAccess(const AMDGPUMachineModuleInfo &MMI_,
843 const GCNSubtarget &ST)
844 : MMI(&MMI_), ST(ST) {}
845
846std::optional<SIMemOpInfo> SIMemOpAccess::constructFromMIWithMMO(
847 const MachineBasicBlock::iterator &MI) const {
848 assert(MI->getNumMemOperands() > 0);
849
851 AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
852 AtomicOrdering FailureOrdering = AtomicOrdering::NotAtomic;
853 SIAtomicAddrSpace InstrAddrSpace = SIAtomicAddrSpace::NONE;
854 bool IsNonTemporal = true;
855 bool IsVolatile = false;
856 bool IsLastUse = false;
857 bool IsCooperative = false;
858
859 // Validator should check whether or not MMOs cover the entire set of
860 // locations accessed by the memory instruction.
861 for (const auto &MMO : MI->memoperands()) {
862 IsNonTemporal &= MMO->isNonTemporal();
863 IsVolatile |= MMO->isVolatile();
864 IsLastUse |= MMO->getFlags() & MOLastUse;
865 IsCooperative |= MMO->getFlags() & MOCooperative;
866 InstrAddrSpace |=
867 toSIAtomicAddrSpace(MMO->getPointerInfo().getAddrSpace());
868 AtomicOrdering OpOrdering = MMO->getSuccessOrdering();
869 if (OpOrdering != AtomicOrdering::NotAtomic) {
870 const auto &IsSyncScopeInclusion =
871 MMI->isSyncScopeInclusion(SSID, MMO->getSyncScopeID());
872 if (!IsSyncScopeInclusion) {
873 reportUnsupported(MI,
874 "Unsupported non-inclusive atomic synchronization scope");
875 return std::nullopt;
876 }
877
878 SSID = *IsSyncScopeInclusion ? SSID : MMO->getSyncScopeID();
879 Ordering = getMergedAtomicOrdering(Ordering, OpOrdering);
880 assert(MMO->getFailureOrdering() != AtomicOrdering::Release &&
881 MMO->getFailureOrdering() != AtomicOrdering::AcquireRelease);
882 FailureOrdering =
883 getMergedAtomicOrdering(FailureOrdering, MMO->getFailureOrdering());
884 }
885 }
886
887 SIAtomicScope Scope = SIAtomicScope::NONE;
888 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
889 bool IsCrossAddressSpaceOrdering = false;
890 if (Ordering != AtomicOrdering::NotAtomic) {
891 auto ScopeOrNone = toSIAtomicScope(SSID, InstrAddrSpace);
892 if (!ScopeOrNone) {
893 reportUnsupported(MI, "Unsupported atomic synchronization scope");
894 return std::nullopt;
895 }
896 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
897 *ScopeOrNone;
898 if ((OrderingAddrSpace == SIAtomicAddrSpace::NONE) ||
899 ((OrderingAddrSpace & SIAtomicAddrSpace::ATOMIC) != OrderingAddrSpace) ||
900 ((InstrAddrSpace & SIAtomicAddrSpace::ATOMIC) == SIAtomicAddrSpace::NONE)) {
901 reportUnsupported(MI, "Unsupported atomic address space");
902 return std::nullopt;
903 }
904 }
905 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace, InstrAddrSpace,
906 IsCrossAddressSpaceOrdering, FailureOrdering, IsVolatile,
907 IsNonTemporal, IsLastUse, IsCooperative);
908}
909
910std::optional<SIMemOpInfo>
911SIMemOpAccess::getLoadInfo(const MachineBasicBlock::iterator &MI) const {
912 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
913
914 if (!(MI->mayLoad() && !MI->mayStore()))
915 return std::nullopt;
916
917 // Be conservative if there are no memory operands.
918 if (MI->getNumMemOperands() == 0)
919 return SIMemOpInfo(ST);
920
921 return constructFromMIWithMMO(MI);
922}
923
924std::optional<SIMemOpInfo>
925SIMemOpAccess::getStoreInfo(const MachineBasicBlock::iterator &MI) const {
926 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
927
928 if (!(!MI->mayLoad() && MI->mayStore()))
929 return std::nullopt;
930
931 // Be conservative if there are no memory operands.
932 if (MI->getNumMemOperands() == 0)
933 return SIMemOpInfo(ST);
934
935 return constructFromMIWithMMO(MI);
936}
937
938std::optional<SIMemOpInfo>
939SIMemOpAccess::getAtomicFenceInfo(const MachineBasicBlock::iterator &MI) const {
940 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
941
942 if (MI->getOpcode() != AMDGPU::ATOMIC_FENCE)
943 return std::nullopt;
944
946 static_cast<AtomicOrdering>(MI->getOperand(0).getImm());
947
948 SyncScope::ID SSID = static_cast<SyncScope::ID>(MI->getOperand(1).getImm());
949 auto ScopeOrNone = toSIAtomicScope(SSID, SIAtomicAddrSpace::ATOMIC);
950 if (!ScopeOrNone) {
951 reportUnsupported(MI, "Unsupported atomic synchronization scope");
952 return std::nullopt;
953 }
954
955 SIAtomicScope Scope = SIAtomicScope::NONE;
956 SIAtomicAddrSpace OrderingAddrSpace = SIAtomicAddrSpace::NONE;
957 bool IsCrossAddressSpaceOrdering = false;
958 std::tie(Scope, OrderingAddrSpace, IsCrossAddressSpaceOrdering) =
959 *ScopeOrNone;
960
961 if (OrderingAddrSpace != SIAtomicAddrSpace::ATOMIC) {
962 // We currently expect refineOrderingAS to be the only place that
963 // can refine the AS ordered by the fence.
964 // If that changes, we need to review the semantics of that function
965 // in case it needs to preserve certain address spaces.
966 reportUnsupported(MI, "Unsupported atomic address space");
967 return std::nullopt;
968 }
969
970 auto SynchronizeAS = getSynchronizeAddrSpaceMD(*MI);
971 if (SynchronizeAS)
972 OrderingAddrSpace = *SynchronizeAS;
973
974 return SIMemOpInfo(ST, Ordering, Scope, OrderingAddrSpace,
975 SIAtomicAddrSpace::ATOMIC, IsCrossAddressSpaceOrdering,
976 AtomicOrdering::NotAtomic);
977}
978
979std::optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
980 const MachineBasicBlock::iterator &MI) const {
981 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
982
983 if (!(MI->mayLoad() && MI->mayStore()))
984 return std::nullopt;
985
986 // Be conservative if there are no memory operands.
987 if (MI->getNumMemOperands() == 0)
988 return SIMemOpInfo(ST);
989
990 return constructFromMIWithMMO(MI);
991}
992
993std::optional<SIMemOpInfo>
994SIMemOpAccess::getLDSDMAInfo(const MachineBasicBlock::iterator &MI) const {
995 assert(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic);
996
998 return std::nullopt;
999
1000 return constructFromMIWithMMO(MI);
1001}
1002
1003SICacheControl::SICacheControl(const GCNSubtarget &ST) : ST(ST) {
1004 TII = ST.getInstrInfo();
1005 IV = getIsaVersion(ST.getCPU());
1006 InsertCacheInv = !AmdgcnSkipCacheInvalidations;
1007}
1008
1009bool SICacheControl::enableNamedBit(const MachineBasicBlock::iterator MI,
1010 AMDGPU::CPol::CPol Bit) const {
1011 MachineOperand *CPol = TII->getNamedOperand(*MI, AMDGPU::OpName::cpol);
1012 if (!CPol)
1013 return false;
1014
1015 CPol->setImm(CPol->getImm() | Bit);
1016 return true;
1017}
1018
1019bool SICacheControl::canAffectGlobalAddrSpace(SIAtomicAddrSpace AS) const {
1020 assert((!ST.hasGloballyAddressableScratch() ||
1021 (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE ||
1022 (AS & SIAtomicAddrSpace::SCRATCH) == SIAtomicAddrSpace::NONE) &&
1023 "scratch instructions should already be replaced by flat "
1024 "instructions if GloballyAddressableScratch is enabled");
1025 return (AS & SIAtomicAddrSpace::GLOBAL) != SIAtomicAddrSpace::NONE;
1026}
1027
1028/* static */
1029std::unique_ptr<SICacheControl> SICacheControl::create(const GCNSubtarget &ST) {
1030 GCNSubtarget::Generation Generation = ST.getGeneration();
1031 if (ST.hasGFX940Insts())
1032 return std::make_unique<SIGfx940CacheControl>(ST);
1033 if (ST.hasGFX90AInsts())
1034 return std::make_unique<SIGfx90ACacheControl>(ST);
1035 if (Generation <= AMDGPUSubtarget::SOUTHERN_ISLANDS)
1036 return std::make_unique<SIGfx6CacheControl>(ST);
1037 if (Generation < AMDGPUSubtarget::GFX10)
1038 return std::make_unique<SIGfx7CacheControl>(ST);
1039 if (Generation < AMDGPUSubtarget::GFX11)
1040 return std::make_unique<SIGfx10CacheControl>(ST);
1041 if (Generation < AMDGPUSubtarget::GFX12)
1042 return std::make_unique<SIGfx11CacheControl>(ST);
1043 return std::make_unique<SIGfx12CacheControl>(ST);
1044}
1045
1046bool SIGfx6CacheControl::enableLoadCacheBypass(
1048 SIAtomicScope Scope,
1049 SIAtomicAddrSpace AddrSpace) const {
1050 assert(MI->mayLoad() && !MI->mayStore());
1051 bool Changed = false;
1052
1053 if (canAffectGlobalAddrSpace(AddrSpace)) {
1054 switch (Scope) {
1055 case SIAtomicScope::SYSTEM:
1056 case SIAtomicScope::AGENT:
1057 // Set L1 cache policy to MISS_EVICT.
1058 // Note: there is no L2 cache bypass policy at the ISA level.
1059 Changed |= enableGLCBit(MI);
1060 break;
1061 case SIAtomicScope::WORKGROUP:
1062 case SIAtomicScope::WAVEFRONT:
1063 case SIAtomicScope::SINGLETHREAD:
1064 // No cache to bypass.
1065 break;
1066 default:
1067 llvm_unreachable("Unsupported synchronization scope");
1068 }
1069 }
1070
1071 /// The scratch address space does not need the global memory caches
1072 /// to be bypassed as all memory operations by the same thread are
1073 /// sequentially consistent, and no other thread can access scratch
1074 /// memory.
1075
1076 /// Other address spaces do not have a cache.
1077
1078 return Changed;
1079}
1080
1081bool SIGfx6CacheControl::enableStoreCacheBypass(
1083 SIAtomicScope Scope,
1084 SIAtomicAddrSpace AddrSpace) const {
1085 assert(!MI->mayLoad() && MI->mayStore());
1086 bool Changed = false;
1087
1088 /// The L1 cache is write through so does not need to be bypassed. There is no
1089 /// bypass control for the L2 cache at the isa level.
1090
1091 return Changed;
1092}
1093
1094bool SIGfx6CacheControl::enableRMWCacheBypass(
1096 SIAtomicScope Scope,
1097 SIAtomicAddrSpace AddrSpace) const {
1098 assert(MI->mayLoad() && MI->mayStore());
1099 bool Changed = false;
1100
1101 /// Do not set GLC for RMW atomic operations as L0/L1 cache is automatically
1102 /// bypassed, and the GLC bit is instead used to indicate if they are
1103 /// return or no-return.
1104 /// Note: there is no L2 cache coherent bypass control at the ISA level.
1105
1106 return Changed;
1107}
1108
1109bool SIGfx6CacheControl::enableVolatileAndOrNonTemporal(
1110 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1111 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1112 // Only handle load and store, not atomic read-modify-write insructions. The
1113 // latter use glc to indicate if the atomic returns a result and so must not
1114 // be used for cache control.
1115 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1116
1117 // Only update load and store, not LLVM IR atomic read-modify-write
1118 // instructions. The latter are always marked as volatile so cannot sensibly
1119 // handle it as do not want to pessimize all atomics. Also they do not support
1120 // the nontemporal attribute.
1121 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1122
1123 bool Changed = false;
1124
1125 if (IsVolatile) {
1126 // Set L1 cache policy to be MISS_EVICT for load instructions
1127 // and MISS_LRU for store instructions.
1128 // Note: there is no L2 cache bypass policy at the ISA level.
1129 if (Op == SIMemOp::LOAD)
1130 Changed |= enableGLCBit(MI);
1131
1132 // Ensure operation has completed at system scope to cause all volatile
1133 // operations to be visible outside the program in a global order. Do not
1134 // request cross address space as only the global address space can be
1135 // observable outside the program, so no need to cause a waitcnt for LDS
1136 // address space operations.
1137 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1138 Position::AFTER, AtomicOrdering::Unordered,
1139 /*AtomicsOnly=*/false);
1140
1141 return Changed;
1142 }
1143
1144 if (IsNonTemporal) {
1145 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1146 // for both loads and stores, and the L2 cache policy to STREAM.
1147 Changed |= enableGLCBit(MI);
1148 Changed |= enableSLCBit(MI);
1149 return Changed;
1150 }
1151
1152 return Changed;
1153}
1154
1155bool SIGfx6CacheControl::insertWait(MachineBasicBlock::iterator &MI,
1156 SIAtomicScope Scope,
1157 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1158 bool IsCrossAddrSpaceOrdering, Position Pos,
1159 AtomicOrdering Order,
1160 bool AtomicsOnly) const {
1161 bool Changed = false;
1162
1163 MachineBasicBlock &MBB = *MI->getParent();
1164 DebugLoc DL = MI->getDebugLoc();
1165
1166 if (Pos == Position::AFTER)
1167 ++MI;
1168
1169 bool VMCnt = false;
1170 bool LGKMCnt = false;
1171
1172 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
1173 SIAtomicAddrSpace::NONE) {
1174 switch (Scope) {
1175 case SIAtomicScope::SYSTEM:
1176 case SIAtomicScope::AGENT:
1177 VMCnt |= true;
1178 break;
1179 case SIAtomicScope::WORKGROUP:
1180 case SIAtomicScope::WAVEFRONT:
1181 case SIAtomicScope::SINGLETHREAD:
1182 // The L1 cache keeps all memory operations in order for
1183 // wavefronts in the same work-group.
1184 break;
1185 default:
1186 llvm_unreachable("Unsupported synchronization scope");
1187 }
1188 }
1189
1190 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1191 switch (Scope) {
1192 case SIAtomicScope::SYSTEM:
1193 case SIAtomicScope::AGENT:
1194 case SIAtomicScope::WORKGROUP:
1195 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
1196 // not needed as LDS operations for all waves are executed in a total
1197 // global ordering as observed by all waves. Required if also
1198 // synchronizing with global/GDS memory as LDS operations could be
1199 // reordered with respect to later global/GDS memory operations of the
1200 // same wave.
1201 LGKMCnt |= IsCrossAddrSpaceOrdering;
1202 break;
1203 case SIAtomicScope::WAVEFRONT:
1204 case SIAtomicScope::SINGLETHREAD:
1205 // The LDS keeps all memory operations in order for
1206 // the same wavefront.
1207 break;
1208 default:
1209 llvm_unreachable("Unsupported synchronization scope");
1210 }
1211 }
1212
1213 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
1214 switch (Scope) {
1215 case SIAtomicScope::SYSTEM:
1216 case SIAtomicScope::AGENT:
1217 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
1218 // is not needed as GDS operations for all waves are executed in a total
1219 // global ordering as observed by all waves. Required if also
1220 // synchronizing with global/LDS memory as GDS operations could be
1221 // reordered with respect to later global/LDS memory operations of the
1222 // same wave.
1223 LGKMCnt |= IsCrossAddrSpaceOrdering;
1224 break;
1225 case SIAtomicScope::WORKGROUP:
1226 case SIAtomicScope::WAVEFRONT:
1227 case SIAtomicScope::SINGLETHREAD:
1228 // The GDS keeps all memory operations in order for
1229 // the same work-group.
1230 break;
1231 default:
1232 llvm_unreachable("Unsupported synchronization scope");
1233 }
1234 }
1235
1236 if (VMCnt || LGKMCnt) {
1237 unsigned WaitCntImmediate =
1239 VMCnt ? 0 : getVmcntBitMask(IV),
1241 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
1242 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
1243 .addImm(WaitCntImmediate);
1244 Changed = true;
1245 }
1246
1247 // On architectures that support direct loads to LDS, emit an unknown waitcnt
1248 // at workgroup-scoped release operations that specify the LDS address space.
1249 // SIInsertWaitcnts will later replace this with a vmcnt().
1250 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
1251 Scope == SIAtomicScope::WORKGROUP &&
1252 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
1253 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
1254 Changed = true;
1255 }
1256
1257 if (Pos == Position::AFTER)
1258 --MI;
1259
1260 return Changed;
1261}
1262
1263bool SIGfx6CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1264 SIAtomicScope Scope,
1265 SIAtomicAddrSpace AddrSpace,
1266 Position Pos) const {
1267 if (!InsertCacheInv)
1268 return false;
1269
1270 bool Changed = false;
1271
1272 MachineBasicBlock &MBB = *MI->getParent();
1273 DebugLoc DL = MI->getDebugLoc();
1274
1275 if (Pos == Position::AFTER)
1276 ++MI;
1277
1278 if (canAffectGlobalAddrSpace(AddrSpace)) {
1279 switch (Scope) {
1280 case SIAtomicScope::SYSTEM:
1281 case SIAtomicScope::AGENT:
1282 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBINVL1));
1283 Changed = true;
1284 break;
1285 case SIAtomicScope::WORKGROUP:
1286 case SIAtomicScope::WAVEFRONT:
1287 case SIAtomicScope::SINGLETHREAD:
1288 // No cache to invalidate.
1289 break;
1290 default:
1291 llvm_unreachable("Unsupported synchronization scope");
1292 }
1293 }
1294
1295 /// The scratch address space does not need the global memory cache
1296 /// to be flushed as all memory operations by the same thread are
1297 /// sequentially consistent, and no other thread can access scratch
1298 /// memory.
1299
1300 /// Other address spaces do not have a cache.
1301
1302 if (Pos == Position::AFTER)
1303 --MI;
1304
1305 return Changed;
1306}
1307
1308bool SIGfx6CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1309 SIAtomicScope Scope,
1310 SIAtomicAddrSpace AddrSpace,
1311 bool IsCrossAddrSpaceOrdering,
1312 Position Pos) const {
1313 return insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1314 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1315 /*AtomicsOnly=*/false);
1316}
1317
1318bool SIGfx7CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1319 SIAtomicScope Scope,
1320 SIAtomicAddrSpace AddrSpace,
1321 Position Pos) const {
1322 if (!InsertCacheInv)
1323 return false;
1324
1325 bool Changed = false;
1326
1327 MachineBasicBlock &MBB = *MI->getParent();
1328 DebugLoc DL = MI->getDebugLoc();
1329
1330 const GCNSubtarget &STM = MBB.getParent()->getSubtarget<GCNSubtarget>();
1331
1332 const unsigned InvalidateL1 = STM.isAmdPalOS() || STM.isMesa3DOS()
1333 ? AMDGPU::BUFFER_WBINVL1
1334 : AMDGPU::BUFFER_WBINVL1_VOL;
1335
1336 if (Pos == Position::AFTER)
1337 ++MI;
1338
1339 if (canAffectGlobalAddrSpace(AddrSpace)) {
1340 switch (Scope) {
1341 case SIAtomicScope::SYSTEM:
1342 case SIAtomicScope::AGENT:
1343 BuildMI(MBB, MI, DL, TII->get(InvalidateL1));
1344 Changed = true;
1345 break;
1346 case SIAtomicScope::WORKGROUP:
1347 case SIAtomicScope::WAVEFRONT:
1348 case SIAtomicScope::SINGLETHREAD:
1349 // No cache to invalidate.
1350 break;
1351 default:
1352 llvm_unreachable("Unsupported synchronization scope");
1353 }
1354 }
1355
1356 /// The scratch address space does not need the global memory cache
1357 /// to be flushed as all memory operations by the same thread are
1358 /// sequentially consistent, and no other thread can access scratch
1359 /// memory.
1360
1361 /// Other address spaces do not have a cache.
1362
1363 if (Pos == Position::AFTER)
1364 --MI;
1365
1366 return Changed;
1367}
1368
1369bool SIGfx90ACacheControl::enableLoadCacheBypass(
1371 SIAtomicScope Scope,
1372 SIAtomicAddrSpace AddrSpace) const {
1373 assert(MI->mayLoad() && !MI->mayStore());
1374 bool Changed = false;
1375
1376 if (canAffectGlobalAddrSpace(AddrSpace)) {
1377 switch (Scope) {
1378 case SIAtomicScope::SYSTEM:
1379 case SIAtomicScope::AGENT:
1380 // Set the L1 cache policy to MISS_LRU.
1381 // Note: there is no L2 cache bypass policy at the ISA level.
1382 Changed |= enableGLCBit(MI);
1383 break;
1384 case SIAtomicScope::WORKGROUP:
1385 // In threadgroup split mode the waves of a work-group can be executing on
1386 // different CUs. Therefore need to bypass the L1 which is per CU.
1387 // Otherwise in non-threadgroup split mode all waves of a work-group are
1388 // on the same CU, and so the L1 does not need to be bypassed.
1389 if (ST.isTgSplitEnabled())
1390 Changed |= enableGLCBit(MI);
1391 break;
1392 case SIAtomicScope::WAVEFRONT:
1393 case SIAtomicScope::SINGLETHREAD:
1394 // No cache to bypass.
1395 break;
1396 default:
1397 llvm_unreachable("Unsupported synchronization scope");
1398 }
1399 }
1400
1401 /// The scratch address space does not need the global memory caches
1402 /// to be bypassed as all memory operations by the same thread are
1403 /// sequentially consistent, and no other thread can access scratch
1404 /// memory.
1405
1406 /// Other address spaces do not have a cache.
1407
1408 return Changed;
1409}
1410
1411bool SIGfx90ACacheControl::enableRMWCacheBypass(
1413 SIAtomicScope Scope,
1414 SIAtomicAddrSpace AddrSpace) const {
1415 assert(MI->mayLoad() && MI->mayStore());
1416 bool Changed = false;
1417
1418 if (canAffectGlobalAddrSpace(AddrSpace)) {
1419 switch (Scope) {
1420 case SIAtomicScope::SYSTEM:
1421 case SIAtomicScope::AGENT:
1422 /// Do not set glc for RMW atomic operations as they implicitly bypass
1423 /// the L1 cache, and the glc bit is instead used to indicate if they are
1424 /// return or no-return.
1425 break;
1426 case SIAtomicScope::WORKGROUP:
1427 case SIAtomicScope::WAVEFRONT:
1428 case SIAtomicScope::SINGLETHREAD:
1429 // No cache to bypass. RMW atomics implicitly bypass the L1 cache.
1430 break;
1431 default:
1432 llvm_unreachable("Unsupported synchronization scope");
1433 }
1434 }
1435
1436 return Changed;
1437}
1438
1439bool SIGfx90ACacheControl::enableVolatileAndOrNonTemporal(
1440 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1441 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1442 // Only handle load and store, not atomic read-modify-write insructions. The
1443 // latter use glc to indicate if the atomic returns a result and so must not
1444 // be used for cache control.
1445 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1446
1447 // Only update load and store, not LLVM IR atomic read-modify-write
1448 // instructions. The latter are always marked as volatile so cannot sensibly
1449 // handle it as do not want to pessimize all atomics. Also they do not support
1450 // the nontemporal attribute.
1451 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1452
1453 bool Changed = false;
1454
1455 if (IsVolatile) {
1456 // Set L1 cache policy to be MISS_EVICT for load instructions
1457 // and MISS_LRU for store instructions.
1458 // Note: there is no L2 cache bypass policy at the ISA level.
1459 if (Op == SIMemOp::LOAD)
1460 Changed |= enableGLCBit(MI);
1461
1462 // Ensure operation has completed at system scope to cause all volatile
1463 // operations to be visible outside the program in a global order. Do not
1464 // request cross address space as only the global address space can be
1465 // observable outside the program, so no need to cause a waitcnt for LDS
1466 // address space operations.
1467 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1468 Position::AFTER, AtomicOrdering::Unordered,
1469 /*AtomicsOnly=*/false);
1470
1471 return Changed;
1472 }
1473
1474 if (IsNonTemporal) {
1475 // Setting both GLC and SLC configures L1 cache policy to MISS_EVICT
1476 // for both loads and stores, and the L2 cache policy to STREAM.
1477 Changed |= enableGLCBit(MI);
1478 Changed |= enableSLCBit(MI);
1479 return Changed;
1480 }
1481
1482 return Changed;
1483}
1484
1485bool SIGfx90ACacheControl::insertWait(MachineBasicBlock::iterator &MI,
1486 SIAtomicScope Scope,
1487 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1488 bool IsCrossAddrSpaceOrdering,
1489 Position Pos, AtomicOrdering Order,
1490 bool AtomicsOnly) const {
1491 if (ST.isTgSplitEnabled()) {
1492 // In threadgroup split mode the waves of a work-group can be executing on
1493 // different CUs. Therefore need to wait for global or GDS memory operations
1494 // to complete to ensure they are visible to waves in the other CUs.
1495 // Otherwise in non-threadgroup split mode all waves of a work-group are on
1496 // the same CU, so no need to wait for global memory as all waves in the
1497 // work-group access the same the L1, nor wait for GDS as access are ordered
1498 // on a CU.
1499 if (((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH |
1500 SIAtomicAddrSpace::GDS)) != SIAtomicAddrSpace::NONE) &&
1501 (Scope == SIAtomicScope::WORKGROUP)) {
1502 // Same as GFX7 using agent scope.
1503 Scope = SIAtomicScope::AGENT;
1504 }
1505 // In threadgroup split mode LDS cannot be allocated so no need to wait for
1506 // LDS memory operations.
1507 AddrSpace &= ~SIAtomicAddrSpace::LDS;
1508 }
1509 return SIGfx7CacheControl::insertWait(MI, Scope, AddrSpace, Op,
1510 IsCrossAddrSpaceOrdering, Pos, Order,
1511 AtomicsOnly);
1512}
1513
1514bool SIGfx90ACacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1515 SIAtomicScope Scope,
1516 SIAtomicAddrSpace AddrSpace,
1517 Position Pos) const {
1518 if (!InsertCacheInv)
1519 return false;
1520
1521 bool Changed = false;
1522
1523 MachineBasicBlock &MBB = *MI->getParent();
1524 DebugLoc DL = MI->getDebugLoc();
1525
1526 if (Pos == Position::AFTER)
1527 ++MI;
1528
1529 if (canAffectGlobalAddrSpace(AddrSpace)) {
1530 switch (Scope) {
1531 case SIAtomicScope::SYSTEM:
1532 // Ensures that following loads will not see stale remote VMEM data or
1533 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1534 // CC will never be stale due to the local memory probes.
1535 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INVL2));
1536 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1537 // hardware does not reorder memory operations by the same wave with
1538 // respect to a preceding "BUFFER_INVL2". The invalidate is guaranteed to
1539 // remove any cache lines of earlier writes by the same wave and ensures
1540 // later reads by the same wave will refetch the cache lines.
1541 Changed = true;
1542 break;
1543 case SIAtomicScope::AGENT:
1544 // Same as GFX7.
1545 break;
1546 case SIAtomicScope::WORKGROUP:
1547 // In threadgroup split mode the waves of a work-group can be executing on
1548 // different CUs. Therefore need to invalidate the L1 which is per CU.
1549 // Otherwise in non-threadgroup split mode all waves of a work-group are
1550 // on the same CU, and so the L1 does not need to be invalidated.
1551 if (ST.isTgSplitEnabled()) {
1552 // Same as GFX7 using agent scope.
1553 Scope = SIAtomicScope::AGENT;
1554 }
1555 break;
1556 case SIAtomicScope::WAVEFRONT:
1557 case SIAtomicScope::SINGLETHREAD:
1558 // Same as GFX7.
1559 break;
1560 default:
1561 llvm_unreachable("Unsupported synchronization scope");
1562 }
1563 }
1564
1565 /// The scratch address space does not need the global memory cache
1566 /// to be flushed as all memory operations by the same thread are
1567 /// sequentially consistent, and no other thread can access scratch
1568 /// memory.
1569
1570 /// Other address spaces do not have a cache.
1571
1572 if (Pos == Position::AFTER)
1573 --MI;
1574
1575 Changed |= SIGfx7CacheControl::insertAcquire(MI, Scope, AddrSpace, Pos);
1576
1577 return Changed;
1578}
1579
1580bool SIGfx90ACacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1581 SIAtomicScope Scope,
1582 SIAtomicAddrSpace AddrSpace,
1583 bool IsCrossAddrSpaceOrdering,
1584 Position Pos) const {
1585 bool Changed = false;
1586
1587 MachineBasicBlock &MBB = *MI->getParent();
1588 const DebugLoc &DL = MI->getDebugLoc();
1589
1590 if (Pos == Position::AFTER)
1591 ++MI;
1592
1593 if (canAffectGlobalAddrSpace(AddrSpace)) {
1594 switch (Scope) {
1595 case SIAtomicScope::SYSTEM:
1596 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1597 // hardware does not reorder memory operations by the same wave with
1598 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1599 // to initiate writeback of any dirty cache lines of earlier writes by the
1600 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1601 // writeback has completed.
1602 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1603 // Set SC bits to indicate system scope.
1605 // Followed by same as GFX7, which will ensure the necessary "S_WAITCNT
1606 // vmcnt(0)" needed by the "BUFFER_WBL2".
1607 Changed = true;
1608 break;
1609 case SIAtomicScope::AGENT:
1610 case SIAtomicScope::WORKGROUP:
1611 case SIAtomicScope::WAVEFRONT:
1612 case SIAtomicScope::SINGLETHREAD:
1613 // Same as GFX7.
1614 break;
1615 default:
1616 llvm_unreachable("Unsupported synchronization scope");
1617 }
1618 }
1619
1620 if (Pos == Position::AFTER)
1621 --MI;
1622
1623 Changed |=
1624 SIGfx7CacheControl::insertRelease(MI, Scope, AddrSpace,
1625 IsCrossAddrSpaceOrdering, Pos);
1626
1627 return Changed;
1628}
1629
1630bool SIGfx940CacheControl::enableLoadCacheBypass(
1631 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1632 SIAtomicAddrSpace AddrSpace) const {
1633 assert(MI->mayLoad() && !MI->mayStore());
1634 bool Changed = false;
1635
1636 if (canAffectGlobalAddrSpace(AddrSpace)) {
1637 switch (Scope) {
1638 case SIAtomicScope::SYSTEM:
1639 // Set SC bits to indicate system scope.
1640 Changed |= enableSC0Bit(MI);
1641 Changed |= enableSC1Bit(MI);
1642 break;
1643 case SIAtomicScope::AGENT:
1644 // Set SC bits to indicate agent scope.
1645 Changed |= enableSC1Bit(MI);
1646 break;
1647 case SIAtomicScope::WORKGROUP:
1648 // In threadgroup split mode the waves of a work-group can be executing on
1649 // different CUs. Therefore need to bypass the L1 which is per CU.
1650 // Otherwise in non-threadgroup split mode all waves of a work-group are
1651 // on the same CU, and so the L1 does not need to be bypassed. Setting SC
1652 // bits to indicate work-group scope will do this automatically.
1653 Changed |= enableSC0Bit(MI);
1654 break;
1655 case SIAtomicScope::WAVEFRONT:
1656 case SIAtomicScope::SINGLETHREAD:
1657 // Leave SC bits unset to indicate wavefront scope.
1658 break;
1659 default:
1660 llvm_unreachable("Unsupported synchronization scope");
1661 }
1662 }
1663
1664 /// The scratch address space does not need the global memory caches
1665 /// to be bypassed as all memory operations by the same thread are
1666 /// sequentially consistent, and no other thread can access scratch
1667 /// memory.
1668
1669 /// Other address spaces do not have a cache.
1670
1671 return Changed;
1672}
1673
1674bool SIGfx940CacheControl::enableStoreCacheBypass(
1676 SIAtomicScope Scope, SIAtomicAddrSpace AddrSpace) const {
1677 assert(!MI->mayLoad() && MI->mayStore());
1678 bool Changed = false;
1679
1680 if (canAffectGlobalAddrSpace(AddrSpace)) {
1681 switch (Scope) {
1682 case SIAtomicScope::SYSTEM:
1683 // Set SC bits to indicate system scope.
1684 Changed |= enableSC0Bit(MI);
1685 Changed |= enableSC1Bit(MI);
1686 break;
1687 case SIAtomicScope::AGENT:
1688 // Set SC bits to indicate agent scope.
1689 Changed |= enableSC1Bit(MI);
1690 break;
1691 case SIAtomicScope::WORKGROUP:
1692 // Set SC bits to indicate workgroup scope.
1693 Changed |= enableSC0Bit(MI);
1694 break;
1695 case SIAtomicScope::WAVEFRONT:
1696 case SIAtomicScope::SINGLETHREAD:
1697 // Leave SC bits unset to indicate wavefront scope.
1698 break;
1699 default:
1700 llvm_unreachable("Unsupported synchronization scope");
1701 }
1702 }
1703
1704 /// The scratch address space does not need the global memory caches
1705 /// to be bypassed as all memory operations by the same thread are
1706 /// sequentially consistent, and no other thread can access scratch
1707 /// memory.
1708
1709 /// Other address spaces do not have a cache.
1710
1711 return Changed;
1712}
1713
1714bool SIGfx940CacheControl::enableRMWCacheBypass(
1715 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
1716 SIAtomicAddrSpace AddrSpace) const {
1717 assert(MI->mayLoad() && MI->mayStore());
1718 bool Changed = false;
1719
1720 if (canAffectGlobalAddrSpace(AddrSpace)) {
1721 switch (Scope) {
1722 case SIAtomicScope::SYSTEM:
1723 // Set SC1 bit to indicate system scope.
1724 Changed |= enableSC1Bit(MI);
1725 break;
1726 case SIAtomicScope::AGENT:
1727 case SIAtomicScope::WORKGROUP:
1728 case SIAtomicScope::WAVEFRONT:
1729 case SIAtomicScope::SINGLETHREAD:
1730 // RMW atomic operations implicitly bypass the L1 cache and only use SC1
1731 // to indicate system or agent scope. The SC0 bit is used to indicate if
1732 // they are return or no-return. Leave SC1 bit unset to indicate agent
1733 // scope.
1734 break;
1735 default:
1736 llvm_unreachable("Unsupported synchronization scope");
1737 }
1738 }
1739
1740 return Changed;
1741}
1742
1743bool SIGfx940CacheControl::enableVolatileAndOrNonTemporal(
1744 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1745 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1746 // Only handle load and store, not atomic read-modify-write insructions. The
1747 // latter use glc to indicate if the atomic returns a result and so must not
1748 // be used for cache control.
1749 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1750
1751 // Only update load and store, not LLVM IR atomic read-modify-write
1752 // instructions. The latter are always marked as volatile so cannot sensibly
1753 // handle it as do not want to pessimize all atomics. Also they do not support
1754 // the nontemporal attribute.
1755 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1756
1757 bool Changed = false;
1758
1759 if (IsVolatile) {
1760 // Set SC bits to indicate system scope.
1761 Changed |= enableSC0Bit(MI);
1762 Changed |= enableSC1Bit(MI);
1763
1764 // Ensure operation has completed at system scope to cause all volatile
1765 // operations to be visible outside the program in a global order. Do not
1766 // request cross address space as only the global address space can be
1767 // observable outside the program, so no need to cause a waitcnt for LDS
1768 // address space operations.
1769 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
1770 Position::AFTER, AtomicOrdering::Unordered,
1771 /*AtomicsOnly=*/false);
1772
1773 return Changed;
1774 }
1775
1776 if (IsNonTemporal) {
1777 Changed |= enableNTBit(MI);
1778 return Changed;
1779 }
1780
1781 return Changed;
1782}
1783
1784bool SIGfx940CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
1785 SIAtomicScope Scope,
1786 SIAtomicAddrSpace AddrSpace,
1787 Position Pos) const {
1788 if (!InsertCacheInv)
1789 return false;
1790
1791 bool Changed = false;
1792
1793 MachineBasicBlock &MBB = *MI->getParent();
1794 DebugLoc DL = MI->getDebugLoc();
1795
1796 if (Pos == Position::AFTER)
1797 ++MI;
1798
1799 if (canAffectGlobalAddrSpace(AddrSpace)) {
1800 switch (Scope) {
1801 case SIAtomicScope::SYSTEM:
1802 // Ensures that following loads will not see stale remote VMEM data or
1803 // stale local VMEM data with MTYPE NC. Local VMEM data with MTYPE RW and
1804 // CC will never be stale due to the local memory probes.
1805 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1806 // Set SC bits to indicate system scope.
1808 // Inserting a "S_WAITCNT vmcnt(0)" after is not required because the
1809 // hardware does not reorder memory operations by the same wave with
1810 // respect to a preceding "BUFFER_INV". The invalidate is guaranteed to
1811 // remove any cache lines of earlier writes by the same wave and ensures
1812 // later reads by the same wave will refetch the cache lines.
1813 Changed = true;
1814 break;
1815 case SIAtomicScope::AGENT:
1816 // Ensures that following loads will not see stale remote date or local
1817 // MTYPE NC global data. Local MTYPE RW and CC memory will never be stale
1818 // due to the memory probes.
1819 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1820 // Set SC bits to indicate agent scope.
1822 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1823 // does not reorder memory operations with respect to preceeding buffer
1824 // invalidate. The invalidate is guaranteed to remove any cache lines of
1825 // earlier writes and ensures later writes will refetch the cache lines.
1826 Changed = true;
1827 break;
1828 case SIAtomicScope::WORKGROUP:
1829 // In threadgroup split mode the waves of a work-group can be executing on
1830 // different CUs. Therefore need to invalidate the L1 which is per CU.
1831 // Otherwise in non-threadgroup split mode all waves of a work-group are
1832 // on the same CU, and so the L1 does not need to be invalidated.
1833 if (ST.isTgSplitEnabled()) {
1834 // Ensures L1 is invalidated if in threadgroup split mode. In
1835 // non-threadgroup split mode it is a NOP, but no point generating it in
1836 // that case if know not in that mode.
1837 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_INV))
1838 // Set SC bits to indicate work-group scope.
1840 // Inserting "S_WAITCNT vmcnt(0)" is not required because the hardware
1841 // does not reorder memory operations with respect to preceeding buffer
1842 // invalidate. The invalidate is guaranteed to remove any cache lines of
1843 // earlier writes and ensures later writes will refetch the cache lines.
1844 Changed = true;
1845 }
1846 break;
1847 case SIAtomicScope::WAVEFRONT:
1848 case SIAtomicScope::SINGLETHREAD:
1849 // Could generate "BUFFER_INV" but it would do nothing as there are no
1850 // caches to invalidate.
1851 break;
1852 default:
1853 llvm_unreachable("Unsupported synchronization scope");
1854 }
1855 }
1856
1857 /// The scratch address space does not need the global memory cache
1858 /// to be flushed as all memory operations by the same thread are
1859 /// sequentially consistent, and no other thread can access scratch
1860 /// memory.
1861
1862 /// Other address spaces do not have a cache.
1863
1864 if (Pos == Position::AFTER)
1865 --MI;
1866
1867 return Changed;
1868}
1869
1870bool SIGfx940CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
1871 SIAtomicScope Scope,
1872 SIAtomicAddrSpace AddrSpace,
1873 bool IsCrossAddrSpaceOrdering,
1874 Position Pos) const {
1875 bool Changed = false;
1876
1877 MachineBasicBlock &MBB = *MI->getParent();
1878 DebugLoc DL = MI->getDebugLoc();
1879
1880 if (Pos == Position::AFTER)
1881 ++MI;
1882
1883 if (canAffectGlobalAddrSpace(AddrSpace)) {
1884 switch (Scope) {
1885 case SIAtomicScope::SYSTEM:
1886 // Inserting a "S_WAITCNT vmcnt(0)" before is not required because the
1887 // hardware does not reorder memory operations by the same wave with
1888 // respect to a following "BUFFER_WBL2". The "BUFFER_WBL2" is guaranteed
1889 // to initiate writeback of any dirty cache lines of earlier writes by the
1890 // same wave. A "S_WAITCNT vmcnt(0)" is needed after to ensure the
1891 // writeback has completed.
1892 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1893 // Set SC bits to indicate system scope.
1895 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1896 // SIAtomicScope::SYSTEM, the following insertWait will generate the
1897 // required "S_WAITCNT vmcnt(0)" needed by the "BUFFER_WBL2".
1898 Changed = true;
1899 break;
1900 case SIAtomicScope::AGENT:
1901 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_WBL2))
1902 // Set SC bits to indicate agent scope.
1904
1905 // Since AddrSpace contains SIAtomicAddrSpace::GLOBAL and Scope is
1906 // SIAtomicScope::AGENT, the following insertWait will generate the
1907 // required "S_WAITCNT vmcnt(0)".
1908 Changed = true;
1909 break;
1910 case SIAtomicScope::WORKGROUP:
1911 case SIAtomicScope::WAVEFRONT:
1912 case SIAtomicScope::SINGLETHREAD:
1913 // Do not generate "BUFFER_WBL2" as there are no caches it would
1914 // writeback, and would require an otherwise unnecessary
1915 // "S_WAITCNT vmcnt(0)".
1916 break;
1917 default:
1918 llvm_unreachable("Unsupported synchronization scope");
1919 }
1920 }
1921
1922 if (Pos == Position::AFTER)
1923 --MI;
1924
1925 // Ensure the necessary S_WAITCNT needed by any "BUFFER_WBL2" as well as other
1926 // S_WAITCNT needed.
1927 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
1928 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
1929 /*AtomicsOnly=*/false);
1930
1931 return Changed;
1932}
1933
1934bool SIGfx10CacheControl::enableLoadCacheBypass(
1936 SIAtomicScope Scope,
1937 SIAtomicAddrSpace AddrSpace) const {
1938 assert(MI->mayLoad() && !MI->mayStore());
1939 bool Changed = false;
1940
1941 if (canAffectGlobalAddrSpace(AddrSpace)) {
1942 switch (Scope) {
1943 case SIAtomicScope::SYSTEM:
1944 case SIAtomicScope::AGENT:
1945 // Set the L0 and L1 cache policies to MISS_EVICT.
1946 // Note: there is no L2 cache coherent bypass control at the ISA level.
1947 Changed |= enableGLCBit(MI);
1948 Changed |= enableDLCBit(MI);
1949 break;
1950 case SIAtomicScope::WORKGROUP:
1951 // In WGP mode the waves of a work-group can be executing on either CU of
1952 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
1953 // CU mode all waves of a work-group are on the same CU, and so the L0
1954 // does not need to be bypassed.
1955 if (!ST.isCuModeEnabled())
1956 Changed |= enableGLCBit(MI);
1957 break;
1958 case SIAtomicScope::WAVEFRONT:
1959 case SIAtomicScope::SINGLETHREAD:
1960 // No cache to bypass.
1961 break;
1962 default:
1963 llvm_unreachable("Unsupported synchronization scope");
1964 }
1965 }
1966
1967 /// The scratch address space does not need the global memory caches
1968 /// to be bypassed as all memory operations by the same thread are
1969 /// sequentially consistent, and no other thread can access scratch
1970 /// memory.
1971
1972 /// Other address spaces do not have a cache.
1973
1974 return Changed;
1975}
1976
1977bool SIGfx10CacheControl::enableVolatileAndOrNonTemporal(
1978 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
1979 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
1980
1981 // Only handle load and store, not atomic read-modify-write insructions. The
1982 // latter use glc to indicate if the atomic returns a result and so must not
1983 // be used for cache control.
1984 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
1985
1986 // Only update load and store, not LLVM IR atomic read-modify-write
1987 // instructions. The latter are always marked as volatile so cannot sensibly
1988 // handle it as do not want to pessimize all atomics. Also they do not support
1989 // the nontemporal attribute.
1990 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
1991
1992 bool Changed = false;
1993
1994 if (IsVolatile) {
1995 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
1996 // and MISS_LRU for store instructions.
1997 // Note: there is no L2 cache coherent bypass control at the ISA level.
1998 if (Op == SIMemOp::LOAD) {
1999 Changed |= enableGLCBit(MI);
2000 Changed |= enableDLCBit(MI);
2001 }
2002
2003 // Ensure operation has completed at system scope to cause all volatile
2004 // operations to be visible outside the program in a global order. Do not
2005 // request cross address space as only the global address space can be
2006 // observable outside the program, so no need to cause a waitcnt for LDS
2007 // address space operations.
2008 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2009 Position::AFTER, AtomicOrdering::Unordered,
2010 /*AtomicsOnly=*/false);
2011 return Changed;
2012 }
2013
2014 if (IsNonTemporal) {
2015 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2016 // and L2 cache policy to STREAM.
2017 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2018 // to MISS_EVICT and the L2 cache policy to STREAM.
2019 if (Op == SIMemOp::STORE)
2020 Changed |= enableGLCBit(MI);
2021 Changed |= enableSLCBit(MI);
2022
2023 return Changed;
2024 }
2025
2026 return Changed;
2027}
2028
2029bool SIGfx10CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2030 SIAtomicScope Scope,
2031 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2032 bool IsCrossAddrSpaceOrdering,
2033 Position Pos, AtomicOrdering Order,
2034 bool AtomicsOnly) const {
2035 bool Changed = false;
2036
2037 MachineBasicBlock &MBB = *MI->getParent();
2038 DebugLoc DL = MI->getDebugLoc();
2039
2040 if (Pos == Position::AFTER)
2041 ++MI;
2042
2043 bool VMCnt = false;
2044 bool VSCnt = false;
2045 bool LGKMCnt = false;
2046
2047 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2048 SIAtomicAddrSpace::NONE) {
2049 switch (Scope) {
2050 case SIAtomicScope::SYSTEM:
2051 case SIAtomicScope::AGENT:
2052 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2053 VMCnt |= true;
2054 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2055 VSCnt |= true;
2056 break;
2057 case SIAtomicScope::WORKGROUP:
2058 // In WGP mode the waves of a work-group can be executing on either CU of
2059 // the WGP. Therefore need to wait for operations to complete to ensure
2060 // they are visible to waves in the other CU as the L0 is per CU.
2061 // Otherwise in CU mode and all waves of a work-group are on the same CU
2062 // which shares the same L0. Note that we still need to wait when
2063 // performing a release in this mode to respect the transitivity of
2064 // happens-before, e.g. other waves of the workgroup must be able to
2065 // release the memory from another wave at a wider scope.
2066 if (!ST.isCuModeEnabled() || isReleaseOrStronger(Order)) {
2067 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2068 VMCnt |= true;
2069 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2070 VSCnt |= true;
2071 }
2072 break;
2073 case SIAtomicScope::WAVEFRONT:
2074 case SIAtomicScope::SINGLETHREAD:
2075 // The L0 cache keeps all memory operations in order for
2076 // work-items in the same wavefront.
2077 break;
2078 default:
2079 llvm_unreachable("Unsupported synchronization scope");
2080 }
2081 }
2082
2083 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2084 switch (Scope) {
2085 case SIAtomicScope::SYSTEM:
2086 case SIAtomicScope::AGENT:
2087 case SIAtomicScope::WORKGROUP:
2088 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2089 // not needed as LDS operations for all waves are executed in a total
2090 // global ordering as observed by all waves. Required if also
2091 // synchronizing with global/GDS memory as LDS operations could be
2092 // reordered with respect to later global/GDS memory operations of the
2093 // same wave.
2094 LGKMCnt |= IsCrossAddrSpaceOrdering;
2095 break;
2096 case SIAtomicScope::WAVEFRONT:
2097 case SIAtomicScope::SINGLETHREAD:
2098 // The LDS keeps all memory operations in order for
2099 // the same wavefront.
2100 break;
2101 default:
2102 llvm_unreachable("Unsupported synchronization scope");
2103 }
2104 }
2105
2106 if ((AddrSpace & SIAtomicAddrSpace::GDS) != SIAtomicAddrSpace::NONE) {
2107 switch (Scope) {
2108 case SIAtomicScope::SYSTEM:
2109 case SIAtomicScope::AGENT:
2110 // If no cross address space ordering then an GDS "S_WAITCNT lgkmcnt(0)"
2111 // is not needed as GDS operations for all waves are executed in a total
2112 // global ordering as observed by all waves. Required if also
2113 // synchronizing with global/LDS memory as GDS operations could be
2114 // reordered with respect to later global/LDS memory operations of the
2115 // same wave.
2116 LGKMCnt |= IsCrossAddrSpaceOrdering;
2117 break;
2118 case SIAtomicScope::WORKGROUP:
2119 case SIAtomicScope::WAVEFRONT:
2120 case SIAtomicScope::SINGLETHREAD:
2121 // The GDS keeps all memory operations in order for
2122 // the same work-group.
2123 break;
2124 default:
2125 llvm_unreachable("Unsupported synchronization scope");
2126 }
2127 }
2128
2129 if (VMCnt || LGKMCnt) {
2130 unsigned WaitCntImmediate =
2132 VMCnt ? 0 : getVmcntBitMask(IV),
2134 LGKMCnt ? 0 : getLgkmcntBitMask(IV));
2135 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_soft))
2136 .addImm(WaitCntImmediate);
2137 Changed = true;
2138 }
2139
2140 // On architectures that support direct loads to LDS, emit an unknown waitcnt
2141 // at workgroup-scoped release operations that specify the LDS address space.
2142 // SIInsertWaitcnts will later replace this with a vmcnt().
2143 if (ST.hasVMemToLDSLoad() && isReleaseOrStronger(Order) &&
2144 Scope == SIAtomicScope::WORKGROUP &&
2145 (AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2146 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_lds_direct));
2147 Changed = true;
2148 }
2149
2150 if (VSCnt) {
2151 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT_soft))
2152 .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
2153 .addImm(0);
2154 Changed = true;
2155 }
2156
2157 if (Pos == Position::AFTER)
2158 --MI;
2159
2160 return Changed;
2161}
2162
2163bool SIGfx10CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2164 SIAtomicScope Scope,
2165 SIAtomicAddrSpace AddrSpace,
2166 Position Pos) const {
2167 if (!InsertCacheInv)
2168 return false;
2169
2170 bool Changed = false;
2171
2172 MachineBasicBlock &MBB = *MI->getParent();
2173 DebugLoc DL = MI->getDebugLoc();
2174
2175 if (Pos == Position::AFTER)
2176 ++MI;
2177
2178 if (canAffectGlobalAddrSpace(AddrSpace)) {
2179 switch (Scope) {
2180 case SIAtomicScope::SYSTEM:
2181 case SIAtomicScope::AGENT:
2182 // The order of invalidates matter here. We must invalidate "outer in"
2183 // so L1 -> L0 to avoid L0 pulling in stale data from L1 when it is
2184 // invalidated.
2185 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL1_INV));
2186 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2187 Changed = true;
2188 break;
2189 case SIAtomicScope::WORKGROUP:
2190 // In WGP mode the waves of a work-group can be executing on either CU of
2191 // the WGP. Therefore need to invalidate the L0 which is per CU. Otherwise
2192 // in CU mode and all waves of a work-group are on the same CU, and so the
2193 // L0 does not need to be invalidated.
2194 if (!ST.isCuModeEnabled()) {
2195 BuildMI(MBB, MI, DL, TII->get(AMDGPU::BUFFER_GL0_INV));
2196 Changed = true;
2197 }
2198 break;
2199 case SIAtomicScope::WAVEFRONT:
2200 case SIAtomicScope::SINGLETHREAD:
2201 // No cache to invalidate.
2202 break;
2203 default:
2204 llvm_unreachable("Unsupported synchronization scope");
2205 }
2206 }
2207
2208 /// The scratch address space does not need the global memory cache
2209 /// to be flushed as all memory operations by the same thread are
2210 /// sequentially consistent, and no other thread can access scratch
2211 /// memory.
2212
2213 /// Other address spaces do not have a cache.
2214
2215 if (Pos == Position::AFTER)
2216 --MI;
2217
2218 return Changed;
2219}
2220
2221bool SIGfx11CacheControl::enableLoadCacheBypass(
2222 const MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
2223 SIAtomicAddrSpace AddrSpace) const {
2224 assert(MI->mayLoad() && !MI->mayStore());
2225 bool Changed = false;
2226
2227 if (canAffectGlobalAddrSpace(AddrSpace)) {
2228 switch (Scope) {
2229 case SIAtomicScope::SYSTEM:
2230 case SIAtomicScope::AGENT:
2231 // Set the L0 and L1 cache policies to MISS_EVICT.
2232 // Note: there is no L2 cache coherent bypass control at the ISA level.
2233 Changed |= enableGLCBit(MI);
2234 break;
2235 case SIAtomicScope::WORKGROUP:
2236 // In WGP mode the waves of a work-group can be executing on either CU of
2237 // the WGP. Therefore need to bypass the L0 which is per CU. Otherwise in
2238 // CU mode all waves of a work-group are on the same CU, and so the L0
2239 // does not need to be bypassed.
2240 if (!ST.isCuModeEnabled())
2241 Changed |= enableGLCBit(MI);
2242 break;
2243 case SIAtomicScope::WAVEFRONT:
2244 case SIAtomicScope::SINGLETHREAD:
2245 // No cache to bypass.
2246 break;
2247 default:
2248 llvm_unreachable("Unsupported synchronization scope");
2249 }
2250 }
2251
2252 /// The scratch address space does not need the global memory caches
2253 /// to be bypassed as all memory operations by the same thread are
2254 /// sequentially consistent, and no other thread can access scratch
2255 /// memory.
2256
2257 /// Other address spaces do not have a cache.
2258
2259 return Changed;
2260}
2261
2262bool SIGfx11CacheControl::enableVolatileAndOrNonTemporal(
2263 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2264 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2265
2266 // Only handle load and store, not atomic read-modify-write insructions. The
2267 // latter use glc to indicate if the atomic returns a result and so must not
2268 // be used for cache control.
2269 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2270
2271 // Only update load and store, not LLVM IR atomic read-modify-write
2272 // instructions. The latter are always marked as volatile so cannot sensibly
2273 // handle it as do not want to pessimize all atomics. Also they do not support
2274 // the nontemporal attribute.
2275 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2276
2277 bool Changed = false;
2278
2279 if (IsVolatile) {
2280 // Set L0 and L1 cache policy to be MISS_EVICT for load instructions
2281 // and MISS_LRU for store instructions.
2282 // Note: there is no L2 cache coherent bypass control at the ISA level.
2283 if (Op == SIMemOp::LOAD)
2284 Changed |= enableGLCBit(MI);
2285
2286 // Set MALL NOALLOC for load and store instructions.
2287 Changed |= enableDLCBit(MI);
2288
2289 // Ensure operation has completed at system scope to cause all volatile
2290 // operations to be visible outside the program in a global order. Do not
2291 // request cross address space as only the global address space can be
2292 // observable outside the program, so no need to cause a waitcnt for LDS
2293 // address space operations.
2294 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2295 Position::AFTER, AtomicOrdering::Unordered,
2296 /*AtomicsOnly=*/false);
2297 return Changed;
2298 }
2299
2300 if (IsNonTemporal) {
2301 // For loads setting SLC configures L0 and L1 cache policy to HIT_EVICT
2302 // and L2 cache policy to STREAM.
2303 // For stores setting both GLC and SLC configures L0 and L1 cache policy
2304 // to MISS_EVICT and the L2 cache policy to STREAM.
2305 if (Op == SIMemOp::STORE)
2306 Changed |= enableGLCBit(MI);
2307 Changed |= enableSLCBit(MI);
2308
2309 // Set MALL NOALLOC for load and store instructions.
2310 Changed |= enableDLCBit(MI);
2311 return Changed;
2312 }
2313
2314 return Changed;
2315}
2316
2317bool SIGfx12CacheControl::setTH(const MachineBasicBlock::iterator MI,
2318 AMDGPU::CPol::CPol Value) const {
2319 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2320 if (!CPol)
2321 return false;
2322
2323 uint64_t NewTH = Value & AMDGPU::CPol::TH;
2324 if ((CPol->getImm() & AMDGPU::CPol::TH) != NewTH) {
2325 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::TH) | NewTH);
2326 return true;
2327 }
2328
2329 return false;
2330}
2331
2332bool SIGfx12CacheControl::setScope(const MachineBasicBlock::iterator MI,
2333 AMDGPU::CPol::CPol Value) const {
2334 MachineOperand *CPol = TII->getNamedOperand(*MI, OpName::cpol);
2335 if (!CPol)
2336 return false;
2337
2338 uint64_t NewScope = Value & AMDGPU::CPol::SCOPE;
2339 if ((CPol->getImm() & AMDGPU::CPol::SCOPE) != NewScope) {
2340 CPol->setImm((CPol->getImm() & ~AMDGPU::CPol::SCOPE) | NewScope);
2341 return true;
2342 }
2343
2344 return false;
2345}
2346
2347bool SIGfx12CacheControl::insertWaitsBeforeSystemScopeStore(
2348 const MachineBasicBlock::iterator MI) const {
2349 // TODO: implement flag for frontend to give us a hint not to insert waits.
2350
2351 MachineBasicBlock &MBB = *MI->getParent();
2352 const DebugLoc &DL = MI->getDebugLoc();
2353
2354 BuildMI(MBB, MI, DL, TII->get(S_WAIT_LOADCNT_soft)).addImm(0);
2355 if (ST.hasImageInsts()) {
2356 BuildMI(MBB, MI, DL, TII->get(S_WAIT_SAMPLECNT_soft)).addImm(0);
2357 BuildMI(MBB, MI, DL, TII->get(S_WAIT_BVHCNT_soft)).addImm(0);
2358 }
2359 BuildMI(MBB, MI, DL, TII->get(S_WAIT_KMCNT_soft)).addImm(0);
2360 BuildMI(MBB, MI, DL, TII->get(S_WAIT_STORECNT_soft)).addImm(0);
2361
2362 return true;
2363}
2364
2365bool SIGfx12CacheControl::insertWait(MachineBasicBlock::iterator &MI,
2366 SIAtomicScope Scope,
2367 SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2368 bool IsCrossAddrSpaceOrdering,
2369 Position Pos, AtomicOrdering Order,
2370 bool AtomicsOnly) const {
2371 bool Changed = false;
2372
2373 MachineBasicBlock &MBB = *MI->getParent();
2374 DebugLoc DL = MI->getDebugLoc();
2375
2376 bool LOADCnt = false;
2377 bool DSCnt = false;
2378 bool STORECnt = false;
2379
2380 if (Pos == Position::AFTER)
2381 ++MI;
2382
2383 if ((AddrSpace & (SIAtomicAddrSpace::GLOBAL | SIAtomicAddrSpace::SCRATCH)) !=
2384 SIAtomicAddrSpace::NONE) {
2385 switch (Scope) {
2386 case SIAtomicScope::SYSTEM:
2387 case SIAtomicScope::AGENT:
2388 case SIAtomicScope::CLUSTER:
2389 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2390 LOADCnt |= true;
2391 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2392 STORECnt |= true;
2393 break;
2394 case SIAtomicScope::WORKGROUP:
2395 // GFX12.0:
2396 // In WGP mode the waves of a work-group can be executing on either CU
2397 // of the WGP. Therefore need to wait for operations to complete to
2398 // ensure they are visible to waves in the other CU as the L0 is per CU.
2399 //
2400 // Otherwise in CU mode and all waves of a work-group are on the same CU
2401 // which shares the same L0. Note that we still need to wait when
2402 // performing a release in this mode to respect the transitivity of
2403 // happens-before, e.g. other waves of the workgroup must be able to
2404 // release the memory from another wave at a wider scope.
2405 //
2406 // GFX12.5:
2407 // CU$ has two ports. To ensure operations are visible at the workgroup
2408 // level, we need to ensure all operations in this port have completed
2409 // so the other SIMDs in the WG can see them. There is no ordering
2410 // guarantee between the ports.
2411 if (!ST.isCuModeEnabled() || ST.hasGFX1250Insts() ||
2412 isReleaseOrStronger(Order)) {
2413 if ((Op & SIMemOp::LOAD) != SIMemOp::NONE)
2414 LOADCnt |= true;
2415 if ((Op & SIMemOp::STORE) != SIMemOp::NONE)
2416 STORECnt |= true;
2417 }
2418 break;
2419 case SIAtomicScope::WAVEFRONT:
2420 case SIAtomicScope::SINGLETHREAD:
2421 // The L0 cache keeps all memory operations in order for
2422 // work-items in the same wavefront.
2423 break;
2424 default:
2425 llvm_unreachable("Unsupported synchronization scope");
2426 }
2427 }
2428
2429 if ((AddrSpace & SIAtomicAddrSpace::LDS) != SIAtomicAddrSpace::NONE) {
2430 switch (Scope) {
2431 case SIAtomicScope::SYSTEM:
2432 case SIAtomicScope::AGENT:
2433 case SIAtomicScope::CLUSTER:
2434 case SIAtomicScope::WORKGROUP:
2435 // If no cross address space ordering then an "S_WAITCNT lgkmcnt(0)" is
2436 // not needed as LDS operations for all waves are executed in a total
2437 // global ordering as observed by all waves. Required if also
2438 // synchronizing with global/GDS memory as LDS operations could be
2439 // reordered with respect to later global/GDS memory operations of the
2440 // same wave.
2441 DSCnt |= IsCrossAddrSpaceOrdering;
2442 break;
2443 case SIAtomicScope::WAVEFRONT:
2444 case SIAtomicScope::SINGLETHREAD:
2445 // The LDS keeps all memory operations in order for
2446 // the same wavefront.
2447 break;
2448 default:
2449 llvm_unreachable("Unsupported synchronization scope");
2450 }
2451 }
2452
2453 if (LOADCnt) {
2454 // Acquire sequences only need to wait on the previous atomic operation.
2455 // e.g. a typical sequence looks like
2456 // atomic load
2457 // (wait)
2458 // global_inv
2459 //
2460 // We do not have BVH or SAMPLE atomics, so the atomic load is always going
2461 // to be tracked using loadcnt.
2462 //
2463 // This also applies to fences. Fences cannot pair with an instruction
2464 // tracked with bvh/samplecnt as we don't have any atomics that do that.
2465 if (!AtomicsOnly && ST.hasImageInsts()) {
2466 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_BVHCNT_soft)).addImm(0);
2467 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_SAMPLECNT_soft)).addImm(0);
2468 }
2469 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_LOADCNT_soft)).addImm(0);
2470 Changed = true;
2471 }
2472
2473 if (STORECnt) {
2474 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_STORECNT_soft)).addImm(0);
2475 Changed = true;
2476 }
2477
2478 if (DSCnt) {
2479 BuildMI(MBB, MI, DL, TII->get(AMDGPU::S_WAIT_DSCNT_soft)).addImm(0);
2480 Changed = true;
2481 }
2482
2483 if (Pos == Position::AFTER)
2484 --MI;
2485
2486 return Changed;
2487}
2488
2489bool SIGfx12CacheControl::insertAcquire(MachineBasicBlock::iterator &MI,
2490 SIAtomicScope Scope,
2491 SIAtomicAddrSpace AddrSpace,
2492 Position Pos) const {
2493 if (!InsertCacheInv)
2494 return false;
2495
2496 MachineBasicBlock &MBB = *MI->getParent();
2497 DebugLoc DL = MI->getDebugLoc();
2498
2499 /// The scratch address space does not need the global memory cache
2500 /// to be flushed as all memory operations by the same thread are
2501 /// sequentially consistent, and no other thread can access scratch
2502 /// memory.
2503
2504 /// Other address spaces do not have a cache.
2505 if (!canAffectGlobalAddrSpace(AddrSpace))
2506 return false;
2507
2509 switch (Scope) {
2510 case SIAtomicScope::SYSTEM:
2511 ScopeImm = AMDGPU::CPol::SCOPE_SYS;
2512 break;
2513 case SIAtomicScope::AGENT:
2514 ScopeImm = AMDGPU::CPol::SCOPE_DEV;
2515 break;
2516 case SIAtomicScope::CLUSTER:
2517 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2518 break;
2519 case SIAtomicScope::WORKGROUP:
2520 // GFX12.0:
2521 // In WGP mode the waves of a work-group can be executing on either CU of
2522 // the WGP. Therefore we need to invalidate the L0 which is per CU.
2523 // Otherwise in CU mode all waves of a work-group are on the same CU, and
2524 // so the L0 does not need to be invalidated.
2525 //
2526 // GFX12.5 has a shared WGP$, so no invalidates are required.
2527 if (ST.isCuModeEnabled())
2528 return false;
2529
2530 ScopeImm = AMDGPU::CPol::SCOPE_SE;
2531 break;
2532 case SIAtomicScope::WAVEFRONT:
2533 case SIAtomicScope::SINGLETHREAD:
2534 // No cache to invalidate.
2535 return false;
2536 default:
2537 llvm_unreachable("Unsupported synchronization scope");
2538 }
2539
2540 if (Pos == Position::AFTER)
2541 ++MI;
2542
2543 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_INV)).addImm(ScopeImm);
2544
2545 if (Pos == Position::AFTER)
2546 --MI;
2547
2548 return true;
2549}
2550
2551bool SIGfx12CacheControl::insertRelease(MachineBasicBlock::iterator &MI,
2552 SIAtomicScope Scope,
2553 SIAtomicAddrSpace AddrSpace,
2554 bool IsCrossAddrSpaceOrdering,
2555 Position Pos) const {
2556 bool Changed = false;
2557
2558 MachineBasicBlock &MBB = *MI->getParent();
2559 DebugLoc DL = MI->getDebugLoc();
2560
2561 // The scratch address space does not need the global memory cache
2562 // writeback as all memory operations by the same thread are
2563 // sequentially consistent, and no other thread can access scratch
2564 // memory.
2565 if (canAffectGlobalAddrSpace(AddrSpace)) {
2566 if (Pos == Position::AFTER)
2567 ++MI;
2568
2569 // global_wb is only necessary at system scope for GFX12.0,
2570 // they're also necessary at device scope for GFX12.5 as stores
2571 // cannot report completion earlier than L2.
2572 //
2573 // Emitting it for lower scopes is a slow no-op, so we omit it
2574 // for performance.
2575 switch (Scope) {
2576 case SIAtomicScope::SYSTEM:
2577 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2579 Changed = true;
2580 break;
2581 case SIAtomicScope::AGENT:
2582 // GFX12.5 may have >1 L2 per device so we must emit a device scope WB.
2583 if (ST.hasGFX1250Insts()) {
2584 BuildMI(MBB, MI, DL, TII->get(AMDGPU::GLOBAL_WB))
2586 Changed = true;
2587 }
2588 break;
2589 case SIAtomicScope::CLUSTER:
2590 case SIAtomicScope::WORKGROUP:
2591 // No WB necessary, but we still have to wait.
2592 case SIAtomicScope::WAVEFRONT:
2593 case SIAtomicScope::SINGLETHREAD:
2594 // No WB or wait necessary here, but insertWait takes care of that.
2595 break;
2596 default:
2597 llvm_unreachable("Unsupported synchronization scope");
2598 }
2599
2600 if (Pos == Position::AFTER)
2601 --MI;
2602 }
2603
2604 // We always have to wait for previous memory operations (load/store) to
2605 // complete, whether we inserted a WB or not. If we inserted a WB (storecnt),
2606 // we of course need to wait for that as well.
2607 Changed |= insertWait(MI, Scope, AddrSpace, SIMemOp::LOAD | SIMemOp::STORE,
2608 IsCrossAddrSpaceOrdering, Pos, AtomicOrdering::Release,
2609 /*AtomicsOnly=*/false);
2610
2611 return Changed;
2612}
2613
2614bool SIGfx12CacheControl::enableVolatileAndOrNonTemporal(
2615 MachineBasicBlock::iterator &MI, SIAtomicAddrSpace AddrSpace, SIMemOp Op,
2616 bool IsVolatile, bool IsNonTemporal, bool IsLastUse = false) const {
2617
2618 // Only handle load and store, not atomic read-modify-write instructions.
2619 assert((MI->mayLoad() ^ MI->mayStore()) || SIInstrInfo::isLDSDMA(*MI));
2620
2621 // Only update load and store, not LLVM IR atomic read-modify-write
2622 // instructions. The latter are always marked as volatile so cannot sensibly
2623 // handle it as do not want to pessimize all atomics. Also they do not support
2624 // the nontemporal attribute.
2625 assert(Op == SIMemOp::LOAD || Op == SIMemOp::STORE);
2626
2627 bool Changed = false;
2628
2629 if (IsLastUse) {
2630 // Set last-use hint.
2631 Changed |= setTH(MI, AMDGPU::CPol::TH_LU);
2632 } else if (IsNonTemporal) {
2633 // Set non-temporal hint for all cache levels.
2634 Changed |= setTH(MI, AMDGPU::CPol::TH_NT);
2635 }
2636
2637 if (IsVolatile) {
2638 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2639
2640 // Ensure operation has completed at system scope to cause all volatile
2641 // operations to be visible outside the program in a global order. Do not
2642 // request cross address space as only the global address space can be
2643 // observable outside the program, so no need to cause a waitcnt for LDS
2644 // address space operations.
2645 Changed |= insertWait(MI, SIAtomicScope::SYSTEM, AddrSpace, Op, false,
2646 Position::AFTER, AtomicOrdering::Unordered,
2647 /*AtomicsOnly=*/false);
2648 }
2649
2650 return Changed;
2651}
2652
2653bool SIGfx12CacheControl::finalizeStore(MachineInstr &MI, bool Atomic) const {
2654 assert(MI.mayStore() && "Not a Store inst");
2655 const bool IsRMW = (MI.mayLoad() && MI.mayStore());
2656 bool Changed = false;
2657
2658 // GFX12.5 only: xcnt wait is needed before flat and global atomics
2659 // stores/rmw.
2660 if (Atomic && ST.requiresWaitXCntBeforeAtomicStores() && TII->isFLAT(MI)) {
2661 MachineBasicBlock &MBB = *MI.getParent();
2662 BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(S_WAIT_XCNT_soft)).addImm(0);
2663 Changed = true;
2664 }
2665
2666 // Remaining fixes do not apply to RMWs.
2667 if (IsRMW)
2668 return Changed;
2669
2670 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2671 if (!CPol) // Some vmem operations do not have a scope and are not concerned.
2672 return Changed;
2673 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2674
2675 // GFX12.0 only: Extra waits needed before system scope stores.
2676 if (ST.requiresWaitsBeforeSystemScopeStores() && !Atomic &&
2677 Scope == CPol::SCOPE_SYS)
2678 Changed |= insertWaitsBeforeSystemScopeStore(MI.getIterator());
2679
2680 return Changed;
2681}
2682
2683bool SIGfx12CacheControl::handleCooperativeAtomic(MachineInstr &MI) const {
2684 if (!ST.hasGFX1250Insts())
2685 return false;
2686
2687 // Cooperative atomics need to be SCOPE_DEV or higher.
2688 MachineOperand *CPol = TII->getNamedOperand(MI, OpName::cpol);
2689 assert(CPol && "No CPol operand?");
2690 const unsigned Scope = CPol->getImm() & CPol::SCOPE;
2691 if (Scope < CPol::SCOPE_DEV)
2692 return setScope(MI, CPol::SCOPE_DEV);
2693 return false;
2694}
2695
2696bool SIGfx12CacheControl::setAtomicScope(const MachineBasicBlock::iterator &MI,
2697 SIAtomicScope Scope,
2698 SIAtomicAddrSpace AddrSpace) const {
2699 bool Changed = false;
2700
2701 if (canAffectGlobalAddrSpace(AddrSpace)) {
2702 switch (Scope) {
2703 case SIAtomicScope::SYSTEM:
2704 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SYS);
2705 break;
2706 case SIAtomicScope::AGENT:
2707 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_DEV);
2708 break;
2709 case SIAtomicScope::CLUSTER:
2710 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2711 break;
2712 case SIAtomicScope::WORKGROUP:
2713 // In workgroup mode, SCOPE_SE is needed as waves can executes on
2714 // different CUs that access different L0s.
2715 if (!ST.isCuModeEnabled())
2716 Changed |= setScope(MI, AMDGPU::CPol::SCOPE_SE);
2717 break;
2718 case SIAtomicScope::WAVEFRONT:
2719 case SIAtomicScope::SINGLETHREAD:
2720 // No cache to bypass.
2721 break;
2722 default:
2723 llvm_unreachable("Unsupported synchronization scope");
2724 }
2725 }
2726
2727 // The scratch address space does not need the global memory caches
2728 // to be bypassed as all memory operations by the same thread are
2729 // sequentially consistent, and no other thread can access scratch
2730 // memory.
2731
2732 // Other address spaces do not have a cache.
2733
2734 return Changed;
2735}
2736
2737bool SIMemoryLegalizer::removeAtomicPseudoMIs() {
2738 if (AtomicPseudoMIs.empty())
2739 return false;
2740
2741 for (auto &MI : AtomicPseudoMIs)
2742 MI->eraseFromParent();
2743
2744 AtomicPseudoMIs.clear();
2745 return true;
2746}
2747
2748bool SIMemoryLegalizer::expandLoad(const SIMemOpInfo &MOI,
2750 assert(MI->mayLoad() && !MI->mayStore());
2751
2752 bool Changed = false;
2753
2754 if (MOI.isAtomic()) {
2755 const AtomicOrdering Order = MOI.getOrdering();
2756 if (Order == AtomicOrdering::Monotonic ||
2757 Order == AtomicOrdering::Acquire ||
2758 Order == AtomicOrdering::SequentiallyConsistent) {
2759 Changed |= CC->enableLoadCacheBypass(MI, MOI.getScope(),
2760 MOI.getOrderingAddrSpace());
2761 }
2762
2763 // Handle cooperative atomics after cache bypass step, as it may override
2764 // the scope of the instruction to a greater scope.
2765 if (MOI.isCooperative())
2766 Changed |= CC->handleCooperativeAtomic(*MI);
2767
2768 if (Order == AtomicOrdering::SequentiallyConsistent)
2769 Changed |= CC->insertWait(MI, MOI.getScope(), MOI.getOrderingAddrSpace(),
2770 SIMemOp::LOAD | SIMemOp::STORE,
2771 MOI.getIsCrossAddressSpaceOrdering(),
2772 Position::BEFORE, Order, /*AtomicsOnly=*/false);
2773
2774 if (Order == AtomicOrdering::Acquire ||
2775 Order == AtomicOrdering::SequentiallyConsistent) {
2776 // The wait below only needs to wait on the prior atomic.
2777 Changed |=
2778 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2779 SIMemOp::LOAD, MOI.getIsCrossAddressSpaceOrdering(),
2780 Position::AFTER, Order, /*AtomicsOnly=*/true);
2781 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2782 MOI.getOrderingAddrSpace(),
2783 Position::AFTER);
2784 }
2785
2786 return Changed;
2787 }
2788
2789 // Atomic instructions already bypass caches to the scope specified by the
2790 // SyncScope operand. Only non-atomic volatile and nontemporal/last-use
2791 // instructions need additional treatment.
2792 Changed |= CC->enableVolatileAndOrNonTemporal(
2793 MI, MOI.getInstrAddrSpace(), SIMemOp::LOAD, MOI.isVolatile(),
2794 MOI.isNonTemporal(), MOI.isLastUse());
2795
2796 return Changed;
2797}
2798
2799bool SIMemoryLegalizer::expandStore(const SIMemOpInfo &MOI,
2801 assert(!MI->mayLoad() && MI->mayStore());
2802
2803 bool Changed = false;
2804 // FIXME: Necessary hack because iterator can lose track of the store.
2805 MachineInstr &StoreMI = *MI;
2806
2807 if (MOI.isAtomic()) {
2808 if (MOI.getOrdering() == AtomicOrdering::Monotonic ||
2809 MOI.getOrdering() == AtomicOrdering::Release ||
2810 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent) {
2811 Changed |= CC->enableStoreCacheBypass(MI, MOI.getScope(),
2812 MOI.getOrderingAddrSpace());
2813 }
2814
2815 // Handle cooperative atomics after cache bypass step, as it may override
2816 // the scope of the instruction to a greater scope.
2817 if (MOI.isCooperative())
2818 Changed |= CC->handleCooperativeAtomic(*MI);
2819
2820 if (MOI.getOrdering() == AtomicOrdering::Release ||
2821 MOI.getOrdering() == AtomicOrdering::SequentiallyConsistent)
2822 Changed |= CC->insertRelease(MI, MOI.getScope(),
2823 MOI.getOrderingAddrSpace(),
2824 MOI.getIsCrossAddressSpaceOrdering(),
2825 Position::BEFORE);
2826
2827 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/true);
2828 return Changed;
2829 }
2830
2831 // Atomic instructions already bypass caches to the scope specified by the
2832 // SyncScope operand. Only non-atomic volatile and nontemporal instructions
2833 // need additional treatment.
2834 Changed |= CC->enableVolatileAndOrNonTemporal(
2835 MI, MOI.getInstrAddrSpace(), SIMemOp::STORE, MOI.isVolatile(),
2836 MOI.isNonTemporal());
2837
2838 // GFX12 specific, scope(desired coherence domain in cache hierarchy) is
2839 // instruction field, do not confuse it with atomic scope.
2840 Changed |= CC->finalizeStore(StoreMI, /*Atomic=*/false);
2841 return Changed;
2842}
2843
2844bool SIMemoryLegalizer::expandAtomicFence(const SIMemOpInfo &MOI,
2846 assert(MI->getOpcode() == AMDGPU::ATOMIC_FENCE);
2847
2848 AtomicPseudoMIs.push_back(MI);
2849 bool Changed = false;
2850
2851 const SIAtomicAddrSpace OrderingAddrSpace = MOI.getOrderingAddrSpace();
2852
2853 if (MOI.isAtomic()) {
2854 const AtomicOrdering Order = MOI.getOrdering();
2855 if (Order == AtomicOrdering::Acquire) {
2856 // Acquire fences only need to wait on the previous atomic they pair with.
2857 Changed |= CC->insertWait(MI, MOI.getScope(), OrderingAddrSpace,
2858 SIMemOp::LOAD | SIMemOp::STORE,
2859 MOI.getIsCrossAddressSpaceOrdering(),
2860 Position::BEFORE, Order, /*AtomicsOnly=*/true);
2861 }
2862
2863 if (Order == AtomicOrdering::Release ||
2864 Order == AtomicOrdering::AcquireRelease ||
2865 Order == AtomicOrdering::SequentiallyConsistent)
2866 /// TODO: This relies on a barrier always generating a waitcnt
2867 /// for LDS to ensure it is not reordered with the completion of
2868 /// the proceeding LDS operations. If barrier had a memory
2869 /// ordering and memory scope, then library does not need to
2870 /// generate a fence. Could add support in this file for
2871 /// barrier. SIInsertWaitcnt.cpp could then stop unconditionally
2872 /// adding S_WAITCNT before a S_BARRIER.
2873 Changed |= CC->insertRelease(MI, MOI.getScope(), OrderingAddrSpace,
2874 MOI.getIsCrossAddressSpaceOrdering(),
2875 Position::BEFORE);
2876
2877 // TODO: If both release and invalidate are happening they could be combined
2878 // to use the single "BUFFER_WBINV*" instruction. This could be done by
2879 // reorganizing this code or as part of optimizing SIInsertWaitcnt pass to
2880 // track cache invalidate and write back instructions.
2881
2882 if (Order == AtomicOrdering::Acquire ||
2883 Order == AtomicOrdering::AcquireRelease ||
2884 Order == AtomicOrdering::SequentiallyConsistent)
2885 Changed |= CC->insertAcquire(MI, MOI.getScope(), OrderingAddrSpace,
2886 Position::BEFORE);
2887
2888 return Changed;
2889 }
2890
2891 return Changed;
2892}
2893
2894bool SIMemoryLegalizer::expandAtomicCmpxchgOrRmw(const SIMemOpInfo &MOI,
2896 assert(MI->mayLoad() && MI->mayStore());
2897
2898 bool Changed = false;
2899 MachineInstr &RMWMI = *MI;
2900
2901 if (MOI.isAtomic()) {
2902 const AtomicOrdering Order = MOI.getOrdering();
2903 if (Order == AtomicOrdering::Monotonic ||
2904 Order == AtomicOrdering::Acquire || Order == AtomicOrdering::Release ||
2905 Order == AtomicOrdering::AcquireRelease ||
2906 Order == AtomicOrdering::SequentiallyConsistent) {
2907 Changed |= CC->enableRMWCacheBypass(MI, MOI.getScope(),
2908 MOI.getInstrAddrSpace());
2909 }
2910
2911 if (Order == AtomicOrdering::Release ||
2912 Order == AtomicOrdering::AcquireRelease ||
2913 Order == AtomicOrdering::SequentiallyConsistent ||
2914 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent)
2915 Changed |= CC->insertRelease(MI, MOI.getScope(),
2916 MOI.getOrderingAddrSpace(),
2917 MOI.getIsCrossAddressSpaceOrdering(),
2918 Position::BEFORE);
2919
2920 if (Order == AtomicOrdering::Acquire ||
2921 Order == AtomicOrdering::AcquireRelease ||
2922 Order == AtomicOrdering::SequentiallyConsistent ||
2923 MOI.getFailureOrdering() == AtomicOrdering::Acquire ||
2924 MOI.getFailureOrdering() == AtomicOrdering::SequentiallyConsistent) {
2925 // Only wait on the previous atomic.
2926 Changed |=
2927 CC->insertWait(MI, MOI.getScope(), MOI.getInstrAddrSpace(),
2928 isAtomicRet(*MI) ? SIMemOp::LOAD : SIMemOp::STORE,
2929 MOI.getIsCrossAddressSpaceOrdering(), Position::AFTER,
2930 Order, /*AtomicsOnly=*/true);
2931 Changed |= CC->insertAcquire(MI, MOI.getScope(),
2932 MOI.getOrderingAddrSpace(),
2933 Position::AFTER);
2934 }
2935
2936 Changed |= CC->finalizeStore(RMWMI, /*Atomic=*/true);
2937 return Changed;
2938 }
2939
2940 return Changed;
2941}
2942
2943bool SIMemoryLegalizer::expandLDSDMA(const SIMemOpInfo &MOI,
2945 assert(MI->mayLoad() && MI->mayStore());
2946
2947 // The volatility or nontemporal-ness of the operation is a
2948 // function of the global memory, not the LDS.
2949 SIMemOp OpKind =
2950 SIInstrInfo::mayWriteLDSThroughDMA(*MI) ? SIMemOp::LOAD : SIMemOp::STORE;
2951
2952 // Handle volatile and/or nontemporal markers on direct-to-LDS loads and
2953 // stores. The operation is treated as a volatile/nontemporal store
2954 // to its second argument.
2955 return CC->enableVolatileAndOrNonTemporal(
2956 MI, MOI.getInstrAddrSpace(), OpKind, MOI.isVolatile(),
2957 MOI.isNonTemporal(), MOI.isLastUse());
2958}
2959
2960bool SIMemoryLegalizerLegacy::runOnMachineFunction(MachineFunction &MF) {
2961 const MachineModuleInfo &MMI =
2962 getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
2963 return SIMemoryLegalizer(MMI).run(MF);
2964}
2965
2966PreservedAnalyses
2970 .getCachedResult<MachineModuleAnalysis>(
2971 *MF.getFunction().getParent());
2972 assert(MMI && "MachineModuleAnalysis must be available");
2973 if (!SIMemoryLegalizer(MMI->getMMI()).run(MF))
2974 return PreservedAnalyses::all();
2976}
2977
2978bool SIMemoryLegalizer::run(MachineFunction &MF) {
2979 bool Changed = false;
2980
2981 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2982 SIMemOpAccess MOA(MMI.getObjFileInfo<AMDGPUMachineModuleInfo>(), ST);
2983 CC = SICacheControl::create(ST);
2984
2985 for (auto &MBB : MF) {
2986 for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
2987
2988 // Unbundle instructions after the post-RA scheduler.
2989 if (MI->isBundle() && MI->mayLoadOrStore()) {
2990 MachineBasicBlock::instr_iterator II(MI->getIterator());
2991 for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
2992 I != E && I->isBundledWithPred(); ++I) {
2993 I->unbundleFromPred();
2994 for (MachineOperand &MO : I->operands())
2995 if (MO.isReg())
2996 MO.setIsInternalRead(false);
2997 }
2998
2999 MI->eraseFromParent();
3000 MI = II->getIterator();
3001 }
3002
3003 if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
3004 continue;
3005
3006 if (const auto &MOI = MOA.getLoadInfo(MI)) {
3007 Changed |= expandLoad(*MOI, MI);
3008 } else if (const auto &MOI = MOA.getStoreInfo(MI)) {
3009 Changed |= expandStore(*MOI, MI);
3010 } else if (const auto &MOI = MOA.getLDSDMAInfo(MI)) {
3011 Changed |= expandLDSDMA(*MOI, MI);
3012 } else if (const auto &MOI = MOA.getAtomicFenceInfo(MI)) {
3013 Changed |= expandAtomicFence(*MOI, MI);
3014 } else if (const auto &MOI = MOA.getAtomicCmpxchgOrRmwInfo(MI)) {
3015 Changed |= expandAtomicCmpxchgOrRmw(*MOI, MI);
3016 }
3017 }
3018 }
3019
3020 Changed |= removeAtomicPseudoMIs();
3021 return Changed;
3022}
3023
3024INITIALIZE_PASS(SIMemoryLegalizerLegacy, DEBUG_TYPE, PASS_NAME, false, false)
3025
3026char SIMemoryLegalizerLegacy::ID = 0;
3027char &llvm::SIMemoryLegalizerID = SIMemoryLegalizerLegacy::ID;
3028
3030 return new SIMemoryLegalizerLegacy();
3031}
static std::optional< LoadInfo > getLoadInfo(const MachineInstr &MI)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Provides AMDGPU specific target descriptions.
AMDGPU Machine Module Info.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Atomic ordering constants.
AMD GCN specific subclass of TargetSubtarget.
#define DEBUG_TYPE
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
This header defines various interfaces for pass management in LLVM.
#define I(x, y, z)
Definition MD5.cpp:58
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
uint64_t IntrinsicInst * II
#define INITIALIZE_PASS(passName, arg, name, cfg, analysis)
Definition PassSupport.h:56
static cl::opt< bool > AmdgcnSkipCacheInvalidations("amdgcn-skip-cache-invalidations", cl::init(false), cl::Hidden, cl::desc("Use this to skip inserting cache invalidating instructions."))
#define PASS_NAME
This file contains some functions that are useful when dealing with strings.
#define PASS_NAME
static const uint32_t IV[8]
Definition blake3_impl.h:83
SyncScope::ID getClusterOneAddressSpaceSSID() const
std::optional< bool > isSyncScopeInclusion(SyncScope::ID A, SyncScope::ID B) const
In AMDGPU target synchronization scopes are inclusive, meaning a larger synchronization scope is incl...
SyncScope::ID getAgentOneAddressSpaceSSID() const
SyncScope::ID getSingleThreadOneAddressSpaceSSID() const
SyncScope::ID getWavefrontOneAddressSpaceSSID() const
SyncScope::ID getSystemOneAddressSpaceSSID() const
SyncScope::ID getWorkgroupOneAddressSpaceSSID() const
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
LLVM_ABI void setPreservesCFG()
This function should be called by the pass, iff they do not:
Definition Pass.cpp:270
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Diagnostic information for unsupported feature in backend.
FunctionPass class - This class is used to implement most global optimizations.
Definition Pass.h:314
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
A helper class to return the specified delimiter string after the first invocation of operator String...
Helper class to manipulate !mmra metadata nodes.
Instructions::iterator instr_iterator
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
MachineFunctionPass - This class adapts the FunctionPass interface to allow convenient creation of pa...
void getAnalysisUsage(AnalysisUsage &AU) const override
getAnalysisUsage - Subclasses that override getAnalysisUsage must call this.
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
Function & getFunction()
Return the LLVM function that this machine code represents.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addReg(Register RegNo, unsigned flags=0, unsigned SubReg=0) const
Add a new virtual register operand.
Representation of each machine instruction.
Ty & getObjFileInfo()
Keep track of various per-module pieces of information for backends that would like to do so.
MachineOperand class - Representation of each machine instruction operand.
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
static bool mayWriteLDSThroughDMA(const MachineInstr &MI)
static bool isAtomicRet(const MachineInstr &MI)
static bool isLDSDMA(const MachineInstr &MI)
PreservedAnalyses run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM)
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
A wrapper around a string literal that serves as a proxy for constructing global tables of StringRefs...
Definition StringRef.h:854
StringMap - This is an unconventional map that is specialized for handling keys that are "strings",...
Definition StringMap.h:133
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
A raw_ostream that writes to an SmallVector or SmallString.
Changed
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char IsVolatile[]
Key for Kernel::Arg::Metadata::mIsVolatile.
LLVM_ABI IsaVersion getIsaVersion(StringRef GPU)
unsigned encodeWaitcnt(const IsaVersion &Version, unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt)
Encodes Vmcnt, Expcnt and Lgkmcnt into Waitcnt for given isa Version.
unsigned getVmcntBitMask(const IsaVersion &Version)
unsigned getLgkmcntBitMask(const IsaVersion &Version)
unsigned getExpcntBitMask(const IsaVersion &Version)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Undef
Value of the register doesn't matter.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
NodeAddr< FuncNode * > Func
Definition RDFGraph.h:393
This is an optimization pass for GlobalISel generic memory operations.
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
OuterAnalysisManagerProxy< ModuleAnalysisManager, MachineFunction > ModuleAnalysisManagerMachineFunctionProxy
Provide the ModuleAnalysisManager to Function proxy.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
char & SIMemoryLegalizerID
LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE()
AnalysisManager< MachineFunction > MachineFunctionAnalysisManager
bool isReleaseOrStronger(AtomicOrdering AO)
LLVM_ABI PreservedAnalyses getMachineFunctionPassPreservedAnalyses()
Returns the minimum set of Analyses that all machine function passes must preserve.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
AtomicOrdering getMergedAtomicOrdering(AtomicOrdering AO, AtomicOrdering Other)
Return a single atomic ordering that is at least as strong as both the AO and Other orderings for an ...
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ LLVM_MARK_AS_BITMASK_ENUM
Definition ModRef.h:37
DWARFExpression::Operation Op
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
FunctionPass * createSIMemoryLegalizerPass()