LLVM 23.0.0git
AMDGPUSwLowerLDS.cpp
Go to the documentation of this file.
1//===-- AMDGPUSwLowerLDS.cpp -----------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass lowers the local data store, LDS, uses in kernel and non-kernel
10// functions in module to use dynamically allocated global memory.
11// Packed LDS Layout is emulated in the global memory.
12// The lowered memory instructions from LDS to global memory are then
13// instrumented for address sanitizer, to catch addressing errors.
14// This pass only work when address sanitizer has been enabled and has
15// instrumented the IR. It identifies that IR has been instrumented using
16// "nosanitize_address" module flag.
17//
18// Replacement of Kernel LDS accesses:
19// For a kernel, LDS access can be static or dynamic which are direct
20// (accessed within kernel) and indirect (accessed through non-kernels).
21// All these LDS accesses corresponding to kernel will be packed together,
22// where all static LDS accesses will be allocated first and then dynamic
23// LDS follows. The total size with alignment is calculated. A new LDS global
24// will be created for the kernel called "SW LDS" and it will have the
25// attribute "amdgpu-lds-size" attached with value of the size calculated.
26// All the LDS accesses in the module will be replaced by GEP with offset
27// into the "Sw LDS".
28// A new "llvm.amdgcn.<kernel>.dynlds" is created per kernel accessing
29// the dynamic LDS. This will be marked used by kernel and will have
30// MD_absolue_symbol metadata set to total static LDS size, Since dynamic
31// LDS allocation starts after all static LDS allocation.
32//
33// A device global memory equal to the total LDS size will be allocated.
34// At the prologue of the kernel, a single work-item from the
35// work-group, does a "malloc" and stores the pointer of the
36// allocation in "SW LDS".
37//
38// To store the offsets corresponding to all LDS accesses, another global
39// variable is created which will be called "SW LDS metadata" in this pass.
40// - SW LDS Global:
41// It is LDS global of ptr type with name
42// "llvm.amdgcn.sw.lds.<kernel-name>".
43// - Metadata Global:
44// It is of struct type, with n members. n equals the number of LDS
45// globals accessed by the kernel(direct and indirect). Each member of
46// struct is another struct of type {i32, i32, i32}. First member
47// corresponds to offset, second member corresponds to size of LDS global
48// being replaced and third represents the total aligned size. It will
49// have name "llvm.amdgcn.sw.lds.<kernel-name>.md". This global will have
50// an initializer with static LDS related offsets and sizes initialized.
51// But for dynamic LDS related entries, offsets will be initialized to
52// previous static LDS allocation end offset. Sizes for them will be zero
53// initially. These dynamic LDS offset and size values will be updated
54// within the kernel, since kernel can read the dynamic LDS size
55// allocation done at runtime with query to "hidden_dynamic_lds_size"
56// hidden kernel argument.
57//
58// At the epilogue of kernel, allocated memory would be made free by the same
59// single work-item.
60//
61// Replacement of non-kernel LDS accesses:
62// Multiple kernels can access the same non-kernel function.
63// All the kernels accessing LDS through non-kernels are sorted and
64// assigned a kernel-id. All the LDS globals accessed by non-kernels
65// are sorted. This information is used to build two tables:
66// - Base table:
67// Base table will have single row, with elements of the row
68// placed as per kernel ID. Each element in the row corresponds
69// to ptr of "SW LDS" variable created for that kernel.
70// - Offset table:
71// Offset table will have multiple rows and columns.
72// Rows are assumed to be from 0 to (n-1). n is total number
73// of kernels accessing the LDS through non-kernels.
74// Each row will have m elements. m is the total number of
75// unique LDS globals accessed by all non-kernels.
76// Each element in the row correspond to the ptr of
77// the replacement of LDS global done by that particular kernel.
78// A LDS variable in non-kernel will be replaced based on the information
79// from base and offset tables. Based on kernel-id query, ptr of "SW
80// LDS" for that corresponding kernel is obtained from base table.
81// The Offset into the base "SW LDS" is obtained from
82// corresponding element in offset table. With this information, replacement
83// value is obtained.
84//===----------------------------------------------------------------------===//
85
86#include "AMDGPU.h"
88#include "AMDGPUMemoryUtils.h"
89#include "AMDGPUTargetMachine.h"
90#include "llvm/ADT/DenseMap.h"
91#include "llvm/ADT/DenseSet.h"
92#include "llvm/ADT/SetVector.h"
94#include "llvm/ADT/StringRef.h"
98#include "llvm/IR/Constants.h"
99#include "llvm/IR/DIBuilder.h"
100#include "llvm/IR/DebugInfo.h"
102#include "llvm/IR/IRBuilder.h"
103#include "llvm/IR/Instructions.h"
104#include "llvm/IR/IntrinsicsAMDGPU.h"
105#include "llvm/IR/MDBuilder.h"
107#include "llvm/Pass.h"
111
112#include <algorithm>
113
114#define DEBUG_TYPE "amdgpu-sw-lower-lds"
115#define COV5_HIDDEN_DYN_LDS_SIZE_ARG 15
116
117using namespace llvm;
118using namespace AMDGPU;
119
120namespace {
121
123 AsanInstrumentLDS("amdgpu-asan-instrument-lds",
124 cl::desc("Run asan instrumentation on LDS instructions "
125 "lowered to global memory"),
126 cl::init(true), cl::Hidden);
127
128using DomTreeCallback = function_ref<DominatorTree *(Function &F)>;
129
130struct LDSAccessTypeInfo {
131 SetVector<GlobalVariable *> StaticLDSGlobals;
132 SetVector<GlobalVariable *> DynamicLDSGlobals;
133};
134
135// Struct to hold all the Metadata required for a kernel
136// to replace a LDS global uses with corresponding offset
137// in to device global memory.
138struct KernelLDSParameters {
139 GlobalVariable *SwLDS = nullptr;
140 GlobalVariable *SwDynLDS = nullptr;
141 GlobalVariable *SwLDSMetadata = nullptr;
142 LDSAccessTypeInfo DirectAccess;
143 LDSAccessTypeInfo IndirectAccess;
145 LDSToReplacementIndicesMap;
146 uint32_t MallocSize = 0;
147 uint32_t LDSSize = 0;
148 SmallVector<std::pair<uint32_t, uint32_t>, 64> RedzoneOffsetAndSizeVector;
149};
150
151// Struct to store information for creation of offset table
152// for all the non-kernel LDS accesses.
153struct NonKernelLDSParameters {
154 GlobalVariable *LDSBaseTable = nullptr;
155 GlobalVariable *LDSOffsetTable = nullptr;
156 SetVector<Function *> OrderedKernels;
157 SetVector<GlobalVariable *> OrdereLDSGlobals;
158};
159
160struct AsanInstrumentInfo {
161 int Scale = 0;
162 uint32_t Offset = 0;
163 SetVector<Instruction *> Instructions;
164};
165
166struct FunctionsAndLDSAccess {
167 DenseMap<Function *, KernelLDSParameters> KernelToLDSParametersMap;
168 SetVector<Function *> KernelsWithIndirectLDSAccess;
169 SetVector<Function *> NonKernelsWithLDSArgument;
170 SetVector<GlobalVariable *> AllNonKernelLDSAccess;
171 FunctionVariableMap NonKernelToLDSAccessMap;
172};
173
174class AMDGPUSwLowerLDS {
175public:
176 AMDGPUSwLowerLDS(Module &Mod, DomTreeCallback Callback)
177 : M(Mod), IRB(M.getContext()), DTCallback(Callback) {}
178 bool run();
179 void getUsesOfLDSByNonKernels();
180 void getNonKernelsWithLDSArguments(const CallGraph &CG);
182 getOrderedIndirectLDSAccessingKernels(SetVector<Function *> &Kernels);
184 getOrderedNonKernelAllLDSGlobals(SetVector<GlobalVariable *> &Variables);
185 void buildSwLDSGlobal(Function *Func);
186 void buildSwDynLDSGlobal(Function *Func);
187 void populateSwMetadataGlobal(Function *Func);
188 void populateSwLDSAttributeAndMetadata(Function *Func);
189 void populateLDSToReplacementIndicesMap(Function *Func);
190 void getLDSMemoryInstructions(Function *Func,
191 SetVector<Instruction *> &LDSInstructions);
192 void replaceKernelLDSAccesses(Function *Func);
193 Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr);
194 void translateLDSMemoryOperationsToGlobalMemory(
195 Function *Func, Value *LoadMallocPtr,
196 SetVector<Instruction *> &LDSInstructions);
197 void poisonRedzones(Function *Func, Value *MallocPtr);
198 void lowerKernelLDSAccesses(Function *Func, DomTreeUpdater &DTU);
199 void buildNonKernelLDSOffsetTable(NonKernelLDSParameters &NKLDSParams);
200 void buildNonKernelLDSBaseTable(NonKernelLDSParameters &NKLDSParams);
201 Constant *
202 getAddressesOfVariablesInKernel(Function *Func,
203 SetVector<GlobalVariable *> &Variables);
204 void lowerNonKernelLDSAccesses(Function *Func,
205 SetVector<GlobalVariable *> &LDSGlobals,
206 NonKernelLDSParameters &NKLDSParams);
207 void
208 updateMallocSizeForDynamicLDS(Function *Func, Value **CurrMallocSize,
209 Value *HiddenDynLDSSize,
210 SetVector<GlobalVariable *> &DynamicLDSGlobals);
211 void initAsanInfo();
212
213private:
214 Module &M;
215 IRBuilder<> IRB;
216 DomTreeCallback DTCallback;
217 FunctionsAndLDSAccess FuncLDSAccessInfo;
218 AsanInstrumentInfo AsanInfo;
219};
220
221template <typename T> SetVector<T> sortByName(std::vector<T> &&V) {
222 // Sort the vector of globals or Functions based on their name.
223 // Returns a SetVector of globals/Functions.
224 sort(V, [](const auto *L, const auto *R) {
225 return L->getName() < R->getName();
226 });
227 return {SetVector<T>(llvm::from_range, V)};
228}
229
230SetVector<GlobalVariable *> AMDGPUSwLowerLDS::getOrderedNonKernelAllLDSGlobals(
231 SetVector<GlobalVariable *> &Variables) {
232 // Sort all the non-kernel LDS accesses based on their name.
233 return sortByName(
234 std::vector<GlobalVariable *>(Variables.begin(), Variables.end()));
235}
236
237SetVector<Function *> AMDGPUSwLowerLDS::getOrderedIndirectLDSAccessingKernels(
238 SetVector<Function *> &Kernels) {
239 // Sort the non-kernels accessing LDS based on their name.
240 // Also assign a kernel ID metadata based on the sorted order.
241 LLVMContext &Ctx = M.getContext();
242 if (Kernels.size() > UINT32_MAX) {
243 report_fatal_error("Unimplemented SW LDS lowering for > 2**32 kernels");
244 }
245 SetVector<Function *> OrderedKernels =
246 sortByName(std::vector<Function *>(Kernels.begin(), Kernels.end()));
247 for (size_t i = 0; i < Kernels.size(); i++) {
248 Metadata *AttrMDArgs[1] = {
250 };
251 Function *Func = OrderedKernels[i];
252 Func->setMetadata("llvm.amdgcn.lds.kernel.id",
253 MDNode::get(Ctx, AttrMDArgs));
254 }
255 return OrderedKernels;
256}
257
258void AMDGPUSwLowerLDS::getNonKernelsWithLDSArguments(const CallGraph &CG) {
259 // Among the kernels accessing LDS, get list of
260 // Non-kernels to which a call is made and a ptr
261 // to addrspace(3) is passed as argument.
262 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
263 Function *Func = K.first;
264 const CallGraphNode *CGN = CG[Func];
265 if (!CGN)
266 continue;
267 for (auto &I : *CGN) {
268 CallGraphNode *CallerCGN = I.second;
269 Function *CalledFunc = CallerCGN->getFunction();
270 if (!CalledFunc || CalledFunc->isDeclaration())
271 continue;
272 if (AMDGPU::isKernel(*CalledFunc))
273 continue;
274 for (auto AI = CalledFunc->arg_begin(), E = CalledFunc->arg_end();
275 AI != E; ++AI) {
276 Type *ArgTy = (*AI).getType();
277 if (!ArgTy->isPointerTy())
278 continue;
280 continue;
281 FuncLDSAccessInfo.NonKernelsWithLDSArgument.insert(CalledFunc);
282 // Also add the Calling function to KernelsWithIndirectLDSAccess list
283 // so that base table of LDS is generated.
284 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(Func);
285 }
286 }
287 }
288}
289
290void AMDGPUSwLowerLDS::getUsesOfLDSByNonKernels() {
291 for (GlobalVariable *GV : FuncLDSAccessInfo.AllNonKernelLDSAccess) {
293 continue;
294
295 for (User *V : GV->users()) {
296 if (auto *I = dyn_cast<Instruction>(V)) {
297 Function *F = I->getFunction();
298 if (!isKernel(*F) && !F->isDeclaration())
299 FuncLDSAccessInfo.NonKernelToLDSAccessMap[F].insert(GV);
300 }
301 }
302 }
303}
304
305static void recordLDSAbsoluteAddress(Module &M, GlobalVariable *GV,
306 uint32_t Address) {
307 // Write the specified address into metadata where it can be retrieved by
308 // the assembler. Format is a half open range, [Address Address+1)
309 LLVMContext &Ctx = M.getContext();
310 auto *IntTy = M.getDataLayout().getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
311 MDBuilder MDB(Ctx);
312 MDNode *MetadataNode = MDB.createRange(ConstantInt::get(IntTy, Address),
313 ConstantInt::get(IntTy, Address + 1));
314 GV->setMetadata(LLVMContext::MD_absolute_symbol, MetadataNode);
315}
316
317static void addLDSSizeAttribute(Function *Func, uint32_t Offset,
318 bool IsDynLDS) {
319 if (Offset != 0) {
320 std::string Buffer;
321 raw_string_ostream SS{Buffer};
322 SS << Offset;
323 if (IsDynLDS)
324 SS << "," << Offset;
325 Func->addFnAttr("amdgpu-lds-size", Buffer);
326 }
327}
328
329static void markUsedByKernel(Function *Func, GlobalVariable *SGV) {
330 BasicBlock *Entry = &Func->getEntryBlock();
331 IRBuilder<> Builder(Entry, Entry->getFirstNonPHIIt());
332
333 Function *Decl = Intrinsic::getOrInsertDeclaration(Func->getParent(),
334 Intrinsic::donothing, {});
335
336 Value *UseInstance[1] = {
337 Builder.CreateConstInBoundsGEP1_32(SGV->getValueType(), SGV, 0)};
338
339 Builder.CreateCall(Decl, {},
340 {OperandBundleDefT<Value *>("ExplicitUse", UseInstance)});
341}
342
343void AMDGPUSwLowerLDS::buildSwLDSGlobal(Function *Func) {
344 // Create new LDS global required for each kernel to store
345 // device global memory pointer.
346 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
347 // Create new global pointer variable
348 LDSParams.SwLDS = new GlobalVariable(
349 M, IRB.getPtrTy(), false, GlobalValue::InternalLinkage,
350 PoisonValue::get(IRB.getPtrTy()), "llvm.amdgcn.sw.lds." + Func->getName(),
353 MD.NoAddress = true;
354 LDSParams.SwLDS->setSanitizerMetadata(MD);
355}
356
357void AMDGPUSwLowerLDS::buildSwDynLDSGlobal(Function *Func) {
358 // Create new Dyn LDS global if kernel accesses dyn LDS.
359 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
360 if (LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
361 LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
362 return;
363 // Create new global pointer variable
364 auto *emptyCharArray = ArrayType::get(IRB.getInt8Ty(), 0);
365 LDSParams.SwDynLDS = new GlobalVariable(
366 M, emptyCharArray, false, GlobalValue::ExternalLinkage, nullptr,
367 "llvm.amdgcn." + Func->getName() + ".dynlds", nullptr,
369 markUsedByKernel(Func, LDSParams.SwDynLDS);
371 MD.NoAddress = true;
372 LDSParams.SwDynLDS->setSanitizerMetadata(MD);
373}
374
375void AMDGPUSwLowerLDS::populateSwLDSAttributeAndMetadata(Function *Func) {
376 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
377 bool IsDynLDSUsed = LDSParams.SwDynLDS;
378 uint32_t Offset = LDSParams.LDSSize;
379 recordLDSAbsoluteAddress(M, LDSParams.SwLDS, 0);
380 addLDSSizeAttribute(Func, Offset, IsDynLDSUsed);
381 if (LDSParams.SwDynLDS)
382 recordLDSAbsoluteAddress(M, LDSParams.SwDynLDS, Offset);
383}
384
385void AMDGPUSwLowerLDS::populateSwMetadataGlobal(Function *Func) {
386 // Create new metadata global for every kernel and initialize the
387 // start offsets and sizes corresponding to each LDS accesses.
388 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
389 auto &Ctx = M.getContext();
390 auto &DL = M.getDataLayout();
391 std::vector<Type *> Items;
392 Type *Int32Ty = IRB.getInt32Ty();
393 std::vector<Constant *> Initializers;
394 Align MaxAlignment(1);
395 auto UpdateMaxAlignment = [&MaxAlignment, &DL](GlobalVariable *GV) {
396 Align GVAlign = AMDGPU::getAlign(DL, GV);
397 MaxAlignment = std::max(MaxAlignment, GVAlign);
398 };
399
400 for (GlobalVariable *GV : LDSParams.DirectAccess.StaticLDSGlobals)
401 UpdateMaxAlignment(GV);
402
403 for (GlobalVariable *GV : LDSParams.DirectAccess.DynamicLDSGlobals)
404 UpdateMaxAlignment(GV);
405
406 for (GlobalVariable *GV : LDSParams.IndirectAccess.StaticLDSGlobals)
407 UpdateMaxAlignment(GV);
408
409 for (GlobalVariable *GV : LDSParams.IndirectAccess.DynamicLDSGlobals)
410 UpdateMaxAlignment(GV);
411
412 //{StartOffset, AlignedSizeInBytes}
413 SmallString<128> MDItemStr;
414 raw_svector_ostream MDItemOS(MDItemStr);
415 MDItemOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.item";
416
417 StructType *LDSItemTy =
418 StructType::create(Ctx, {Int32Ty, Int32Ty, Int32Ty}, MDItemOS.str());
419 uint32_t &MallocSize = LDSParams.MallocSize;
420 SetVector<GlobalVariable *> UniqueLDSGlobals;
421 int AsanScale = AsanInfo.Scale;
422 auto buildInitializerForSwLDSMD =
423 [&](SetVector<GlobalVariable *> &LDSGlobals) {
424 for (auto &GV : LDSGlobals) {
425 if (is_contained(UniqueLDSGlobals, GV))
426 continue;
427 UniqueLDSGlobals.insert(GV);
428
429 Type *Ty = GV->getValueType();
430 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
431 Items.push_back(LDSItemTy);
432 Constant *ItemStartOffset = ConstantInt::get(Int32Ty, MallocSize);
433 Constant *SizeInBytesConst = ConstantInt::get(Int32Ty, SizeInBytes);
434 // Get redzone size corresponding a size.
435 const uint64_t RightRedzoneSize =
436 AMDGPU::getRedzoneSizeForGlobal(AsanScale, SizeInBytes);
437 // Update MallocSize with current size and redzone size.
438 MallocSize += SizeInBytes;
439 if (!AMDGPU::isDynamicLDS(*GV))
440 LDSParams.RedzoneOffsetAndSizeVector.emplace_back(MallocSize,
441 RightRedzoneSize);
442 MallocSize += RightRedzoneSize;
443 // Align current size plus redzone.
444 uint64_t AlignedSize =
445 alignTo(SizeInBytes + RightRedzoneSize, MaxAlignment);
446 Constant *AlignedSizeInBytesConst =
447 ConstantInt::get(Int32Ty, AlignedSize);
448 // Align MallocSize
449 MallocSize = alignTo(MallocSize, MaxAlignment);
450 Constant *InitItem =
451 ConstantStruct::get(LDSItemTy, {ItemStartOffset, SizeInBytesConst,
452 AlignedSizeInBytesConst});
453 Initializers.push_back(InitItem);
454 }
455 };
456 SetVector<GlobalVariable *> SwLDSVector;
457 SwLDSVector.insert(LDSParams.SwLDS);
458 buildInitializerForSwLDSMD(SwLDSVector);
459 buildInitializerForSwLDSMD(LDSParams.DirectAccess.StaticLDSGlobals);
460 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.StaticLDSGlobals);
461 buildInitializerForSwLDSMD(LDSParams.DirectAccess.DynamicLDSGlobals);
462 buildInitializerForSwLDSMD(LDSParams.IndirectAccess.DynamicLDSGlobals);
463
464 // Update the LDS size used by the kernel.
465 Type *Ty = LDSParams.SwLDS->getValueType();
466 const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
467 uint64_t AlignedSize = alignTo(SizeInBytes, MaxAlignment);
468 LDSParams.LDSSize = AlignedSize;
469 SmallString<128> MDTypeStr;
470 raw_svector_ostream MDTypeOS(MDTypeStr);
471 MDTypeOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md.type";
472 StructType *MetadataStructType =
473 StructType::create(Ctx, Items, MDTypeOS.str());
474 SmallString<128> MDStr;
475 raw_svector_ostream MDOS(MDStr);
476 MDOS << "llvm.amdgcn.sw.lds." << Func->getName() << ".md";
477 LDSParams.SwLDSMetadata = new GlobalVariable(
478 M, MetadataStructType, false, GlobalValue::InternalLinkage,
479 PoisonValue::get(MetadataStructType), MDOS.str(), nullptr,
481 Constant *data = ConstantStruct::get(MetadataStructType, Initializers);
482 LDSParams.SwLDSMetadata->setInitializer(data);
483 assert(LDSParams.SwLDS);
484 // Set the alignment to MaxAlignment for SwLDS.
485 LDSParams.SwLDS->setAlignment(MaxAlignment);
486 if (LDSParams.SwDynLDS)
487 LDSParams.SwDynLDS->setAlignment(MaxAlignment);
489 MD.NoAddress = true;
490 LDSParams.SwLDSMetadata->setSanitizerMetadata(MD);
491}
492
493void AMDGPUSwLowerLDS::populateLDSToReplacementIndicesMap(Function *Func) {
494 // Fill the corresponding LDS replacement indices for each LDS access
495 // related to this kernel.
496 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
497 SetVector<GlobalVariable *> UniqueLDSGlobals;
498 auto PopulateIndices = [&](SetVector<GlobalVariable *> &LDSGlobals,
499 uint32_t &Idx) {
500 for (auto &GV : LDSGlobals) {
501 if (is_contained(UniqueLDSGlobals, GV))
502 continue;
503 UniqueLDSGlobals.insert(GV);
504 LDSParams.LDSToReplacementIndicesMap[GV] = {0, Idx, 0};
505 ++Idx;
506 }
507 };
508 uint32_t Idx = 0;
509 SetVector<GlobalVariable *> SwLDSVector;
510 SwLDSVector.insert(LDSParams.SwLDS);
511 PopulateIndices(SwLDSVector, Idx);
512 PopulateIndices(LDSParams.DirectAccess.StaticLDSGlobals, Idx);
513 PopulateIndices(LDSParams.IndirectAccess.StaticLDSGlobals, Idx);
514 PopulateIndices(LDSParams.DirectAccess.DynamicLDSGlobals, Idx);
515 PopulateIndices(LDSParams.IndirectAccess.DynamicLDSGlobals, Idx);
516}
517
518static void replacesUsesOfGlobalInFunction(Function *Func, GlobalVariable *GV,
519 Value *Replacement) {
520 // Replace all uses of LDS global in this Function with a Replacement.
521 auto ReplaceUsesLambda = [Func](const Use &U) -> bool {
522 auto *V = U.getUser();
523 if (auto *Inst = dyn_cast<Instruction>(V)) {
524 auto *Func1 = Inst->getFunction();
525 if (Func == Func1)
526 return true;
527 }
528 return false;
529 };
530 GV->replaceUsesWithIf(Replacement, ReplaceUsesLambda);
531}
532
533void AMDGPUSwLowerLDS::replaceKernelLDSAccesses(Function *Func) {
534 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
535 GlobalVariable *SwLDS = LDSParams.SwLDS;
536 assert(SwLDS);
537 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
538 assert(SwLDSMetadata);
539 StructType *SwLDSMetadataStructType =
540 cast<StructType>(SwLDSMetadata->getValueType());
541 Type *Int32Ty = IRB.getInt32Ty();
542 auto &IndirectAccess = LDSParams.IndirectAccess;
543 auto &DirectAccess = LDSParams.DirectAccess;
544 // Replace all uses of LDS global in this Function with a Replacement.
545 SetVector<GlobalVariable *> UniqueLDSGlobals;
546 auto ReplaceLDSGlobalUses = [&](SetVector<GlobalVariable *> &LDSGlobals) {
547 for (auto &GV : LDSGlobals) {
548 // Do not generate instructions if LDS access is in non-kernel
549 // i.e indirect-access.
550 if ((IndirectAccess.StaticLDSGlobals.contains(GV) ||
551 IndirectAccess.DynamicLDSGlobals.contains(GV)) &&
552 (!DirectAccess.StaticLDSGlobals.contains(GV) &&
553 !DirectAccess.DynamicLDSGlobals.contains(GV)))
554 continue;
555 if (is_contained(UniqueLDSGlobals, GV))
556 continue;
557 UniqueLDSGlobals.insert(GV);
558 auto &Indices = LDSParams.LDSToReplacementIndicesMap[GV];
559 assert(Indices.size() == 3);
560 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
561 ConstantInt::get(Int32Ty, Indices[1]),
562 ConstantInt::get(Int32Ty, Indices[2])};
564 SwLDSMetadataStructType, SwLDSMetadata, GEPIdx, true);
565 Value *Offset = IRB.CreateLoad(Int32Ty, GEP);
566 Value *BasePlusOffset =
567 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), SwLDS, {Offset});
568 LLVM_DEBUG(GV->printAsOperand(dbgs() << "Sw LDS Lowering, Replacing LDS ",
569 false));
570 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
571 }
572 };
573 ReplaceLDSGlobalUses(DirectAccess.StaticLDSGlobals);
574 ReplaceLDSGlobalUses(IndirectAccess.StaticLDSGlobals);
575 ReplaceLDSGlobalUses(DirectAccess.DynamicLDSGlobals);
576 ReplaceLDSGlobalUses(IndirectAccess.DynamicLDSGlobals);
577}
578
579void AMDGPUSwLowerLDS::updateMallocSizeForDynamicLDS(
580 Function *Func, Value **CurrMallocSize, Value *HiddenDynLDSSize,
581 SetVector<GlobalVariable *> &DynamicLDSGlobals) {
582 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
583 Type *Int32Ty = IRB.getInt32Ty();
584
585 GlobalVariable *SwLDS = LDSParams.SwLDS;
586 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
587 assert(SwLDS && SwLDSMetadata);
588 StructType *MetadataStructType =
589 cast<StructType>(SwLDSMetadata->getValueType());
590 unsigned MaxAlignment = SwLDS->getAlignment();
591 Value *MaxAlignValue = IRB.getInt32(MaxAlignment);
592 Value *MaxAlignValueMinusOne = IRB.getInt32(MaxAlignment - 1);
593
594 for (GlobalVariable *DynGV : DynamicLDSGlobals) {
595 auto &Indices = LDSParams.LDSToReplacementIndicesMap[DynGV];
596 // Update the Offset metadata.
597 Constant *Index0 = ConstantInt::get(Int32Ty, 0);
598 Constant *Index1 = ConstantInt::get(Int32Ty, Indices[1]);
599
600 Constant *Index2Offset = ConstantInt::get(Int32Ty, 0);
601 auto *GEPForOffset = IRB.CreateInBoundsGEP(
602 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2Offset});
603
604 IRB.CreateStore(*CurrMallocSize, GEPForOffset);
605 // Update the size and Aligned Size metadata.
606 Constant *Index2Size = ConstantInt::get(Int32Ty, 1);
607 auto *GEPForSize = IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
608 {Index0, Index1, Index2Size});
609
610 Value *CurrDynLDSSize = IRB.CreateLoad(Int32Ty, HiddenDynLDSSize);
611 IRB.CreateStore(CurrDynLDSSize, GEPForSize);
612 Constant *Index2AlignedSize = ConstantInt::get(Int32Ty, 2);
613 auto *GEPForAlignedSize = IRB.CreateInBoundsGEP(
614 MetadataStructType, SwLDSMetadata, {Index0, Index1, Index2AlignedSize});
615
616 Value *AlignedDynLDSSize =
617 IRB.CreateAdd(CurrDynLDSSize, MaxAlignValueMinusOne);
618 AlignedDynLDSSize = IRB.CreateUDiv(AlignedDynLDSSize, MaxAlignValue);
619 AlignedDynLDSSize = IRB.CreateMul(AlignedDynLDSSize, MaxAlignValue);
620 IRB.CreateStore(AlignedDynLDSSize, GEPForAlignedSize);
621
622 // Update the Current Malloc Size
623 *CurrMallocSize = IRB.CreateAdd(*CurrMallocSize, AlignedDynLDSSize);
624 }
625}
626
627static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
628 DISubprogram *SP) {
629 assert(InsertBefore);
630 if (InsertBefore->getDebugLoc())
631 return InsertBefore->getDebugLoc();
632 if (SP)
633 return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
634 return DebugLoc();
635}
636
637void AMDGPUSwLowerLDS::getLDSMemoryInstructions(
638 Function *Func, SetVector<Instruction *> &LDSInstructions) {
639 for (BasicBlock &BB : *Func) {
640 for (Instruction &Inst : BB) {
641 if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
642 if (LI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
643 LDSInstructions.insert(&Inst);
644 } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
645 if (SI->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
646 LDSInstructions.insert(&Inst);
647 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&Inst)) {
648 if (RMW->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
649 LDSInstructions.insert(&Inst);
650 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(&Inst)) {
651 if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
652 LDSInstructions.insert(&Inst);
653 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&Inst)) {
654 if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
655 ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS)
656 LDSInstructions.insert(&Inst);
657 } else if (AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(&Inst)) {
658 if (MI->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
659 LDSInstructions.insert(&Inst);
660 } else if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
661 if (MTI->getSourceAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
662 LDSInstructions.insert(&Inst);
663 }
664 } else
665 continue;
666 }
667 }
668}
669
670Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr,
671 Value *LDSPtr) {
672 assert(LDSPtr && "Invalid LDS pointer operand");
673 Type *LDSPtrType = LDSPtr->getType();
674 LLVMContext &Ctx = M.getContext();
675 const DataLayout &DL = M.getDataLayout();
676 Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS);
677 if (auto *VecPtrTy = dyn_cast<VectorType>(LDSPtrType)) {
678 // Handle vector of pointers
679 ElementCount NumElements = VecPtrTy->getElementCount();
680 IntTy = VectorType::get(IntTy, NumElements);
681 }
682 Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy);
683 return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex});
684}
685
686void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory(
687 Function *Func, Value *LoadMallocPtr,
688 SetVector<Instruction *> &LDSInstructions) {
689 LLVM_DEBUG(dbgs() << "Translating LDS memory operations to global memory : "
690 << Func->getName());
691 for (Instruction *Inst : LDSInstructions) {
692 IRB.SetInsertPoint(Inst);
693 if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
694 Value *LIOperand = LI->getPointerOperand();
695 Value *Replacement =
696 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand);
697 LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement,
698 LI->getAlign(), LI->isVolatile());
699 NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
700 AsanInfo.Instructions.insert(NewLI);
701 LI->replaceAllUsesWith(NewLI);
702 LI->eraseFromParent();
703 } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
704 Value *SIOperand = SI->getPointerOperand();
705 Value *Replacement =
706 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand);
707 StoreInst *NewSI = IRB.CreateAlignedStore(
708 SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile());
709 NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
710 AsanInfo.Instructions.insert(NewSI);
711 SI->replaceAllUsesWith(NewSI);
712 SI->eraseFromParent();
713 } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
714 Value *RMWPtrOperand = RMW->getPointerOperand();
715 Value *RMWValOperand = RMW->getValOperand();
716 Value *Replacement =
717 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand);
718 AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW(
719 RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(),
720 RMW->getOrdering(), RMW->getSyncScopeID());
721 NewRMW->setVolatile(RMW->isVolatile());
722 AsanInfo.Instructions.insert(NewRMW);
723 RMW->replaceAllUsesWith(NewRMW);
724 RMW->eraseFromParent();
725 } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(Inst)) {
726 Value *XCHGPtrOperand = XCHG->getPointerOperand();
727 Value *Replacement =
728 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand);
730 Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(),
731 XCHG->getAlign(), XCHG->getSuccessOrdering(),
732 XCHG->getFailureOrdering(), XCHG->getSyncScopeID());
733 NewXCHG->setVolatile(XCHG->isVolatile());
734 AsanInfo.Instructions.insert(NewXCHG);
735 XCHG->replaceAllUsesWith(NewXCHG);
736 XCHG->eraseFromParent();
737 } else if (AnyMemIntrinsic *MI = dyn_cast<AnyMemIntrinsic>(Inst)) {
738 Value *NewDest = MI->getRawDest();
739 if (MI->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
740 NewDest = getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, NewDest);
741 CallInst *NewMI = nullptr;
743 if (MI->isAtomic()) {
745 NewDest, MSI->getValue(), MSI->getLength(),
746 MSI->getDestAlign().valueOrOne(), MSI->getElementSizeInBytes());
747 } else {
748 NewMI = IRB.CreateMemSet(NewDest, MSI->getValue(), MSI->getLength(),
749 MSI->getDestAlign(),
750 cast<MemSetInst>(MI)->isVolatile());
751 }
753 Value *NewSrc = MTI->getRawSource();
754 if (MTI->getSourceAddressSpace() == AMDGPUAS::LOCAL_ADDRESS)
755 NewSrc = getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, NewSrc);
756 if (MI->isAtomic()) {
757 if (MI->getIntrinsicID() ==
758 Intrinsic::memmove_element_unordered_atomic) {
760 NewDest, MTI->getDestAlign().valueOrOne(), NewSrc,
761 MTI->getSourceAlign().valueOrOne(), MTI->getLength(),
762 MTI->getElementSizeInBytes());
763 } else {
765 NewDest, MTI->getDestAlign().valueOrOne(), NewSrc,
766 MTI->getSourceAlign().valueOrOne(), MTI->getLength(),
767 MTI->getElementSizeInBytes());
768 }
769 } else {
770 NewMI = IRB.CreateMemTransferInst(
771 MI->getIntrinsicID(), NewDest, MTI->getDestAlign(), NewSrc,
772 MTI->getSourceAlign(), MTI->getLength(),
773 cast<MemTransferInst>(MI)->isVolatile());
774 }
775 } else
776 reportFatalUsageError("Unimplemented LDS lowering memory intrinsic");
777 AsanInfo.Instructions.insert(NewMI);
778 MI->replaceAllUsesWith(NewMI);
779 MI->eraseFromParent();
780 } else if (AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(Inst)) {
781 Value *AIOperand = ASC->getPointerOperand();
782 Value *Replacement =
783 getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand);
784 Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType());
785 // Note: No need to add the instruction to AsanInfo instructions to be
786 // instrumented list. FLAT_ADDRESS ptr would have been already
787 // instrumented by asan pass prior to this pass.
788 ASC->replaceAllUsesWith(NewAI);
789 ASC->eraseFromParent();
790 } else
791 report_fatal_error("Unimplemented LDS lowering instruction");
792 }
793}
794
795void AMDGPUSwLowerLDS::poisonRedzones(Function *Func, Value *MallocPtr) {
796 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
797 Type *Int64Ty = IRB.getInt64Ty();
798 Type *VoidTy = IRB.getVoidTy();
799 FunctionCallee AsanPoisonRegion = M.getOrInsertFunction(
800 "__asan_poison_region",
801 FunctionType::get(VoidTy, {Int64Ty, Int64Ty}, false));
802
803 auto RedzonesVec = LDSParams.RedzoneOffsetAndSizeVector;
804 size_t VecSize = RedzonesVec.size();
805 for (unsigned i = 0; i < VecSize; i++) {
806 auto &RedzonePair = RedzonesVec[i];
807 uint64_t RedzoneOffset = RedzonePair.first;
808 uint64_t RedzoneSize = RedzonePair.second;
809 Value *RedzoneAddrOffset = IRB.CreateInBoundsGEP(
810 IRB.getInt8Ty(), MallocPtr, {IRB.getInt64(RedzoneOffset)});
811 Value *RedzoneAddress = IRB.CreatePtrToInt(RedzoneAddrOffset, Int64Ty);
812 IRB.CreateCall(AsanPoisonRegion,
813 {RedzoneAddress, IRB.getInt64(RedzoneSize)});
814 }
815}
816
817void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
818 DomTreeUpdater &DTU) {
819 LLVM_DEBUG(dbgs() << "Sw Lowering Kernel LDS for : " << Func->getName());
820 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
821 auto &Ctx = M.getContext();
822 auto *PrevEntryBlock = &Func->getEntryBlock();
823 SetVector<Instruction *> LDSInstructions;
824 getLDSMemoryInstructions(Func, LDSInstructions);
825 const DataLayout &DL = M.getDataLayout();
826
827 // Create malloc block.
828 auto *MallocBlock = BasicBlock::Create(Ctx, "Malloc", Func, PrevEntryBlock);
829
830 // Create WIdBlock block which has instructions related to selection of
831 // {0,0,0} indiex work item in the work group.
832 auto *WIdBlock = BasicBlock::Create(Ctx, "WId", Func, MallocBlock);
833
834 // Move constant-size allocas from the original entry block to the new entry
835 // block (WIdBlock) so they remain static allocas. Splice the leading cluster
836 // in bulk, then move any stragglers that are interleaved with other
837 // instructions.
838 auto SplitIt = PrevEntryBlock->getFirstNonPHIOrDbgOrAlloca();
839 WIdBlock->splice(WIdBlock->end(), PrevEntryBlock, PrevEntryBlock->begin(),
840 SplitIt);
841 for (Instruction &I : make_early_inc_range(*PrevEntryBlock))
842 if (auto *AI = dyn_cast<AllocaInst>(&I))
843 if (isa<ConstantInt>(AI->getArraySize()))
844 AI->moveBefore(*WIdBlock, WIdBlock->end());
845
846 IRB.SetInsertPoint(WIdBlock, WIdBlock->end());
847 DebugLoc FirstDL =
848 getOrCreateDebugLoc(&*PrevEntryBlock->begin(), Func->getSubprogram());
849 IRB.SetCurrentDebugLocation(FirstDL);
850 Value *WIdx = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_x, {});
851 Value *WIdy = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_y, {});
852 Value *WIdz = IRB.CreateIntrinsic(Intrinsic::amdgcn_workitem_id_z, {});
853 Value *XYOr = IRB.CreateOr(WIdx, WIdy);
854 Value *XYZOr = IRB.CreateOr(XYOr, WIdz);
855 Value *WIdzCond = IRB.CreateICmpEQ(XYZOr, IRB.getInt32(0));
856
857 // All work items will branch to PrevEntryBlock except {0,0,0} index
858 // work item which will branch to malloc block.
859 IRB.CreateCondBr(WIdzCond, MallocBlock, PrevEntryBlock);
860
861 // Malloc block
862 IRB.SetInsertPoint(MallocBlock, MallocBlock->begin());
863
864 // If Dynamic LDS globals are accessed by the kernel,
865 // Get the size of dyn lds from hidden dyn_lds_size kernel arg.
866 // Update the corresponding metadata global entries for this dyn lds global.
867 GlobalVariable *SwLDS = LDSParams.SwLDS;
868 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
869 assert(SwLDS && SwLDSMetadata);
870 StructType *MetadataStructType =
871 cast<StructType>(SwLDSMetadata->getValueType());
872 uint32_t MallocSize = 0;
873 Value *CurrMallocSize;
874 Type *Int32Ty = IRB.getInt32Ty();
875 Type *Int64Ty = IRB.getInt64Ty();
876
877 SetVector<GlobalVariable *> UniqueLDSGlobals;
878 auto GetUniqueLDSGlobals = [&](SetVector<GlobalVariable *> &LDSGlobals) {
879 for (auto &GV : LDSGlobals) {
880 if (is_contained(UniqueLDSGlobals, GV))
881 continue;
882 UniqueLDSGlobals.insert(GV);
883 }
884 };
885
886 GetUniqueLDSGlobals(LDSParams.DirectAccess.StaticLDSGlobals);
887 GetUniqueLDSGlobals(LDSParams.IndirectAccess.StaticLDSGlobals);
888 unsigned NumStaticLDS = 1 + UniqueLDSGlobals.size();
889 UniqueLDSGlobals.clear();
890
891 if (NumStaticLDS) {
892 auto *GEPForEndStaticLDSOffset =
893 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
894 {ConstantInt::get(Int32Ty, 0),
895 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
896 ConstantInt::get(Int32Ty, 0)});
897
898 auto *GEPForEndStaticLDSSize =
899 IRB.CreateInBoundsGEP(MetadataStructType, SwLDSMetadata,
900 {ConstantInt::get(Int32Ty, 0),
901 ConstantInt::get(Int32Ty, NumStaticLDS - 1),
902 ConstantInt::get(Int32Ty, 2)});
903
904 Value *EndStaticLDSOffset =
905 IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSOffset);
906 Value *EndStaticLDSSize = IRB.CreateLoad(Int32Ty, GEPForEndStaticLDSSize);
907 CurrMallocSize = IRB.CreateAdd(EndStaticLDSOffset, EndStaticLDSSize);
908 } else
909 CurrMallocSize = IRB.getInt32(MallocSize);
910
911 if (LDSParams.SwDynLDS) {
914 "Dynamic LDS size query is only supported for CO V5 and later.");
915 // Get size from hidden dyn_lds_size argument of kernel
917 IRB.CreateIntrinsic(Intrinsic::amdgcn_implicitarg_ptr, {});
918 Value *HiddenDynLDSSize = IRB.CreateInBoundsGEP(
919 ImplicitArg->getType(), ImplicitArg,
920 {ConstantInt::get(Int64Ty, COV5_HIDDEN_DYN_LDS_SIZE_ARG)});
921 UniqueLDSGlobals.clear();
922 GetUniqueLDSGlobals(LDSParams.DirectAccess.DynamicLDSGlobals);
923 GetUniqueLDSGlobals(LDSParams.IndirectAccess.DynamicLDSGlobals);
924 updateMallocSizeForDynamicLDS(Func, &CurrMallocSize, HiddenDynLDSSize,
925 UniqueLDSGlobals);
926 }
927
928 CurrMallocSize = IRB.CreateZExt(CurrMallocSize, Int64Ty);
929
930 // Create a call to malloc function which does device global memory allocation
931 // with size equals to all LDS global accesses size in this kernel.
932 Value *ReturnAddress = IRB.CreateIntrinsic(
933 Intrinsic::returnaddress, IRB.getPtrTy(DL.getProgramAddressSpace()),
934 {IRB.getInt32(0)});
935 FunctionCallee MallocFunc = M.getOrInsertFunction(
936 StringRef("__asan_malloc_impl"),
937 FunctionType::get(Int64Ty, {Int64Ty, Int64Ty}, false));
938 Value *RAPtrToInt = IRB.CreatePtrToInt(ReturnAddress, Int64Ty);
939 Value *MallocCall = IRB.CreateCall(MallocFunc, {CurrMallocSize, RAPtrToInt});
940
941 Value *MallocPtr =
943
944 // Create store of malloc to new global
945 IRB.CreateStore(MallocPtr, SwLDS);
946
947 // Create calls to __asan_poison_region to poison redzones.
948 poisonRedzones(Func, MallocPtr);
949
950 // Create branch to PrevEntryBlock
951 IRB.CreateBr(PrevEntryBlock);
952
953 // Create wave-group barrier at the starting of Previous entry block
954 Type *Int1Ty = IRB.getInt1Ty();
955 IRB.SetInsertPoint(PrevEntryBlock, PrevEntryBlock->begin());
956 auto *XYZCondPhi = IRB.CreatePHI(Int1Ty, 2, "xyzCond");
957 XYZCondPhi->addIncoming(IRB.getInt1(0), WIdBlock);
958 XYZCondPhi->addIncoming(IRB.getInt1(1), MallocBlock);
959
960 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
961
962 // Load malloc pointer from Sw LDS.
963 Value *LoadMallocPtr =
965
966 // Replace All uses of LDS globals with new LDS pointers.
967 replaceKernelLDSAccesses(Func);
968
969 // Replace Memory Operations on LDS with corresponding
970 // global memory pointers.
971 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
972 LDSInstructions);
973
974 auto *CondFreeBlock = BasicBlock::Create(Ctx, "CondFree", Func);
975 auto *FreeBlock = BasicBlock::Create(Ctx, "Free", Func);
976 auto *EndBlock = BasicBlock::Create(Ctx, "End", Func);
977 for (BasicBlock &BB : *Func) {
978 if (!BB.empty()) {
979 if (ReturnInst *RI = dyn_cast<ReturnInst>(&BB.back())) {
980 RI->eraseFromParent();
981 IRB.SetInsertPoint(&BB, BB.end());
982 IRB.CreateBr(CondFreeBlock);
983 }
984 }
985 }
986
987 // Cond Free Block
988 IRB.SetInsertPoint(CondFreeBlock, CondFreeBlock->begin());
989 IRB.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {});
990 IRB.CreateCondBr(XYZCondPhi, FreeBlock, EndBlock);
991
992 // Free Block
993 IRB.SetInsertPoint(FreeBlock, FreeBlock->begin());
994
995 // Free the previously allocate device global memory.
996 FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
997 StringRef("__asan_free_impl"),
998 FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
999 Value *ReturnAddr = IRB.CreateIntrinsic(
1000 Intrinsic::returnaddress, IRB.getPtrTy(DL.getProgramAddressSpace()),
1001 IRB.getInt32(0));
1002 Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
1003 Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
1004 IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
1005
1006 IRB.CreateBr(EndBlock);
1007
1008 // End Block
1009 IRB.SetInsertPoint(EndBlock, EndBlock->begin());
1010 IRB.CreateRetVoid();
1011 // Update the DomTree with corresponding links to basic blocks.
1012 DTU.applyUpdates({{DominatorTree::Insert, WIdBlock, MallocBlock},
1013 {DominatorTree::Insert, MallocBlock, PrevEntryBlock},
1014 {DominatorTree::Insert, CondFreeBlock, FreeBlock},
1015 {DominatorTree::Insert, FreeBlock, EndBlock}});
1016}
1017
1018Constant *AMDGPUSwLowerLDS::getAddressesOfVariablesInKernel(
1019 Function *Func, SetVector<GlobalVariable *> &Variables) {
1020 Type *Int32Ty = IRB.getInt32Ty();
1021 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1022
1023 GlobalVariable *SwLDSMetadata = LDSParams.SwLDSMetadata;
1024 assert(SwLDSMetadata);
1025 auto *SwLDSMetadataStructType =
1026 cast<StructType>(SwLDSMetadata->getValueType());
1027 ArrayType *KernelOffsetsType =
1029
1030 SmallVector<Constant *> Elements;
1031 for (auto *GV : Variables) {
1032 auto It = LDSParams.LDSToReplacementIndicesMap.find(GV);
1033 if (It == LDSParams.LDSToReplacementIndicesMap.end()) {
1034 Elements.push_back(
1036 continue;
1037 }
1038 auto &Indices = It->second;
1039 Constant *GEPIdx[] = {ConstantInt::get(Int32Ty, Indices[0]),
1040 ConstantInt::get(Int32Ty, Indices[1]),
1041 ConstantInt::get(Int32Ty, Indices[2])};
1042 Constant *GEP = ConstantExpr::getGetElementPtr(SwLDSMetadataStructType,
1043 SwLDSMetadata, GEPIdx, true);
1044 Elements.push_back(GEP);
1045 }
1046 return ConstantArray::get(KernelOffsetsType, Elements);
1047}
1048
1049void AMDGPUSwLowerLDS::buildNonKernelLDSBaseTable(
1050 NonKernelLDSParameters &NKLDSParams) {
1051 // Base table will have single row, with elements of the row
1052 // placed as per kernel ID. Each element in the row corresponds
1053 // to addresss of "SW LDS" global of the kernel.
1054 auto &Kernels = NKLDSParams.OrderedKernels;
1055 if (Kernels.empty())
1056 return;
1057 const size_t NumberKernels = Kernels.size();
1058 ArrayType *AllKernelsOffsetsType =
1059 ArrayType::get(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), NumberKernels);
1060 std::vector<Constant *> OverallConstantExprElts(NumberKernels);
1061 for (size_t i = 0; i < NumberKernels; i++) {
1062 Function *Func = Kernels[i];
1063 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1064 OverallConstantExprElts[i] = LDSParams.SwLDS;
1065 }
1066 Constant *init =
1067 ConstantArray::get(AllKernelsOffsetsType, OverallConstantExprElts);
1068 NKLDSParams.LDSBaseTable = new GlobalVariable(
1069 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, init,
1070 "llvm.amdgcn.sw.lds.base.table", nullptr, GlobalValue::NotThreadLocal,
1073 MD.NoAddress = true;
1074 NKLDSParams.LDSBaseTable->setSanitizerMetadata(MD);
1075}
1076
1077void AMDGPUSwLowerLDS::buildNonKernelLDSOffsetTable(
1078 NonKernelLDSParameters &NKLDSParams) {
1079 // Offset table will have multiple rows and columns.
1080 // Rows are assumed to be from 0 to (n-1). n is total number
1081 // of kernels accessing the LDS through non-kernels.
1082 // Each row will have m elements. m is the total number of
1083 // unique LDS globals accessed by non-kernels.
1084 // Each element in the row correspond to the address of
1085 // the replacement of LDS global done by that particular kernel.
1086 auto &Variables = NKLDSParams.OrdereLDSGlobals;
1087 auto &Kernels = NKLDSParams.OrderedKernels;
1088 if (Variables.empty() || Kernels.empty())
1089 return;
1090 const size_t NumberVariables = Variables.size();
1091 const size_t NumberKernels = Kernels.size();
1092
1093 ArrayType *KernelOffsetsType =
1094 ArrayType::get(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), NumberVariables);
1095
1096 ArrayType *AllKernelsOffsetsType =
1097 ArrayType::get(KernelOffsetsType, NumberKernels);
1098 std::vector<Constant *> overallConstantExprElts(NumberKernels);
1099 for (size_t i = 0; i < NumberKernels; i++) {
1100 Function *Func = Kernels[i];
1101 overallConstantExprElts[i] =
1102 getAddressesOfVariablesInKernel(Func, Variables);
1103 }
1104 Constant *Init =
1105 ConstantArray::get(AllKernelsOffsetsType, overallConstantExprElts);
1106 NKLDSParams.LDSOffsetTable = new GlobalVariable(
1107 M, AllKernelsOffsetsType, true, GlobalValue::InternalLinkage, Init,
1108 "llvm.amdgcn.sw.lds.offset.table", nullptr, GlobalValue::NotThreadLocal,
1111 MD.NoAddress = true;
1112 NKLDSParams.LDSOffsetTable->setSanitizerMetadata(MD);
1113}
1114
1115void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
1116 Function *Func, SetVector<GlobalVariable *> &LDSGlobals,
1117 NonKernelLDSParameters &NKLDSParams) {
1118 // Replace LDS access in non-kernel with replacement queried from
1119 // Base table and offset from offset table.
1120 LLVM_DEBUG(dbgs() << "Sw LDS lowering, lower non-kernel access for : "
1121 << Func->getName());
1122 auto InsertAt = Func->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
1123 IRB.SetInsertPoint(InsertAt);
1124
1125 // Get LDS memory instructions.
1126 SetVector<Instruction *> LDSInstructions;
1127 getLDSMemoryInstructions(Func, LDSInstructions);
1128
1129 auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {});
1130 GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
1131 GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
1132 auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
1133 Value *BaseGEP = IRB.CreateInBoundsGEP(
1134 LDSBaseTable->getValueType(), LDSBaseTable, {IRB.getInt32(0), KernelId});
1135 Value *BaseLoad =
1136 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::LOCAL_ADDRESS), BaseGEP);
1137 Value *LoadMallocPtr =
1138 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), BaseLoad);
1139
1140 for (GlobalVariable *GV : LDSGlobals) {
1141 const auto *GVIt = llvm::find(OrdereLDSGlobals, GV);
1142 assert(GVIt != OrdereLDSGlobals.end());
1143 uint32_t GVOffset = std::distance(OrdereLDSGlobals.begin(), GVIt);
1144
1145 Value *OffsetGEP = IRB.CreateInBoundsGEP(
1146 LDSOffsetTable->getValueType(), LDSOffsetTable,
1147 {IRB.getInt32(0), KernelId, IRB.getInt32(GVOffset)});
1148 Value *OffsetLoad =
1149 IRB.CreateLoad(IRB.getPtrTy(AMDGPUAS::GLOBAL_ADDRESS), OffsetGEP);
1150 Value *Offset = IRB.CreateLoad(IRB.getInt32Ty(), OffsetLoad);
1151 Value *BasePlusOffset =
1152 IRB.CreateInBoundsGEP(IRB.getInt8Ty(), BaseLoad, {Offset});
1153 LLVM_DEBUG(dbgs() << "Sw LDS Lowering, Replace non-kernel LDS for "
1154 << GV->getName());
1155 replacesUsesOfGlobalInFunction(Func, GV, BasePlusOffset);
1156 }
1157 translateLDSMemoryOperationsToGlobalMemory(Func, LoadMallocPtr,
1158 LDSInstructions);
1159}
1160
1161static void reorderStaticDynamicIndirectLDSSet(KernelLDSParameters &LDSParams) {
1162 // Sort Static, dynamic LDS globals which are either
1163 // direct or indirect access on basis of name.
1164 auto &DirectAccess = LDSParams.DirectAccess;
1165 auto &IndirectAccess = LDSParams.IndirectAccess;
1166 LDSParams.DirectAccess.StaticLDSGlobals = sortByName(
1167 std::vector<GlobalVariable *>(DirectAccess.StaticLDSGlobals.begin(),
1168 DirectAccess.StaticLDSGlobals.end()));
1169 LDSParams.DirectAccess.DynamicLDSGlobals = sortByName(
1170 std::vector<GlobalVariable *>(DirectAccess.DynamicLDSGlobals.begin(),
1171 DirectAccess.DynamicLDSGlobals.end()));
1172 LDSParams.IndirectAccess.StaticLDSGlobals = sortByName(
1173 std::vector<GlobalVariable *>(IndirectAccess.StaticLDSGlobals.begin(),
1174 IndirectAccess.StaticLDSGlobals.end()));
1175 LDSParams.IndirectAccess.DynamicLDSGlobals = sortByName(
1176 std::vector<GlobalVariable *>(IndirectAccess.DynamicLDSGlobals.begin(),
1177 IndirectAccess.DynamicLDSGlobals.end()));
1178}
1179
1180void AMDGPUSwLowerLDS::initAsanInfo() {
1181 // Get Shadow mapping scale and offset.
1182 unsigned LongSize =
1183 M.getDataLayout().getPointerSizeInBits(AMDGPUAS::GLOBAL_ADDRESS);
1185 int Scale;
1186 bool OrShadowOffset;
1187 llvm::getAddressSanitizerParams(M.getTargetTriple(), LongSize, false, &Offset,
1188 &Scale, &OrShadowOffset);
1189 AsanInfo.Scale = Scale;
1190 AsanInfo.Offset = Offset;
1191}
1192
1193static bool hasFnWithSanitizeAddressAttr(FunctionVariableMap &LDSAccesses) {
1194 for (auto &K : LDSAccesses) {
1195 Function *F = K.first;
1196 if (!F)
1197 continue;
1198 if (F->hasFnAttribute(Attribute::SanitizeAddress))
1199 return true;
1200 }
1201 return false;
1202}
1203
1204bool AMDGPUSwLowerLDS::run() {
1205 bool Changed = false;
1206
1207 CallGraph CG = CallGraph(M);
1208
1209 Changed |=
1211
1212 // Get all the direct and indirect access of LDS for all the kernels.
1214
1215 // Flag to decide whether to lower all the LDS accesses
1216 // based on sanitize_address attribute.
1217 bool LowerAllLDS = hasFnWithSanitizeAddressAttr(LDSUsesInfo.DirectAccess) ||
1218 hasFnWithSanitizeAddressAttr(LDSUsesInfo.IndirectAccess);
1219
1220 if (!LowerAllLDS)
1221 return Changed;
1222
1223 // Utility to group LDS access into direct, indirect, static and dynamic.
1224 auto PopulateKernelStaticDynamicLDS = [&](FunctionVariableMap &LDSAccesses,
1225 bool DirectAccess) {
1226 for (auto &K : LDSAccesses) {
1227 Function *F = K.first;
1228 if (!F || K.second.empty())
1229 continue;
1230
1231 assert(isKernel(*F));
1232
1233 // Only inserts if key isn't already in the map.
1234 FuncLDSAccessInfo.KernelToLDSParametersMap.insert(
1235 {F, KernelLDSParameters()});
1236
1237 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[F];
1238 if (!DirectAccess)
1239 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess.insert(F);
1240 for (GlobalVariable *GV : K.second) {
1241 if (!DirectAccess) {
1242 if (AMDGPU::isDynamicLDS(*GV))
1243 LDSParams.IndirectAccess.DynamicLDSGlobals.insert(GV);
1244 else
1245 LDSParams.IndirectAccess.StaticLDSGlobals.insert(GV);
1246 FuncLDSAccessInfo.AllNonKernelLDSAccess.insert(GV);
1247 } else {
1248 if (AMDGPU::isDynamicLDS(*GV))
1249 LDSParams.DirectAccess.DynamicLDSGlobals.insert(GV);
1250 else
1251 LDSParams.DirectAccess.StaticLDSGlobals.insert(GV);
1252 }
1253 }
1254 }
1255 };
1256
1257 PopulateKernelStaticDynamicLDS(LDSUsesInfo.DirectAccess, true);
1258 PopulateKernelStaticDynamicLDS(LDSUsesInfo.IndirectAccess, false);
1259
1260 // Get address sanitizer scale.
1261 initAsanInfo();
1262
1263 for (auto &K : FuncLDSAccessInfo.KernelToLDSParametersMap) {
1264 Function *Func = K.first;
1265 auto &LDSParams = FuncLDSAccessInfo.KernelToLDSParametersMap[Func];
1266 if (LDSParams.DirectAccess.StaticLDSGlobals.empty() &&
1267 LDSParams.DirectAccess.DynamicLDSGlobals.empty() &&
1268 LDSParams.IndirectAccess.StaticLDSGlobals.empty() &&
1269 LDSParams.IndirectAccess.DynamicLDSGlobals.empty()) {
1270 Changed = false;
1271 } else {
1273 CG, Func,
1274 {"amdgpu-no-workitem-id-x", "amdgpu-no-workitem-id-y",
1275 "amdgpu-no-workitem-id-z", "amdgpu-no-heap-ptr"});
1276 if (!LDSParams.IndirectAccess.StaticLDSGlobals.empty() ||
1277 !LDSParams.IndirectAccess.DynamicLDSGlobals.empty())
1278 removeFnAttrFromReachable(CG, Func, {"amdgpu-no-lds-kernel-id"});
1279 reorderStaticDynamicIndirectLDSSet(LDSParams);
1280 buildSwLDSGlobal(Func);
1281 buildSwDynLDSGlobal(Func);
1282 populateSwMetadataGlobal(Func);
1283 populateSwLDSAttributeAndMetadata(Func);
1284 populateLDSToReplacementIndicesMap(Func);
1285 DomTreeUpdater DTU(DTCallback(*Func),
1286 DomTreeUpdater::UpdateStrategy::Lazy);
1287 lowerKernelLDSAccesses(Func, DTU);
1288 Changed = true;
1289 }
1290 }
1291
1292 // Get the Uses of LDS from non-kernels.
1293 getUsesOfLDSByNonKernels();
1294
1295 // Get non-kernels with LDS ptr as argument and called by kernels.
1296 getNonKernelsWithLDSArguments(CG);
1297
1298 // Lower LDS accesses in non-kernels.
1299 if (!FuncLDSAccessInfo.NonKernelToLDSAccessMap.empty() ||
1300 !FuncLDSAccessInfo.NonKernelsWithLDSArgument.empty()) {
1301 NonKernelLDSParameters NKLDSParams;
1302 NKLDSParams.OrderedKernels = getOrderedIndirectLDSAccessingKernels(
1303 FuncLDSAccessInfo.KernelsWithIndirectLDSAccess);
1304 NKLDSParams.OrdereLDSGlobals = getOrderedNonKernelAllLDSGlobals(
1305 FuncLDSAccessInfo.AllNonKernelLDSAccess);
1306 buildNonKernelLDSBaseTable(NKLDSParams);
1307 buildNonKernelLDSOffsetTable(NKLDSParams);
1308 for (auto &K : FuncLDSAccessInfo.NonKernelToLDSAccessMap) {
1309 Function *Func = K.first;
1310 DenseSet<GlobalVariable *> &LDSGlobals = K.second;
1311 SetVector<GlobalVariable *> OrderedLDSGlobals = sortByName(
1312 std::vector<GlobalVariable *>(LDSGlobals.begin(), LDSGlobals.end()));
1313 lowerNonKernelLDSAccesses(Func, OrderedLDSGlobals, NKLDSParams);
1314 }
1315 for (Function *Func : FuncLDSAccessInfo.NonKernelsWithLDSArgument) {
1316 auto &K = FuncLDSAccessInfo.NonKernelToLDSAccessMap;
1317 if (K.contains(Func))
1318 continue;
1320 lowerNonKernelLDSAccesses(Func, Vec, NKLDSParams);
1321 }
1322 Changed = true;
1323 }
1324
1325 if (!Changed)
1326 return Changed;
1327
1328 for (auto &GV : make_early_inc_range(M.globals())) {
1330 // probably want to remove from used lists
1332 if (GV.use_empty())
1333 GV.eraseFromParent();
1334 }
1335 }
1336
1337 if (AsanInstrumentLDS) {
1338 SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
1339 for (Instruction *Inst : AsanInfo.Instructions) {
1340 SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
1341 getInterestingMemoryOperands(M, Inst, InterestingOperands);
1342 llvm::append_range(OperandsToInstrument, InterestingOperands);
1343 }
1344 for (auto &Operand : OperandsToInstrument) {
1345 Value *Addr = Operand.getPtr();
1346 instrumentAddress(M, IRB, Operand.getInsn(), Operand.getInsn(), Addr,
1347 Operand.Alignment.valueOrOne(), Operand.TypeStoreSize,
1348 Operand.IsWrite, nullptr, false, false, AsanInfo.Scale,
1349 AsanInfo.Offset);
1350 Changed = true;
1351 }
1352 }
1353
1354 return Changed;
1355}
1356
1357class AMDGPUSwLowerLDSLegacy : public ModulePass {
1358public:
1359 static char ID;
1360 AMDGPUSwLowerLDSLegacy() : ModulePass(ID) {}
1361 bool runOnModule(Module &M) override;
1362 void getAnalysisUsage(AnalysisUsage &AU) const override {
1364 }
1365};
1366} // namespace
1367
1368char AMDGPUSwLowerLDSLegacy::ID = 0;
1369char &llvm::AMDGPUSwLowerLDSLegacyPassID = AMDGPUSwLowerLDSLegacy::ID;
1370
1371INITIALIZE_PASS_BEGIN(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1372 "AMDGPU Software lowering of LDS", false, false)
1374INITIALIZE_PASS_END(AMDGPUSwLowerLDSLegacy, "amdgpu-sw-lower-lds",
1375 "AMDGPU Software lowering of LDS", false, false)
1376
1377bool AMDGPUSwLowerLDSLegacy::runOnModule(Module &M) {
1378 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1379 // instrumented the IR. Return early if the flag is not present.
1380 if (!M.getModuleFlag("nosanitize_address"))
1381 return false;
1382 DominatorTreeWrapperPass *const DTW =
1383 getAnalysisIfAvailable<DominatorTreeWrapperPass>();
1384 auto DTCallback = [&DTW](Function &F) -> DominatorTree * {
1385 return DTW ? &DTW->getDomTree() : nullptr;
1386 };
1387
1388 AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
1389 bool IsChanged = SwLowerLDSImpl.run();
1390 return IsChanged;
1391}
1392
1394 return new AMDGPUSwLowerLDSLegacy();
1395}
1396
1399 // AddressSanitizer pass adds "nosanitize_address" module flag if it has
1400 // instrumented the IR. Return early if the flag is not present.
1401 if (!M.getModuleFlag("nosanitize_address"))
1402 return PreservedAnalyses::all();
1403 auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
1404 auto DTCallback = [&FAM](Function &F) -> DominatorTree * {
1405 return &FAM.getResult<DominatorTreeAnalysis>(F);
1406 };
1407 AMDGPUSwLowerLDS SwLowerLDSImpl(M, DTCallback);
1408 bool IsChanged = SwLowerLDSImpl.run();
1409 if (!IsChanged)
1410 return PreservedAnalyses::all();
1411
1414 return PA;
1415}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
This file provides interfaces used to build and manipulate a call graph, which is a very useful tool ...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file defines the DenseMap class.
This file defines the DenseSet and SmallDenseSet classes.
Hexagon Common GEP
IRTranslator LLVM IR MI
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
if(auto Err=PB.parsePassPipeline(MPM, Passes)) return wrap(std MPM run * Mod
FunctionAnalysisManager FAM
#define INITIALIZE_PASS_DEPENDENCY(depName)
Definition PassSupport.h:42
#define INITIALIZE_PASS_END(passName, arg, name, cfg, analysis)
Definition PassSupport.h:44
#define INITIALIZE_PASS_BEGIN(passName, arg, name, cfg, analysis)
Definition PassSupport.h:39
This file implements a set that has insertion order iteration characteristics.
static Split data
This file contains some functions that are useful when dealing with strings.
#define LLVM_DEBUG(...)
Definition Debug.h:119
Target-Independent Code Generator Pass Configuration Options pass.
static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore, DISubprogram *SP)
This class represents a conversion between pointers from one address space to another.
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
Represent the analysis usage information of a pass.
AnalysisUsage & addPreserved()
Add the specified Pass class to the set of analyses preserved by this pass.
This class represents any memset intrinsic.
static LLVM_ABI ArrayType * get(Type *ElementType, uint64_t NumElements)
This static method is the primary way to construct an ArrayType.
An instruction that atomically checks whether a specified value is in a memory location,...
void setVolatile(bool V)
Specify whether this is a volatile cmpxchg.
an instruction that atomically reads a memory location, combines it with another value,...
void setVolatile(bool V)
Specify whether this is a volatile RMW or not.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A node in the call graph for a module.
Definition CallGraph.h:162
Function * getFunction() const
Returns the function that this call graph node represents.
Definition CallGraph.h:193
The basic data container for the call graph of a Module of IR.
Definition CallGraph.h:72
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI Constant * get(ArrayType *T, ArrayRef< Constant * > V)
static ConstantAsMetadata * get(Constant *C)
Definition Metadata.h:537
static Constant * getGetElementPtr(Type *Ty, Constant *C, ArrayRef< Constant * > IdxList, GEPNoWrapFlags NW=GEPNoWrapFlags::none(), std::optional< ConstantRange > InRange=std::nullopt, Type *OnlyIfReducedTy=nullptr)
Getelementptr form.
Definition Constants.h:1470
static LLVM_ABI Constant * get(StructType *T, ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI void removeDeadConstantUsers() const
If there are any dead constant users dangling off of this constant, remove them.
Subprogram description. Uses SubclassData1.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
A debug info location.
Definition DebugLoc.h:126
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Legacy analysis pass which computes a DominatorTree.
Definition Dominators.h:306
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
static LLVM_ABI FunctionType * get(Type *Result, ArrayRef< Type * > Params, bool isVarArg)
This static method is the primary way of constructing a FunctionType.
arg_iterator arg_end()
Definition Function.h:851
arg_iterator arg_begin()
Definition Function.h:842
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set a particular kind of metadata attachment.
LLVM_ABI bool isDeclaration() const
Return true if the primary definition of this global value is outside of the current translation unit...
Definition Globals.cpp:346
LLVM_ABI void setSanitizerMetadata(SanitizerMetadata Meta)
Definition Globals.cpp:262
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
@ ExternalLinkage
Externally visible function.
Definition GlobalValue.h:53
Type * getValueType() const
uint64_t getAlignment() const
FIXME: Remove this function once transition to Align is over.
LLVM_ABI void eraseFromParent()
eraseFromParent - This method unlinks 'this' from the containing module and deletes it.
Definition Globals.cpp:547
ConstantInt * getInt1(bool V)
Get a constant value representing either true or false.
Definition IRBuilder.h:452
AtomicCmpXchgInst * CreateAtomicCmpXchg(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, SyncScope::ID SSID=SyncScope::System)
Definition IRBuilder.h:1957
IntegerType * getInt1Ty()
Fetch the type representing a single bit.
Definition IRBuilder.h:519
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1923
CondBrInst * CreateCondBr(Value *Cond, BasicBlock *True, BasicBlock *False, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a conditional 'br Cond, TrueDest, FalseDest' instruction.
Definition IRBuilder.h:1216
Value * CreateIntToPtr(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2227
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:221
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:534
IntegerType * getInt64Ty()
Fetch the type representing a 64-bit integer.
Definition IRBuilder.h:539
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2008
LLVM_ABI CallInst * CreateElementUnorderedAtomicMemMove(Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, uint32_t ElementSize, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert an element unordered-atomic memmove between the specified pointers.
Value * CreateUDiv(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1473
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:482
UncondBrInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1210
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:477
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2529
Value * CreateICmpEQ(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2364
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1906
CallInst * CreateElementUnorderedAtomicMemSet(Value *Ptr, Value *Val, uint64_t Size, Align Alignment, uint32_t ElementSize, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert an element unordered-atomic memset of the region of memory starting at the given po...
Definition IRBuilder.h:629
CallInst * CreateMemSet(Value *Ptr, Value *Val, uint64_t Size, MaybeAlign Align, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert a memset to the specified pointer and the specified value.
Definition IRBuilder.h:608
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2110
ReturnInst * CreateRetVoid()
Create a 'ret void' instruction.
Definition IRBuilder.h:1187
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1919
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1422
Value * CreatePtrToInt(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2222
CallInst * CreateCall(FunctionType *FTy, Value *Callee, ArrayRef< Value * > Args={}, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2543
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:577
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
Type * getVoidTy()
Fetch the type representing void.
Definition IRBuilder.h:572
LLVM_ABI CallInst * CreateElementUnorderedAtomicMemCpy(Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size, uint32_t ElementSize, const AAMDNodes &AAInfo=AAMDNodes())
Create and insert an element unordered-atomic memcpy between the specified pointers.
StoreInst * CreateAlignedStore(Value *Val, Value *Ptr, MaybeAlign Align, bool isVolatile=false)
Definition IRBuilder.h:1942
LLVM_ABI CallInst * CreateMemTransferInst(Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile=false, const AAMDNodes &AAInfo=AAMDNodes())
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1592
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:524
Value * CreateAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2237
Value * CreateMul(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1456
AtomicRMWInst * CreateAtomicRMW(AtomicRMWInst::BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System, bool Elementwise=false)
Definition IRBuilder.h:1970
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1069
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1554
Root of the metadata hierarchy.
Definition Metadata.h:64
ModulePass class - This class is used to implement unstructured interprocedural optimizations and ana...
Definition Pass.h:255
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
A container for an operand bundle being viewed as a set of values rather than a set of uses.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
Return a value (possibly void), from a function.
A vector that has set insertion semantics.
Definition SetVector.h:57
size_type size() const
Determine the number of elements in the SetVector.
Definition SetVector.h:103
iterator end()
Get an iterator to the end of the SetVector.
Definition SetVector.h:112
void clear()
Completely clear the SetVector.
Definition SetVector.h:267
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
iterator begin()
Get an iterator to the beginning of the SetVector.
Definition SetVector.h:106
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
SmallString - A SmallString is just a SmallVector with methods and accessors that make it work better...
Definition SmallString.h:26
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this store instruction.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
Class to represent struct types.
static LLVM_ABI StructType * create(LLVMContext &Context, StringRef Name)
This creates an identified struct.
Definition Type.cpp:683
Target-Independent Code Generator Pass Configuration Options.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI void printAsOperand(raw_ostream &O, bool PrintType=true, const Module *M=nullptr) const
Print the name of this Value out to the specified raw_ostream.
bool use_empty() const
Definition Value.h:346
LLVM_ABI bool replaceUsesWithIf(Value *New, llvm::function_ref< bool(Use &U)> ShouldReplace)
Go through the uses list for this definition and make each use point to "V" if the callback ShouldRep...
Definition Value.cpp:561
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
An efficient, type-erasing, non-owning reference to a callable.
A raw_ostream that writes to an std::string.
A raw_ostream that writes to an SmallVector or SmallString.
StringRef str() const
Return a StringRef for the vector contents.
Changed
@ LOCAL_ADDRESS
Address space for local memory.
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
GVUsesInfoTy getTransitiveUsesOfLDSForLowering(const CallGraph &CG, Module &M)
Collects all uses of LDS Global Variables in M using getUsesOfGVByFunction, with isLDSVariableToLower...
void getInterestingMemoryOperands(Module &M, Instruction *I, SmallVectorImpl< InterestingMemoryOperand > &Interesting)
Get all the memory operands from the instruction that needs to be instrumented.
bool isDynamicLDS(const GlobalVariable &GV)
unsigned getAMDHSACodeObjectVersion(const Module &M)
void removeFnAttrFromReachable(CallGraph &CG, Function *KernelRoot, ArrayRef< StringRef > FnAttrs)
Strip FnAttr attribute from any functions where we may have introduced its use.
bool eliminateGVConstantExprUsesFromAllInstructions(Module &M, function_ref< bool(const GlobalVariable &)> Filter)
Iterates over all GlobalVariables in M, and whenever Filter returns true, replace all constant users ...
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
DenseMap< Function *, DenseSet< GlobalVariable * > > FunctionVariableMap
bool isLDSVariableToLower(const GlobalVariable &GV)
Align getAlign(const DataLayout &DL, const GlobalVariable *GV)
void instrumentAddress(Module &M, IRBuilder<> &IRB, Instruction *OrigIns, Instruction *InsertBefore, Value *Addr, Align Alignment, TypeSize TypeStoreSize, bool IsWrite, Value *SizeArgument, bool UseCalls, bool Recover, int AsanScale, int AsanOffset)
Instrument the memory operand Addr.
uint64_t getRedzoneSizeForGlobal(int AsanScale, uint64_t SizeInBytes)
Given SizeInBytes of the Value to be instrunmented, Returns the redzone size corresponding to it.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
ModulePass * createAMDGPUSwLowerLDSLegacyPass()
@ Offset
Definition DWP.cpp:573
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
constexpr from_range_t from_range
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
InnerAnalysisManagerProxy< FunctionAnalysisManager, Module > FunctionAnalysisManagerModuleProxy
Provide the FunctionAnalysisManager to Module proxy.
char & AMDGPUSwLowerLDSLegacyPassID
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
LLVM_ABI void getAddressSanitizerParams(const Triple &TargetTriple, int LongSize, bool IsKasan, uint64_t *ShadowBase, int *MappingScale, bool *OrShadowOffset)
AnalysisManager< Module > ModuleAnalysisManager
Convenience typedef for the Module analysis manager.
Definition MIRParser.h:39
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM)
FunctionVariableMap DirectAccess
FunctionVariableMap IndirectAccess
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39