LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::scalar(32); // TODO: Expected float32
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::scalar(64); // TODO: Expected float64
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
322// TODO: Expected LLT::fixed_vector(2, LLT::float16())
324constexpr LLT V2BF16 = V2F16; // FIXME
325
326constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
327constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
328constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
329constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
330constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
331constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
332constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
333constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
334constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
335constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
336constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
337constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
338constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
339
340constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
341constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
342constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
343constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
344constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
345constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
346constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
347constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
348
349constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
350constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
351
352constexpr std::initializer_list<LLT> AllScalarTypes = {
354
355constexpr std::initializer_list<LLT> AllS16Vectors{
357
358constexpr std::initializer_list<LLT> AllS32Vectors = {
361
362constexpr std::initializer_list<LLT> AllS64Vectors = {
364
370
371// Checks whether a type is in the list of legal register types.
372static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
373 if (Ty.isPointerOrPointerVector())
374 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
375
378 (ST.useRealTrue16Insts() && Ty == S16) ||
380}
381
383 unsigned TypeIdx) {
384 return [&ST, TypeIdx](const LegalityQuery &Query) {
385 return isRegisterClassType(ST, Query.Types[TypeIdx]);
386 };
387}
388
389// If we have a truncating store or an extending load with a data size larger
390// than 32-bits, we need to reduce to a 32-bit type.
392 return [=](const LegalityQuery &Query) {
393 const LLT Ty = Query.Types[TypeIdx];
394 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
395 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
396 };
397}
398
399// If we have a truncating store or an extending load with a data size larger
400// than 32-bits and mem location is a power of 2
402 return [=](const LegalityQuery &Query) {
403 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
404 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
405 isPowerOf2_64(MemSize);
406 };
407}
408
409// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
410// handle some operations by just promoting the register during
411// selection. There are also d16 loads on GFX9+ which preserve the high bits.
412static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
413 bool IsLoad, bool IsAtomic) {
414 switch (AS) {
416 // FIXME: Private element size.
417 return ST.hasFlatScratchEnabled() ? 128 : 32;
419 return ST.useDS128() ? 128 : 64;
424 // Treat constant and global as identical. SMRD loads are sometimes usable for
425 // global loads (ideally constant address space should be eliminated)
426 // depending on the context. Legality cannot be context dependent, but
427 // RegBankSelect can split the load as necessary depending on the pointer
428 // register bank/uniformity and if the memory is invariant or not written in a
429 // kernel.
430 return IsLoad ? 512 : 128;
431 default:
432 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
433 // if they may alias scratch depending on the subtarget. This needs to be
434 // moved to custom handling to use addressMayBeAccessedAsPrivate
435 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
436 }
437}
438
439static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
440 const LegalityQuery &Query) {
441 const LLT Ty = Query.Types[0];
442
443 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
444 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
445
446 unsigned RegSize = Ty.getSizeInBits();
447 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
448 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
449 unsigned AS = Query.Types[1].getAddressSpace();
450
451 // All of these need to be custom lowered to cast the pointer operand.
453 return false;
454
455 // Do not handle extending vector loads.
456 if (Ty.isVector() && MemSize != RegSize)
457 return false;
458
459 // TODO: We should be able to widen loads if the alignment is high enough, but
460 // we also need to modify the memory access size.
461#if 0
462 // Accept widening loads based on alignment.
463 if (IsLoad && MemSize < Size)
464 MemSize = std::max(MemSize, Align);
465#endif
466
467 // Only 1-byte and 2-byte to 32-bit extloads are valid.
468 if (MemSize != RegSize && RegSize != 32)
469 return false;
470
471 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
472 Query.MMODescrs[0].Ordering !=
474 return false;
475
476 switch (MemSize) {
477 case 8:
478 case 16:
479 case 32:
480 case 64:
481 case 128:
482 break;
483 case 96:
484 if (!ST.hasDwordx3LoadStores())
485 return false;
486 break;
487 case 256:
488 case 512:
489 // These may contextually need to be broken down.
490 break;
491 default:
492 return false;
493 }
494
495 assert(RegSize >= MemSize);
496
497 if (AlignBits < MemSize) {
498 const SITargetLowering *TLI = ST.getTargetLowering();
499 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
500 Align(AlignBits / 8)))
501 return false;
502 }
503
504 return true;
505}
506
507// The newer buffer intrinsic forms take their resource arguments as
508// pointers in address space 8, aka s128 values. However, in order to not break
509// SelectionDAG, the underlying operations have to continue to take v4i32
510// arguments. Therefore, we convert resource pointers - or vectors of them
511// to integer values here.
512static bool hasBufferRsrcWorkaround(const LLT Ty) {
513 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
514 return true;
515 if (Ty.isVector()) {
516 const LLT ElemTy = Ty.getElementType();
517 return hasBufferRsrcWorkaround(ElemTy);
518 }
519 return false;
520}
521
522// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
523// workaround this. Eventually it should ignore the type for loads and only care
524// about the size. Return true in cases where we will workaround this for now by
525// bitcasting.
526static bool loadStoreBitcastWorkaround(const LLT Ty) {
528 return false;
529
530 const unsigned Size = Ty.getSizeInBits();
531 if (Ty.isPointerVector())
532 return true;
533 if (Size <= 64)
534 return false;
535 // Address space 8 pointers get their own workaround.
537 return false;
538 if (!Ty.isVector())
539 return true;
540
541 unsigned EltSize = Ty.getScalarSizeInBits();
542 return EltSize != 32 && EltSize != 64;
543}
544
545static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
546 const LLT Ty = Query.Types[0];
547 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
549}
550
551/// Return true if a load or store of the type should be lowered with a bitcast
552/// to a different type.
553static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
554 const LLT MemTy) {
555 const unsigned MemSizeInBits = MemTy.getSizeInBits();
556 const unsigned Size = Ty.getSizeInBits();
557 if (Size != MemSizeInBits)
558 return Size <= 32 && Ty.isVector();
559
561 return true;
562
563 // Don't try to handle bitcasting vector ext loads for now.
564 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
565 (Size <= 32 || isRegisterSize(ST, Size)) &&
566 !isRegisterVectorElementType(Ty.getElementType());
567}
568
569/// Return true if we should legalize a load by widening an odd sized memory
570/// access up to the alignment. Note this case when the memory access itself
571/// changes, not the size of the result register.
572static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
573 uint64_t AlignInBits, unsigned AddrSpace,
574 unsigned Opcode) {
575 unsigned SizeInBits = MemoryTy.getSizeInBits();
576 // We don't want to widen cases that are naturally legal.
577 if (isPowerOf2_32(SizeInBits))
578 return false;
579
580 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
581 // end up widening these for a scalar load during RegBankSelect, if we don't
582 // have 96-bit scalar loads.
583 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
584 return false;
585
586 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
587 return false;
588
589 // A load is known dereferenceable up to the alignment, so it's legal to widen
590 // to it.
591 //
592 // TODO: Could check dereferenceable for less aligned cases.
593 unsigned RoundedSize = NextPowerOf2(SizeInBits);
594 if (AlignInBits < RoundedSize)
595 return false;
596
597 // Do not widen if it would introduce a slow unaligned load.
598 const SITargetLowering *TLI = ST.getTargetLowering();
599 unsigned Fast = 0;
601 RoundedSize, AddrSpace, Align(AlignInBits / 8),
603 Fast;
604}
605
606static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
607 unsigned Opcode) {
608 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
609 return false;
610
611 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
612 Query.MMODescrs[0].AlignInBits,
613 Query.Types[1].getAddressSpace(), Opcode);
614}
615
616/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
617/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
618/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
620 MachineRegisterInfo &MRI, unsigned Idx) {
621 MachineOperand &MO = MI.getOperand(Idx);
622
623 const LLT PointerTy = MRI.getType(MO.getReg());
624
625 // Paranoidly prevent us from doing this multiple times.
627 return PointerTy;
628
629 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
630 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
631 if (!PointerTy.isVector()) {
632 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
633 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
634 const LLT S32 = LLT::scalar(32);
635
636 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
637 std::array<Register, 4> VectorElems;
638 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
639 for (unsigned I = 0; I < NumParts; ++I)
640 VectorElems[I] =
641 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
642 B.buildMergeValues(MO, VectorElems);
643 MO.setReg(VectorReg);
644 return VectorTy;
645 }
646 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
647 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
648 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
649 B.buildIntToPtr(MO, Scalar);
650 MO.setReg(BitcastReg);
651
652 return VectorTy;
653}
654
655/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
656/// the form in which the value must be in order to be passed to the low-level
657/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
658/// needed in order to account for the fact that we can't define a register
659/// class for s128 without breaking SelectionDAG.
661 MachineRegisterInfo &MRI = *B.getMRI();
662 const LLT PointerTy = MRI.getType(Pointer);
663 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
664 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
665
666 if (!PointerTy.isVector()) {
667 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
668 SmallVector<Register, 4> PointerParts;
669 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
670 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
671 for (unsigned I = 0; I < NumParts; ++I)
672 PointerParts.push_back(Unmerged.getReg(I));
673 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
674 }
675 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
676 return B.buildBitcast(VectorTy, Scalar).getReg(0);
677}
678
680 unsigned Idx) {
681 MachineOperand &MO = MI.getOperand(Idx);
682
683 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
684 // Paranoidly prevent us from doing this multiple times.
686 return;
688}
689
691 const GCNTargetMachine &TM)
692 : ST(ST_) {
693 using namespace TargetOpcode;
694
695 auto GetAddrSpacePtr = [&TM](unsigned AS) {
696 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
697 };
698
699 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
700 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
701 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
702 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
703 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
704 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
705 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
706 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
707 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
708 const LLT BufferStridedPtr =
709 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
710
711 const LLT CodePtr = FlatPtr;
712
713 const std::initializer_list<LLT> AddrSpaces64 = {
714 GlobalPtr, ConstantPtr, FlatPtr
715 };
716
717 const std::initializer_list<LLT> AddrSpaces32 = {
718 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
719 };
720
721 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
722
723 const std::initializer_list<LLT> FPTypesBase = {
724 S32, S64
725 };
726
727 const std::initializer_list<LLT> FPTypes16 = {
728 S32, S64, S16
729 };
730
731 const std::initializer_list<LLT> FPTypesPK16 = {
732 S32, S64, S16, V2S16
733 };
734
735 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
736
737 // s1 for VCC branches, s32 for SCC branches.
739
740 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
741 // elements for v3s16
744 .legalFor(AllS32Vectors)
746 .legalFor(AddrSpaces64)
747 .legalFor(AddrSpaces32)
748 .legalFor(AddrSpaces128)
749 .legalIf(isPointer(0))
750 .clampScalar(0, S16, S256)
752 .clampMaxNumElements(0, S32, 16)
754 .scalarize(0);
755
756 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
757 // Full set of gfx9 features.
758 if (ST.hasScalarAddSub64()) {
759 getActionDefinitionsBuilder({G_ADD, G_SUB})
760 .legalFor({S64, S32, S16, V2S16})
761 .clampMaxNumElementsStrict(0, S16, 2)
762 .scalarize(0)
763 .minScalar(0, S16)
765 .maxScalar(0, S32);
766 } else {
767 getActionDefinitionsBuilder({G_ADD, G_SUB})
768 .legalFor({S32, S16, V2S16})
769 .clampMaxNumElementsStrict(0, S16, 2)
770 .scalarize(0)
771 .minScalar(0, S16)
773 .maxScalar(0, S32);
774 }
775
776 if (ST.hasScalarSMulU64()) {
778 .legalFor({S64, S32, S16, V2S16})
779 .clampMaxNumElementsStrict(0, S16, 2)
780 .scalarize(0)
781 .minScalar(0, S16)
783 .custom();
784 } else {
786 .legalFor({S32, S16, V2S16})
787 .clampMaxNumElementsStrict(0, S16, 2)
788 .scalarize(0)
789 .minScalar(0, S16)
791 .custom();
792 }
793 assert(ST.hasMad64_32());
794
795 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
796 .legalFor({S32, S16, V2S16}) // Clamp modifier
797 .minScalarOrElt(0, S16)
799 .scalarize(0)
801 .lower();
802 } else if (ST.has16BitInsts()) {
803 getActionDefinitionsBuilder({G_ADD, G_SUB})
804 .legalFor({S32, S16})
805 .minScalar(0, S16)
807 .maxScalar(0, S32)
808 .scalarize(0);
809
811 .legalFor({S32, S16})
812 .scalarize(0)
813 .minScalar(0, S16)
815 .custom();
816 assert(ST.hasMad64_32());
817
818 // Technically the saturating operations require clamp bit support, but this
819 // was introduced at the same time as 16-bit operations.
820 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
821 .legalFor({S32, S16}) // Clamp modifier
822 .minScalar(0, S16)
823 .scalarize(0)
825 .lower();
826
827 // We're just lowering this, but it helps get a better result to try to
828 // coerce to the desired type first.
829 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
830 .minScalar(0, S16)
831 .scalarize(0)
832 .lower();
833 } else {
834 getActionDefinitionsBuilder({G_ADD, G_SUB})
835 .legalFor({S32})
836 .widenScalarToNextMultipleOf(0, 32)
837 .clampScalar(0, S32, S32)
838 .scalarize(0);
839
840 auto &Mul = getActionDefinitionsBuilder(G_MUL)
841 .legalFor({S32})
842 .scalarize(0)
843 .minScalar(0, S32)
845
846 if (ST.hasMad64_32())
847 Mul.custom();
848 else
849 Mul.maxScalar(0, S32);
850
851 if (ST.hasIntClamp()) {
852 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
853 .legalFor({S32}) // Clamp modifier.
854 .scalarize(0)
856 .lower();
857 } else {
858 // Clamp bit support was added in VI, along with 16-bit operations.
859 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
860 .minScalar(0, S32)
861 .scalarize(0)
862 .lower();
863 }
864
865 // FIXME: DAG expansion gets better results. The widening uses the smaller
866 // range values and goes for the min/max lowering directly.
867 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
868 .minScalar(0, S32)
869 .scalarize(0)
870 .lower();
871 }
872
874 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
875 .customFor({S32, S64})
876 .clampScalar(0, S32, S64)
878 .scalarize(0);
879
880 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
881 .legalFor({S32})
882 .maxScalar(0, S32);
883
884 if (ST.hasVOP3PInsts()) {
885 Mulh
886 .clampMaxNumElements(0, S8, 2)
887 .lowerFor({V2S8});
888 }
889
890 Mulh
891 .scalarize(0)
892 .lower();
893
894 // Report legal for any types we can handle anywhere. For the cases only legal
895 // on the SALU, RegBankSelect will be able to re-legalize.
896 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
897 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
898 .clampScalar(0, S32, S64)
904 .scalarize(0);
905
907 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
908 .legalFor({{S32, S1}, {S32, S32}})
909 .clampScalar(0, S32, S32)
910 .scalarize(0);
911
913 // Don't worry about the size constraint.
915 .lower();
916
918 .legalFor({S1, S32, S64, S16, GlobalPtr,
919 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
920 .legalIf(isPointer(0))
921 .clampScalar(0, S32, S64)
923
924 getActionDefinitionsBuilder(G_FCONSTANT)
925 .legalFor({S32, S64, S16})
926 .clampScalar(0, S16, S64);
927
928 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
929 .legalIf(isRegisterClassType(ST, 0))
930 // s1 and s16 are special cases because they have legal operations on
931 // them, but don't really occupy registers in the normal way.
932 .legalFor({S1, S16})
933 .clampNumElements(0, V16S32, V32S32)
937 .clampMaxNumElements(0, S32, 16);
938
939 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
940
941 // If the amount is divergent, we have to do a wave reduction to get the
942 // maximum value, so this is expanded during RegBankSelect.
943 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
944 .legalFor({{PrivatePtr, S32}});
945
946 getActionDefinitionsBuilder(G_STACKSAVE)
947 .customFor({PrivatePtr});
948 getActionDefinitionsBuilder(G_STACKRESTORE)
949 .legalFor({PrivatePtr});
950
951 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
952
953 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
954 .customIf(typeIsNot(0, PrivatePtr));
955
956 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
957
958 auto &FPOpActions = getActionDefinitionsBuilder(
959 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
960 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
961 .legalFor({S32, S64});
962 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
963 .customFor({S32, S64});
964 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
965 .customFor({S32, S64});
966
967 if (ST.has16BitInsts()) {
968 if (ST.hasVOP3PInsts())
969 FPOpActions.legalFor({S16, V2S16});
970 else
971 FPOpActions.legalFor({S16});
972
973 TrigActions.customFor({S16});
974 FDIVActions.customFor({S16});
975 }
976
977 if (ST.hasPackedFP32Ops()) {
978 FPOpActions.legalFor({V2S32});
979 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
980 }
981
982 auto &MinNumMaxNumIeee =
983 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
984
985 if (ST.hasVOP3PInsts()) {
986 MinNumMaxNumIeee.legalFor(FPTypesPK16)
987 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
988 .clampMaxNumElements(0, S16, 2)
989 .clampScalar(0, S16, S64)
990 .scalarize(0);
991 } else if (ST.has16BitInsts()) {
992 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
993 } else {
994 MinNumMaxNumIeee.legalFor(FPTypesBase)
995 .clampScalar(0, S32, S64)
996 .scalarize(0);
997 }
998
999 auto &MinNumMaxNum = getActionDefinitionsBuilder(
1000 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1001
1002 if (ST.hasVOP3PInsts()) {
1003 MinNumMaxNum.customFor(FPTypesPK16)
1004 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1005 .clampMaxNumElements(0, S16, 2)
1006 .clampScalar(0, S16, S64)
1007 .scalarize(0);
1008 } else if (ST.has16BitInsts()) {
1009 MinNumMaxNum.customFor(FPTypes16)
1010 .clampScalar(0, S16, S64)
1011 .scalarize(0);
1012 } else {
1013 MinNumMaxNum.customFor(FPTypesBase)
1014 .clampScalar(0, S32, S64)
1015 .scalarize(0);
1016 }
1017
1018 if (ST.hasVOP3PInsts())
1019 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1020
1021 FPOpActions
1022 .scalarize(0)
1023 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1024
1025 TrigActions
1026 .scalarize(0)
1027 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1028
1029 FDIVActions
1030 .scalarize(0)
1031 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1032
1033 auto &FNegAbs = getActionDefinitionsBuilder({G_FNEG, G_FABS});
1034 FNegAbs.legalFor(FPTypesPK16)
1035 .legalFor(ST.hasPackedFP32Ops(), {V2S32})
1037 if (ST.hasPackedFP32Ops())
1038 FNegAbs.clampMaxNumElementsStrict(0, S32, 2);
1039 FNegAbs.scalarize(0).clampScalar(0, S16, S64);
1040
1041 if (ST.has16BitInsts()) {
1043 .legalFor({S16})
1044 .customFor({S32, S64})
1045 .scalarize(0)
1046 .unsupported();
1048 .legalFor({S32, S64, S16})
1049 .scalarize(0)
1050 .clampScalar(0, S16, S64);
1051
1052 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1053 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1054 .scalarize(0)
1055 .maxScalarIf(typeIs(0, S16), 1, S16)
1056 .clampScalar(1, S32, S32)
1057 .lower();
1058
1060 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1061 .scalarize(0)
1062 .lower();
1063
1065 .lowerFor({S16, S32, S64})
1066 .scalarize(0)
1067 .lower();
1068 } else {
1070 .customFor({S32, S64, S16})
1071 .scalarize(0)
1072 .unsupported();
1073
1074
1075 if (ST.hasFractBug()) {
1077 .customFor({S64})
1078 .legalFor({S32, S64})
1079 .scalarize(0)
1080 .clampScalar(0, S32, S64);
1081 } else {
1083 .legalFor({S32, S64})
1084 .scalarize(0)
1085 .clampScalar(0, S32, S64);
1086 }
1087
1088 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1089 .legalFor({{S32, S32}, {S64, S32}})
1090 .scalarize(0)
1091 .clampScalar(0, S32, S64)
1092 .clampScalar(1, S32, S32)
1093 .lower();
1094
1096 .customFor({{S32, S32}, {S64, S32}})
1097 .scalarize(0)
1098 .minScalar(0, S32)
1099 .clampScalar(1, S32, S32)
1100 .lower();
1101
1103 .lowerFor({S32, S64})
1104 .scalarize(0)
1105 .lower();
1106 }
1107
1108 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1109 if (ST.hasCvtPkF16F32Inst()) {
1110 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1111 .clampMaxNumElements(0, S16, 2);
1112 } else {
1113 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1114 }
1115 FPTruncActions.scalarize(0).lower();
1116
1118 .legalFor({{S64, S32}, {S32, S16}})
1119 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1120 .scalarize(0);
1121
1122 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1123 if (ST.has16BitInsts()) {
1124 FSubActions
1125 // Use actual fsub instruction
1126 .legalFor({S32, S16})
1127 // Must use fadd + fneg
1128 .lowerFor({S64, V2S16});
1129 } else {
1130 FSubActions
1131 // Use actual fsub instruction
1132 .legalFor({S32})
1133 // Must use fadd + fneg
1134 .lowerFor({S64, S16, V2S16});
1135 }
1136
1137 if (ST.hasPackedFP32Ops())
1138 FSubActions.lowerFor({V2S32}).clampMaxNumElements(0, S32, 2);
1139
1140 FSubActions
1141 .clampMaxNumElements(0, S16, 2)
1142 .scalarize(0)
1143 .clampScalar(0, S32, S64);
1144
1145 // Whether this is legal depends on the floating point mode for the function.
1146 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1147 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1148 FMad.customFor({S32, S16});
1149 else if (ST.hasMadMacF32Insts())
1150 FMad.customFor({S32});
1151 else if (ST.hasMadF16())
1152 FMad.customFor({S16});
1153 FMad.scalarize(0)
1154 .lower();
1155
1156 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1157 if (ST.has16BitInsts()) {
1158 FRem.customFor({S16, S32, S64});
1159 } else {
1160 FRem.minScalar(0, S32)
1161 .customFor({S32, S64});
1162 }
1163 FRem.scalarize(0);
1164
1165 // TODO: Do we need to clamp maximum bitwidth?
1167 .legalIf(isScalar(0))
1168 .legalFor({{V2S16, V2S32}})
1169 .clampMaxNumElements(0, S16, 2)
1170 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1171 // situations (like an invalid implicit use), we don't want to infinite loop
1172 // in the legalizer.
1174 .alwaysLegal();
1175
1176 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1177 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1178 {S32, S1}, {S64, S1}, {S16, S1}})
1179 .scalarize(0)
1180 .clampScalar(0, S32, S64)
1181 .widenScalarToNextPow2(1, 32);
1182
1183 // TODO: Split s1->s64 during regbankselect for VALU.
1184 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1185 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1186 .lowerIf(typeIs(1, S1))
1187 .customFor({{S32, S64}, {S64, S64}});
1188 if (ST.has16BitInsts())
1189 IToFP.legalFor({{S16, S16}});
1190 IToFP.clampScalar(1, S32, S64)
1191 .minScalar(0, S32)
1192 .scalarize(0)
1194
1195 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1196 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1197 .customFor({{S64, S32}, {S64, S64}})
1198 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1199 if (ST.has16BitInsts())
1200 FPToI.legalFor({{S16, S16}});
1201 else
1202 FPToI.minScalar(1, S32);
1203
1204 FPToI.minScalar(0, S32)
1205 .widenScalarToNextPow2(0, 32)
1206 .scalarize(0)
1207 .lower();
1208
1209 // clang-format off
1210 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1211 .legalFor({{S32, S32}, {S32, S64}})
1212 .legalFor(ST.has16BitInsts(),{{S16, S16}})
1213 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1214
1215 // If available, widen width <16 to i16, intead of i32 so v_cvt_i16/u16_f16 can be used.
1216 if (ST.has16BitInsts())
1217 FPToISat.minScalarIf(typeIs(1, S16), 0, S16);
1218
1219 FPToISat.minScalar(1, S32);
1220 FPToISat.minScalar(0, S32)
1221 .widenScalarToNextPow2(0, 32)
1222 .scalarize(0)
1223 .lower();
1224 // clang-format on
1225
1226 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1227 .clampScalar(0, S16, S64)
1228 .scalarize(0)
1229 .lower();
1230
1231 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1232 .legalFor({S16, S32})
1233 .scalarize(0)
1234 .lower();
1235
1236 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1237 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1238 .scalarize(0)
1239 .lower();
1240
1241 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1242 .clampScalar(0, S16, S64)
1243 .scalarize(0)
1244 .lower();
1245
1246 if (ST.has16BitInsts()) {
1247 getActionDefinitionsBuilder(
1248 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1249 .legalFor({S16, S32, S64})
1250 .clampScalar(0, S16, S64)
1251 .scalarize(0);
1252 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1253 getActionDefinitionsBuilder(
1254 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1255 .legalFor({S32, S64})
1256 .clampScalar(0, S32, S64)
1257 .scalarize(0);
1258 } else {
1259 getActionDefinitionsBuilder(
1260 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1261 .legalFor({S32})
1262 .customFor({S64})
1263 .clampScalar(0, S32, S64)
1264 .scalarize(0);
1265 }
1266
1267 getActionDefinitionsBuilder(G_PTR_ADD)
1268 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1269 .legalIf(all(isPointer(0), sameSize(0, 1)))
1270 .scalarize(0)
1271 .scalarSameSizeAs(1, 0);
1272
1273 getActionDefinitionsBuilder(G_PTRMASK)
1274 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1275 .scalarSameSizeAs(1, 0)
1276 .scalarize(0);
1277
1278 auto &CmpBuilder =
1279 getActionDefinitionsBuilder(G_ICMP)
1280 // The compare output type differs based on the register bank of the output,
1281 // so make both s1 and s32 legal.
1282 //
1283 // Scalar compares producing output in scc will be promoted to s32, as that
1284 // is the allocatable register type that will be needed for the copy from
1285 // scc. This will be promoted during RegBankSelect, and we assume something
1286 // before that won't try to use s32 result types.
1287 //
1288 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1289 // bank.
1291 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1292 .legalForCartesianProduct(
1293 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1294 if (ST.has16BitInsts()) {
1295 CmpBuilder.legalFor({{S1, S16}});
1296 }
1297
1298 CmpBuilder
1300 .clampScalar(1, S32, S64)
1301 .scalarize(0)
1302 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1303
1304 auto &FCmpBuilder =
1305 getActionDefinitionsBuilder(G_FCMP).legalForCartesianProduct(
1306 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1307
1308 if (ST.hasSALUFloatInsts())
1309 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1310
1311 FCmpBuilder
1313 .clampScalar(1, S32, S64)
1314 .scalarize(0);
1315
1316 // FIXME: fpow has a selection pattern that should move to custom lowering.
1317 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1318 if (ST.has16BitInsts())
1319 ExpOps.customFor({{S32}, {S16}});
1320 else
1321 ExpOps.customFor({S32});
1322 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1323 .scalarize(0);
1324
1325 getActionDefinitionsBuilder(G_FPOWI)
1326 .clampScalar(0, MinScalarFPTy, S32)
1327 .lower();
1328
1329 getActionDefinitionsBuilder(G_FLOG2)
1330 .legalFor(ST.has16BitInsts(), {S16})
1331 .customFor({S32, S16})
1332 .scalarize(0)
1333 .lower();
1334
1335 getActionDefinitionsBuilder(G_FEXP2)
1336 .legalFor(ST.has16BitInsts(), {S16})
1337 .customFor({S32, S64, S16})
1338 .scalarize(0)
1339 .lower();
1340
1341 auto &LogOps =
1342 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1343 LogOps.customFor({S32, S16, S64});
1344 LogOps.clampScalar(0, MinScalarFPTy, S32)
1345 .scalarize(0);
1346
1347 // The 64-bit versions produce 32-bit results, but only on the SALU.
1348 getActionDefinitionsBuilder(G_CTPOP)
1349 .legalFor({{S32, S32}, {S32, S64}})
1350 .clampScalar(0, S32, S32)
1351 .widenScalarToNextPow2(1, 32)
1352 .clampScalar(1, S32, S64)
1353 .scalarize(0)
1354 .widenScalarToNextPow2(0, 32);
1355
1356 // If no 16 bit instr is available, lower into different instructions.
1357 if (ST.has16BitInsts())
1358 getActionDefinitionsBuilder(G_IS_FPCLASS)
1359 .legalForCartesianProduct({S1}, FPTypes16)
1360 .widenScalarToNextPow2(1)
1361 .scalarize(0)
1362 .lower();
1363 else
1364 getActionDefinitionsBuilder(G_IS_FPCLASS)
1365 .legalForCartesianProduct({S1}, FPTypesBase)
1366 .lowerFor({S1, S16})
1367 .widenScalarToNextPow2(1)
1368 .scalarize(0)
1369 .lower();
1370
1371 // The hardware instructions return a different result on 0 than the generic
1372 // instructions expect. The hardware produces -1, but these produce the
1373 // bitwidth.
1374 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1375 .scalarize(0)
1376 .clampScalar(0, S32, S32)
1377 .clampScalar(1, S32, S64)
1378 .widenScalarToNextPow2(0, 32)
1379 .widenScalarToNextPow2(1, 32)
1380 .custom();
1381
1382 // The 64-bit versions produce 32-bit results, but only on the SALU.
1383 getActionDefinitionsBuilder(G_CTLZ_ZERO_POISON)
1384 .legalFor({{S32, S32}, {S32, S64}})
1385 .customIf(scalarNarrowerThan(1, 32))
1386 .clampScalar(0, S32, S32)
1387 .clampScalar(1, S32, S64)
1388 .scalarize(0)
1389 .widenScalarToNextPow2(0, 32)
1390 .widenScalarToNextPow2(1, 32);
1391
1392 getActionDefinitionsBuilder(G_CTTZ_ZERO_POISON)
1393 .legalFor({{S32, S32}, {S32, S64}})
1394 .clampScalar(0, S32, S32)
1395 .clampScalar(1, S32, S64)
1396 .scalarize(0)
1397 .widenScalarToNextPow2(0, 32)
1398 .widenScalarToNextPow2(1, 32);
1399
1400 getActionDefinitionsBuilder(G_CTLS)
1401 .customFor({{S32, S32}})
1402 .scalarize(0)
1403 .clampScalar(0, S32, S32)
1404 .clampScalar(1, S32, S32);
1405
1406 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1407 // RegBankSelect.
1408 getActionDefinitionsBuilder(G_BITREVERSE)
1409 .legalFor({S32, S64})
1410 .clampScalar(0, S32, S64)
1411 .scalarize(0)
1412 .widenScalarToNextPow2(0);
1413
1414 if (ST.has16BitInsts()) {
1415 getActionDefinitionsBuilder(G_BSWAP)
1416 .legalFor({S16, S32, V2S16})
1417 .clampMaxNumElementsStrict(0, S16, 2)
1418 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1419 // narrowScalar limitation.
1420 .widenScalarToNextPow2(0)
1421 .clampScalar(0, S16, S32)
1422 .scalarize(0);
1423
1424 if (ST.hasVOP3PInsts()) {
1425 getActionDefinitionsBuilder(G_ABS)
1426 .legalFor({S32, S16, V2S16})
1427 .clampMaxNumElements(0, S16, 2)
1428 .minScalar(0, S16)
1429 .widenScalarToNextPow2(0)
1430 .scalarize(0)
1431 .lower();
1432 if (ST.hasIntMinMax64()) {
1433 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1434 .legalFor({S32, S16, S64, V2S16})
1435 .clampMaxNumElements(0, S16, 2)
1436 .minScalar(0, S16)
1437 .widenScalarToNextPow2(0)
1438 .scalarize(0)
1439 .lower();
1440 } else {
1441 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1442 .legalFor({S32, S16, V2S16})
1443 .clampMaxNumElements(0, S16, 2)
1444 .minScalar(0, S16)
1445 .widenScalarToNextPow2(0)
1446 .scalarize(0)
1447 .lower();
1448 }
1449 } else {
1450 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1451 .legalFor({S32, S16})
1452 .widenScalarToNextPow2(0)
1453 .minScalar(0, S16)
1454 .scalarize(0)
1455 .lower();
1456 }
1457 } else {
1458 // TODO: Should have same legality without v_perm_b32
1459 getActionDefinitionsBuilder(G_BSWAP)
1460 .legalFor({S32})
1461 .lowerIf(scalarNarrowerThan(0, 32))
1462 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1463 // narrowScalar limitation.
1464 .widenScalarToNextPow2(0)
1465 .maxScalar(0, S32)
1466 .scalarize(0)
1467 .lower();
1468
1469 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1470 .legalFor({S32})
1471 .minScalar(0, S32)
1472 .widenScalarToNextPow2(0)
1473 .scalarize(0)
1474 .lower();
1475 }
1476
1477 getActionDefinitionsBuilder(G_INTTOPTR)
1478 // List the common cases
1479 .legalForCartesianProduct(AddrSpaces64, {S64})
1480 .legalForCartesianProduct(AddrSpaces32, {S32})
1481 .scalarize(0)
1482 // Accept any address space as long as the size matches
1483 .legalIf(sameSize(0, 1))
1484 .widenScalarIf(smallerThan(1, 0),
1485 [](const LegalityQuery &Query) {
1486 return std::pair(
1487 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1488 })
1489 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1490 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1491 });
1492
1493 getActionDefinitionsBuilder(G_PTRTOINT)
1494 // List the common cases
1495 .legalForCartesianProduct(AddrSpaces64, {S64})
1496 .legalForCartesianProduct(AddrSpaces32, {S32})
1497 .scalarize(0)
1498 // Accept any address space as long as the size matches
1499 .legalIf(sameSize(0, 1))
1500 .widenScalarIf(smallerThan(0, 1),
1501 [](const LegalityQuery &Query) {
1502 return std::pair(
1503 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1504 })
1505 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1506 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1507 });
1508
1509 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1510 .scalarize(0)
1511 .custom();
1512
1513 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1514 bool IsLoad) -> bool {
1515 const LLT DstTy = Query.Types[0];
1516
1517 // Split vector extloads.
1518 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1519
1520 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1521 return true;
1522
1523 const LLT PtrTy = Query.Types[1];
1524 unsigned AS = PtrTy.getAddressSpace();
1525 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1526 Query.MMODescrs[0].Ordering !=
1528 return true;
1529
1530 // Catch weird sized loads that don't evenly divide into the access sizes
1531 // TODO: May be able to widen depending on alignment etc.
1532 unsigned NumRegs = (MemSize + 31) / 32;
1533 if (NumRegs == 3) {
1534 if (!ST.hasDwordx3LoadStores())
1535 return true;
1536 } else {
1537 // If the alignment allows, these should have been widened.
1538 if (!isPowerOf2_32(NumRegs))
1539 return true;
1540 }
1541
1542 return false;
1543 };
1544
1545 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1546 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1547 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1548
1549 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1550 // LDS
1551 // TODO: Unsupported flat for SI.
1552
1553 for (unsigned Op : {G_LOAD, G_STORE}) {
1554 const bool IsStore = Op == G_STORE;
1555
1556 auto &Actions = getActionDefinitionsBuilder(Op);
1557 // Explicitly list some common cases.
1558 // TODO: Does this help compile time at all?
1559 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1560 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1561 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1562 {S64, GlobalPtr, S64, GlobalAlign32},
1563 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1564 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1565 {S32, GlobalPtr, S8, GlobalAlign8},
1566 {S32, GlobalPtr, S16, GlobalAlign16},
1567
1568 {S32, LocalPtr, S32, 32},
1569 {S64, LocalPtr, S64, 32},
1570 {V2S32, LocalPtr, V2S32, 32},
1571 {S32, LocalPtr, S8, 8},
1572 {S32, LocalPtr, S16, 16},
1573 {V2S16, LocalPtr, S32, 32},
1574
1575 {S32, PrivatePtr, S32, 32},
1576 {S32, PrivatePtr, S8, 8},
1577 {S32, PrivatePtr, S16, 16},
1578 {V2S16, PrivatePtr, S32, 32},
1579
1580 {S32, ConstantPtr, S32, GlobalAlign32},
1581 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1582 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1583 {S64, ConstantPtr, S64, GlobalAlign32},
1584 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1585
1586 Actions.legalForTypesWithMemDesc(ST.useRealTrue16Insts(), /* Pred */
1587 {{S16, GlobalPtr, S8, GlobalAlign8},
1588 {S16, GlobalPtr, S16, GlobalAlign16},
1589 {S16, LocalPtr, S8, 8},
1590 {S16, LocalPtr, S16, 16},
1591 {S16, PrivatePtr, S8, 8},
1592 {S16, PrivatePtr, S16, 16}});
1593
1594 Actions.legalIf(
1595 [=](const LegalityQuery &Query) -> bool {
1596 return isLoadStoreLegal(ST, Query);
1597 });
1598
1599 // The custom pointers (fat pointers, buffer resources) don't work with load
1600 // and store at this level. Fat pointers should have been lowered to
1601 // intrinsics before the translation to MIR.
1602 Actions.unsupportedIf(
1603 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1604
1605 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1606 // ptrtoint. This is needed to account for the fact that we can't have i128
1607 // as a register class for SelectionDAG reasons.
1608 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1609 return hasBufferRsrcWorkaround(Query.Types[0]);
1610 });
1611
1612 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1613 // 64-bits.
1614 //
1615 // TODO: Should generalize bitcast action into coerce, which will also cover
1616 // inserting addrspacecasts.
1617 Actions.customIf(typeIs(1, Constant32Ptr));
1618
1619 // Turn any illegal element vectors into something easier to deal
1620 // with. These will ultimately produce 32-bit scalar shifts to extract the
1621 // parts anyway.
1622 //
1623 // For odd 16-bit element vectors, prefer to split those into pieces with
1624 // 16-bit vector parts.
1625 Actions.bitcastIf(
1626 [=](const LegalityQuery &Query) -> bool {
1627 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1628 Query.MMODescrs[0].MemoryTy);
1629 }, bitcastToRegisterType(0));
1630
1631 if (!IsStore) {
1632 // Widen suitably aligned loads by loading extra bytes. The standard
1633 // legalization actions can't properly express widening memory operands.
1634 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1635 return shouldWidenLoad(ST, Query, G_LOAD);
1636 });
1637 }
1638
1639 // FIXME: load/store narrowing should be moved to lower action
1640 Actions
1641 .narrowScalarIf(
1642 [=](const LegalityQuery &Query) -> bool {
1643 return !Query.Types[0].isVector() &&
1644 needToSplitMemOp(Query, Op == G_LOAD);
1645 },
1646 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1647 const LLT DstTy = Query.Types[0];
1648 const LLT PtrTy = Query.Types[1];
1649
1650 const unsigned DstSize = DstTy.getSizeInBits();
1651 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1652
1653 // Split extloads.
1654 if (DstSize > MemSize)
1655 return std::pair(0, LLT::scalar(MemSize));
1656
1657 unsigned MaxSize = maxSizeForAddrSpace(
1658 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1659 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1660 if (MemSize > MaxSize)
1661 return std::pair(0, LLT::scalar(MaxSize));
1662
1663 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1664 return std::pair(0, LLT::scalar(Align));
1665 })
1666 .fewerElementsIf(
1667 [=](const LegalityQuery &Query) -> bool {
1668 return Query.Types[0].isVector() &&
1669 needToSplitMemOp(Query, Op == G_LOAD);
1670 },
1671 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1672 const LLT DstTy = Query.Types[0];
1673 const LLT PtrTy = Query.Types[1];
1674
1675 LLT EltTy = DstTy.getElementType();
1676 unsigned MaxSize = maxSizeForAddrSpace(
1677 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1678 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1679
1680 // FIXME: Handle widened to power of 2 results better. This ends
1681 // up scalarizing.
1682 // FIXME: 3 element stores scalarized on SI
1683
1684 // Split if it's too large for the address space.
1685 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1686 if (MemSize > MaxSize) {
1687 unsigned NumElts = DstTy.getNumElements();
1688 unsigned EltSize = EltTy.getSizeInBits();
1689
1690 if (MaxSize % EltSize == 0) {
1691 return std::pair(
1693 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1694 }
1695
1696 unsigned NumPieces = MemSize / MaxSize;
1697
1698 // FIXME: Refine when odd breakdowns handled
1699 // The scalars will need to be re-legalized.
1700 if (NumPieces == 1 || NumPieces >= NumElts ||
1701 NumElts % NumPieces != 0)
1702 return std::pair(0, EltTy);
1703
1704 return std::pair(0,
1705 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1706 }
1707
1708 // FIXME: We could probably handle weird extending loads better.
1709 if (DstTy.getSizeInBits() > MemSize)
1710 return std::pair(0, EltTy);
1711
1712 unsigned EltSize = EltTy.getSizeInBits();
1713 unsigned DstSize = DstTy.getSizeInBits();
1714 if (!isPowerOf2_32(DstSize)) {
1715 // We're probably decomposing an odd sized store. Try to split
1716 // to the widest type. TODO: Account for alignment. As-is it
1717 // should be OK, since the new parts will be further legalized.
1718 unsigned FloorSize = llvm::bit_floor(DstSize);
1719 return std::pair(
1721 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1722 }
1723
1724 // May need relegalization for the scalars.
1725 return std::pair(0, EltTy);
1726 })
1727 .minScalar(0, S32)
1728 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1730 .widenScalarToNextPow2(0)
1731 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1732 .lower();
1733 }
1734
1735 // FIXME: Unaligned accesses not lowered.
1736 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1737 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1738 {S32, GlobalPtr, S16, 2 * 8},
1739 {S32, LocalPtr, S8, 8},
1740 {S32, LocalPtr, S16, 16},
1741 {S32, PrivatePtr, S8, 8},
1742 {S32, PrivatePtr, S16, 16},
1743 {S32, ConstantPtr, S8, 8},
1744 {S32, ConstantPtr, S16, 2 * 8}})
1745 .legalIf(
1746 [=](const LegalityQuery &Query) -> bool {
1747 return isLoadStoreLegal(ST, Query);
1748 });
1749
1750 if (ST.hasFlatAddressSpace()) {
1751 ExtLoads.legalForTypesWithMemDesc(
1752 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1753 }
1754
1755 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1756 // 64-bits.
1757 //
1758 // TODO: Should generalize bitcast action into coerce, which will also cover
1759 // inserting addrspacecasts.
1760 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1761
1762 ExtLoads.narrowScalarIf(
1763 [](const LegalityQuery &Query) {
1764 LLT MemTy = Query.MMODescrs[0].MemoryTy;
1765 return MemTy.isAnyScalar() && MemTy.getSizeInBits() > 32 &&
1766 Query.Types[0].getSizeInBits() > MemTy.getSizeInBits();
1767 }, // For large MemSize, narrowscalar to MemSize (load MemSize + ext)
1769 ExtLoads.clampScalar(0, S32, S32)
1770 .widenScalarToNextPow2(0)
1771 .lower();
1772
1773 auto &Atomics = getActionDefinitionsBuilder(
1774 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1775 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1776 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1777 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1778 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1779 {S64, GlobalPtr}, {S64, LocalPtr},
1780 {S32, RegionPtr}, {S64, RegionPtr}});
1781 if (ST.hasFlatAddressSpace()) {
1782 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1783 }
1784
1785 auto &Atomics32 =
1786 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1787 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1788 if (ST.hasFlatAddressSpace()) {
1789 Atomics32.legalFor({{S32, FlatPtr}});
1790 }
1791
1792 // TODO: v2bf16 operations, and fat buffer pointer support.
1793 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1794 if (ST.hasLDSFPAtomicAddF32()) {
1795 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1796 if (ST.hasLdsAtomicAddF64())
1797 Atomic.legalFor({{S64, LocalPtr}});
1798 if (ST.hasAtomicDsPkAdd16Insts())
1799 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1800 }
1801 if (ST.hasAtomicFaddInsts())
1802 Atomic.legalFor({{S32, GlobalPtr}});
1803 if (ST.hasFlatAtomicFaddF32Inst())
1804 Atomic.legalFor({{S32, FlatPtr}});
1805
1806 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1807 // These are legal with some caveats, and should have undergone expansion in
1808 // the IR in most situations
1809 // TODO: Move atomic expansion into legalizer
1810 Atomic.legalFor({
1811 {S32, GlobalPtr},
1812 {S64, GlobalPtr},
1813 {S64, FlatPtr}
1814 });
1815 }
1816
1817 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1818 ST.hasAtomicBufferGlobalPkAddF16Insts())
1819 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1820 if (ST.hasAtomicGlobalPkAddBF16Inst())
1821 Atomic.legalFor({{V2BF16, GlobalPtr}});
1822 if (ST.hasAtomicFlatPkAdd16Insts())
1823 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1824
1825
1826 // Most of the legalization work here is done by AtomicExpand. We could
1827 // probably use a simpler legality rule that just assumes anything is OK.
1828 auto &AtomicFMinFMax =
1829 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1830 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1831
1832 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1833 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1834 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1835 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1836 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1837 AtomicFMinFMax.legalFor({F32, FlatPtr});
1838 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1839 AtomicFMinFMax.legalFor({F64, FlatPtr});
1840
1841 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1842 // demarshalling
1843 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1844 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1845 {S32, FlatPtr}, {S64, FlatPtr}})
1846 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1847 {S32, RegionPtr}, {S64, RegionPtr}});
1848 // TODO: Pointer types, any 32-bit or 64-bit vector
1849
1850 // Condition should be s32 for scalar, s1 for vector.
1851 getActionDefinitionsBuilder(G_SELECT)
1852 .legalForCartesianProduct({S32, S64, S16, V2S32, V2S16, V4S16, GlobalPtr,
1853 LocalPtr, FlatPtr, PrivatePtr,
1854 LLT::fixed_vector(2, LocalPtr),
1855 LLT::fixed_vector(2, PrivatePtr)},
1856 {S1, S32})
1857 .clampScalar(0, S16, S64)
1858 .scalarize(1)
1859 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1860 .fewerElementsIf(numElementsNotEven(0), scalarize(0))
1861 .clampMaxNumElements(0, S32, 2)
1862 .clampMaxNumElements(0, LocalPtr, 2)
1863 .clampMaxNumElements(0, PrivatePtr, 2)
1864 .scalarize(0)
1865 .widenScalarToNextPow2(0)
1866 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1867
1868 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1869 // be more flexible with the shift amount type.
1870 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1871 .legalFor({{S32, S32}, {S64, S32}});
1872 if (ST.has16BitInsts()) {
1873 if (ST.hasVOP3PInsts()) {
1874 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1875 .clampMaxNumElements(0, S16, 2);
1876 } else
1877 Shifts.legalFor({{S16, S16}});
1878
1879 // TODO: Support 16-bit shift amounts for all types
1880 Shifts.widenScalarIf(
1881 [=](const LegalityQuery &Query) {
1882 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1883 // 32-bit amount.
1884 const LLT ValTy = Query.Types[0];
1885 const LLT AmountTy = Query.Types[1];
1886 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1887 AmountTy.getSizeInBits() < 16;
1888 }, changeTo(1, S16));
1889 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1890 Shifts.clampScalar(1, S32, S32);
1891 Shifts.widenScalarToNextPow2(0, 16);
1892 Shifts.clampScalar(0, S16, S64);
1893
1894 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1895 .minScalar(0, S16)
1896 .scalarize(0)
1897 .lower();
1898 } else {
1899 // Make sure we legalize the shift amount type first, as the general
1900 // expansion for the shifted type will produce much worse code if it hasn't
1901 // been truncated already.
1902 Shifts.clampScalar(1, S32, S32);
1903 Shifts.widenScalarToNextPow2(0, 32);
1904 Shifts.clampScalar(0, S32, S64);
1905
1906 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1907 .minScalar(0, S32)
1908 .scalarize(0)
1909 .lower();
1910 }
1911 Shifts.scalarize(0);
1912
1913 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1914 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1915 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1916 unsigned IdxTypeIdx = 2;
1917
1918 getActionDefinitionsBuilder(Op)
1919 .customIf([=](const LegalityQuery &Query) {
1920 const LLT EltTy = Query.Types[EltTypeIdx];
1921 const LLT VecTy = Query.Types[VecTypeIdx];
1922 const LLT IdxTy = Query.Types[IdxTypeIdx];
1923 const unsigned EltSize = EltTy.getSizeInBits();
1924 const bool isLegalVecType =
1926 // Address space 8 pointers are 128-bit wide values, but the logic
1927 // below will try to bitcast them to 2N x s64, which will fail.
1928 // Therefore, as an intermediate step, wrap extracts/insertions from a
1929 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1930 // extraction result) in order to produce a vector operation that can
1931 // be handled by the logic below.
1932 if (EltTy.isPointer() && EltSize > 64)
1933 return true;
1934 return (EltSize == 32 || EltSize == 64) &&
1935 VecTy.getSizeInBits() % 32 == 0 &&
1936 VecTy.getSizeInBits() <= MaxRegisterSize &&
1937 IdxTy.getSizeInBits() == 32 &&
1938 isLegalVecType;
1939 })
1940 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1941 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1942 bitcastToVectorElement32(VecTypeIdx))
1943 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1944 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1945 scalarOrEltWiderThan(VecTypeIdx, 64)),
1946 [=](const LegalityQuery &Query) {
1947 // For > 64-bit element types, try to turn this into a
1948 // 64-bit element vector since we may be able to do better
1949 // indexing if this is scalar. If not, fall back to 32.
1950 const LLT EltTy = Query.Types[EltTypeIdx];
1951 const LLT VecTy = Query.Types[VecTypeIdx];
1952 const unsigned DstEltSize = EltTy.getSizeInBits();
1953 const unsigned VecSize = VecTy.getSizeInBits();
1954
1955 const unsigned TargetEltSize =
1956 DstEltSize % 64 == 0 ? 64 : 32;
1957 return std::pair(VecTypeIdx,
1958 LLT::fixed_vector(VecSize / TargetEltSize,
1959 TargetEltSize));
1960 })
1961 .clampScalar(EltTypeIdx, S32, S64)
1962 .clampScalar(VecTypeIdx, S32, S64)
1963 .clampScalar(IdxTypeIdx, S32, S32)
1964 .clampMaxNumElements(VecTypeIdx, S32, 32)
1965 // TODO: Clamp elements for 64-bit vectors?
1966 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1968 // It should only be necessary with variable indexes.
1969 // As a last resort, lower to the stack
1970 .lower();
1971 }
1972
1973 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1974 .unsupportedIf([=](const LegalityQuery &Query) {
1975 const LLT &EltTy = Query.Types[1].getElementType();
1976 return Query.Types[0] != EltTy;
1977 });
1978
1979 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1980 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1981 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1982 getActionDefinitionsBuilder(Op)
1983 .widenScalarIf(
1984 [=](const LegalityQuery &Query) {
1985 const LLT BigTy = Query.Types[BigTyIdx];
1986 return (BigTy.getScalarSizeInBits() < 16);
1987 },
1989 .widenScalarIf(
1990 [=](const LegalityQuery &Query) {
1991 const LLT LitTy = Query.Types[LitTyIdx];
1992 return (LitTy.getScalarSizeInBits() < 16);
1993 },
1995 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1996 .widenScalarToNextPow2(BigTyIdx, 32)
1997 .customIf([=](const LegalityQuery &Query) {
1998 // Generic lower operates on the full-width value, producing
1999 // shift+trunc/mask sequences. For simple cases where extract/insert
2000 // values are 32-bit aligned, we can instead unmerge/merge and work on
2001 // the 32-bit components. However, we can't check the offset here so
2002 // custom lower function will have to call generic lowering if offset
2003 // is not 32-bit aligned.
2004 const LLT BigTy = Query.Types[BigTyIdx];
2005 const LLT LitTy = Query.Types[LitTyIdx];
2006 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
2007 LitTy.getSizeInBits() % 32 == 0;
2008 })
2009 .lower();
2010 }
2011
2012 auto &BuildVector =
2013 getActionDefinitionsBuilder(G_BUILD_VECTOR)
2014 .legalForCartesianProduct(AllS32Vectors, {S32})
2015 .legalForCartesianProduct(AllS64Vectors, {S64})
2016 .clampNumElements(0, V16S32, V32S32)
2017 .clampNumElements(0, V2S64, V16S64)
2018 .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16))
2019 .moreElementsIf(isIllegalRegisterType(ST, 0),
2021
2022 if (ST.hasScalarPackInsts()) {
2023 BuildVector
2024 // FIXME: Should probably widen s1 vectors straight to s32
2025 .minScalarOrElt(0, S16)
2026 .minScalar(1, S16);
2027
2028 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2029 .legalFor({V2S16, S32})
2030 .lower();
2031 } else {
2032 BuildVector.customFor({V2S16, S16});
2033 BuildVector.minScalarOrElt(0, S32);
2034
2035 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2036 .customFor({V2S16, S32})
2037 .lower();
2038 }
2039
2040 BuildVector.legalIf(isRegisterType(ST, 0));
2041
2042 // FIXME: Clamp maximum size
2043 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2044 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2045 .clampMaxNumElements(0, S32, 32)
2046 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2047 .clampMaxNumElements(0, S16, 64);
2048
2049 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2050
2051 // Merge/Unmerge
2052 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2053 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2054 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2055
2056 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2057 const LLT Ty = Query.Types[TypeIdx];
2058 if (Ty.isVector()) {
2059 const LLT &EltTy = Ty.getElementType();
2060 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2061 return true;
2063 return true;
2064 }
2065 return false;
2066 };
2067
2068 auto &Builder =
2069 getActionDefinitionsBuilder(Op)
2070 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2071 .lowerFor({{S16, V2S16}})
2072 .lowerIf([=](const LegalityQuery &Query) {
2073 const LLT BigTy = Query.Types[BigTyIdx];
2074 return BigTy.getSizeInBits() == 32;
2075 })
2076 // Try to widen to s16 first for small types.
2077 // TODO: Only do this on targets with legal s16 shifts
2078 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2079 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2080 .moreElementsIf(isSmallOddVector(BigTyIdx),
2081 oneMoreElement(BigTyIdx))
2082 .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
2083 elementTypeIs(1, S16)),
2084 changeTo(1, V2S16))
2085 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2086 // not worth considering the multiples of 64 since 2*192 and 2*384
2087 // are not valid.
2088 .clampScalar(LitTyIdx, S32, S512)
2089 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2090 // Break up vectors with weird elements into scalars
2091 .fewerElementsIf(
2092 [=](const LegalityQuery &Query) {
2093 return notValidElt(Query, LitTyIdx);
2094 },
2095 scalarize(0))
2096 .fewerElementsIf(
2097 [=](const LegalityQuery &Query) {
2098 return notValidElt(Query, BigTyIdx);
2099 },
2100 scalarize(1))
2101 .clampScalar(BigTyIdx, S32, MaxScalar);
2102
2103 if (Op == G_MERGE_VALUES) {
2104 Builder.widenScalarIf(
2105 // TODO: Use 16-bit shifts if legal for 8-bit values?
2106 [=](const LegalityQuery &Query) {
2107 const LLT Ty = Query.Types[LitTyIdx];
2108 return Ty.getSizeInBits() < 32;
2109 },
2110 changeTo(LitTyIdx, S32));
2111 }
2112
2113 Builder.widenScalarIf(
2114 [=](const LegalityQuery &Query) {
2115 const LLT Ty = Query.Types[BigTyIdx];
2116 return Ty.getSizeInBits() % 16 != 0;
2117 },
2118 [=](const LegalityQuery &Query) {
2119 // Pick the next power of 2, or a multiple of 64 over 128.
2120 // Whichever is smaller.
2121 const LLT &Ty = Query.Types[BigTyIdx];
2122 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2123 if (NewSizeInBits >= 256) {
2124 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2125 if (RoundedTo < NewSizeInBits)
2126 NewSizeInBits = RoundedTo;
2127 }
2128 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2129 })
2130 // Any vectors left are the wrong size. Scalarize them.
2131 .scalarize(0)
2132 .scalarize(1);
2133 }
2134
2135 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2136 // RegBankSelect.
2137 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2138 .legalFor({{S32}, {S64}})
2139 .clampScalar(0, S32, S64);
2140
2141 if (ST.hasVOP3PInsts()) {
2142 SextInReg.lowerFor({{V2S16}})
2143 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2144 // get more vector shift opportunities, since we'll get those when
2145 // expanded.
2146 .clampMaxNumElementsStrict(0, S16, 2);
2147 } else if (ST.has16BitInsts()) {
2148 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2149 } else {
2150 // Prefer to promote to s32 before lowering if we don't have 16-bit
2151 // shifts. This avoid a lot of intermediate truncate and extend operations.
2152 SextInReg.lowerFor({{S32}, {S64}});
2153 }
2154
2155 SextInReg
2156 .scalarize(0)
2157 .clampScalar(0, S32, S64)
2158 .lower();
2159
2160 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2161 .scalarize(0)
2162 .lower();
2163
2164 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2165 FSHRActionDefs.legalFor({{S32, S32}})
2166 .clampMaxNumElementsStrict(0, S16, 2);
2167 if (ST.hasVOP3PInsts())
2168 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2169 FSHRActionDefs.scalarize(0).lower();
2170
2171 if (ST.hasVOP3PInsts()) {
2172 getActionDefinitionsBuilder(G_FSHL)
2173 .lowerFor({{V2S16, V2S16}})
2174 .clampMaxNumElementsStrict(0, S16, 2)
2175 .scalarize(0)
2176 .lower();
2177 } else {
2178 getActionDefinitionsBuilder(G_FSHL)
2179 .scalarize(0)
2180 .lower();
2181 }
2182
2183 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2184 .legalFor({S64});
2185
2186 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2187
2188 getActionDefinitionsBuilder(G_FENCE)
2189 .alwaysLegal();
2190
2191 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2192 .scalarize(0)
2193 .minScalar(0, S32)
2194 .lower();
2195
2196 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2197 .legalFor({{S32, S32}, {S64, S32}})
2198 .clampScalar(1, S32, S32)
2199 .clampScalar(0, S32, S64)
2200 .widenScalarToNextPow2(0)
2201 .scalarize(0);
2202
2203 getActionDefinitionsBuilder(
2204 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2205 G_FCOPYSIGN,
2206
2207 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2208 G_READ_REGISTER, G_WRITE_REGISTER,
2209
2210 G_SADDO, G_SSUBO})
2211 .lower();
2212
2213 if (ST.hasIEEEMinimumMaximumInsts()) {
2214 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2215 .legalFor(FPTypesPK16)
2216 .clampMaxNumElements(0, S16, 2)
2217 .scalarize(0);
2218 } else if (ST.hasVOP3PInsts()) {
2219 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2220 .lowerFor({V2S16})
2221 .clampMaxNumElementsStrict(0, S16, 2)
2222 .scalarize(0)
2223 .lower();
2224 } else {
2225 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2226 .scalarize(0)
2227 .clampScalar(0, S32, S64)
2228 .lower();
2229 }
2230
2231 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2232 .lower();
2233
2234 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2235
2236 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2237 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2238 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2239 .unsupported();
2240
2241 getActionDefinitionsBuilder(G_PREFETCH).alwaysLegal();
2242
2243 getActionDefinitionsBuilder(
2244 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2245 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2246 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2247 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2248 .legalFor(AllVectors)
2249 .scalarize(1)
2250 .lower();
2251
2252 getLegacyLegalizerInfo().computeTables();
2253 verify(*ST.getInstrInfo());
2254}
2255
2258 LostDebugLocObserver &LocObserver) const {
2259 MachineIRBuilder &B = Helper.MIRBuilder;
2260 MachineRegisterInfo &MRI = *B.getMRI();
2261
2262 switch (MI.getOpcode()) {
2263 case TargetOpcode::G_ADDRSPACE_CAST:
2264 return legalizeAddrSpaceCast(MI, MRI, B);
2265 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2266 return legalizeFroundeven(MI, MRI, B);
2267 case TargetOpcode::G_FCEIL:
2268 return legalizeFceil(MI, MRI, B);
2269 case TargetOpcode::G_FREM:
2270 return legalizeFrem(MI, MRI, B);
2271 case TargetOpcode::G_INTRINSIC_TRUNC:
2272 return legalizeIntrinsicTrunc(MI, MRI, B);
2273 case TargetOpcode::G_SITOFP:
2274 return legalizeITOFP(MI, MRI, B, true);
2275 case TargetOpcode::G_UITOFP:
2276 return legalizeITOFP(MI, MRI, B, false);
2277 case TargetOpcode::G_FPTOSI:
2278 return legalizeFPTOI(MI, MRI, B, true);
2279 case TargetOpcode::G_FPTOUI:
2280 return legalizeFPTOI(MI, MRI, B, false);
2281 case TargetOpcode::G_FMINNUM:
2282 case TargetOpcode::G_FMAXNUM:
2283 case TargetOpcode::G_FMINIMUMNUM:
2284 case TargetOpcode::G_FMAXIMUMNUM:
2285 return legalizeMinNumMaxNum(Helper, MI);
2286 case TargetOpcode::G_EXTRACT:
2287 return legalizeExtract(Helper, MI);
2288 case TargetOpcode::G_INSERT:
2289 return legalizeInsert(Helper, MI);
2290 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2291 return legalizeExtractVectorElt(MI, MRI, B);
2292 case TargetOpcode::G_INSERT_VECTOR_ELT:
2293 return legalizeInsertVectorElt(MI, MRI, B);
2294 case TargetOpcode::G_FSIN:
2295 case TargetOpcode::G_FCOS:
2296 return legalizeSinCos(MI, MRI, B);
2297 case TargetOpcode::G_GLOBAL_VALUE:
2298 return legalizeGlobalValue(MI, MRI, B);
2299 case TargetOpcode::G_LOAD:
2300 case TargetOpcode::G_SEXTLOAD:
2301 case TargetOpcode::G_ZEXTLOAD:
2302 return legalizeLoad(Helper, MI);
2303 case TargetOpcode::G_STORE:
2304 return legalizeStore(Helper, MI);
2305 case TargetOpcode::G_FMAD:
2306 return legalizeFMad(MI, MRI, B);
2307 case TargetOpcode::G_FDIV:
2308 return legalizeFDIV(MI, MRI, B);
2309 case TargetOpcode::G_FFREXP:
2310 return legalizeFFREXP(MI, MRI, B);
2311 case TargetOpcode::G_FSQRT:
2312 return legalizeFSQRT(MI, MRI, B);
2313 case TargetOpcode::G_UDIV:
2314 case TargetOpcode::G_UREM:
2315 case TargetOpcode::G_UDIVREM:
2316 return legalizeUnsignedDIV_REM(MI, MRI, B);
2317 case TargetOpcode::G_SDIV:
2318 case TargetOpcode::G_SREM:
2319 case TargetOpcode::G_SDIVREM:
2320 return legalizeSignedDIV_REM(MI, MRI, B);
2321 case TargetOpcode::G_ATOMIC_CMPXCHG:
2322 return legalizeAtomicCmpXChg(MI, MRI, B);
2323 case TargetOpcode::G_FLOG2:
2324 return legalizeFlog2(MI, B);
2325 case TargetOpcode::G_FLOG:
2326 case TargetOpcode::G_FLOG10:
2327 return legalizeFlogCommon(MI, B);
2328 case TargetOpcode::G_FEXP2:
2329 return legalizeFExp2(MI, B);
2330 case TargetOpcode::G_FEXP:
2331 case TargetOpcode::G_FEXP10:
2332 return legalizeFExp(MI, B);
2333 case TargetOpcode::G_FPOW:
2334 return legalizeFPow(MI, B);
2335 case TargetOpcode::G_FFLOOR:
2336 return legalizeFFloor(MI, MRI, B);
2337 case TargetOpcode::G_BUILD_VECTOR:
2338 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2339 return legalizeBuildVector(MI, MRI, B);
2340 case TargetOpcode::G_MUL:
2341 return legalizeMul(Helper, MI);
2342 case TargetOpcode::G_CTLZ:
2343 case TargetOpcode::G_CTTZ:
2344 return legalizeCTLZ_CTTZ(MI, MRI, B);
2345 case TargetOpcode::G_CTLS:
2346 return legalizeCTLS(MI, MRI, B);
2347 case TargetOpcode::G_CTLZ_ZERO_POISON:
2348 return legalizeCTLZ_ZERO_POISON(MI, MRI, B);
2349 case TargetOpcode::G_STACKSAVE:
2350 return legalizeStackSave(MI, B);
2351 case TargetOpcode::G_GET_FPENV:
2352 return legalizeGetFPEnv(MI, MRI, B);
2353 case TargetOpcode::G_SET_FPENV:
2354 return legalizeSetFPEnv(MI, MRI, B);
2355 case TargetOpcode::G_TRAP:
2356 return legalizeTrap(MI, MRI, B);
2357 case TargetOpcode::G_DEBUGTRAP:
2358 return legalizeDebugTrap(MI, MRI, B);
2359 default:
2360 return false;
2361 }
2362
2363 llvm_unreachable("expected switch to return");
2364}
2365
2367 unsigned AS,
2369 MachineIRBuilder &B) const {
2370 MachineFunction &MF = B.getMF();
2371 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2372 const LLT S32 = LLT::scalar(32);
2373 const LLT S64 = LLT::scalar(64);
2374
2376
2377 if (ST.hasApertureRegs()) {
2378 // Note: this register is somewhat broken. When used as a 32-bit operand,
2379 // it only returns zeroes. The real value is in the upper 32 bits.
2380 // Thus, we must emit extract the high 32 bits.
2381 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2382 ? AMDGPU::SRC_SHARED_BASE
2383 : AMDGPU::SRC_PRIVATE_BASE;
2384 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2385 !ST.hasGloballyAddressableScratch()) &&
2386 "Cannot use src_private_base with globally addressable scratch!");
2388 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2389 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2390 return B.buildUnmerge(S32, Dst).getReg(1);
2391 }
2392
2395 // For code object version 5, private_base and shared_base are passed through
2396 // implicit kernargs.
2400
2405 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2406
2407 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2409
2410 if (!loadInputValue(KernargPtrReg, B,
2412 return Register();
2413
2415 PtrInfo.getWithOffset(Offset),
2419
2420 // Pointer address
2421 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2422 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2423 // Load address
2424 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2425 }
2426
2429
2431 return Register();
2432
2433 // TODO: Use custom PseudoSourceValue
2435
2436 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2437 // private_segment_aperture_base_hi.
2438 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2439
2441 PtrInfo,
2444 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2445
2446 B.buildObjectPtrOffset(
2447 LoadAddr, QueuePtr,
2448 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2449 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2450}
2451
2452/// Return true if the value is a known valid address, such that a null check is
2453/// not necessary.
2455 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2456 MachineInstr *Def = MRI.getVRegDef(Val);
2457 switch (Def->getOpcode()) {
2458 case AMDGPU::G_FRAME_INDEX:
2459 case AMDGPU::G_GLOBAL_VALUE:
2460 case AMDGPU::G_BLOCK_ADDR:
2461 return true;
2462 case AMDGPU::G_CONSTANT: {
2463 const ConstantInt *CI = Def->getOperand(1).getCImm();
2464 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2465 }
2466 default:
2467 return false;
2468 }
2469
2470 return false;
2471}
2472
2475 MachineIRBuilder &B) const {
2476 MachineFunction &MF = B.getMF();
2477
2478 // MI can either be a G_ADDRSPACE_CAST or a
2479 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2480 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2481 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2482 Intrinsic::amdgcn_addrspacecast_nonnull));
2483
2484 const LLT S32 = LLT::scalar(32);
2485 Register Dst = MI.getOperand(0).getReg();
2486 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2487 : MI.getOperand(1).getReg();
2488 LLT DstTy = MRI.getType(Dst);
2489 LLT SrcTy = MRI.getType(Src);
2490 unsigned DestAS = DstTy.getAddressSpace();
2491 unsigned SrcAS = SrcTy.getAddressSpace();
2492
2493 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2494 // vector element.
2495 assert(!DstTy.isVector());
2496
2497 const AMDGPUTargetMachine &TM
2498 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2499
2500 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2501 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2502 return true;
2503 }
2504
2505 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2506 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2507 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2508 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2509 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2510 ST.hasGloballyAddressableScratch()) {
2511 // flat -> private with globally addressable scratch: subtract
2512 // src_flat_scratch_base_lo.
2513 const LLT S32 = LLT::scalar(32);
2514 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2515 Register FlatScratchBaseLo =
2516 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2517 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2518 .getReg(0);
2519 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2520 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2521 return B.buildIntToPtr(Dst, Sub).getReg(0);
2522 }
2523
2524 // Extract low 32-bits of the pointer.
2525 return B.buildExtract(Dst, Src, 0).getReg(0);
2526 };
2527
2528 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2529 // G_ADDRSPACE_CAST we need to guess.
2530 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2531 castFlatToLocalOrPrivate(Dst);
2532 MI.eraseFromParent();
2533 return true;
2534 }
2535
2536 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2537
2538 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2539 auto FlatNull = B.buildConstant(SrcTy, 0);
2540
2541 // Extract low 32-bits of the pointer.
2542 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2543
2544 auto CmpRes =
2545 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2546 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2547
2548 MI.eraseFromParent();
2549 return true;
2550 }
2551
2552 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2553 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2554 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2555 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2556 // Coerce the type of the low half of the result so we can use
2557 // merge_values.
2558 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2559
2560 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2561 ST.hasGloballyAddressableScratch()) {
2562 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2563 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2564 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2565 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2566 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2567 .addUse(AllOnes)
2568 .addUse(ThreadID)
2569 .getReg(0);
2570 if (ST.isWave64()) {
2571 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2572 .addUse(AllOnes)
2573 .addUse(ThreadID)
2574 .getReg(0);
2575 }
2576 Register ShAmt =
2577 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2578 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2579 Register CvtPtr =
2580 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2581 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2582 // 64-bit hi:lo value.
2583 Register FlatScratchBase =
2584 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2585 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2586 .getReg(0);
2587 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2588 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2589 }
2590
2591 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2592 if (!ApertureReg.isValid())
2593 return false;
2594
2595 // TODO: Should we allow mismatched types but matching sizes in merges to
2596 // avoid the ptrtoint?
2597 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2598 };
2599
2600 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2601 // G_ADDRSPACE_CAST we need to guess.
2602 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2603 castLocalOrPrivateToFlat(Dst);
2604 MI.eraseFromParent();
2605 return true;
2606 }
2607
2608 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2609
2610 auto SegmentNull =
2611 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2612 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2613
2614 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2615 SegmentNull.getReg(0));
2616
2617 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2618
2619 MI.eraseFromParent();
2620 return true;
2621 }
2622
2623 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2624 SrcTy.getSizeInBits() == 64) {
2625 // Truncate.
2626 B.buildExtract(Dst, Src, 0);
2627 MI.eraseFromParent();
2628 return true;
2629 }
2630
2631 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2632 DstTy.getSizeInBits() == 64) {
2634 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2635 auto PtrLo = B.buildPtrToInt(S32, Src);
2636 if (AddrHiVal == 0) {
2637 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2638 B.buildIntToPtr(Dst, Zext);
2639 } else {
2640 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2641 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2642 }
2643
2644 MI.eraseFromParent();
2645 return true;
2646 }
2647
2648 // Invalid casts are poison.
2649 // TODO: Should return poison
2650 B.buildUndef(Dst);
2651 MI.eraseFromParent();
2652 return true;
2653}
2654
2657 MachineIRBuilder &B) const {
2658 Register Src = MI.getOperand(1).getReg();
2659 LLT Ty = MRI.getType(Src);
2660 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2661
2662 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2663 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2664
2665 auto C1 = B.buildFConstant(Ty, C1Val);
2666 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2667
2668 // TODO: Should this propagate fast-math-flags?
2669 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2670 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2671
2672 auto C2 = B.buildFConstant(Ty, C2Val);
2673 auto Fabs = B.buildFAbs(Ty, Src);
2674
2675 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2676 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2677 MI.eraseFromParent();
2678 return true;
2679}
2680
2683 MachineIRBuilder &B) const {
2684
2685 const LLT S1 = LLT::scalar(1);
2686 const LLT S64 = LLT::scalar(64);
2687
2688 Register Src = MI.getOperand(1).getReg();
2689 assert(MRI.getType(Src) == S64);
2690
2691 // result = trunc(src)
2692 // if (src > 0.0 && src != result)
2693 // result += 1.0
2694
2695 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2696
2697 const auto Zero = B.buildFConstant(S64, 0.0);
2698 const auto One = B.buildFConstant(S64, 1.0);
2699 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2700 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2701 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2702 auto Add = B.buildSelect(S64, And, One, Zero);
2703
2704 // TODO: Should this propagate fast-math-flags?
2705 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2706 MI.eraseFromParent();
2707 return true;
2708}
2709
2712 MachineIRBuilder &B) const {
2713 Register DstReg = MI.getOperand(0).getReg();
2714 Register Src0Reg = MI.getOperand(1).getReg();
2715 Register Src1Reg = MI.getOperand(2).getReg();
2716 auto Flags = MI.getFlags();
2717 LLT Ty = MRI.getType(DstReg);
2718
2719 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2720 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2721 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2722 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2723 MI.eraseFromParent();
2724 return true;
2725}
2726
2729 const unsigned FractBits = 52;
2730 const unsigned ExpBits = 11;
2731 LLT S32 = LLT::scalar(32);
2732
2733 auto Const0 = B.buildConstant(S32, FractBits - 32);
2734 auto Const1 = B.buildConstant(S32, ExpBits);
2735
2736 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2737 .addUse(Hi)
2738 .addUse(Const0.getReg(0))
2739 .addUse(Const1.getReg(0));
2740
2741 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2742}
2743
2746 MachineIRBuilder &B) const {
2747 const LLT S1 = LLT::scalar(1);
2748 const LLT S32 = LLT::scalar(32);
2749 const LLT S64 = LLT::scalar(64);
2750
2751 Register Src = MI.getOperand(1).getReg();
2752 assert(MRI.getType(Src) == S64);
2753
2754 // TODO: Should this use extract since the low half is unused?
2755 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2756 Register Hi = Unmerge.getReg(1);
2757
2758 // Extract the upper half, since this is where we will find the sign and
2759 // exponent.
2760 auto Exp = extractF64Exponent(Hi, B);
2761
2762 const unsigned FractBits = 52;
2763
2764 // Extract the sign bit.
2765 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2766 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2767
2768 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2769
2770 const auto Zero32 = B.buildConstant(S32, 0);
2771
2772 // Extend back to 64-bits.
2773 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2774
2775 auto Shr = B.buildAShr(S64, FractMask, Exp);
2776 auto Not = B.buildNot(S64, Shr);
2777 auto Tmp0 = B.buildAnd(S64, Src, Not);
2778 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2779
2780 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2781 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2782
2783 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2784 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2785 MI.eraseFromParent();
2786 return true;
2787}
2788
2791 MachineIRBuilder &B, bool Signed) const {
2792
2793 Register Dst = MI.getOperand(0).getReg();
2794 Register Src = MI.getOperand(1).getReg();
2795
2796 const LLT S64 = LLT::scalar(64);
2797 const LLT S32 = LLT::scalar(32);
2798
2799 assert(MRI.getType(Src) == S64);
2800
2801 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2802 auto ThirtyTwo = B.buildConstant(S32, 32);
2803
2804 if (MRI.getType(Dst) == S64) {
2805 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2806 : B.buildUITOFP(S64, Unmerge.getReg(1));
2807
2808 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2809 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2810
2811 // TODO: Should this propagate fast-math-flags?
2812 B.buildFAdd(Dst, LdExp, CvtLo);
2813 MI.eraseFromParent();
2814 return true;
2815 }
2816
2817 assert(MRI.getType(Dst) == S32);
2818
2819 auto One = B.buildConstant(S32, 1);
2820
2821 MachineInstrBuilder ShAmt;
2822 if (Signed) {
2823 auto ThirtyOne = B.buildConstant(S32, 31);
2824 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2825 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2826 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2827 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2828 .addUse(Unmerge.getReg(1));
2829 auto LS2 = B.buildSub(S32, LS, One);
2830 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2831 } else
2832 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2833 auto Norm = B.buildShl(S64, Src, ShAmt);
2834 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2835 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2836 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2837 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2838 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2839 B.buildFLdexp(Dst, FVal, Scale);
2840 MI.eraseFromParent();
2841 return true;
2842}
2843
2844// TODO: Copied from DAG implementation. Verify logic and document how this
2845// actually works.
2849 bool Signed) const {
2850
2851 Register Dst = MI.getOperand(0).getReg();
2852 Register Src = MI.getOperand(1).getReg();
2853
2854 const LLT S64 = LLT::scalar(64);
2855 const LLT S32 = LLT::scalar(32);
2856
2857 const LLT SrcLT = MRI.getType(Src);
2858 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2859
2860 unsigned Flags = MI.getFlags();
2861
2862 // The basic idea of converting a floating point number into a pair of 32-bit
2863 // integers is illustrated as follows:
2864 //
2865 // tf := trunc(val);
2866 // hif := floor(tf * 2^-32);
2867 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2868 // hi := fptoi(hif);
2869 // lo := fptoi(lof);
2870 //
2871 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2873 if (Signed && SrcLT == S32) {
2874 // However, a 32-bit floating point number has only 23 bits mantissa and
2875 // it's not enough to hold all the significant bits of `lof` if val is
2876 // negative. To avoid the loss of precision, We need to take the absolute
2877 // value after truncating and flip the result back based on the original
2878 // signedness.
2879 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2880 Trunc = B.buildFAbs(S32, Trunc, Flags);
2881 }
2882 MachineInstrBuilder K0, K1;
2883 if (SrcLT == S64) {
2884 K0 = B.buildFConstant(
2885 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2886 K1 = B.buildFConstant(
2887 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2888 } else {
2889 K0 = B.buildFConstant(
2890 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2891 K1 = B.buildFConstant(
2892 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2893 }
2894
2895 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2896 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2897 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2898
2899 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2900 : B.buildFPTOUI(S32, FloorMul);
2901 auto Lo = B.buildFPTOUI(S32, Fma);
2902
2903 if (Signed && SrcLT == S32) {
2904 // Flip the result based on the signedness, which is either all 0s or 1s.
2905 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2906 // r := xor({lo, hi}, sign) - sign;
2907 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2908 Sign);
2909 } else
2910 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2911 MI.eraseFromParent();
2912
2913 return true;
2914}
2915
2917 MachineInstr &MI) const {
2918 MachineFunction &MF = Helper.MIRBuilder.getMF();
2920
2921 // With ieee_mode disabled, the instructions have the correct behavior.
2922 if (!MFI->getMode().IEEE)
2923 return true;
2924
2926}
2927
2929 MachineInstr &MI) const {
2930 MachineIRBuilder &B = Helper.MIRBuilder;
2931 MachineRegisterInfo &MRI = *B.getMRI();
2932 Register DstReg = MI.getOperand(0).getReg();
2933 Register SrcReg = MI.getOperand(1).getReg();
2934 uint64_t Offset = MI.getOperand(2).getImm();
2935
2936 // Fall back to generic lowering for offset 0 (trivial trunc) and
2937 // non-32-bit-aligned cases which require shift+trunc sequences
2938 // that generic code handles correctly.
2939 if (Offset == 0 || Offset % 32 != 0)
2940 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2941
2942 const LLT DstTy = MRI.getType(DstReg);
2943 unsigned StartIdx = Offset / 32;
2944 unsigned DstCount = DstTy.getSizeInBits() / 32;
2945 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2946
2947 if (DstCount == 1) {
2948 if (DstTy.isPointer())
2949 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2950 else
2951 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2952 } else {
2953 SmallVector<Register, 8> MergeVec;
2954 for (unsigned I = 0; I < DstCount; ++I)
2955 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2956 B.buildMergeLikeInstr(DstReg, MergeVec);
2957 }
2958
2959 MI.eraseFromParent();
2960 return true;
2961}
2962
2964 MachineInstr &MI) const {
2965 MachineIRBuilder &B = Helper.MIRBuilder;
2966 MachineRegisterInfo &MRI = *B.getMRI();
2967 Register DstReg = MI.getOperand(0).getReg();
2968 Register SrcReg = MI.getOperand(1).getReg();
2969 Register InsertSrc = MI.getOperand(2).getReg();
2970 uint64_t Offset = MI.getOperand(3).getImm();
2971
2972 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2973 const LLT InsertTy = MRI.getType(InsertSrc);
2974 unsigned InsertSize = InsertTy.getSizeInBits();
2975
2976 // Fall back to generic lowering for non-32-bit-aligned cases which
2977 // require shift+mask sequences that generic code handles correctly.
2978 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2979 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2980
2981 const LLT S32 = LLT::scalar(32);
2982 unsigned DstCount = DstSize / 32;
2983 unsigned InsertCount = InsertSize / 32;
2984 unsigned StartIdx = Offset / 32;
2985
2986 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
2987
2988 SmallVector<Register, 8> MergeVec;
2989 for (unsigned I = 0; I < StartIdx; ++I)
2990 MergeVec.push_back(SrcUnmerge.getReg(I));
2991
2992 if (InsertCount == 1) {
2993 // Merge-like instructions require same source types. Convert pointer
2994 // to scalar when inserting a pointer value into a scalar.
2995 if (InsertTy.isPointer())
2996 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
2997 MergeVec.push_back(InsertSrc);
2998 } else {
2999 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
3000 for (unsigned I = 0; I < InsertCount; ++I)
3001 MergeVec.push_back(InsertUnmerge.getReg(I));
3002 }
3003
3004 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
3005 MergeVec.push_back(SrcUnmerge.getReg(I));
3006
3007 B.buildMergeLikeInstr(DstReg, MergeVec);
3008
3009 MI.eraseFromParent();
3010 return true;
3011}
3012
3015 MachineIRBuilder &B) const {
3016 // TODO: Should move some of this into LegalizerHelper.
3017
3018 // TODO: Promote dynamic indexing of s16 to s32
3019
3020 Register Dst = MI.getOperand(0).getReg();
3021 Register Vec = MI.getOperand(1).getReg();
3022
3023 LLT VecTy = MRI.getType(Vec);
3024 LLT EltTy = VecTy.getElementType();
3025 assert(EltTy == MRI.getType(Dst));
3026
3027 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3028 // but we can't go directly to that logic becasue you can't bitcast a vector
3029 // of pointers to a vector of integers. Therefore, introduce an intermediate
3030 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3031 // drive the legalization forward.
3032 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3033 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3034 LLT IntVecTy = VecTy.changeElementType(IntTy);
3035
3036 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3037 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3038 B.buildIntToPtr(Dst, IntElt);
3039
3040 MI.eraseFromParent();
3041 return true;
3042 }
3043
3044 // FIXME: Artifact combiner probably should have replaced the truncated
3045 // constant before this, so we shouldn't need
3046 // getIConstantVRegValWithLookThrough.
3047 std::optional<ValueAndVReg> MaybeIdxVal =
3048 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3049 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3050 return true;
3051 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3052
3053 if (IdxVal < VecTy.getNumElements()) {
3054 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3055 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3056 } else {
3057 B.buildUndef(Dst);
3058 }
3059
3060 MI.eraseFromParent();
3061 return true;
3062}
3063
3066 MachineIRBuilder &B) const {
3067 // TODO: Should move some of this into LegalizerHelper.
3068
3069 // TODO: Promote dynamic indexing of s16 to s32
3070
3071 Register Dst = MI.getOperand(0).getReg();
3072 Register Vec = MI.getOperand(1).getReg();
3073 Register Ins = MI.getOperand(2).getReg();
3074
3075 LLT VecTy = MRI.getType(Vec);
3076 LLT EltTy = VecTy.getElementType();
3077 assert(EltTy == MRI.getType(Ins));
3078
3079 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3080 // but we can't go directly to that logic becasue you can't bitcast a vector
3081 // of pointers to a vector of integers. Therefore, make the pointer vector
3082 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3083 // new value, and then inttoptr the result vector back. This will then allow
3084 // the rest of legalization to take over.
3085 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3086 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3087 LLT IntVecTy = VecTy.changeElementType(IntTy);
3088
3089 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3090 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3091 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3092 MI.getOperand(3));
3093 B.buildIntToPtr(Dst, IntVecDest);
3094 MI.eraseFromParent();
3095 return true;
3096 }
3097
3098 // FIXME: Artifact combiner probably should have replaced the truncated
3099 // constant before this, so we shouldn't need
3100 // getIConstantVRegValWithLookThrough.
3101 std::optional<ValueAndVReg> MaybeIdxVal =
3102 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3103 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3104 return true;
3105
3106 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3107
3108 unsigned NumElts = VecTy.getNumElements();
3109 if (IdxVal < NumElts) {
3111 for (unsigned i = 0; i < NumElts; ++i)
3112 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3113 B.buildUnmerge(SrcRegs, Vec);
3114
3115 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3116 B.buildMergeLikeInstr(Dst, SrcRegs);
3117 } else {
3118 B.buildUndef(Dst);
3119 }
3120
3121 MI.eraseFromParent();
3122 return true;
3123}
3124
3127 MachineIRBuilder &B) const {
3128
3129 Register DstReg = MI.getOperand(0).getReg();
3130 Register SrcReg = MI.getOperand(1).getReg();
3131 LLT Ty = MRI.getType(DstReg);
3132 unsigned Flags = MI.getFlags();
3133
3134 Register TrigVal;
3135 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3136 if (ST.hasTrigReducedRange()) {
3137 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3138 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3139 .addUse(MulVal.getReg(0))
3140 .setMIFlags(Flags)
3141 .getReg(0);
3142 } else
3143 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3144
3145 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3146 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3147 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3148 .addUse(TrigVal)
3149 .setMIFlags(Flags);
3150 MI.eraseFromParent();
3151 return true;
3152}
3153
3156 const GlobalValue *GV,
3157 int64_t Offset,
3158 unsigned GAFlags) const {
3159 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3160 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3161 // to the following code sequence:
3162 //
3163 // For constant address space:
3164 // s_getpc_b64 s[0:1]
3165 // s_add_u32 s0, s0, $symbol
3166 // s_addc_u32 s1, s1, 0
3167 //
3168 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3169 // a fixup or relocation is emitted to replace $symbol with a literal
3170 // constant, which is a pc-relative offset from the encoding of the $symbol
3171 // operand to the global variable.
3172 //
3173 // For global address space:
3174 // s_getpc_b64 s[0:1]
3175 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3176 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3177 //
3178 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3179 // fixups or relocations are emitted to replace $symbol@*@lo and
3180 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3181 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3182 // operand to the global variable.
3183
3185
3186 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3187 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3188
3189 if (ST.has64BitLiterals()) {
3190 assert(GAFlags != SIInstrInfo::MO_NONE);
3191
3193 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3194 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3195 } else {
3197 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3198
3199 MIB.addGlobalAddress(GV, Offset, GAFlags);
3200 if (GAFlags == SIInstrInfo::MO_NONE)
3201 MIB.addImm(0);
3202 else
3203 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3204 }
3205
3206 if (!B.getMRI()->getRegClassOrNull(PCReg))
3207 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3208
3209 if (PtrTy.getSizeInBits() == 32)
3210 B.buildExtract(DstReg, PCReg, 0);
3211 return true;
3212}
3213
3214// Emit a ABS32_LO / ABS32_HI relocation stub.
3216 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3217 MachineRegisterInfo &MRI) const {
3218 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3219
3220 if (RequiresHighHalf && ST.has64BitLiterals()) {
3221 if (!MRI.getRegClassOrNull(DstReg))
3222 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3223 B.buildInstr(AMDGPU::S_MOV_B64)
3224 .addDef(DstReg)
3225 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3226 return;
3227 }
3228
3229 LLT S32 = LLT::scalar(32);
3230
3231 // Use the destination directly, if and only if we store the lower address
3232 // part only and we don't have a register class being set.
3233 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3234 ? DstReg
3236
3237 if (!MRI.getRegClassOrNull(AddrLo))
3238 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3239
3240 // Write the lower half.
3241 B.buildInstr(AMDGPU::S_MOV_B32)
3242 .addDef(AddrLo)
3243 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3244
3245 // If required, write the upper half as well.
3246 if (RequiresHighHalf) {
3247 assert(PtrTy.getSizeInBits() == 64 &&
3248 "Must provide a 64-bit pointer type!");
3249
3251 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3252
3253 B.buildInstr(AMDGPU::S_MOV_B32)
3254 .addDef(AddrHi)
3255 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3256
3257 // Use the destination directly, if and only if we don't have a register
3258 // class being set.
3259 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3260 ? DstReg
3262
3263 if (!MRI.getRegClassOrNull(AddrDst))
3264 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3265
3266 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3267
3268 // If we created a new register for the destination, cast the result into
3269 // the final output.
3270 if (AddrDst != DstReg)
3271 B.buildCast(DstReg, AddrDst);
3272 } else if (AddrLo != DstReg) {
3273 // If we created a new register for the destination, cast the result into
3274 // the final output.
3275 B.buildCast(DstReg, AddrLo);
3276 }
3277}
3278
3281 MachineIRBuilder &B) const {
3282 Register DstReg = MI.getOperand(0).getReg();
3283 LLT Ty = MRI.getType(DstReg);
3284 unsigned AS = Ty.getAddressSpace();
3285
3286 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3287 MachineFunction &MF = B.getMF();
3289
3291 if (!MFI->isModuleEntryFunction() &&
3292 GV->getName() != "llvm.amdgcn.module.lds" &&
3294 const Function &Fn = MF.getFunction();
3296 Fn, "local memory global used by non-kernel function",
3297 MI.getDebugLoc(), DS_Warning));
3298
3299 // We currently don't have a way to correctly allocate LDS objects that
3300 // aren't directly associated with a kernel. We do force inlining of
3301 // functions that use local objects. However, if these dead functions are
3302 // not eliminated, we don't want a compile time error. Just emit a warning
3303 // and a trap, since there should be no callable path here.
3304 B.buildTrap();
3305 B.buildUndef(DstReg);
3306 MI.eraseFromParent();
3307 return true;
3308 }
3309
3310 // TODO: We could emit code to handle the initialization somewhere.
3311 // We ignore the initializer for now and legalize it to allow selection.
3312 // The initializer will anyway get errored out during assembly emission.
3313 const SITargetLowering *TLI = ST.getTargetLowering();
3314 if (!TLI->shouldUseLDSConstAddress(GV)) {
3315 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3316 return true; // Leave in place;
3317 }
3318
3319 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3320 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3321 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3322 // zero-sized type in other languages to declare the dynamic shared
3323 // memory which size is not known at the compile time. They will be
3324 // allocated by the runtime and placed directly after the static
3325 // allocated ones. They all share the same offset.
3326 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3327 // Adjust alignment for that dynamic shared memory array.
3328 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3329 LLT S32 = LLT::scalar(32);
3330 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3331 B.buildIntToPtr(DstReg, Sz);
3332 MI.eraseFromParent();
3333 return true;
3334 }
3335 }
3336
3337 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3338 MI.eraseFromParent();
3339 return true;
3340 }
3341
3342 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3343 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3344 MI.eraseFromParent();
3345 return true;
3346 }
3347
3348 const SITargetLowering *TLI = ST.getTargetLowering();
3349
3350 if (TLI->shouldEmitFixup(GV)) {
3351 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3352 MI.eraseFromParent();
3353 return true;
3354 }
3355
3356 if (TLI->shouldEmitPCReloc(GV)) {
3357 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3358 MI.eraseFromParent();
3359 return true;
3360 }
3361
3363 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3364
3365 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3370 LoadTy, Align(8));
3371
3372 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3373
3374 if (Ty.getSizeInBits() == 32) {
3375 // Truncate if this is a 32-bit constant address.
3376 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3377 B.buildExtract(DstReg, Load, 0);
3378 } else
3379 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3380
3381 MI.eraseFromParent();
3382 return true;
3383}
3384
3386 if (Ty.isVector())
3387 return Ty.changeElementCount(
3388 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3389 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3390}
3391
3393 MachineInstr &MI) const {
3394 MachineIRBuilder &B = Helper.MIRBuilder;
3395 MachineRegisterInfo &MRI = *B.getMRI();
3396 GISelChangeObserver &Observer = Helper.Observer;
3397
3398 Register PtrReg = MI.getOperand(1).getReg();
3399 LLT PtrTy = MRI.getType(PtrReg);
3400 unsigned AddrSpace = PtrTy.getAddressSpace();
3401
3402 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3404 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3405 Observer.changingInstr(MI);
3406 MI.getOperand(1).setReg(Cast.getReg(0));
3407 Observer.changedInstr(MI);
3408 return true;
3409 }
3410
3411 if (MI.getOpcode() != AMDGPU::G_LOAD)
3412 return false;
3413
3414 Register ValReg = MI.getOperand(0).getReg();
3415 LLT ValTy = MRI.getType(ValReg);
3416
3417 if (hasBufferRsrcWorkaround(ValTy)) {
3418 Observer.changingInstr(MI);
3419 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3420 Observer.changedInstr(MI);
3421 return true;
3422 }
3423
3424 MachineMemOperand *MMO = *MI.memoperands_begin();
3425 const unsigned ValSize = ValTy.getSizeInBits();
3426 const LLT MemTy = MMO->getMemoryType();
3427 const Align MemAlign = MMO->getAlign();
3428 const unsigned MemSize = MemTy.getSizeInBits();
3429 const uint64_t AlignInBits = 8 * MemAlign.value();
3430
3431 // Widen non-power-of-2 loads to the alignment if needed
3432 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3433 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3434
3435 // This was already the correct extending load result type, so just adjust
3436 // the memory type.
3437 if (WideMemSize == ValSize) {
3438 MachineFunction &MF = B.getMF();
3439
3440 MachineMemOperand *WideMMO =
3441 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3442 Observer.changingInstr(MI);
3443 MI.setMemRefs(MF, {WideMMO});
3444 Observer.changedInstr(MI);
3445 return true;
3446 }
3447
3448 // Don't bother handling edge case that should probably never be produced.
3449 if (ValSize > WideMemSize)
3450 return false;
3451
3452 LLT WideTy = widenToNextPowerOf2(ValTy);
3453
3454 Register WideLoad;
3455 if (!WideTy.isVector()) {
3456 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3457 B.buildTrunc(ValReg, WideLoad).getReg(0);
3458 } else {
3459 // Extract the subvector.
3460
3461 if (isRegisterType(ST, ValTy)) {
3462 // If this a case where G_EXTRACT is legal, use it.
3463 // (e.g. <3 x s32> -> <4 x s32>)
3464 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3465 B.buildExtract(ValReg, WideLoad, 0);
3466 } else {
3467 // For cases where the widened type isn't a nice register value, unmerge
3468 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3469 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3470 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3471 }
3472 }
3473
3474 MI.eraseFromParent();
3475 return true;
3476 }
3477
3478 return false;
3479}
3480
3482 MachineInstr &MI) const {
3483 MachineIRBuilder &B = Helper.MIRBuilder;
3484 MachineRegisterInfo &MRI = *B.getMRI();
3485 GISelChangeObserver &Observer = Helper.Observer;
3486
3487 Register DataReg = MI.getOperand(0).getReg();
3488 LLT DataTy = MRI.getType(DataReg);
3489
3490 if (hasBufferRsrcWorkaround(DataTy)) {
3491 Observer.changingInstr(MI);
3493 Observer.changedInstr(MI);
3494 return true;
3495 }
3496 return false;
3497}
3498
3501 MachineIRBuilder &B) const {
3502 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3503 assert(Ty.isScalar());
3504
3505 MachineFunction &MF = B.getMF();
3507
3508 // TODO: Always legal with future ftz flag.
3509 // TODO: Type is expected to be LLT::float32()/LLT::float16()
3510 // FIXME: Do we need just output?
3511 if (Ty == LLT::scalar(32) &&
3513 return true;
3514 if (Ty == LLT::scalar(16) &&
3516 return true;
3517
3518 MachineIRBuilder HelperBuilder(MI);
3519 GISelObserverWrapper DummyObserver;
3520 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3521 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3522}
3523
3526 Register DstReg = MI.getOperand(0).getReg();
3527 Register PtrReg = MI.getOperand(1).getReg();
3528 Register CmpVal = MI.getOperand(2).getReg();
3529 Register NewVal = MI.getOperand(3).getReg();
3530
3532 "this should not have been custom lowered");
3533
3534 LLT ValTy = MRI.getType(CmpVal);
3535 LLT VecTy = LLT::fixed_vector(2, ValTy);
3536
3537 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3538
3539 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3540 .addDef(DstReg)
3541 .addUse(PtrReg)
3542 .addUse(PackedVal)
3543 .setMemRefs(MI.memoperands());
3544
3545 MI.eraseFromParent();
3546 return true;
3547}
3548
3549/// Return true if it's known that \p Src can never be an f32 denormal value.
3551 Register Src) {
3552 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3553 switch (DefMI->getOpcode()) {
3554 case TargetOpcode::G_INTRINSIC: {
3556 case Intrinsic::amdgcn_frexp_mant:
3557 case Intrinsic::amdgcn_log:
3558 case Intrinsic::amdgcn_log_clamp:
3559 case Intrinsic::amdgcn_exp2:
3560 case Intrinsic::amdgcn_sqrt:
3561 return true;
3562 default:
3563 break;
3564 }
3565
3566 break;
3567 }
3568 case TargetOpcode::G_FSQRT:
3569 return true;
3570 case TargetOpcode::G_FFREXP: {
3571 if (DefMI->getOperand(0).getReg() == Src)
3572 return true;
3573 break;
3574 }
3575 case TargetOpcode::G_FPEXT: {
3576 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3577 }
3578 default:
3579 return false;
3580 }
3581
3582 return false;
3583}
3584
3585static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3586 return Flags & MachineInstr::FmAfn;
3587}
3588
3590 unsigned Flags) {
3591 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3594}
3595
3596std::pair<Register, Register>
3598 unsigned Flags) const {
3599 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3600 return {};
3601
3602 const LLT F32 = LLT::scalar(32);
3603 auto SmallestNormal = B.buildFConstant(
3605 auto IsLtSmallestNormal =
3606 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3607
3608 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3609 auto One = B.buildFConstant(F32, 1.0);
3610 auto ScaleFactor =
3611 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3612 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3613
3614 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3615}
3616
3618 MachineIRBuilder &B) const {
3619 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3620 // If we have to handle denormals, scale up the input and adjust the result.
3621
3622 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3623 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3624
3625 Register Dst = MI.getOperand(0).getReg();
3626 Register Src = MI.getOperand(1).getReg();
3627 LLT Ty = B.getMRI()->getType(Dst);
3628 unsigned Flags = MI.getFlags();
3629
3630 if (Ty == LLT::scalar(16)) {
3631 const LLT F32 = LLT::scalar(32);
3632 // Nothing in half is a denormal when promoted to f32.
3633 auto Ext = B.buildFPExt(F32, Src, Flags);
3634 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3635 .addUse(Ext.getReg(0))
3636 .setMIFlags(Flags);
3637 B.buildFPTrunc(Dst, Log2, Flags);
3638 MI.eraseFromParent();
3639 return true;
3640 }
3641
3642 assert(Ty == LLT::scalar(32));
3643
3644 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3645 if (!ScaledInput) {
3646 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3647 .addUse(Src)
3648 .setMIFlags(Flags);
3649 MI.eraseFromParent();
3650 return true;
3651 }
3652
3653 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3654 .addUse(ScaledInput)
3655 .setMIFlags(Flags);
3656
3657 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3658 auto Zero = B.buildFConstant(Ty, 0.0);
3659 auto ResultOffset =
3660 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3661 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3662
3663 MI.eraseFromParent();
3664 return true;
3665}
3666
3668 Register Z, unsigned Flags) {
3669 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3670 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3671}
3672
3674 MachineIRBuilder &B) const {
3675 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3676 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3677
3678 MachineRegisterInfo &MRI = *B.getMRI();
3679 Register Dst = MI.getOperand(0).getReg();
3680 Register X = MI.getOperand(1).getReg();
3681 unsigned Flags = MI.getFlags();
3682 const LLT Ty = MRI.getType(X);
3683
3684 const LLT F32 = LLT::scalar(32);
3685 const LLT F16 = LLT::scalar(16);
3686
3687 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3688 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3689 // depending on !fpmath metadata.
3690 bool PromoteToF32 =
3691 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3692 if (PromoteToF32) {
3694 auto PromoteSrc = B.buildFPExt(F32, X);
3695 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3696 B.buildFPTrunc(Dst, LogVal);
3697 } else {
3698 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3699 }
3700
3701 MI.eraseFromParent();
3702 return true;
3703 }
3704
3705 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3706 if (ScaledInput)
3707 X = ScaledInput;
3708
3709 auto Y =
3710 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3711
3712 Register R;
3713 if (ST.hasFastFMAF32()) {
3714 // c+cc are ln(2)/ln(10) to more than 49 bits
3715 const float c_log10 = 0x1.344134p-2f;
3716 const float cc_log10 = 0x1.09f79ep-26f;
3717
3718 // c + cc is ln(2) to more than 49 bits
3719 const float c_log = 0x1.62e42ep-1f;
3720 const float cc_log = 0x1.efa39ep-25f;
3721
3722 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3723 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3724 // This adds correction terms for which contraction may lead to an increase
3725 // in the error of the approximation, so disable it.
3726 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3727 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3728 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3729 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3730 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3731 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3732 } else {
3733 // ch+ct is ln(2)/ln(10) to more than 36 bits
3734 const float ch_log10 = 0x1.344000p-2f;
3735 const float ct_log10 = 0x1.3509f6p-18f;
3736
3737 // ch + ct is ln(2) to more than 36 bits
3738 const float ch_log = 0x1.62e000p-1f;
3739 const float ct_log = 0x1.0bfbe8p-15f;
3740
3741 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3742 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3743
3744 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3745 auto YH = B.buildAnd(Ty, Y, MaskConst);
3746 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3747 // This adds correction terms for which contraction may lead to an increase
3748 // in the error of the approximation, so disable it.
3749 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3750 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3751
3752 Register Mad0 =
3753 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3754 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3755 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3756 }
3757
3758 const bool IsFiniteOnly =
3760
3761 if (!IsFiniteOnly) {
3762 // Expand isfinite(x) => fabs(x) < inf
3763 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3764 auto Fabs = B.buildFAbs(Ty, Y);
3765 auto IsFinite =
3766 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3767 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3768 }
3769
3770 if (ScaledInput) {
3771 auto Zero = B.buildFConstant(Ty, 0.0);
3772 auto ShiftK =
3773 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3774 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3775 B.buildFSub(Dst, R, Shift, Flags);
3776 } else {
3777 B.buildCopy(Dst, R);
3778 }
3779
3780 MI.eraseFromParent();
3781 return true;
3782}
3783
3785 Register Src, bool IsLog10,
3786 unsigned Flags) const {
3787 const double Log2BaseInverted =
3789
3790 LLT Ty = B.getMRI()->getType(Dst);
3791
3792 if (Ty == LLT::scalar(32)) {
3793 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3794 if (ScaledInput) {
3795 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3796 .addUse(Src)
3797 .setMIFlags(Flags);
3798 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3799 auto Zero = B.buildFConstant(Ty, 0.0);
3800 auto ResultOffset =
3801 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3802 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3803
3804 if (ST.hasFastFMAF32())
3805 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3806 else {
3807 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3808 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3809 }
3810
3811 return true;
3812 }
3813 }
3814
3815 auto Log2Operand = Ty == LLT::scalar(16)
3816 ? B.buildFLog2(Ty, Src, Flags)
3817 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3818 .addUse(Src)
3819 .setMIFlags(Flags);
3820 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3821 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3822 return true;
3823}
3824
3826 MachineIRBuilder &B) const {
3827 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3828 // If we have to handle denormals, scale up the input and adjust the result.
3829
3830 Register Dst = MI.getOperand(0).getReg();
3831 Register Src = MI.getOperand(1).getReg();
3832 unsigned Flags = MI.getFlags();
3833 LLT Ty = B.getMRI()->getType(Dst);
3834 const LLT F16 = LLT::scalar(16);
3835 const LLT F32 = LLT::scalar(32);
3836 const LLT F64 = LLT::scalar(64);
3837
3838 if (Ty == F64)
3839 return legalizeFEXPF64(MI, B);
3840
3841 if (Ty == F16) {
3842 // Nothing in half is a denormal when promoted to f32.
3843 auto Ext = B.buildFPExt(F32, Src, Flags);
3844 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3845 .addUse(Ext.getReg(0))
3846 .setMIFlags(Flags);
3847 B.buildFPTrunc(Dst, Log2, Flags);
3848 MI.eraseFromParent();
3849 return true;
3850 }
3851
3852 assert(Ty == F32);
3853
3854 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3855 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3856 .addUse(Src)
3857 .setMIFlags(Flags);
3858 MI.eraseFromParent();
3859 return true;
3860 }
3861
3862 // bool needs_scaling = x < -0x1.f80000p+6f;
3863 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3864
3865 // -nextafter(128.0, -1)
3866 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3867 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3868 RangeCheckConst, Flags);
3869
3870 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3871 auto Zero = B.buildFConstant(Ty, 0.0);
3872 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3873 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3874
3875 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3876 .addUse(AddInput.getReg(0))
3877 .setMIFlags(Flags);
3878
3879 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3880 auto One = B.buildFConstant(Ty, 1.0);
3881 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3882 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3883 MI.eraseFromParent();
3884 return true;
3885}
3886
3888 const SrcOp &Src, unsigned Flags) {
3889 LLT Ty = Dst.getLLTTy(*B.getMRI());
3890
3891 if (Ty == LLT::scalar(32)) {
3892 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3893 .addUse(Src.getReg())
3894 .setMIFlags(Flags);
3895 }
3896 return B.buildFExp2(Dst, Src, Flags);
3897}
3898
3900 Register Dst, Register X,
3901 unsigned Flags,
3902 bool IsExp10) const {
3903 LLT Ty = B.getMRI()->getType(X);
3904
3905 // exp(x) -> exp2(M_LOG2E_F * x);
3906 // exp10(x) -> exp2(log2(10) * x);
3907 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3908 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3909 buildExp(B, Dst, Mul, Flags);
3910 return true;
3911}
3912
3914 Register X, unsigned Flags) const {
3915 LLT Ty = B.getMRI()->getType(Dst);
3916 LLT F32 = LLT::scalar(32);
3917
3918 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3919 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3920 }
3921
3922 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3923 auto NeedsScaling =
3924 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3925 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3926 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3927 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3928
3929 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3930 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3931
3932 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3933 .addUse(ExpInput.getReg(0))
3934 .setMIFlags(Flags);
3935
3936 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3937 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3938 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3939 return true;
3940}
3941
3943 Register Dst, Register X,
3944 unsigned Flags) const {
3945 LLT Ty = B.getMRI()->getType(Dst);
3946 LLT F32 = LLT::scalar(32);
3947
3948 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3949 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3950 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3951 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3952
3953 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3954 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3955 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3956 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3957 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3958 return true;
3959 }
3960
3961 // bool s = x < -0x1.2f7030p+5f;
3962 // x += s ? 0x1.0p+5f : 0.0f;
3963 // exp10 = exp2(x * 0x1.a92000p+1f) *
3964 // exp2(x * 0x1.4f0978p-11f) *
3965 // (s ? 0x1.9f623ep-107f : 1.0f);
3966
3967 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3968 auto NeedsScaling =
3969 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3970
3971 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3972 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3973 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3974
3975 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3976 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3977
3978 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3979 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3980 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3981 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3982
3983 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3984 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3985 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3986
3987 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3988 return true;
3989}
3990
3991// This expansion gives a result slightly better than 1ulp.
3993 MachineIRBuilder &B) const {
3994
3995 Register X = MI.getOperand(1).getReg();
3996 LLT S64 = LLT::scalar(64);
3997 LLT S32 = LLT::scalar(32);
3998 LLT S1 = LLT::scalar(1);
3999
4000 // TODO: Check if reassoc is safe. There is an output change in exp2 and
4001 // exp10, which slightly increases ulp.
4002 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
4003
4004 Register Dn, F, T;
4005
4006 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
4007 // Dn = rint(X)
4008 Dn = B.buildFRint(S64, X, Flags).getReg(0);
4009 // F = X - Dn
4010 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
4011 // T = F*C1 + F*C2
4012 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4013 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4014 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
4015 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
4016
4017 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
4018 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
4019 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4020 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4021
4022 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4023 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
4024 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
4025 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4026 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4027
4028 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4029 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4030 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4031 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4032
4033 } else { // G_FEXP
4034 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4035 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4036 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4037
4038 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4039 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4040 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4041 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4042 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4043 }
4044
4045 // Polynomial chain for P
4046 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4047 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4048 Flags);
4049 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4050 Flags);
4051 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4052 Flags);
4053 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4054 Flags);
4055 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4056 Flags);
4057 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4058 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4059 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4060 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4061
4062 auto One = B.buildFConstant(S64, 1.0);
4063 P = B.buildFMA(S64, T, P, One, Flags);
4064 P = B.buildFMA(S64, T, P, One, Flags);
4065
4066 // Z = FLDEXP(P, (int)Dn)
4067 auto DnInt = B.buildFPTOSI(S32, Dn);
4068 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4069
4070 if (!(Flags & MachineInstr::FmNoInfs)) {
4071 // Overflow guard: if X <= 1024.0 then Z else +inf
4072 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4073 B.buildFConstant(S64, APFloat(1024.0)));
4074 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4075 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4076 }
4077
4078 // Underflow guard: if X >= -1075.0 then Z else 0.0
4079 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4080 B.buildFConstant(S64, APFloat(-1075.0)));
4081 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4082 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4083
4084 MI.eraseFromParent();
4085 return true;
4086}
4087
4089 MachineIRBuilder &B) const {
4090 Register Dst = MI.getOperand(0).getReg();
4091 Register X = MI.getOperand(1).getReg();
4092 const unsigned Flags = MI.getFlags();
4093 MachineFunction &MF = B.getMF();
4094 MachineRegisterInfo &MRI = *B.getMRI();
4095 LLT Ty = MRI.getType(Dst);
4096
4097 const LLT F64 = LLT::scalar(64);
4098
4099 if (Ty == F64)
4100 return legalizeFEXPF64(MI, B);
4101
4102 const LLT F16 = LLT::scalar(16);
4103 const LLT F32 = LLT::scalar(32);
4104 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4105
4106 if (Ty == F16) {
4107 // v_exp_f16 (fmul x, log2e)
4108 if (allowApproxFunc(MF, Flags)) {
4109 // TODO: Does this really require fast?
4110 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4111 : legalizeFExpUnsafe(B, Dst, X, Flags);
4112 MI.eraseFromParent();
4113 return true;
4114 }
4115
4116 // Nothing in half is a denormal when promoted to f32.
4117 //
4118 // exp(f16 x) ->
4119 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4120 //
4121 // exp10(f16 x) ->
4122 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4123 auto Ext = B.buildFPExt(F32, X, Flags);
4125 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4126 B.buildFPTrunc(Dst, Lowered, Flags);
4127 MI.eraseFromParent();
4128 return true;
4129 }
4130
4131 assert(Ty == F32);
4132
4133 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4134 // library behavior. Also, is known-not-daz source sufficient?
4135 if (allowApproxFunc(MF, Flags)) {
4136 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4137 : legalizeFExpUnsafe(B, Dst, X, Flags);
4138 MI.eraseFromParent();
4139 return true;
4140 }
4141
4142 // Algorithm:
4143 //
4144 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4145 //
4146 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4147 // n = 64*m + j, 0 <= j < 64
4148 //
4149 // e^x = 2^((64*m + j + f)/64)
4150 // = (2^m) * (2^(j/64)) * 2^(f/64)
4151 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4152 //
4153 // f = x*(64/ln(2)) - n
4154 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4155 //
4156 // e^x = (2^m) * (2^(j/64)) * e^r
4157 //
4158 // (2^(j/64)) is precomputed
4159 //
4160 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4161 // e^r = 1 + q
4162 //
4163 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4164 //
4165 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4166 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4167 Register PH, PL;
4168
4169 if (ST.hasFastFMAF32()) {
4170 const float c_exp = numbers::log2ef;
4171 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4172 const float c_exp10 = 0x1.a934f0p+1f;
4173 const float cc_exp10 = 0x1.2f346ep-24f;
4174
4175 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4176 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4177 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4178 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4179
4180 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4181 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4182 } else {
4183 const float ch_exp = 0x1.714000p+0f;
4184 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4185
4186 const float ch_exp10 = 0x1.a92000p+1f;
4187 const float cl_exp10 = 0x1.4f0978p-11f;
4188
4189 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4190 auto XH = B.buildAnd(Ty, X, MaskConst);
4191 auto XL = B.buildFSub(Ty, X, XH, Flags);
4192
4193 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4194 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4195
4196 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4197 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4198
4199 Register Mad0 =
4200 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4201 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4202 }
4203
4204 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4205
4206 // It is unsafe to contract this fsub into the PH multiply.
4207 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4208 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4209 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4210
4211 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4212 .addUse(A.getReg(0))
4213 .setMIFlags(Flags);
4214 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4215
4216 auto UnderflowCheckConst =
4217 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4218 auto Zero = B.buildFConstant(Ty, 0.0);
4219 auto Underflow =
4220 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4221
4222 R = B.buildSelect(Ty, Underflow, Zero, R);
4223
4224 if (!(Flags & MachineInstr::FmNoInfs)) {
4225 auto OverflowCheckConst =
4226 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4227
4228 auto Overflow =
4229 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4230 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4231 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4232 }
4233
4234 B.buildCopy(Dst, R);
4235 MI.eraseFromParent();
4236 return true;
4237}
4238
4240 MachineIRBuilder &B) const {
4241 Register Dst = MI.getOperand(0).getReg();
4242 Register Src0 = MI.getOperand(1).getReg();
4243 Register Src1 = MI.getOperand(2).getReg();
4244 unsigned Flags = MI.getFlags();
4245 LLT Ty = B.getMRI()->getType(Dst);
4246 const LLT F16 = LLT::scalar(16); // TODO: Expected LLT::float16()
4247 const LLT F32 = LLT::scalar(32); // TODO: Expected LLT::float32()
4248
4249 if (Ty == F32) {
4250 auto Log = B.buildFLog2(F32, Src0, Flags);
4251 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4252 .addUse(Log.getReg(0))
4253 .addUse(Src1)
4254 .setMIFlags(Flags);
4255 B.buildFExp2(Dst, Mul, Flags);
4256 } else if (Ty == F16) {
4257 // There's no f16 fmul_legacy, so we need to convert for it.
4258 auto Log = B.buildFLog2(F16, Src0, Flags);
4259 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4260 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4261 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4262 .addUse(Ext0.getReg(0))
4263 .addUse(Ext1.getReg(0))
4264 .setMIFlags(Flags);
4265 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4266 } else
4267 return false;
4268
4269 MI.eraseFromParent();
4270 return true;
4271}
4272
4273// Find a source register, ignoring any possible source modifiers.
4275 Register ModSrc = OrigSrc;
4276 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4277 ModSrc = SrcFNeg->getOperand(1).getReg();
4278 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4279 ModSrc = SrcFAbs->getOperand(1).getReg();
4280 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4281 ModSrc = SrcFAbs->getOperand(1).getReg();
4282 return ModSrc;
4283}
4284
4287 MachineIRBuilder &B) const {
4288
4289 const LLT S1 = LLT::scalar(1);
4290 const LLT F64 = LLT::scalar(64); // TODO: Expected float64
4291 Register Dst = MI.getOperand(0).getReg();
4292 Register OrigSrc = MI.getOperand(1).getReg();
4293 unsigned Flags = MI.getFlags();
4294 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4295 "this should not have been custom lowered");
4296
4297 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4298 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4299 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4300 // V_FRACT bug is:
4301 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4302 //
4303 // Convert floor(x) to (x - fract(x))
4304
4305 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4306 .addUse(OrigSrc)
4307 .setMIFlags(Flags);
4308
4309 // Give source modifier matching some assistance before obscuring a foldable
4310 // pattern.
4311
4312 // TODO: We can avoid the neg on the fract? The input sign to fract
4313 // shouldn't matter?
4314 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4315
4316 auto Const =
4317 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4318
4320
4321 // We don't need to concern ourselves with the snan handling difference, so
4322 // use the one which will directly select.
4323 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4324 if (MFI->getMode().IEEE)
4325 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4326 else
4327 B.buildFMinNum(Min, Fract, Const, Flags);
4328
4329 Register CorrectedFract = Min;
4330 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4331 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4332 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4333 }
4334
4335 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4336 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4337
4338 MI.eraseFromParent();
4339 return true;
4340}
4341
4342// Turn an illegal packed v2s16 build vector into bit operations.
4343// TODO: This should probably be a bitcast action in LegalizerHelper.
4346 Register Dst = MI.getOperand(0).getReg();
4347 const LLT S32 = LLT::scalar(32);
4348 const LLT S16 = LLT::scalar(16);
4349 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4350
4351 Register Src0 = MI.getOperand(1).getReg();
4352 Register Src1 = MI.getOperand(2).getReg();
4353
4354 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4355 assert(MRI.getType(Src0) == S32);
4356 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4357 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4358 }
4359
4360 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4361 B.buildBitcast(Dst, Merge);
4362
4363 MI.eraseFromParent();
4364 return true;
4365}
4366
4367// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4368//
4369// Source and accumulation registers must all be 32-bits.
4370//
4371// TODO: When the multiply is uniform, we should produce a code sequence
4372// that is better suited to instruction selection on the SALU. Instead of
4373// the outer loop going over parts of the result, the outer loop should go
4374// over parts of one of the factors. This should result in instruction
4375// selection that makes full use of S_ADDC_U32 instructions.
4378 ArrayRef<Register> Src0,
4379 ArrayRef<Register> Src1,
4380 bool UsePartialMad64_32,
4381 bool SeparateOddAlignedProducts) const {
4382 // Use (possibly empty) vectors of S1 registers to represent the set of
4383 // carries from one pair of positions to the next.
4384 using Carry = SmallVector<Register, 2>;
4385
4386 MachineIRBuilder &B = Helper.MIRBuilder;
4387 GISelValueTracking &VT = *Helper.getValueTracking();
4388
4389 const LLT S1 = LLT::scalar(1);
4390 const LLT S32 = LLT::scalar(32);
4391 const LLT S64 = LLT::scalar(64);
4392
4393 Register Zero32;
4394 Register Zero64;
4395
4396 auto getZero32 = [&]() -> Register {
4397 if (!Zero32)
4398 Zero32 = B.buildConstant(S32, 0).getReg(0);
4399 return Zero32;
4400 };
4401 auto getZero64 = [&]() -> Register {
4402 if (!Zero64)
4403 Zero64 = B.buildConstant(S64, 0).getReg(0);
4404 return Zero64;
4405 };
4406
4407 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4408 for (unsigned i = 0; i < Src0.size(); ++i) {
4409 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4410 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4411 }
4412
4413 // Merge the given carries into the 32-bit LocalAccum, which is modified
4414 // in-place.
4415 //
4416 // Returns the carry-out, which is a single S1 register or null.
4417 auto mergeCarry =
4418 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4419 if (CarryIn.empty())
4420 return Register();
4421
4422 bool HaveCarryOut = true;
4423 Register CarryAccum;
4424 if (CarryIn.size() == 1) {
4425 if (!LocalAccum) {
4426 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4427 return Register();
4428 }
4429
4430 CarryAccum = getZero32();
4431 } else {
4432 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4433 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4434 CarryAccum =
4435 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4436 .getReg(0);
4437 }
4438
4439 if (!LocalAccum) {
4440 LocalAccum = getZero32();
4441 HaveCarryOut = false;
4442 }
4443 }
4444
4445 auto Add =
4446 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4447 LocalAccum = Add.getReg(0);
4448 return HaveCarryOut ? Add.getReg(1) : Register();
4449 };
4450
4451 // Build a multiply-add chain to compute
4452 //
4453 // LocalAccum + (partial products at DstIndex)
4454 // + (opportunistic subset of CarryIn)
4455 //
4456 // LocalAccum is an array of one or two 32-bit registers that are updated
4457 // in-place. The incoming registers may be null.
4458 //
4459 // In some edge cases, carry-ins can be consumed "for free". In that case,
4460 // the consumed carry bits are removed from CarryIn in-place.
4461 auto buildMadChain =
4462 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4463 -> Carry {
4464 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4465 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4466
4467 Carry CarryOut;
4468 unsigned j0 = 0;
4469
4470 // Use plain 32-bit multiplication for the most significant part of the
4471 // result by default.
4472 if (LocalAccum.size() == 1 &&
4473 (!UsePartialMad64_32 || !CarryIn.empty())) {
4474 do {
4475 // Skip multiplication if one of the operands is 0
4476 unsigned j1 = DstIndex - j0;
4477 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4478 ++j0;
4479 continue;
4480 }
4481 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4482 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4483 LocalAccum[0] = Mul.getReg(0);
4484 } else {
4485 if (CarryIn.empty()) {
4486 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4487 } else {
4488 LocalAccum[0] =
4489 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4490 .getReg(0);
4491 CarryIn.pop_back();
4492 }
4493 }
4494 ++j0;
4495 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4496 }
4497
4498 // Build full 64-bit multiplies.
4499 if (j0 <= DstIndex) {
4500 bool HaveSmallAccum = false;
4501 Register Tmp;
4502
4503 if (LocalAccum[0]) {
4504 if (LocalAccum.size() == 1) {
4505 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4506 HaveSmallAccum = true;
4507 } else if (LocalAccum[1]) {
4508 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4509 HaveSmallAccum = false;
4510 } else {
4511 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4512 HaveSmallAccum = true;
4513 }
4514 } else {
4515 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4516 Tmp = getZero64();
4517 HaveSmallAccum = true;
4518 }
4519
4520 do {
4521 unsigned j1 = DstIndex - j0;
4522 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4523 ++j0;
4524 continue;
4525 }
4526 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4527 {Src0[j0], Src1[j1], Tmp});
4528 Tmp = Mad.getReg(0);
4529 if (!HaveSmallAccum)
4530 CarryOut.push_back(Mad.getReg(1));
4531 HaveSmallAccum = false;
4532
4533 ++j0;
4534 } while (j0 <= DstIndex);
4535
4536 auto Unmerge = B.buildUnmerge(S32, Tmp);
4537 LocalAccum[0] = Unmerge.getReg(0);
4538 if (LocalAccum.size() > 1)
4539 LocalAccum[1] = Unmerge.getReg(1);
4540 }
4541
4542 return CarryOut;
4543 };
4544
4545 // Outer multiply loop, iterating over destination parts from least
4546 // significant to most significant parts.
4547 //
4548 // The columns of the following diagram correspond to the destination parts
4549 // affected by one iteration of the outer loop (ignoring boundary
4550 // conditions).
4551 //
4552 // Dest index relative to 2 * i: 1 0 -1
4553 // ------
4554 // Carries from previous iteration: e o
4555 // Even-aligned partial product sum: E E .
4556 // Odd-aligned partial product sum: O O
4557 //
4558 // 'o' is OddCarry, 'e' is EvenCarry.
4559 // EE and OO are computed from partial products via buildMadChain and use
4560 // accumulation where possible and appropriate.
4561 //
4562 Register SeparateOddCarry;
4563 Carry EvenCarry;
4564 Carry OddCarry;
4565
4566 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4567 Carry OddCarryIn = std::move(OddCarry);
4568 Carry EvenCarryIn = std::move(EvenCarry);
4569 OddCarry.clear();
4570 EvenCarry.clear();
4571
4572 // Partial products at offset 2 * i.
4573 if (2 * i < Accum.size()) {
4574 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4575 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4576 }
4577
4578 // Partial products at offset 2 * i - 1.
4579 if (i > 0) {
4580 if (!SeparateOddAlignedProducts) {
4581 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4582 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4583 } else {
4584 bool IsHighest = 2 * i >= Accum.size();
4585 Register SeparateOddOut[2];
4586 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4587 .take_front(IsHighest ? 1 : 2);
4588 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4589
4591
4592 if (i == 1) {
4593 if (!IsHighest)
4594 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4595 else
4596 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4597 } else {
4598 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4599 SeparateOddCarry);
4600 }
4601 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4602
4603 if (!IsHighest) {
4604 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4605 Lo->getOperand(1).getReg());
4606 Accum[2 * i] = Hi.getReg(0);
4607 SeparateOddCarry = Hi.getReg(1);
4608 }
4609 }
4610 }
4611
4612 // Add in the carries from the previous iteration
4613 if (i > 0) {
4614 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4615 EvenCarryIn.push_back(CarryOut);
4616
4617 if (2 * i < Accum.size()) {
4618 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4619 OddCarry.push_back(CarryOut);
4620 }
4621 }
4622 }
4623}
4624
4625// Custom narrowing of wide multiplies using wide multiply-add instructions.
4626//
4627// TODO: If the multiply is followed by an addition, we should attempt to
4628// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4630 MachineInstr &MI) const {
4631 assert(ST.hasMad64_32());
4632 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4633
4634 MachineIRBuilder &B = Helper.MIRBuilder;
4635 MachineRegisterInfo &MRI = *B.getMRI();
4636
4637 Register DstReg = MI.getOperand(0).getReg();
4638 Register Src0 = MI.getOperand(1).getReg();
4639 Register Src1 = MI.getOperand(2).getReg();
4640
4641 LLT Ty = MRI.getType(DstReg);
4642 assert(Ty.isScalar());
4643
4644 unsigned Size = Ty.getSizeInBits();
4645 if (ST.hasVMulU64Inst() && Size == 64)
4646 return true;
4647
4648 unsigned NumParts = Size / 32;
4649 assert((Size % 32) == 0);
4650 assert(NumParts >= 2);
4651
4652 // Whether to use MAD_64_32 for partial products whose high half is
4653 // discarded. This avoids some ADD instructions but risks false dependency
4654 // stalls on some subtargets in some cases.
4655 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4656
4657 // Whether to compute odd-aligned partial products separately. This is
4658 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4659 // in an even-aligned VGPR.
4660 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4661
4662 LLT S32 = LLT::scalar(32);
4663 SmallVector<Register, 2> Src0Parts, Src1Parts;
4664 for (unsigned i = 0; i < NumParts; ++i) {
4667 }
4668 B.buildUnmerge(Src0Parts, Src0);
4669 B.buildUnmerge(Src1Parts, Src1);
4670
4671 SmallVector<Register, 2> AccumRegs(NumParts);
4672 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4673 SeparateOddAlignedProducts);
4674
4675 B.buildMergeLikeInstr(DstReg, AccumRegs);
4676 MI.eraseFromParent();
4677 return true;
4678}
4679
4680// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4681// ctlz/cttz_zero_poison. This allows us to fix up the result for the zero input
4682// case with a single min instruction instead of a compare+select.
4685 MachineIRBuilder &B) const {
4686 Register Dst = MI.getOperand(0).getReg();
4687 Register Src = MI.getOperand(1).getReg();
4688 LLT DstTy = MRI.getType(Dst);
4689 LLT SrcTy = MRI.getType(Src);
4690
4691 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4692 ? AMDGPU::G_AMDGPU_FFBH_U32
4693 : AMDGPU::G_AMDGPU_FFBL_B32;
4694 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4695 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4696
4697 MI.eraseFromParent();
4698 return true;
4699}
4700
4703 MachineIRBuilder &B) const {
4704 Register Dst = MI.getOperand(0).getReg();
4705 Register Src = MI.getOperand(1).getReg();
4706 LLT SrcTy = MRI.getType(Src);
4707 TypeSize NumBits = SrcTy.getSizeInBits();
4708
4709 assert(NumBits < 32u);
4710
4711 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4712 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4713 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4714 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4715 B.buildTrunc(Dst, Ctlz);
4716 MI.eraseFromParent();
4717 return true;
4718}
4719
4722 MachineIRBuilder &B) const {
4723 Register Dst = MI.getOperand(0).getReg();
4724 Register Src = MI.getOperand(1).getReg();
4725 LLT SrcTy = MRI.getType(Src);
4726 const LLT S32 = LLT::scalar(32);
4727 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4728 unsigned BitWidth = SrcTy.getSizeInBits();
4729
4730 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4731 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4732 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4733 MI.eraseFromParent();
4734 return true;
4735}
4736
4737// Check that this is a G_XOR x, -1
4738static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4739 if (MI.getOpcode() != TargetOpcode::G_XOR)
4740 return false;
4741 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4742 return ConstVal == -1;
4743}
4744
4745// Return the use branch instruction, otherwise null if the usage is invalid.
4746static MachineInstr *
4748 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4749 Register CondDef = MI.getOperand(0).getReg();
4750 if (!MRI.hasOneNonDBGUse(CondDef))
4751 return nullptr;
4752
4753 MachineBasicBlock *Parent = MI.getParent();
4754 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4755
4756 if (isNot(MRI, *UseMI)) {
4757 Register NegatedCond = UseMI->getOperand(0).getReg();
4758 if (!MRI.hasOneNonDBGUse(NegatedCond))
4759 return nullptr;
4760
4761 // We're deleting the def of this value, so we need to remove it.
4762 eraseInstr(*UseMI, MRI);
4763
4764 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4765 Negated = true;
4766 }
4767
4768 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4769 return nullptr;
4770
4771 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4772 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4773 if (Next == Parent->end()) {
4774 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4775 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4776 return nullptr;
4777 UncondBrTarget = &*NextMBB;
4778 } else {
4779 if (Next->getOpcode() != AMDGPU::G_BR)
4780 return nullptr;
4781 Br = &*Next;
4782 UncondBrTarget = Br->getOperand(0).getMBB();
4783 }
4784
4785 return UseMI;
4786}
4787
4790 const ArgDescriptor *Arg,
4791 const TargetRegisterClass *ArgRC,
4792 LLT ArgTy) const {
4793 MCRegister SrcReg = Arg->getRegister();
4794 assert(SrcReg.isPhysical() && "Physical register expected");
4795 assert(DstReg.isVirtual() && "Virtual register expected");
4796
4797 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4798 *ArgRC, B.getDebugLoc(), ArgTy);
4799 if (Arg->isMasked()) {
4800 // TODO: Should we try to emit this once in the entry block?
4801 const LLT S32 = LLT::scalar(32);
4802 const unsigned Mask = Arg->getMask();
4803 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4804
4805 Register AndMaskSrc = LiveIn;
4806
4807 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4808 // 0.
4809 if (Shift != 0) {
4810 auto ShiftAmt = B.buildConstant(S32, Shift);
4811 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4812 }
4813
4814 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4815 } else {
4816 B.buildCopy(DstReg, LiveIn);
4817 }
4818}
4819
4824 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4825 Register DstReg = MI.getOperand(0).getReg();
4826 if (!ST.hasClusters()) {
4827 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4828 return false;
4829 MI.eraseFromParent();
4830 return true;
4831 }
4832
4833 // Clusters are supported. Return the global position in the grid. If clusters
4834 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4835
4836 // WorkGroupIdXYZ = ClusterId == 0 ?
4837 // ClusterIdXYZ :
4838 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4839 MachineRegisterInfo &MRI = *B.getMRI();
4840 const LLT S32 = LLT::scalar(32);
4841 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4842 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4843 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4844 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4845 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4846 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4847 return false;
4848
4849 auto One = B.buildConstant(S32, 1);
4850 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4851 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4852 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4853
4854 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4855
4856 switch (MFI->getClusterDims().getKind()) {
4859 B.buildCopy(DstReg, GlobalIdXYZ);
4860 MI.eraseFromParent();
4861 return true;
4862 }
4864 B.buildCopy(DstReg, ClusterIdXYZ);
4865 MI.eraseFromParent();
4866 return true;
4867 }
4869 using namespace AMDGPU::Hwreg;
4870 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4871 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4872 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4873 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4874 .addDef(ClusterId)
4875 .addImm(ClusterIdField);
4876 auto Zero = B.buildConstant(S32, 0);
4877 auto NoClusters =
4878 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4879 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4880 MI.eraseFromParent();
4881 return true;
4882 }
4883 }
4884
4885 llvm_unreachable("nothing should reach here");
4886}
4887
4889 Register DstReg, MachineIRBuilder &B,
4891 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4892 const ArgDescriptor *Arg = nullptr;
4893 const TargetRegisterClass *ArgRC;
4894 LLT ArgTy;
4895
4896 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4897 const ArgDescriptor WorkGroupIDX =
4898 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4899 // If GridZ is not programmed in an entry function then the hardware will set
4900 // it to all zeros, so there is no need to mask the GridY value in the low
4901 // order bits.
4902 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4903 AMDGPU::TTMP7,
4904 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4905 const ArgDescriptor WorkGroupIDZ =
4906 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4907 const ArgDescriptor ClusterWorkGroupIDX =
4908 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4909 const ArgDescriptor ClusterWorkGroupIDY =
4910 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4911 const ArgDescriptor ClusterWorkGroupIDZ =
4912 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4913 const ArgDescriptor ClusterWorkGroupMaxIDX =
4914 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4915 const ArgDescriptor ClusterWorkGroupMaxIDY =
4916 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4917 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4918 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4919 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4920 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4921
4922 auto LoadConstant = [&](unsigned N) {
4923 B.buildConstant(DstReg, N);
4924 return true;
4925 };
4926
4927 if (ST.hasArchitectedSGPRs() &&
4929 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4930 bool HasFixedDims = ClusterDims.isFixedDims();
4931
4932 switch (ArgType) {
4934 Arg = &WorkGroupIDX;
4935 ArgRC = &AMDGPU::SReg_32RegClass;
4936 ArgTy = LLT::scalar(32);
4937 break;
4939 Arg = &WorkGroupIDY;
4940 ArgRC = &AMDGPU::SReg_32RegClass;
4941 ArgTy = LLT::scalar(32);
4942 break;
4944 Arg = &WorkGroupIDZ;
4945 ArgRC = &AMDGPU::SReg_32RegClass;
4946 ArgTy = LLT::scalar(32);
4947 break;
4949 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4950 return LoadConstant(0);
4951 Arg = &ClusterWorkGroupIDX;
4952 ArgRC = &AMDGPU::SReg_32RegClass;
4953 ArgTy = LLT::scalar(32);
4954 break;
4956 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4957 return LoadConstant(0);
4958 Arg = &ClusterWorkGroupIDY;
4959 ArgRC = &AMDGPU::SReg_32RegClass;
4960 ArgTy = LLT::scalar(32);
4961 break;
4963 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4964 return LoadConstant(0);
4965 Arg = &ClusterWorkGroupIDZ;
4966 ArgRC = &AMDGPU::SReg_32RegClass;
4967 ArgTy = LLT::scalar(32);
4968 break;
4970 if (HasFixedDims)
4971 return LoadConstant(ClusterDims.getDims()[0] - 1);
4972 Arg = &ClusterWorkGroupMaxIDX;
4973 ArgRC = &AMDGPU::SReg_32RegClass;
4974 ArgTy = LLT::scalar(32);
4975 break;
4977 if (HasFixedDims)
4978 return LoadConstant(ClusterDims.getDims()[1] - 1);
4979 Arg = &ClusterWorkGroupMaxIDY;
4980 ArgRC = &AMDGPU::SReg_32RegClass;
4981 ArgTy = LLT::scalar(32);
4982 break;
4984 if (HasFixedDims)
4985 return LoadConstant(ClusterDims.getDims()[2] - 1);
4986 Arg = &ClusterWorkGroupMaxIDZ;
4987 ArgRC = &AMDGPU::SReg_32RegClass;
4988 ArgTy = LLT::scalar(32);
4989 break;
4991 Arg = &ClusterWorkGroupMaxFlatID;
4992 ArgRC = &AMDGPU::SReg_32RegClass;
4993 ArgTy = LLT::scalar(32);
4994 break;
4995 default:
4996 break;
4997 }
4998 }
4999
5000 if (!Arg)
5001 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5002
5003 if (!Arg) {
5005 // The intrinsic may appear when we have a 0 sized kernarg segment, in
5006 // which case the pointer argument may be missing and we use null.
5007 return LoadConstant(0);
5008 }
5009
5010 // It's undefined behavior if a function marked with the amdgpu-no-*
5011 // attributes uses the corresponding intrinsic.
5012 B.buildUndef(DstReg);
5013 return true;
5014 }
5015
5016 if (!Arg->isRegister() || !Arg->getRegister().isValid())
5017 return false; // TODO: Handle these
5018 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
5019 return true;
5020}
5021
5025 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
5026 return false;
5027
5028 MI.eraseFromParent();
5029 return true;
5030}
5031
5033 int64_t C) {
5034 B.buildConstant(MI.getOperand(0).getReg(), C);
5035 MI.eraseFromParent();
5036 return true;
5037}
5038
5041 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5042 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5043 if (MaxID == 0)
5044 return replaceWithConstant(B, MI, 0);
5045
5046 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5047 const ArgDescriptor *Arg;
5048 const TargetRegisterClass *ArgRC;
5049 LLT ArgTy;
5050 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5051
5052 Register DstReg = MI.getOperand(0).getReg();
5053 if (!Arg) {
5054 // It's undefined behavior if a function marked with the amdgpu-no-*
5055 // attributes uses the corresponding intrinsic.
5056 B.buildUndef(DstReg);
5057 MI.eraseFromParent();
5058 return true;
5059 }
5060
5061 if (Arg->isMasked()) {
5062 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5063 // masking operations anyway.
5064 //
5065 // TODO: We could assert the top bit is 0 for the source copy.
5066 if (!loadInputValue(DstReg, B, ArgType))
5067 return false;
5068 } else {
5070 if (!loadInputValue(TmpReg, B, ArgType))
5071 return false;
5072 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5073 }
5074
5075 MI.eraseFromParent();
5076 return true;
5077}
5078
5081 // This isn't really a constant pool but close enough.
5084 return PtrInfo;
5085}
5086
5088 int64_t Offset) const {
5090 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5091
5092 // TODO: If we passed in the base kernel offset we could have a better
5093 // alignment than 4, but we don't really need it.
5094 if (!loadInputValue(KernArgReg, B,
5096 llvm_unreachable("failed to find kernarg segment ptr");
5097
5098 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5099 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5100}
5101
5102/// Legalize a value that's loaded from kernel arguments. This is only used by
5103/// legacy intrinsics.
5107 Align Alignment) const {
5108 Register DstReg = MI.getOperand(0).getReg();
5109
5110 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5111 "unexpected kernarg parameter type");
5112
5115 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5118 MI.eraseFromParent();
5119 return true;
5120}
5121
5124 MachineIRBuilder &B) const {
5125 Register Dst = MI.getOperand(0).getReg();
5126 LLT DstTy = MRI.getType(Dst);
5127 LLT S16 = LLT::scalar(16);
5128 LLT S32 = LLT::scalar(32);
5129 LLT S64 = LLT::scalar(64);
5130
5131 if (DstTy == S16)
5132 return legalizeFDIV16(MI, MRI, B);
5133 if (DstTy == S32)
5134 return legalizeFDIV32(MI, MRI, B);
5135 if (DstTy == S64)
5136 return legalizeFDIV64(MI, MRI, B);
5137
5138 return false;
5139}
5140
5142 Register DstDivReg,
5143 Register DstRemReg,
5144 Register X,
5145 Register Y) const {
5146 const LLT S1 = LLT::scalar(1);
5147 const LLT S32 = LLT::scalar(32);
5148
5149 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5150 // algorithm used here.
5151
5152 // Initial estimate of inv(y).
5153 auto FloatY = B.buildUITOFP(S32, Y);
5154 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5155 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5156 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5157 auto Z = B.buildFPTOUI(S32, ScaledY);
5158
5159 // One round of UNR.
5160 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5161 auto NegYZ = B.buildMul(S32, NegY, Z);
5162 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5163
5164 // Quotient/remainder estimate.
5165 auto Q = B.buildUMulH(S32, X, Z);
5166 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5167
5168 // First quotient/remainder refinement.
5169 auto One = B.buildConstant(S32, 1);
5170 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5171 if (DstDivReg)
5172 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5173 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5174
5175 // Second quotient/remainder refinement.
5176 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5177 if (DstDivReg)
5178 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5179
5180 if (DstRemReg)
5181 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5182}
5183
5184// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5185//
5186// Return lo, hi of result
5187//
5188// %cvt.lo = G_UITOFP Val.lo
5189// %cvt.hi = G_UITOFP Val.hi
5190// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5191// %rcp = G_AMDGPU_RCP_IFLAG %mad
5192// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5193// %mul2 = G_FMUL %mul1, 2**(-32)
5194// %trunc = G_INTRINSIC_TRUNC %mul2
5195// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5196// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5197static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5198 Register Val) {
5199 const LLT S32 = LLT::scalar(32);
5200 auto Unmerge = B.buildUnmerge(S32, Val);
5201
5202 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5203 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5204
5205 auto Mad = B.buildFMAD(
5206 S32, CvtHi, // 2**32
5207 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5208
5209 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5210 auto Mul1 = B.buildFMul(
5211 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5212
5213 // 2**(-32)
5214 auto Mul2 = B.buildFMul(
5215 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5216 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5217
5218 // -(2**32)
5219 auto Mad2 = B.buildFMAD(
5220 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5221 Mul1);
5222
5223 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5224 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5225
5226 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5227}
5228
5230 Register DstDivReg,
5231 Register DstRemReg,
5232 Register Numer,
5233 Register Denom) const {
5234 const LLT S32 = LLT::scalar(32);
5235 const LLT S64 = LLT::scalar(64);
5236 const LLT S1 = LLT::scalar(1);
5237 Register RcpLo, RcpHi;
5238
5239 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5240
5241 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5242
5243 auto Zero64 = B.buildConstant(S64, 0);
5244 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5245
5246 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5247 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5248
5249 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5250 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5251 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5252
5253 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5254 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5255 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5256
5257 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5258 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5259 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5260 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5261 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5262
5263 auto Zero32 = B.buildConstant(S32, 0);
5264 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5265 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5266 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5267
5268 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5269 Register NumerLo = UnmergeNumer.getReg(0);
5270 Register NumerHi = UnmergeNumer.getReg(1);
5271
5272 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5273 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5274 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5275 Register Mul3_Lo = UnmergeMul3.getReg(0);
5276 Register Mul3_Hi = UnmergeMul3.getReg(1);
5277 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5278 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5279 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5280 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5281
5282 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5283 Register DenomLo = UnmergeDenom.getReg(0);
5284 Register DenomHi = UnmergeDenom.getReg(1);
5285
5286 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5287 auto C1 = B.buildSExt(S32, CmpHi);
5288
5289 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5290 auto C2 = B.buildSExt(S32, CmpLo);
5291
5292 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5293 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5294
5295 // TODO: Here and below portions of the code can be enclosed into if/endif.
5296 // Currently control flow is unconditional and we have 4 selects after
5297 // potential endif to substitute PHIs.
5298
5299 // if C3 != 0 ...
5300 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5301 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5302 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5303 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5304
5305 auto One64 = B.buildConstant(S64, 1);
5306 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5307
5308 auto C4 =
5309 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5310 auto C5 =
5311 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5312 auto C6 = B.buildSelect(
5313 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5314
5315 // if (C6 != 0)
5316 auto Add4 = B.buildAdd(S64, Add3, One64);
5317 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5318
5319 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5320 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5321 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5322
5323 // endif C6
5324 // endif C3
5325
5326 if (DstDivReg) {
5327 auto Sel1 = B.buildSelect(
5328 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5329 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5330 Sel1, MulHi3);
5331 }
5332
5333 if (DstRemReg) {
5334 auto Sel2 = B.buildSelect(
5335 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5336 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5337 Sel2, Sub1);
5338 }
5339}
5340
5343 MachineIRBuilder &B) const {
5344 Register DstDivReg, DstRemReg;
5345 switch (MI.getOpcode()) {
5346 default:
5347 llvm_unreachable("Unexpected opcode!");
5348 case AMDGPU::G_UDIV: {
5349 DstDivReg = MI.getOperand(0).getReg();
5350 break;
5351 }
5352 case AMDGPU::G_UREM: {
5353 DstRemReg = MI.getOperand(0).getReg();
5354 break;
5355 }
5356 case AMDGPU::G_UDIVREM: {
5357 DstDivReg = MI.getOperand(0).getReg();
5358 DstRemReg = MI.getOperand(1).getReg();
5359 break;
5360 }
5361 }
5362
5363 const LLT S64 = LLT::scalar(64);
5364 const LLT S32 = LLT::scalar(32);
5365 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5366 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5367 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5368 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5369
5370 if (Ty == S32)
5371 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5372 else if (Ty == S64)
5373 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5374 else
5375 return false;
5376
5377 MI.eraseFromParent();
5378 return true;
5379}
5380
5383 MachineIRBuilder &B) const {
5384 const LLT S64 = LLT::scalar(64);
5385 const LLT S32 = LLT::scalar(32);
5386
5387 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5388 if (Ty != S32 && Ty != S64)
5389 return false;
5390
5391 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5392 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5393 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5394
5395 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5396 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5397 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5398
5399 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5400 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5401
5402 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5403 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5404
5405 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5406 switch (MI.getOpcode()) {
5407 default:
5408 llvm_unreachable("Unexpected opcode!");
5409 case AMDGPU::G_SDIV: {
5410 DstDivReg = MI.getOperand(0).getReg();
5411 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5412 break;
5413 }
5414 case AMDGPU::G_SREM: {
5415 DstRemReg = MI.getOperand(0).getReg();
5416 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5417 break;
5418 }
5419 case AMDGPU::G_SDIVREM: {
5420 DstDivReg = MI.getOperand(0).getReg();
5421 DstRemReg = MI.getOperand(1).getReg();
5422 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5423 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5424 break;
5425 }
5426 }
5427
5428 if (Ty == S32)
5429 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5430 else
5431 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5432
5433 if (DstDivReg) {
5434 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5435 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5436 B.buildSub(DstDivReg, SignXor, Sign);
5437 }
5438
5439 if (DstRemReg) {
5440 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5441 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5442 B.buildSub(DstRemReg, SignXor, Sign);
5443 }
5444
5445 MI.eraseFromParent();
5446 return true;
5447}
5448
5451 MachineIRBuilder &B) const {
5452 Register Res = MI.getOperand(0).getReg();
5453 Register LHS = MI.getOperand(1).getReg();
5454 Register RHS = MI.getOperand(2).getReg();
5455 uint16_t Flags = MI.getFlags();
5456 LLT ResTy = MRI.getType(Res);
5457
5458 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5459
5460 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5461 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5462 return false;
5463
5464 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5465 // the CI documentation has a worst case error of 1 ulp.
5466 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5467 // use it as long as we aren't trying to use denormals.
5468 //
5469 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5470
5471 // 1 / x -> RCP(x)
5472 if (CLHS->isExactlyValue(1.0)) {
5473 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5474 .addUse(RHS)
5475 .setMIFlags(Flags);
5476
5477 MI.eraseFromParent();
5478 return true;
5479 }
5480
5481 // -1 / x -> RCP( FNEG(x) )
5482 if (CLHS->isExactlyValue(-1.0)) {
5483 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5484 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5485 .addUse(FNeg.getReg(0))
5486 .setMIFlags(Flags);
5487
5488 MI.eraseFromParent();
5489 return true;
5490 }
5491 }
5492
5493 // For f16 require afn or arcp.
5494 // For f32 require afn.
5495 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5496 !MI.getFlag(MachineInstr::FmArcp)))
5497 return false;
5498
5499 // x / y -> x * (1.0 / y)
5500 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5501 .addUse(RHS)
5502 .setMIFlags(Flags);
5503 B.buildFMul(Res, LHS, RCP, Flags);
5504
5505 MI.eraseFromParent();
5506 return true;
5507}
5508
5511 MachineIRBuilder &B) const {
5512 Register Res = MI.getOperand(0).getReg();
5513 Register X = MI.getOperand(1).getReg();
5514 Register Y = MI.getOperand(2).getReg();
5515 uint16_t Flags = MI.getFlags();
5516 LLT ResTy = MRI.getType(Res);
5517
5518 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5519
5520 if (!AllowInaccurateRcp)
5521 return false;
5522
5523 const ConstantFP *CLHS = getConstantFPVRegVal(X, MRI);
5524 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
5525
5526 // Pull out the negation so it folds for free into the source modifiers.
5527 if (IsNegRcp)
5528 X = B.buildFConstant(ResTy, 1.0).getReg(0);
5529
5530 Register NegY = IsNegRcp ? Y : B.buildFNeg(ResTy, Y).getReg(0);
5531 auto One = B.buildFConstant(ResTy, 1.0);
5532
5533 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5534 .addUse(Y)
5535 .setMIFlags(Flags);
5536 if (IsNegRcp)
5537 R = B.buildFNeg(ResTy, R);
5538
5539 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5540 R = B.buildFMA(ResTy, Tmp0, R, R);
5541
5542 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5543 R = B.buildFMA(ResTy, Tmp1, R, R);
5544
5545 // Skip the last 2 correction terms for reciprocal.
5546 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0))) {
5547 B.buildCopy(Res, R);
5548 MI.eraseFromParent();
5549 return true;
5550 }
5551
5552 auto Ret = B.buildFMul(ResTy, X, R);
5553 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5554
5555 B.buildFMA(Res, Tmp2, R, Ret);
5556 MI.eraseFromParent();
5557 return true;
5558}
5559
5562 MachineIRBuilder &B) const {
5563 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5564 return true;
5565
5566 Register Res = MI.getOperand(0).getReg();
5567 Register LHS = MI.getOperand(1).getReg();
5568 Register RHS = MI.getOperand(2).getReg();
5569
5570 uint16_t Flags = MI.getFlags();
5571
5572 LLT S16 = LLT::scalar(16);
5573 LLT S32 = LLT::scalar(32);
5574
5575 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5576 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5577 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5578 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5579 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5580 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5581 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5582 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5583 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5584 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5585 // q16.u = opx(V_CVT_F16_F32, q32.u);
5586 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5587
5588 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5589 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5590 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5591 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5592 .addUse(RHSExt.getReg(0))
5593 .setMIFlags(Flags);
5594 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5596 if (ST.hasMadMacF32Insts()) {
5597 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5598 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5599 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5600 } else {
5601 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5602 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5603 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5604 }
5605 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5606 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5607 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5608 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5609 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5610 .addUse(RDst.getReg(0))
5611 .addUse(RHS)
5612 .addUse(LHS)
5613 .setMIFlags(Flags);
5614
5615 MI.eraseFromParent();
5616 return true;
5617}
5618
5619static constexpr unsigned SPDenormModeBitField =
5621
5622// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5623// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5625 const GCNSubtarget &ST,
5627 // Set SP denorm mode to this value.
5628 unsigned SPDenormMode =
5629 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5630
5631 if (ST.hasDenormModeInst()) {
5632 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5633 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5634
5635 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5636 B.buildInstr(AMDGPU::S_DENORM_MODE)
5637 .addImm(NewDenormModeValue);
5638
5639 } else {
5640 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5641 .addImm(SPDenormMode)
5642 .addImm(SPDenormModeBitField);
5643 }
5644}
5645
5648 MachineIRBuilder &B) const {
5649 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5650 return true;
5651
5652 Register Res = MI.getOperand(0).getReg();
5653 Register LHS = MI.getOperand(1).getReg();
5654 Register RHS = MI.getOperand(2).getReg();
5655 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5656 SIModeRegisterDefaults Mode = MFI->getMode();
5657
5658 uint16_t Flags = MI.getFlags();
5659
5660 LLT S32 = LLT::scalar(32);
5661 LLT S1 = LLT::scalar(1);
5662
5663 auto One = B.buildFConstant(S32, 1.0f);
5664
5665 auto DenominatorScaled =
5666 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5667 .addUse(LHS)
5668 .addUse(RHS)
5669 .addImm(0)
5670 .setMIFlags(Flags);
5671 auto NumeratorScaled =
5672 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5673 .addUse(LHS)
5674 .addUse(RHS)
5675 .addImm(1)
5676 .setMIFlags(Flags);
5677
5678 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5679 .addUse(DenominatorScaled.getReg(0))
5680 .setMIFlags(Flags);
5681 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5682
5683 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5684 const bool HasDynamicDenormals =
5685 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5686 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5687
5688 Register SavedSPDenormMode;
5689 if (!PreservesDenormals) {
5690 if (HasDynamicDenormals) {
5691 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5692 B.buildInstr(AMDGPU::S_GETREG_B32)
5693 .addDef(SavedSPDenormMode)
5694 .addImm(SPDenormModeBitField);
5695 }
5696 toggleSPDenormMode(true, B, ST, Mode);
5697 }
5698
5699 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5700 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5701 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5702 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5703 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5704 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5705
5706 if (!PreservesDenormals) {
5707 if (HasDynamicDenormals) {
5708 assert(SavedSPDenormMode);
5709 B.buildInstr(AMDGPU::S_SETREG_B32)
5710 .addReg(SavedSPDenormMode)
5711 .addImm(SPDenormModeBitField);
5712 } else
5713 toggleSPDenormMode(false, B, ST, Mode);
5714 }
5715
5716 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5717 .addUse(Fma4.getReg(0))
5718 .addUse(Fma1.getReg(0))
5719 .addUse(Fma3.getReg(0))
5720 .addUse(NumeratorScaled.getReg(1))
5721 .setMIFlags(Flags);
5722
5723 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5724 .addUse(Fmas.getReg(0))
5725 .addUse(RHS)
5726 .addUse(LHS)
5727 .setMIFlags(Flags);
5728
5729 MI.eraseFromParent();
5730 return true;
5731}
5732
5735 MachineIRBuilder &B) const {
5736 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5737 return true;
5738
5739 Register Res = MI.getOperand(0).getReg();
5740 Register LHS = MI.getOperand(1).getReg();
5741 Register RHS = MI.getOperand(2).getReg();
5742
5743 uint16_t Flags = MI.getFlags();
5744
5745 LLT S64 = LLT::scalar(64);
5746 LLT S1 = LLT::scalar(1);
5747
5748 auto One = B.buildFConstant(S64, 1.0);
5749
5750 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5751 .addUse(LHS)
5752 .addUse(RHS)
5753 .addImm(0)
5754 .setMIFlags(Flags);
5755
5756 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5757
5758 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5759 .addUse(DivScale0.getReg(0))
5760 .setMIFlags(Flags);
5761
5762 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5763 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5764 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5765
5766 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5767 .addUse(LHS)
5768 .addUse(RHS)
5769 .addImm(1)
5770 .setMIFlags(Flags);
5771
5772 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5773 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5774 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5775
5776 Register Scale;
5777 if (!ST.hasUsableDivScaleConditionOutput()) {
5778 // Workaround a hardware bug on SI where the condition output from div_scale
5779 // is not usable.
5780
5781 LLT S32 = LLT::scalar(32);
5782
5783 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5784 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5785 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5786 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5787
5788 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5789 Scale1Unmerge.getReg(1));
5790 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5791 Scale0Unmerge.getReg(1));
5792 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5793 } else {
5794 Scale = DivScale1.getReg(1);
5795 }
5796
5797 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5798 .addUse(Fma4.getReg(0))
5799 .addUse(Fma3.getReg(0))
5800 .addUse(Mul.getReg(0))
5801 .addUse(Scale)
5802 .setMIFlags(Flags);
5803
5804 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5805 .addUse(Fmas.getReg(0))
5806 .addUse(RHS)
5807 .addUse(LHS)
5808 .setMIFlags(Flags);
5809
5810 MI.eraseFromParent();
5811 return true;
5812}
5813
5816 MachineIRBuilder &B) const {
5817 Register Res0 = MI.getOperand(0).getReg();
5818 Register Res1 = MI.getOperand(1).getReg();
5819 Register Val = MI.getOperand(2).getReg();
5820 uint16_t Flags = MI.getFlags();
5821
5822 LLT Ty = MRI.getType(Res0);
5823 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5824
5825 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5826 .addUse(Val)
5827 .setMIFlags(Flags);
5828 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5829 .addUse(Val)
5830 .setMIFlags(Flags);
5831
5832 if (ST.hasFractBug()) {
5833 auto Fabs = B.buildFAbs(Ty, Val);
5834 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5835 auto IsFinite =
5836 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5837 auto Zero = B.buildConstant(InstrExpTy, 0);
5838 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5839 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5840 }
5841
5842 B.buildCopy(Res0, Mant);
5843 B.buildSExtOrTrunc(Res1, Exp);
5844
5845 MI.eraseFromParent();
5846 return true;
5847}
5848
5851 MachineIRBuilder &B) const {
5852 Register Res = MI.getOperand(0).getReg();
5853 Register LHS = MI.getOperand(2).getReg();
5854 Register RHS = MI.getOperand(3).getReg();
5855 uint16_t Flags = MI.getFlags();
5856
5857 LLT S32 = LLT::scalar(32);
5858 LLT S1 = LLT::scalar(1);
5859
5860 auto Abs = B.buildFAbs(S32, RHS, Flags);
5861 const APFloat C0Val(1.0f);
5862
5863 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5864 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5865 auto C2 = B.buildFConstant(S32, 1.0f);
5866
5867 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5868 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5869
5870 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5871
5872 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5873 .addUse(Mul0.getReg(0))
5874 .setMIFlags(Flags);
5875
5876 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5877
5878 B.buildFMul(Res, Sel, Mul1, Flags);
5879
5880 MI.eraseFromParent();
5881 return true;
5882}
5883
5886 MachineIRBuilder &B) const {
5887 // Bypass the correct expansion a standard promotion through G_FSQRT would
5888 // get. The f32 op is accurate enough for the f16 cas.
5889 unsigned Flags = MI.getFlags();
5890 assert(!ST.has16BitInsts());
5891 const LLT F32 = LLT::scalar(32);
5892 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5893 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5894 .addUse(Ext.getReg(0))
5895 .setMIFlags(Flags);
5896 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5897 MI.eraseFromParent();
5898 return true;
5899}
5900
5903 MachineIRBuilder &B) const {
5904 MachineFunction &MF = B.getMF();
5905 Register Dst = MI.getOperand(0).getReg();
5906 Register X = MI.getOperand(1).getReg();
5907 const unsigned Flags = MI.getFlags();
5908 const LLT S1 = LLT::scalar(1);
5909 const LLT F32 = LLT::scalar(32);
5910 const LLT I32 = LLT::scalar(32);
5911
5912 if (allowApproxFunc(MF, Flags)) {
5913 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5914 .addUse(X)
5915 .setMIFlags(Flags);
5916 MI.eraseFromParent();
5917 return true;
5918 }
5919
5920 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5921 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5922 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5923 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5924 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5925
5927 if (needsDenormHandlingF32(MF, X, Flags)) {
5928 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5929 .addUse(SqrtX.getReg(0))
5930 .setMIFlags(Flags);
5931
5932 auto NegOne = B.buildConstant(I32, -1);
5933 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5934
5935 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5936 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5937
5938 auto PosOne = B.buildConstant(I32, 1);
5939 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5940
5941 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5942 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5943
5944 auto Zero = B.buildFConstant(F32, 0.0f);
5945 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5946
5947 SqrtS =
5948 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5949
5950 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5951 SqrtS =
5952 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5953 } else {
5954 auto SqrtR =
5955 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5956 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5957
5958 auto Half = B.buildFConstant(F32, 0.5f);
5959 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5960 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5961 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5962 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5963 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5964 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5965 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5966 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5967 }
5968
5969 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5970
5971 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5972
5973 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5974
5975 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5976 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5977
5978 MI.eraseFromParent();
5979 return true;
5980}
5981
5984 MachineIRBuilder &B) const {
5985 // For double type, the SQRT and RSQ instructions don't have required
5986 // precision, we apply Goldschmidt's algorithm to improve the result:
5987 //
5988 // y0 = rsq(x)
5989 // g0 = x * y0
5990 // h0 = 0.5 * y0
5991 //
5992 // r0 = 0.5 - h0 * g0
5993 // g1 = g0 * r0 + g0
5994 // h1 = h0 * r0 + h0
5995 //
5996 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5997 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5998 // h2 = h1 * r1 + h1
5999 //
6000 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
6001 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
6002 //
6003 // sqrt(x) = g3
6004
6005 const LLT S1 = LLT::scalar(1);
6006 const LLT S32 = LLT::scalar(32);
6007 const LLT F64 = LLT::scalar(64);
6008
6009 Register Dst = MI.getOperand(0).getReg();
6010 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
6011
6012 Register X = MI.getOperand(1).getReg();
6013 unsigned Flags = MI.getFlags();
6014
6015 Register SqrtX = X;
6016 Register Scaling, ZeroInt;
6017 if (!MI.getFlag(MachineInstr::FmAfn)) {
6018 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
6019
6020 ZeroInt = B.buildConstant(S32, 0).getReg(0);
6021 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
6022
6023 // Scale up input if it is too small.
6024 auto ScaleUpFactor = B.buildConstant(S32, 256);
6025 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
6026 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
6027 }
6028
6029 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
6030
6031 auto Half = B.buildFConstant(F64, 0.5);
6032 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
6033 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
6034
6035 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
6036 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
6037
6038 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
6039 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
6040
6041 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
6042 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6043
6044 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6045
6046 Register SqrtRet = SqrtS2.getReg(0);
6047 if (!MI.getFlag(MachineInstr::FmAfn)) {
6048 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6049 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6050 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6051
6052 // Scale down the result.
6053 auto ScaleDownFactor = B.buildConstant(S32, -128);
6054 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6055 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6056 }
6057
6058 Register IsZeroOrInf;
6059 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6060 auto ZeroFP = B.buildFConstant(F64, 0.0);
6061 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6062 } else {
6063 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6064 }
6065
6066 // TODO: Check for DAZ and expand to subnormals
6067
6068 // If x is +INF, +0, or -0, use its original value
6069 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6070
6071 MI.eraseFromParent();
6072 return true;
6073}
6074
6077 MachineIRBuilder &B) const {
6078 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6079 if (Ty == LLT::scalar(32))
6080 return legalizeFSQRTF32(MI, MRI, B);
6081 if (Ty == LLT::scalar(64))
6082 return legalizeFSQRTF64(MI, MRI, B);
6083 if (Ty == LLT::scalar(16))
6084 return legalizeFSQRTF16(MI, MRI, B);
6085 return false;
6086}
6087
6088// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6089// FIXME: Why do we handle this one but not other removed instructions?
6090//
6091// Reciprocal square root. The clamp prevents infinite results, clamping
6092// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6093// +-max_float.
6096 MachineIRBuilder &B) const {
6097 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6098 return true;
6099
6100 Register Dst = MI.getOperand(0).getReg();
6101 Register Src = MI.getOperand(2).getReg();
6102 auto Flags = MI.getFlags();
6103
6104 LLT Ty = MRI.getType(Dst);
6105
6106 const fltSemantics *FltSemantics;
6107 if (Ty == LLT::scalar(32))
6108 FltSemantics = &APFloat::IEEEsingle();
6109 else if (Ty == LLT::scalar(64))
6110 FltSemantics = &APFloat::IEEEdouble();
6111 else
6112 return false;
6113
6114 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6115 .addUse(Src)
6116 .setMIFlags(Flags);
6117
6118 // We don't need to concern ourselves with the snan handling difference, since
6119 // the rsq quieted (or not) so use the one which will directly select.
6120 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6121 const bool UseIEEE = MFI->getMode().IEEE;
6122
6123 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6124 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6125 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6126
6127 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6128
6129 if (UseIEEE)
6130 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6131 else
6132 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6133 MI.eraseFromParent();
6134 return true;
6135}
6136
6137// TODO: Fix pointer type handling
6140 Intrinsic::ID IID) const {
6141
6142 MachineIRBuilder &B = Helper.MIRBuilder;
6143 MachineRegisterInfo &MRI = *B.getMRI();
6144
6145 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6146 IID == Intrinsic::amdgcn_permlanex16;
6147 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6148 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6149 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
6150 IID == Intrinsic::amdgcn_permlane_up ||
6151 IID == Intrinsic::amdgcn_permlane_down ||
6152 IID == Intrinsic::amdgcn_permlane_xor;
6153
6154 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6155 Register Src2, LLT VT) -> Register {
6156 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6157 switch (IID) {
6158 case Intrinsic::amdgcn_readfirstlane:
6159 case Intrinsic::amdgcn_permlane64:
6160 return LaneOp.getReg(0);
6161 case Intrinsic::amdgcn_readlane:
6162 case Intrinsic::amdgcn_set_inactive:
6163 case Intrinsic::amdgcn_set_inactive_chain_arg:
6164 return LaneOp.addUse(Src1).getReg(0);
6165 case Intrinsic::amdgcn_writelane:
6166 case Intrinsic::amdgcn_permlane_bcast:
6167 case Intrinsic::amdgcn_permlane_up:
6168 case Intrinsic::amdgcn_permlane_down:
6169 case Intrinsic::amdgcn_permlane_xor:
6170 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6171 case Intrinsic::amdgcn_permlane16:
6172 case Intrinsic::amdgcn_permlanex16: {
6173 Register Src3 = MI.getOperand(5).getReg();
6174 int64_t Src4 = MI.getOperand(6).getImm();
6175 int64_t Src5 = MI.getOperand(7).getImm();
6176 return LaneOp.addUse(Src1)
6177 .addUse(Src2)
6178 .addUse(Src3)
6179 .addImm(Src4)
6180 .addImm(Src5)
6181 .getReg(0);
6182 }
6183 case Intrinsic::amdgcn_mov_dpp8:
6184 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6185 case Intrinsic::amdgcn_update_dpp:
6186 return LaneOp.addUse(Src1)
6187 .addImm(MI.getOperand(4).getImm())
6188 .addImm(MI.getOperand(5).getImm())
6189 .addImm(MI.getOperand(6).getImm())
6190 .addImm(MI.getOperand(7).getImm())
6191 .getReg(0);
6192 default:
6193 llvm_unreachable("unhandled lane op");
6194 }
6195 };
6196
6197 Register DstReg = MI.getOperand(0).getReg();
6198 Register Src0 = MI.getOperand(2).getReg();
6199 Register Src1, Src2;
6200 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6201 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
6202 IsPermlaneShuffle) {
6203 Src1 = MI.getOperand(3).getReg();
6204 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16 ||
6205 IsPermlaneShuffle) {
6206 Src2 = MI.getOperand(4).getReg();
6207 }
6208 }
6209
6210 LLT Ty = MRI.getType(DstReg);
6211 unsigned Size = Ty.getSizeInBits();
6212
6213 unsigned SplitSize = 32;
6214 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6215 ST.hasDPALU_DPP() &&
6216 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6217 SplitSize = 64;
6218
6219 if (Size == SplitSize) {
6220 // Already legal
6221 return true;
6222 }
6223
6224 if (Size < 32) {
6225 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6226
6227 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6228 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6229
6230 if (IID == Intrinsic::amdgcn_writelane)
6231 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6232
6233 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6234 B.buildTrunc(DstReg, LaneOpDst);
6235 MI.eraseFromParent();
6236 return true;
6237 }
6238
6239 if (Size % SplitSize != 0)
6240 return false;
6241
6242 LLT PartialResTy = LLT::scalar(SplitSize);
6243 bool NeedsBitcast = false;
6244 if (Ty.isVector()) {
6245 LLT EltTy = Ty.getElementType();
6246 unsigned EltSize = EltTy.getSizeInBits();
6247 if (EltSize == SplitSize) {
6248 PartialResTy = EltTy;
6249 } else if (EltSize == 16 || EltSize == 32) {
6250 unsigned NElem = SplitSize / EltSize;
6251 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6252 } else {
6253 // Handle all other cases via S32/S64 pieces
6254 NeedsBitcast = true;
6255 }
6256 }
6257
6258 SmallVector<Register, 4> PartialRes;
6259 unsigned NumParts = Size / SplitSize;
6260 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6261 MachineInstrBuilder Src1Parts, Src2Parts;
6262
6263 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6264 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6265
6266 if (IID == Intrinsic::amdgcn_writelane)
6267 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6268
6269 for (unsigned i = 0; i < NumParts; ++i) {
6270 Src0 = Src0Parts.getReg(i);
6271
6272 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6273 Src1 = Src1Parts.getReg(i);
6274
6275 if (IID == Intrinsic::amdgcn_writelane)
6276 Src2 = Src2Parts.getReg(i);
6277
6278 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6279 }
6280
6281 if (NeedsBitcast)
6282 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6283 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6284 else
6285 B.buildMergeLikeInstr(DstReg, PartialRes);
6286
6287 MI.eraseFromParent();
6288 return true;
6289}
6290
6293 MachineIRBuilder &B) const {
6295 ST.getTargetLowering()->getImplicitParameterOffset(
6297 LLT DstTy = MRI.getType(DstReg);
6298 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6299
6300 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6301 if (!loadInputValue(KernargPtrReg, B,
6303 return false;
6304
6305 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6306 B.buildConstant(IdxTy, Offset).getReg(0));
6307 return true;
6308}
6309
6310/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6311/// bits of the pointer and replace them with the stride argument, then
6312/// merge_values everything together. In the common case of a raw buffer (the
6313/// stride component is 0), we can just AND off the upper half.
6316 Register Result = MI.getOperand(0).getReg();
6317 Register Pointer = MI.getOperand(2).getReg();
6318 Register Stride = MI.getOperand(3).getReg();
6319 Register NumRecords = MI.getOperand(4).getReg();
6320 Register Flags = MI.getOperand(5).getReg();
6321
6322 LLT S32 = LLT::scalar(32);
6323 LLT S64 = LLT::scalar(64);
6324
6325 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6326
6327 auto ExtStride = B.buildAnyExt(S32, Stride);
6328
6329 if (ST.has45BitNumRecordsBufferResource()) {
6330 Register Zero = B.buildConstant(S32, 0).getReg(0);
6331 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6332 // num_records.
6333 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6334 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6335 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6336 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6337 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6338
6339 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6340 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6341 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6342 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6343 auto ExtShiftedStride =
6344 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6345 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6346 auto ExtShiftedFlags =
6347 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6348 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6349 Register HighHalf =
6350 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6351 B.buildMergeValues(Result, {LowHalf, HighHalf});
6352 } else {
6353 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6354 auto Unmerge = B.buildUnmerge(S32, Pointer);
6355 auto LowHalf = Unmerge.getReg(0);
6356 auto HighHalf = Unmerge.getReg(1);
6357
6358 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6359 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6360 auto ShiftConst = B.buildConstant(S32, 16);
6361 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6362 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6363 Register NewHighHalfReg = NewHighHalf.getReg(0);
6364 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6365 }
6366
6367 MI.eraseFromParent();
6368 return true;
6369}
6370
6373 MachineIRBuilder &B) const {
6374 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6375 if (!MFI->isEntryFunction()) {
6376 return legalizePreloadedArgIntrin(MI, MRI, B,
6378 }
6379
6380 Register DstReg = MI.getOperand(0).getReg();
6381 if (!getImplicitArgPtr(DstReg, MRI, B))
6382 return false;
6383
6384 MI.eraseFromParent();
6385 return true;
6386}
6387
6390 MachineIRBuilder &B) const {
6391 Function &F = B.getMF().getFunction();
6392 std::optional<uint32_t> KnownSize =
6394 if (KnownSize.has_value())
6395 B.buildConstant(DstReg, *KnownSize);
6396 return false;
6397}
6398
6401 MachineIRBuilder &B) const {
6402
6403 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6404 if (!MFI->isEntryFunction()) {
6405 return legalizePreloadedArgIntrin(MI, MRI, B,
6407 }
6408
6409 Register DstReg = MI.getOperand(0).getReg();
6410 if (!getLDSKernelId(DstReg, MRI, B))
6411 return false;
6412
6413 MI.eraseFromParent();
6414 return true;
6415}
6416
6420 unsigned AddrSpace) const {
6421 const LLT S32 = LLT::scalar(32);
6422 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6423 Register Hi32 = Unmerge.getReg(1);
6424
6425 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6426 ST.hasGloballyAddressableScratch()) {
6427 Register FlatScratchBaseHi =
6428 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6429 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6430 .getReg(0);
6431 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6432 // Test bits 63..58 against the aperture address.
6433 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6434 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6435 B.buildConstant(S32, 1u << 26));
6436 } else {
6437 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6438 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6439 }
6440 MI.eraseFromParent();
6441 return true;
6442}
6443
6444// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6445// offset (the offset that is included in bounds checking and swizzling, to be
6446// split between the instruction's voffset and immoffset fields) and soffset
6447// (the offset that is excluded from bounds checking and swizzling, to go in
6448// the instruction's soffset field). This function takes the first kind of
6449// offset and figures out how to split it between voffset and immoffset.
6450std::pair<Register, unsigned>
6452 Register OrigOffset) const {
6453 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6454 Register BaseReg;
6455 unsigned ImmOffset;
6456 const LLT S32 = LLT::scalar(32);
6457 MachineRegisterInfo &MRI = *B.getMRI();
6458
6459 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6460 // being added, so we can only safely match a 32-bit addition with no unsigned
6461 // overflow.
6462 bool CheckNUW = ST.hasGFX1250Insts();
6463 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6464 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6465
6466 // If BaseReg is a pointer, convert it to int.
6467 if (MRI.getType(BaseReg).isPointer())
6468 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6469
6470 // If the immediate value is too big for the immoffset field, put only bits
6471 // that would normally fit in the immoffset field. The remaining value that
6472 // is copied/added for the voffset field is a large power of 2, and it
6473 // stands more chance of being CSEd with the copy/add for another similar
6474 // load/store.
6475 // However, do not do that rounding down if that is a negative
6476 // number, as it appears to be illegal to have a negative offset in the
6477 // vgpr, even if adding the immediate offset makes it positive.
6478 unsigned Overflow = ImmOffset & ~MaxImm;
6479 ImmOffset -= Overflow;
6480 if ((int32_t)Overflow < 0) {
6481 Overflow += ImmOffset;
6482 ImmOffset = 0;
6483 }
6484
6485 if (Overflow != 0) {
6486 if (!BaseReg) {
6487 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6488 } else {
6489 auto OverflowVal = B.buildConstant(S32, Overflow);
6490 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6491 }
6492 }
6493
6494 if (!BaseReg)
6495 BaseReg = B.buildConstant(S32, 0).getReg(0);
6496
6497 return std::pair(BaseReg, ImmOffset);
6498}
6499
6500/// Handle register layout difference for f16 images for some subtargets.
6503 Register Reg,
6504 bool ImageStore) const {
6505 const LLT S16 = LLT::scalar(16);
6506 const LLT S32 = LLT::scalar(32);
6507 LLT StoreVT = MRI.getType(Reg);
6508 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6509
6510 if (ST.hasUnpackedD16VMem()) {
6511 auto Unmerge = B.buildUnmerge(S16, Reg);
6512
6513 SmallVector<Register, 4> WideRegs;
6514 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6515 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6516
6517 int NumElts = StoreVT.getNumElements();
6518
6519 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6520 .getReg(0);
6521 }
6522
6523 if (ImageStore && ST.hasImageStoreD16Bug()) {
6524 if (StoreVT.getNumElements() == 2) {
6525 SmallVector<Register, 4> PackedRegs;
6526 Reg = B.buildBitcast(S32, Reg).getReg(0);
6527 PackedRegs.push_back(Reg);
6528 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6529 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6530 .getReg(0);
6531 }
6532
6533 if (StoreVT.getNumElements() == 3) {
6534 SmallVector<Register, 4> PackedRegs;
6535 auto Unmerge = B.buildUnmerge(S16, Reg);
6536 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6537 PackedRegs.push_back(Unmerge.getReg(I));
6538 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6539 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6540 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6541 }
6542
6543 if (StoreVT.getNumElements() == 4) {
6544 SmallVector<Register, 4> PackedRegs;
6545 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6546 auto Unmerge = B.buildUnmerge(S32, Reg);
6547 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6548 PackedRegs.push_back(Unmerge.getReg(I));
6549 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6550 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6551 .getReg(0);
6552 }
6553
6554 llvm_unreachable("invalid data type");
6555 }
6556
6557 if (StoreVT == LLT::fixed_vector(3, S16)) {
6558 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6559 .getReg(0);
6560 }
6561 return Reg;
6562}
6563
6565 Register VData, LLT MemTy,
6566 bool IsFormat) const {
6567 MachineRegisterInfo *MRI = B.getMRI();
6568 LLT Ty = MRI->getType(VData);
6569
6570 const LLT S16 = LLT::scalar(16);
6571
6572 // Fixup buffer resources themselves needing to be v4i128.
6574 return castBufferRsrcToV4I32(VData, B);
6575
6576 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6577 Ty = getBitcastRegisterType(Ty);
6578 VData = B.buildBitcast(Ty, VData).getReg(0);
6579 }
6580 // Fixup illegal register types for i8 stores.
6581 if (Ty == LLT::scalar(8) || Ty == S16) {
6582 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6583 return AnyExt;
6584 }
6585
6586 if (Ty.isVector()) {
6587 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6588 if (IsFormat)
6589 return handleD16VData(B, *MRI, VData);
6590 }
6591 }
6592
6593 return VData;
6594}
6595
6597 LegalizerHelper &Helper,
6598 bool IsTyped,
6599 bool IsFormat) const {
6600 MachineIRBuilder &B = Helper.MIRBuilder;
6601 MachineRegisterInfo &MRI = *B.getMRI();
6602
6603 Register VData = MI.getOperand(1).getReg();
6604 LLT Ty = MRI.getType(VData);
6605 LLT EltTy = Ty.getScalarType();
6606 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6607 const LLT S32 = LLT::scalar(32);
6608
6609 MachineMemOperand *MMO = *MI.memoperands_begin();
6610 const int MemSize = MMO->getSize().getValue();
6611 LLT MemTy = MMO->getMemoryType();
6612
6613 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6614
6616 Register RSrc = MI.getOperand(2).getReg();
6617
6618 unsigned ImmOffset;
6619
6620 // The typed intrinsics add an immediate after the registers.
6621 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6622
6623 // The struct intrinsic variants add one additional operand over raw.
6624 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6625 Register VIndex;
6626 int OpOffset = 0;
6627 if (HasVIndex) {
6628 VIndex = MI.getOperand(3).getReg();
6629 OpOffset = 1;
6630 } else {
6631 VIndex = B.buildConstant(S32, 0).getReg(0);
6632 }
6633
6634 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6635 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6636
6637 unsigned Format = 0;
6638 if (IsTyped) {
6639 Format = MI.getOperand(5 + OpOffset).getImm();
6640 ++OpOffset;
6641 }
6642
6643 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6644
6645 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6646
6647 unsigned Opc;
6648 if (IsTyped) {
6649 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6650 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6651 } else if (IsFormat) {
6652 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6653 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6654 } else {
6655 switch (MemSize) {
6656 case 1:
6657 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6658 break;
6659 case 2:
6660 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6661 break;
6662 default:
6663 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6664 break;
6665 }
6666 }
6667
6668 auto MIB = B.buildInstr(Opc)
6669 .addUse(VData) // vdata
6670 .addUse(RSrc) // rsrc
6671 .addUse(VIndex) // vindex
6672 .addUse(VOffset) // voffset
6673 .addUse(SOffset) // soffset
6674 .addImm(ImmOffset); // offset(imm)
6675
6676 if (IsTyped)
6677 MIB.addImm(Format);
6678
6679 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6680 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6681 .addMemOperand(MMO);
6682
6683 MI.eraseFromParent();
6684 return true;
6685}
6686
6687static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6688 Register VIndex, Register VOffset, Register SOffset,
6689 unsigned ImmOffset, unsigned Format,
6690 unsigned AuxiliaryData, MachineMemOperand *MMO,
6691 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6692 auto MIB = B.buildInstr(Opc)
6693 .addDef(LoadDstReg) // vdata
6694 .addUse(RSrc) // rsrc
6695 .addUse(VIndex) // vindex
6696 .addUse(VOffset) // voffset
6697 .addUse(SOffset) // soffset
6698 .addImm(ImmOffset); // offset(imm)
6699
6700 if (IsTyped)
6701 MIB.addImm(Format);
6702
6703 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6704 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6705 .addMemOperand(MMO);
6706}
6707
6709 LegalizerHelper &Helper,
6710 bool IsFormat,
6711 bool IsTyped) const {
6712 MachineIRBuilder &B = Helper.MIRBuilder;
6713 MachineRegisterInfo &MRI = *B.getMRI();
6714 GISelChangeObserver &Observer = Helper.Observer;
6715
6716 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6717 MachineMemOperand *MMO = *MI.memoperands_begin();
6718 const LLT MemTy = MMO->getMemoryType();
6719 const LLT S32 = LLT::scalar(32);
6720
6721 Register Dst = MI.getOperand(0).getReg();
6722
6723 Register StatusDst;
6724 int OpOffset = 0;
6725 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6726 bool IsTFE = MI.getNumExplicitDefs() == 2;
6727 if (IsTFE) {
6728 StatusDst = MI.getOperand(1).getReg();
6729 ++OpOffset;
6730 }
6731
6732 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6733 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6734
6735 // The typed intrinsics add an immediate after the registers.
6736 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6737
6738 // The struct intrinsic variants add one additional operand over raw.
6739 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6740 Register VIndex;
6741 if (HasVIndex) {
6742 VIndex = MI.getOperand(3 + OpOffset).getReg();
6743 ++OpOffset;
6744 } else {
6745 VIndex = B.buildConstant(S32, 0).getReg(0);
6746 }
6747
6748 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6749 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6750
6751 unsigned Format = 0;
6752 if (IsTyped) {
6753 Format = MI.getOperand(5 + OpOffset).getImm();
6754 ++OpOffset;
6755 }
6756
6757 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6758 unsigned ImmOffset;
6759
6760 LLT Ty = MRI.getType(Dst);
6761 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6762 // logic doesn't have to handle that case.
6763 if (hasBufferRsrcWorkaround(Ty)) {
6764 Observer.changingInstr(MI);
6765 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6766 Observer.changedInstr(MI);
6767 Dst = MI.getOperand(0).getReg();
6768 B.setInsertPt(B.getMBB(), MI);
6769 }
6770 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6771 Ty = getBitcastRegisterType(Ty);
6772 Observer.changingInstr(MI);
6773 Helper.bitcastDst(MI, Ty, 0);
6774 Observer.changedInstr(MI);
6775 Dst = MI.getOperand(0).getReg();
6776 B.setInsertPt(B.getMBB(), MI);
6777 }
6778
6779 LLT EltTy = Ty.getScalarType();
6780 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6781 const bool Unpacked = ST.hasUnpackedD16VMem();
6782
6783 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6784
6785 unsigned Opc;
6786
6787 // TODO: Support TFE for typed and narrow loads.
6788 if (IsTyped) {
6789 if (IsTFE)
6790 return false;
6791 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6792 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6793 } else if (IsFormat) {
6794 if (IsD16) {
6795 if (IsTFE)
6796 return false;
6797 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6798 } else {
6799 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6800 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6801 }
6802 } else {
6803 switch (MemTy.getSizeInBits()) {
6804 case 8:
6805 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6806 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6807 break;
6808 case 16:
6809 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6810 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6811 break;
6812 default:
6813 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6814 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6815 break;
6816 }
6817 }
6818
6819 if (IsTFE) {
6820 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6821 unsigned NumLoadDWords = NumValueDWords + 1;
6822 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6823 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6824 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6825 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6826 if (MemTy.getSizeInBits() < 32) {
6827 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6828 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6829 B.buildTrunc(Dst, ExtDst);
6830 } else if (NumValueDWords == 1) {
6831 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6832 } else {
6833 SmallVector<Register, 5> LoadElts;
6834 for (unsigned I = 0; I != NumValueDWords; ++I)
6835 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6836 LoadElts.push_back(StatusDst);
6837 B.buildUnmerge(LoadElts, LoadDstReg);
6838 LoadElts.truncate(NumValueDWords);
6839 B.buildMergeLikeInstr(Dst, LoadElts);
6840 }
6841 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6842 (IsD16 && !Ty.isVector())) {
6843 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6844 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6845 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6846 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6847 B.buildTrunc(Dst, LoadDstReg);
6848 } else if (Unpacked && IsD16 && Ty.isVector()) {
6849 LLT UnpackedTy = Ty.changeElementSize(32);
6850 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6851 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6852 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6853 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6854 // FIXME: G_TRUNC should work, but legalization currently fails
6855 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6857 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6858 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6859 B.buildMergeLikeInstr(Dst, Repack);
6860 } else {
6861 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6862 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6863 }
6864
6865 MI.eraseFromParent();
6866 return true;
6867}
6868
6869static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6870 switch (IntrID) {
6871 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6872 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6873 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6874 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6875 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6876 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6877 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6878 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6879 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6880 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6881 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6882 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6883 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6884 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6885 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6886 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6887 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6888 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6889 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6890 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6891 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6892 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6893 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6894 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6895 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6896 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6897 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6898 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6899 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6900 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6901 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6902 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6903 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6904 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6905 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6906 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6907 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6908 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6909 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6910 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6911 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6912 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6913 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6914 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6915 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6916 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6917 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6918 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6919 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6920 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6921 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6922 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6923 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6924 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6925 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6926 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6927 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6928 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6929 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6930 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6931 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6932 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6933 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6934 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6935 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6936 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6937 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6938 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6939 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6940 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6941 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6942 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6943 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6944 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6945 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6946 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6947 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6948 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6949 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6950 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6951 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6952 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6953 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6954 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6955 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6956 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6957 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6958 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6959 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6960 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6961 default:
6962 llvm_unreachable("unhandled atomic opcode");
6963 }
6964}
6965
6968 Intrinsic::ID IID) const {
6969 const bool IsCmpSwap =
6970 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6971 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6972 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6973 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6974
6975 Register Dst = MI.getOperand(0).getReg();
6976 // Since we don't have 128-bit atomics, we don't need to handle the case of
6977 // p8 argmunents to the atomic itself
6978 Register VData = MI.getOperand(2).getReg();
6979
6980 Register CmpVal;
6981 int OpOffset = 0;
6982
6983 if (IsCmpSwap) {
6984 CmpVal = MI.getOperand(3).getReg();
6985 ++OpOffset;
6986 }
6987
6988 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6989 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6990 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6991
6992 // The struct intrinsic variants add one additional operand over raw.
6993 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6994 Register VIndex;
6995 if (HasVIndex) {
6996 VIndex = MI.getOperand(4 + OpOffset).getReg();
6997 ++OpOffset;
6998 } else {
6999 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
7000 }
7001
7002 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
7003 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
7004 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
7005
7006 MachineMemOperand *MMO = *MI.memoperands_begin();
7007
7008 unsigned ImmOffset;
7009 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
7010
7011 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
7012 .addDef(Dst)
7013 .addUse(VData); // vdata
7014
7015 if (IsCmpSwap)
7016 MIB.addReg(CmpVal);
7017
7018 MIB.addUse(RSrc) // rsrc
7019 .addUse(VIndex) // vindex
7020 .addUse(VOffset) // voffset
7021 .addUse(SOffset) // soffset
7022 .addImm(ImmOffset) // offset(imm)
7023 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
7024 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
7025 .addMemOperand(MMO);
7026
7027 MI.eraseFromParent();
7028 return true;
7029}
7030
7031/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
7032/// vector with s16 typed elements.
7034 SmallVectorImpl<Register> &PackedAddrs,
7035 unsigned ArgOffset,
7037 bool IsA16, bool IsG16) {
7038 const LLT S16 = LLT::scalar(16);
7039 const LLT V2S16 = LLT::fixed_vector(2, 16);
7040 auto EndIdx = Intr->VAddrEnd;
7041
7042 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
7043 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7044 if (!SrcOp.isReg())
7045 continue; // _L to _LZ may have eliminated this.
7046
7047 Register AddrReg = SrcOp.getReg();
7048
7049 if ((I < Intr->GradientStart) ||
7050 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
7051 (I >= Intr->CoordStart && !IsA16)) {
7052 if ((I < Intr->GradientStart) && IsA16 &&
7053 (B.getMRI()->getType(AddrReg) == S16)) {
7054 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7055 // Special handling of bias when A16 is on. Bias is of type half but
7056 // occupies full 32-bit.
7057 PackedAddrs.push_back(
7058 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7059 .getReg(0));
7060 } else {
7061 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7062 "Bias needs to be converted to 16 bit in A16 mode");
7063 // Handle any gradient or coordinate operands that should not be packed
7064 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7065 PackedAddrs.push_back(AddrReg);
7066 }
7067 } else {
7068 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7069 // derivatives dx/dh and dx/dv are packed with undef.
7070 if (((I + 1) >= EndIdx) ||
7071 ((Intr->NumGradients / 2) % 2 == 1 &&
7072 (I == static_cast<unsigned>(Intr->GradientStart +
7073 (Intr->NumGradients / 2) - 1) ||
7074 I == static_cast<unsigned>(Intr->GradientStart +
7075 Intr->NumGradients - 1))) ||
7076 // Check for _L to _LZ optimization
7077 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7078 PackedAddrs.push_back(
7079 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7080 .getReg(0));
7081 } else {
7082 PackedAddrs.push_back(
7083 B.buildBuildVector(
7084 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7085 .getReg(0));
7086 ++I;
7087 }
7088 }
7089 }
7090}
7091
7092/// Convert from separate vaddr components to a single vector address register,
7093/// and replace the remaining operands with $noreg.
7095 int DimIdx, int NumVAddrs) {
7096 const LLT S32 = LLT::scalar(32);
7097 (void)S32;
7098 SmallVector<Register, 8> AddrRegs;
7099 for (int I = 0; I != NumVAddrs; ++I) {
7100 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7101 if (SrcOp.isReg()) {
7102 AddrRegs.push_back(SrcOp.getReg());
7103 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7104 }
7105 }
7106
7107 int NumAddrRegs = AddrRegs.size();
7108 if (NumAddrRegs != 1) {
7109 auto VAddr =
7110 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7111 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7112 }
7113
7114 for (int I = 1; I != NumVAddrs; ++I) {
7115 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7116 if (SrcOp.isReg())
7117 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7118 }
7119}
7120
7121/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7122///
7123/// Depending on the subtarget, load/store with 16-bit element data need to be
7124/// rewritten to use the low half of 32-bit registers, or directly use a packed
7125/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7126/// registers.
7127///
7128/// We don't want to directly select image instructions just yet, but also want
7129/// to exposes all register repacking to the legalizer/combiners. We also don't
7130/// want a selected instruction entering RegBankSelect. In order to avoid
7131/// defining a multitude of intermediate image instructions, directly hack on
7132/// the intrinsic's arguments. In cases like a16 addresses, this requires
7133/// padding now unnecessary arguments with $noreg.
7136 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7137
7138 const MachineFunction &MF = *MI.getMF();
7139 const unsigned NumDefs = MI.getNumExplicitDefs();
7140 const unsigned ArgOffset = NumDefs + 1;
7141 bool IsTFE = NumDefs == 2;
7142 // We are only processing the operands of d16 image operations on subtargets
7143 // that use the unpacked register layout, or need to repack the TFE result.
7144
7145 // TODO: Do we need to guard against already legalized intrinsics?
7146 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7148
7149 MachineRegisterInfo *MRI = B.getMRI();
7150 const LLT S32 = LLT::scalar(32);
7151 const LLT S16 = LLT::scalar(16);
7152 const LLT V2S16 = LLT::fixed_vector(2, 16);
7153
7154 unsigned DMask = 0;
7155 Register VData;
7156 LLT Ty;
7157
7158 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7159 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7160 Ty = MRI->getType(VData);
7161 }
7162
7163 const bool IsAtomicPacked16Bit =
7164 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7165 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7166
7167 // Check for 16 bit addresses and pack if true.
7168 LLT GradTy =
7169 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7170 LLT AddrTy =
7171 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7172 const bool IsG16 =
7173 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7174 const bool IsA16 = AddrTy == S16;
7175 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7176
7177 int DMaskLanes = 0;
7178 if (!BaseOpcode->Atomic) {
7179 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7180 if (BaseOpcode->Gather4) {
7181 DMaskLanes = 4;
7182 } else if (DMask != 0) {
7183 DMaskLanes = llvm::popcount(DMask);
7184 } else if (!IsTFE && !BaseOpcode->Store) {
7185 // If dmask is 0, this is a no-op load. This can be eliminated.
7186 B.buildUndef(MI.getOperand(0));
7187 MI.eraseFromParent();
7188 return true;
7189 }
7190 }
7191
7192 Observer.changingInstr(MI);
7193 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7194
7195 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7196 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7197 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7198 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7199 unsigned NewOpcode = LoadOpcode;
7200 if (BaseOpcode->Store)
7201 NewOpcode = StoreOpcode;
7202 else if (BaseOpcode->NoReturn)
7203 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7204
7205 // Track that we legalized this
7206 MI.setDesc(B.getTII().get(NewOpcode));
7207
7208 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7209 // dmask to be at least 1 otherwise the instruction will fail
7210 if (IsTFE && DMask == 0) {
7211 DMask = 0x1;
7212 DMaskLanes = 1;
7213 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7214 }
7215
7216 if (BaseOpcode->Atomic) {
7217 Register VData0 = MI.getOperand(2).getReg();
7218 LLT Ty = MRI->getType(VData0);
7219
7220 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7221 if (Ty.isVector() && !IsAtomicPacked16Bit)
7222 return false;
7223
7224 if (BaseOpcode->AtomicX2) {
7225 Register VData1 = MI.getOperand(3).getReg();
7226 // The two values are packed in one register.
7227 LLT PackedTy = LLT::fixed_vector(2, Ty);
7228 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7229 MI.getOperand(2).setReg(Concat.getReg(0));
7230 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7231 }
7232 }
7233
7234 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7235
7236 // Rewrite the addressing register layout before doing anything else.
7237 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7238 // 16 bit gradients are supported, but are tied to the A16 control
7239 // so both gradients and addresses must be 16 bit
7240 return false;
7241 }
7242
7243 if (IsA16 && !ST.hasA16()) {
7244 // A16 not supported
7245 return false;
7246 }
7247
7248 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7249 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7250
7251 if (IsA16 || IsG16) {
7252 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7253 // instructions expect VGPR_32
7254 SmallVector<Register, 4> PackedRegs;
7255
7256 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7257
7258 // See also below in the non-a16 branch
7259 const bool UseNSA = ST.hasNSAEncoding() &&
7260 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7261 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7262 const bool UsePartialNSA =
7263 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7264
7265 if (UsePartialNSA) {
7266 // Pack registers that would go over NSAMaxSize into last VAddr register
7267 LLT PackedAddrTy =
7268 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7269 auto Concat = B.buildConcatVectors(
7270 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7271 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7272 PackedRegs.resize(NSAMaxSize);
7273 } else if (!UseNSA && PackedRegs.size() > 1) {
7274 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7275 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7276 PackedRegs[0] = Concat.getReg(0);
7277 PackedRegs.resize(1);
7278 }
7279
7280 const unsigned NumPacked = PackedRegs.size();
7281 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7282 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7283 if (!SrcOp.isReg()) {
7284 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7285 continue;
7286 }
7287
7288 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7289
7290 if (I - Intr->VAddrStart < NumPacked)
7291 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7292 else
7293 SrcOp.setReg(AMDGPU::NoRegister);
7294 }
7295 } else {
7296 // If the register allocator cannot place the address registers contiguously
7297 // without introducing moves, then using the non-sequential address encoding
7298 // is always preferable, since it saves VALU instructions and is usually a
7299 // wash in terms of code size or even better.
7300 //
7301 // However, we currently have no way of hinting to the register allocator
7302 // that MIMG addresses should be placed contiguously when it is possible to
7303 // do so, so force non-NSA for the common 2-address case as a heuristic.
7304 //
7305 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7306 // allocation when possible.
7307 //
7308 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7309 // set of the remaining addresses.
7310 const bool UseNSA = ST.hasNSAEncoding() &&
7311 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7312 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7313 const bool UsePartialNSA =
7314 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7315
7316 if (UsePartialNSA) {
7318 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7319 Intr->NumVAddrs - NSAMaxSize + 1);
7320 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7321 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7322 Intr->NumVAddrs);
7323 }
7324 }
7325
7326 int Flags = 0;
7327 if (IsA16)
7328 Flags |= 1;
7329 if (IsG16)
7330 Flags |= 2;
7331 MI.addOperand(MachineOperand::CreateImm(Flags));
7332
7333 if (BaseOpcode->NoReturn) { // No TFE for stores?
7334 // TODO: Handle dmask trim
7335 if (!Ty.isVector() || !IsD16)
7336 return true;
7337
7338 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7339 if (RepackedReg != VData) {
7340 MI.getOperand(1).setReg(RepackedReg);
7341 }
7342
7343 return true;
7344 }
7345
7346 Register DstReg = MI.getOperand(0).getReg();
7347 const LLT EltTy = Ty.getScalarType();
7348 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7349
7350 // Confirm that the return type is large enough for the dmask specified
7351 if (NumElts < DMaskLanes)
7352 return false;
7353
7354 if (NumElts > 4 || DMaskLanes > 4)
7355 return false;
7356
7357 // Image atomic instructions are using DMask to specify how many bits
7358 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7359 // DMaskLanes for image atomic has default value '0'.
7360 // We must be sure that atomic variants (especially packed) will not be
7361 // truncated from v2s16 or v4s16 to s16 type.
7362 //
7363 // ChangeElementCount will be needed for image load where Ty is always scalar.
7364 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7365 const LLT AdjustedTy =
7366 DMaskLanes == 0
7367 ? Ty
7368 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7369
7370 // The raw dword aligned data component of the load. The only legal cases
7371 // where this matters should be when using the packed D16 format, for
7372 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7373 LLT RoundedTy;
7374
7375 // S32 vector to cover all data, plus TFE result element.
7376 LLT TFETy;
7377
7378 // Register type to use for each loaded component. Will be S32 or V2S16.
7379 LLT RegTy;
7380
7381 if (IsD16 && ST.hasUnpackedD16VMem()) {
7382 RoundedTy =
7383 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7384 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7385 RegTy = S32;
7386 } else {
7387 unsigned EltSize = EltTy.getSizeInBits();
7388 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7389 unsigned RoundedSize = 32 * RoundedElts;
7390 RoundedTy = LLT::scalarOrVector(
7391 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7392 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7393 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7394 }
7395
7396 // The return type does not need adjustment.
7397 // TODO: Should we change s16 case to s32 or <2 x s16>?
7398 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7399 return true;
7400
7401 Register Dst1Reg;
7402
7403 // Insert after the instruction.
7404 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7405
7406 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7407 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7408 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7409 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7410
7411 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7412
7413 MI.getOperand(0).setReg(NewResultReg);
7414
7415 // In the IR, TFE is supposed to be used with a 2 element struct return
7416 // type. The instruction really returns these two values in one contiguous
7417 // register, with one additional dword beyond the loaded data. Rewrite the
7418 // return type to use a single register result.
7419
7420 if (IsTFE) {
7421 Dst1Reg = MI.getOperand(1).getReg();
7422 if (MRI->getType(Dst1Reg) != S32)
7423 return false;
7424
7425 // TODO: Make sure the TFE operand bit is set.
7426 MI.removeOperand(1);
7427
7428 // Handle the easy case that requires no repack instructions.
7429 if (Ty == S32) {
7430 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7431 return true;
7432 }
7433 }
7434
7435 // Now figure out how to copy the new result register back into the old
7436 // result.
7437 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7438
7439 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7440
7441 if (ResultNumRegs == 1) {
7442 assert(!IsTFE);
7443 ResultRegs[0] = NewResultReg;
7444 } else {
7445 // We have to repack into a new vector of some kind.
7446 for (int I = 0; I != NumDataRegs; ++I)
7447 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7448 B.buildUnmerge(ResultRegs, NewResultReg);
7449
7450 // Drop the final TFE element to get the data part. The TFE result is
7451 // directly written to the right place already.
7452 if (IsTFE)
7453 ResultRegs.resize(NumDataRegs);
7454 }
7455
7456 // For an s16 scalar result, we form an s32 result with a truncate regardless
7457 // of packed vs. unpacked.
7458 if (IsD16 && !Ty.isVector()) {
7459 B.buildTrunc(DstReg, ResultRegs[0]);
7460 return true;
7461 }
7462
7463 // Avoid a build/concat_vector of 1 entry.
7464 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7465 B.buildBitcast(DstReg, ResultRegs[0]);
7466 return true;
7467 }
7468
7469 assert(Ty.isVector());
7470
7471 if (IsD16) {
7472 // For packed D16 results with TFE enabled, all the data components are
7473 // S32. Cast back to the expected type.
7474 //
7475 // TODO: We don't really need to use load s32 elements. We would only need one
7476 // cast for the TFE result if a multiple of v2s16 was used.
7477 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7478 for (Register &Reg : ResultRegs)
7479 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7480 } else if (ST.hasUnpackedD16VMem()) {
7481 for (Register &Reg : ResultRegs)
7482 Reg = B.buildTrunc(S16, Reg).getReg(0);
7483 }
7484 }
7485
7486 auto padWithUndef = [&](LLT Ty, int NumElts) {
7487 if (NumElts == 0)
7488 return;
7489 Register Undef = B.buildUndef(Ty).getReg(0);
7490 for (int I = 0; I != NumElts; ++I)
7491 ResultRegs.push_back(Undef);
7492 };
7493
7494 // Pad out any elements eliminated due to the dmask.
7495 LLT ResTy = MRI->getType(ResultRegs[0]);
7496 if (!ResTy.isVector()) {
7497 padWithUndef(ResTy, NumElts - ResultRegs.size());
7498 B.buildBuildVector(DstReg, ResultRegs);
7499 return true;
7500 }
7501
7502 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7503 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7504
7505 // Deal with the one annoying legal case.
7506 const LLT V3S16 = LLT::fixed_vector(3, 16);
7507 if (Ty == V3S16) {
7508 if (IsTFE) {
7509 if (ResultRegs.size() == 1) {
7510 NewResultReg = ResultRegs[0];
7511 } else if (ResultRegs.size() == 2) {
7512 LLT V4S16 = LLT::fixed_vector(4, 16);
7513 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7514 } else {
7515 return false;
7516 }
7517 }
7518
7519 if (MRI->getType(DstReg).getNumElements() <
7520 MRI->getType(NewResultReg).getNumElements()) {
7521 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7522 } else {
7523 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7524 }
7525 return true;
7526 }
7527
7528 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7529 B.buildConcatVectors(DstReg, ResultRegs);
7530 return true;
7531}
7532
7534 MachineInstr &MI) const {
7535 MachineIRBuilder &B = Helper.MIRBuilder;
7536 GISelChangeObserver &Observer = Helper.Observer;
7537
7538 Register OrigDst = MI.getOperand(0).getReg();
7539 Register Dst;
7540 LLT Ty = B.getMRI()->getType(OrigDst);
7541 unsigned Size = Ty.getSizeInBits();
7542 MachineFunction &MF = B.getMF();
7543 unsigned Opc = 0;
7544 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7545 assert(Size == 8 || Size == 16);
7546 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7547 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7548 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7549 // destination register.
7550 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7551 } else {
7552 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7553 Dst = OrigDst;
7554 }
7555
7556 Observer.changingInstr(MI);
7557
7558 // Handle needing to s.buffer.load() a p8 value.
7559 if (hasBufferRsrcWorkaround(Ty)) {
7560 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7561 B.setInsertPt(B.getMBB(), MI);
7562 }
7564 Ty = getBitcastRegisterType(Ty);
7565 Helper.bitcastDst(MI, Ty, 0);
7566 B.setInsertPt(B.getMBB(), MI);
7567 }
7568
7569 // FIXME: We don't really need this intermediate instruction. The intrinsic
7570 // should be fixed to have a memory operand. Since it's readnone, we're not
7571 // allowed to add one.
7572 MI.setDesc(B.getTII().get(Opc));
7573 MI.removeOperand(1); // Remove intrinsic ID
7574
7575 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7576 const unsigned MemSize = (Size + 7) / 8;
7577 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7583 MemSize, MemAlign);
7584 MI.addMemOperand(MF, MMO);
7585 if (Dst != OrigDst) {
7586 MI.getOperand(0).setReg(Dst);
7587 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7588 B.buildTrunc(OrigDst, Dst);
7589 }
7590
7591 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7592 // always be legal. We may need to restore this to a 96-bit result if it turns
7593 // out this needs to be converted to a vector load during RegBankSelect.
7594 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7595 if (Ty.isVector())
7597 else
7598 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7599 }
7600
7601 Observer.changedInstr(MI);
7602 return true;
7603}
7604
7606 MachineInstr &MI) const {
7607 MachineIRBuilder &B = Helper.MIRBuilder;
7608 GISelChangeObserver &Observer = Helper.Observer;
7609 Observer.changingInstr(MI);
7610 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7611 MI.removeOperand(0); // Remove intrinsic ID
7613 Observer.changedInstr(MI);
7614 return true;
7615}
7616
7617// TODO: Move to selection
7620 MachineIRBuilder &B) const {
7621 if (!ST.hasTrapHandler() ||
7622 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7623 return legalizeTrapEndpgm(MI, MRI, B);
7624
7625 return ST.supportsGetDoorbellID() ?
7627}
7628
7631 const DebugLoc &DL = MI.getDebugLoc();
7632 MachineBasicBlock &BB = B.getMBB();
7633 MachineFunction *MF = BB.getParent();
7634
7635 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7636 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7637 .addImm(0);
7638 MI.eraseFromParent();
7639 return true;
7640 }
7641
7642 // We need a block split to make the real endpgm a terminator. We also don't
7643 // want to break phis in successor blocks, so we can't just delete to the
7644 // end of the block.
7645 BB.splitAt(MI, false /*UpdateLiveIns*/);
7647 MF->push_back(TrapBB);
7648 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7649 .addImm(0);
7650 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7651 .addMBB(TrapBB);
7652
7653 BB.addSuccessor(TrapBB);
7654 MI.eraseFromParent();
7655 return true;
7656}
7657
7660 MachineFunction &MF = B.getMF();
7661 const LLT S64 = LLT::scalar(64);
7662
7663 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7664 // For code object version 5, queue_ptr is passed through implicit kernarg.
7670 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7671
7672 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7674
7675 if (!loadInputValue(KernargPtrReg, B,
7677 return false;
7678
7679 // TODO: can we be smarter about machine pointer info?
7682 PtrInfo.getWithOffset(Offset),
7686
7687 // Pointer address
7690 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7691 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7692 // Load address
7693 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7694 B.buildCopy(SGPR01, Temp);
7695 B.buildInstr(AMDGPU::S_TRAP)
7696 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7697 .addReg(SGPR01, RegState::Implicit);
7698 MI.eraseFromParent();
7699 return true;
7700 }
7701
7702 // Pass queue pointer to trap handler as input, and insert trap instruction
7703 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7704 Register LiveIn =
7707 return false;
7708
7709 B.buildCopy(SGPR01, LiveIn);
7710 B.buildInstr(AMDGPU::S_TRAP)
7711 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7712 .addReg(SGPR01, RegState::Implicit);
7713
7714 MI.eraseFromParent();
7715 return true;
7716}
7717
7720 MachineIRBuilder &B) const {
7721 // We need to simulate the 's_trap 2' instruction on targets that run in
7722 // PRIV=1 (where it is treated as a nop).
7723 if (ST.hasPrivEnabledTrap2NopBug()) {
7724 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7725 MI.getDebugLoc());
7726 MI.eraseFromParent();
7727 return true;
7728 }
7729
7730 B.buildInstr(AMDGPU::S_TRAP)
7731 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7732 MI.eraseFromParent();
7733 return true;
7734}
7735
7738 MachineIRBuilder &B) const {
7739 // Is non-HSA path or trap-handler disabled? Then, report a warning
7740 // accordingly
7741 if (!ST.hasTrapHandler() ||
7742 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7743 Function &Fn = B.getMF().getFunction();
7745 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7746 } else {
7747 // Insert debug-trap instruction
7748 B.buildInstr(AMDGPU::S_TRAP)
7749 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7750 }
7751
7752 MI.eraseFromParent();
7753 return true;
7754}
7755
7757 MachineInstr &MI, MachineIRBuilder &B) const {
7758 MachineRegisterInfo &MRI = *B.getMRI();
7759 const LLT S16 = LLT::scalar(16);
7760 const LLT S32 = LLT::scalar(32);
7761 const LLT V2S16 = LLT::fixed_vector(2, 16);
7762 const LLT V3S32 = LLT::fixed_vector(3, 32);
7763
7764 Register DstReg = MI.getOperand(0).getReg();
7765 Register NodePtr = MI.getOperand(2).getReg();
7766 Register RayExtent = MI.getOperand(3).getReg();
7767 Register RayOrigin = MI.getOperand(4).getReg();
7768 Register RayDir = MI.getOperand(5).getReg();
7769 Register RayInvDir = MI.getOperand(6).getReg();
7770 Register TDescr = MI.getOperand(7).getReg();
7771
7772 if (!ST.hasGFX10_AEncoding()) {
7773 Function &Fn = B.getMF().getFunction();
7775 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7776 return false;
7777 }
7778
7779 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7780 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7781 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7782 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7783 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7784 const unsigned NumVDataDwords = 4;
7785 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7786 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7787 const bool UseNSA =
7788 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7789
7790 const unsigned BaseOpcodes[2][2] = {
7791 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7792 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7793 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7794 int Opcode;
7795 if (UseNSA) {
7796 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7797 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7798 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7799 : AMDGPU::MIMGEncGfx10NSA,
7800 NumVDataDwords, NumVAddrDwords);
7801 } else {
7802 assert(!IsGFX12Plus);
7803 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7804 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7805 : AMDGPU::MIMGEncGfx10Default,
7806 NumVDataDwords, NumVAddrDwords);
7807 }
7808 assert(Opcode != -1);
7809
7811 if (UseNSA && IsGFX11Plus) {
7812 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7813 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7814 auto Merged = B.buildMergeLikeInstr(
7815 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7816 Ops.push_back(Merged.getReg(0));
7817 };
7818
7819 Ops.push_back(NodePtr);
7820 Ops.push_back(RayExtent);
7821 packLanes(RayOrigin);
7822
7823 if (IsA16) {
7824 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7825 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7826 auto MergedDir = B.buildMergeLikeInstr(
7827 V3S32,
7828 {B.buildBitcast(
7829 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7830 UnmergeRayDir.getReg(0)}))
7831 .getReg(0),
7832 B.buildBitcast(
7833 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7834 UnmergeRayDir.getReg(1)}))
7835 .getReg(0),
7836 B.buildBitcast(
7837 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7838 UnmergeRayDir.getReg(2)}))
7839 .getReg(0)});
7840 Ops.push_back(MergedDir.getReg(0));
7841 } else {
7842 packLanes(RayDir);
7843 packLanes(RayInvDir);
7844 }
7845 } else {
7846 if (Is64) {
7847 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7848 Ops.push_back(Unmerge.getReg(0));
7849 Ops.push_back(Unmerge.getReg(1));
7850 } else {
7851 Ops.push_back(NodePtr);
7852 }
7853 Ops.push_back(RayExtent);
7854
7855 auto packLanes = [&Ops, &S32, &B](Register Src) {
7856 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7857 Ops.push_back(Unmerge.getReg(0));
7858 Ops.push_back(Unmerge.getReg(1));
7859 Ops.push_back(Unmerge.getReg(2));
7860 };
7861
7862 packLanes(RayOrigin);
7863 if (IsA16) {
7864 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7865 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7869 B.buildMergeLikeInstr(R1,
7870 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7871 B.buildMergeLikeInstr(
7872 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7873 B.buildMergeLikeInstr(
7874 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7875 Ops.push_back(R1);
7876 Ops.push_back(R2);
7877 Ops.push_back(R3);
7878 } else {
7879 packLanes(RayDir);
7880 packLanes(RayInvDir);
7881 }
7882 }
7883
7884 if (!UseNSA) {
7885 // Build a single vector containing all the operands so far prepared.
7886 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7887 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7888 Ops.clear();
7889 Ops.push_back(MergedOps);
7890 }
7891
7892 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7893 .addDef(DstReg)
7894 .addImm(Opcode);
7895
7896 for (Register R : Ops) {
7897 MIB.addUse(R);
7898 }
7899
7900 MIB.addUse(TDescr)
7901 .addImm(IsA16 ? 1 : 0)
7902 .cloneMemRefs(MI);
7903
7904 MI.eraseFromParent();
7905 return true;
7906}
7907
7909 MachineInstr &MI, MachineIRBuilder &B) const {
7910 const LLT S32 = LLT::scalar(32);
7911 const LLT V2S32 = LLT::fixed_vector(2, 32);
7912
7913 Register DstReg = MI.getOperand(0).getReg();
7914 Register DstOrigin = MI.getOperand(1).getReg();
7915 Register DstDir = MI.getOperand(2).getReg();
7916 Register NodePtr = MI.getOperand(4).getReg();
7917 Register RayExtent = MI.getOperand(5).getReg();
7918 Register InstanceMask = MI.getOperand(6).getReg();
7919 Register RayOrigin = MI.getOperand(7).getReg();
7920 Register RayDir = MI.getOperand(8).getReg();
7921 Register Offsets = MI.getOperand(9).getReg();
7922 Register TDescr = MI.getOperand(10).getReg();
7923
7924 if (!ST.hasBVHDualAndBVH8Insts()) {
7925 Function &Fn = B.getMF().getFunction();
7927 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7928 return false;
7929 }
7930
7931 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7932 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7933 const unsigned NumVDataDwords = 10;
7934 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7935 int Opcode = AMDGPU::getMIMGOpcode(
7936 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7937 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7938 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7939 assert(Opcode != -1);
7940
7941 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7942 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7943
7944 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7945 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7946 .addDef(DstReg)
7947 .addDef(DstOrigin)
7948 .addDef(DstDir)
7949 .addImm(Opcode)
7950 .addUse(NodePtr)
7951 .addUse(RayExtentInstanceMaskVec.getReg(0))
7952 .addUse(RayOrigin)
7953 .addUse(RayDir)
7954 .addUse(Offsets)
7955 .addUse(TDescr)
7956 .cloneMemRefs(MI);
7957
7958 MI.eraseFromParent();
7959 return true;
7960}
7961
7963 MachineIRBuilder &B) const {
7964 const SITargetLowering *TLI = ST.getTargetLowering();
7966 Register DstReg = MI.getOperand(0).getReg();
7967 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7968 MI.eraseFromParent();
7969 return true;
7970}
7971
7973 MachineIRBuilder &B) const {
7974 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7975 if (!ST.hasArchitectedSGPRs())
7976 return false;
7977 LLT S32 = LLT::scalar(32);
7978 Register DstReg = MI.getOperand(0).getReg();
7979 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7980 auto LSB = B.buildConstant(S32, 25);
7981 auto Width = B.buildConstant(S32, 5);
7982 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7983 MI.eraseFromParent();
7984 return true;
7985}
7986
7989 AMDGPU::Hwreg::Id HwReg,
7990 unsigned LowBit,
7991 unsigned Width) const {
7992 MachineRegisterInfo &MRI = *B.getMRI();
7993 Register DstReg = MI.getOperand(0).getReg();
7994 if (!MRI.getRegClassOrNull(DstReg))
7995 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7996 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7997 .addDef(DstReg)
7998 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7999 MI.eraseFromParent();
8000 return true;
8001}
8002
8003static constexpr unsigned FPEnvModeBitField =
8005
8006static constexpr unsigned FPEnvTrapBitField =
8008
8011 MachineIRBuilder &B) const {
8012 Register Src = MI.getOperand(0).getReg();
8013 if (MRI.getType(Src) != S64)
8014 return false;
8015
8016 auto ModeReg =
8017 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8018 /*HasSideEffects=*/true, /*isConvergent=*/false)
8019 .addImm(FPEnvModeBitField);
8020 auto TrapReg =
8021 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
8022 /*HasSideEffects=*/true, /*isConvergent=*/false)
8023 .addImm(FPEnvTrapBitField);
8024 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
8025 MI.eraseFromParent();
8026 return true;
8027}
8028
8031 MachineIRBuilder &B) const {
8032 Register Src = MI.getOperand(0).getReg();
8033 if (MRI.getType(Src) != S64)
8034 return false;
8035
8036 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
8037 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8038 /*HasSideEffects=*/true, /*isConvergent=*/false)
8039 .addImm(static_cast<int16_t>(FPEnvModeBitField))
8040 .addReg(Unmerge.getReg(0));
8041 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
8042 /*HasSideEffects=*/true, /*isConvergent=*/false)
8043 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
8044 .addReg(Unmerge.getReg(1));
8045 MI.eraseFromParent();
8046 return true;
8047}
8048
8050 MachineInstr &MI) const {
8051 MachineIRBuilder &B = Helper.MIRBuilder;
8052 MachineRegisterInfo &MRI = *B.getMRI();
8053
8054 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8055 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8056 switch (IntrID) {
8057 case Intrinsic::sponentry:
8058 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8059 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8060 // that we can remove this cast.
8061 const LLT S32 = LLT::scalar(32);
8063 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8064
8065 Register DstReg = MI.getOperand(0).getReg();
8066 B.buildIntToPtr(DstReg, TmpReg);
8067 MI.eraseFromParent();
8068 } else {
8069 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8070 1, 0, /*IsImmutable=*/false);
8071 B.buildFrameIndex(MI.getOperand(0), FI);
8072 MI.eraseFromParent();
8073 }
8074 return true;
8075 case Intrinsic::amdgcn_if:
8076 case Intrinsic::amdgcn_else: {
8077 MachineInstr *Br = nullptr;
8078 MachineBasicBlock *UncondBrTarget = nullptr;
8079 bool Negated = false;
8080 if (MachineInstr *BrCond =
8081 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8082 const SIRegisterInfo *TRI
8083 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8084
8085 Register Def = MI.getOperand(1).getReg();
8086 Register Use = MI.getOperand(3).getReg();
8087
8088 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8089
8090 if (Negated)
8091 std::swap(CondBrTarget, UncondBrTarget);
8092
8093 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8094 if (IntrID == Intrinsic::amdgcn_if) {
8095 B.buildInstr(AMDGPU::SI_IF)
8096 .addDef(Def)
8097 .addUse(Use)
8098 .addMBB(UncondBrTarget);
8099 } else {
8100 B.buildInstr(AMDGPU::SI_ELSE)
8101 .addDef(Def)
8102 .addUse(Use)
8103 .addMBB(UncondBrTarget);
8104 }
8105
8106 if (Br) {
8107 Br->getOperand(0).setMBB(CondBrTarget);
8108 } else {
8109 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8110 // since we're swapping branch targets it needs to be reinserted.
8111 // FIXME: IRTranslator should probably not do this
8112 B.buildBr(*CondBrTarget);
8113 }
8114
8115 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8116 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8117 MI.eraseFromParent();
8118 BrCond->eraseFromParent();
8119 return true;
8120 }
8121
8122 return false;
8123 }
8124 case Intrinsic::amdgcn_loop: {
8125 MachineInstr *Br = nullptr;
8126 MachineBasicBlock *UncondBrTarget = nullptr;
8127 bool Negated = false;
8128 if (MachineInstr *BrCond =
8129 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8130 const SIRegisterInfo *TRI
8131 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8132
8133 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8134 Register Reg = MI.getOperand(2).getReg();
8135
8136 if (Negated)
8137 std::swap(CondBrTarget, UncondBrTarget);
8138
8139 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8140 B.buildInstr(AMDGPU::SI_LOOP)
8141 .addUse(Reg)
8142 .addMBB(UncondBrTarget);
8143
8144 if (Br)
8145 Br->getOperand(0).setMBB(CondBrTarget);
8146 else
8147 B.buildBr(*CondBrTarget);
8148
8149 MI.eraseFromParent();
8150 BrCond->eraseFromParent();
8151 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8152 return true;
8153 }
8154
8155 return false;
8156 }
8157 case Intrinsic::amdgcn_addrspacecast_nonnull:
8158 return legalizeAddrSpaceCast(MI, MRI, B);
8159 case Intrinsic::amdgcn_make_buffer_rsrc:
8160 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8161 case Intrinsic::amdgcn_kernarg_segment_ptr:
8162 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8163 // This only makes sense to call in a kernel, so just lower to null.
8164 B.buildConstant(MI.getOperand(0).getReg(), 0);
8165 MI.eraseFromParent();
8166 return true;
8167 }
8168
8171 case Intrinsic::amdgcn_implicitarg_ptr:
8172 return legalizeImplicitArgPtr(MI, MRI, B);
8173 case Intrinsic::amdgcn_workitem_id_x:
8174 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8176 case Intrinsic::amdgcn_workitem_id_y:
8177 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8179 case Intrinsic::amdgcn_workitem_id_z:
8180 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8182 case Intrinsic::amdgcn_workgroup_id_x:
8183 return legalizeWorkGroupId(
8187 case Intrinsic::amdgcn_workgroup_id_y:
8188 return legalizeWorkGroupId(
8192 case Intrinsic::amdgcn_workgroup_id_z:
8193 return legalizeWorkGroupId(
8197 case Intrinsic::amdgcn_cluster_id_x:
8198 return ST.hasClusters() &&
8201 case Intrinsic::amdgcn_cluster_id_y:
8202 return ST.hasClusters() &&
8205 case Intrinsic::amdgcn_cluster_id_z:
8206 return ST.hasClusters() &&
8209 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8210 return ST.hasClusters() &&
8213 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8214 return ST.hasClusters() &&
8217 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8218 return ST.hasClusters() &&
8221 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8222 return ST.hasClusters() &&
8224 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8225 return ST.hasClusters() &&
8228 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8229 return ST.hasClusters() &&
8232 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8233 return ST.hasClusters() &&
8236 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8237 return ST.hasClusters() &&
8239 MI, MRI, B,
8241 case Intrinsic::amdgcn_wave_id:
8242 return legalizeWaveID(MI, B);
8243 case Intrinsic::amdgcn_lds_kernel_id:
8244 return legalizePreloadedArgIntrin(MI, MRI, B,
8246 case Intrinsic::amdgcn_dispatch_ptr:
8247 return legalizePreloadedArgIntrin(MI, MRI, B,
8249 case Intrinsic::amdgcn_queue_ptr:
8250 return legalizePreloadedArgIntrin(MI, MRI, B,
8252 case Intrinsic::amdgcn_implicit_buffer_ptr:
8255 case Intrinsic::amdgcn_dispatch_id:
8256 return legalizePreloadedArgIntrin(MI, MRI, B,
8258 case Intrinsic::r600_read_ngroups_x:
8259 // TODO: Emit error for hsa
8262 case Intrinsic::r600_read_ngroups_y:
8265 case Intrinsic::r600_read_ngroups_z:
8268 case Intrinsic::r600_read_local_size_x:
8269 // TODO: Could insert G_ASSERT_ZEXT from s16
8271 case Intrinsic::r600_read_local_size_y:
8272 // TODO: Could insert G_ASSERT_ZEXT from s16
8274 // TODO: Could insert G_ASSERT_ZEXT from s16
8275 case Intrinsic::r600_read_local_size_z:
8278 case Intrinsic::amdgcn_fdiv_fast:
8279 return legalizeFDIVFastIntrin(MI, MRI, B);
8280 case Intrinsic::amdgcn_is_shared:
8282 case Intrinsic::amdgcn_is_private:
8284 case Intrinsic::amdgcn_wavefrontsize: {
8285 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8286 MI.eraseFromParent();
8287 return true;
8288 }
8289 case Intrinsic::amdgcn_s_buffer_load:
8290 return legalizeSBufferLoad(Helper, MI);
8291 case Intrinsic::amdgcn_raw_buffer_store:
8292 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8293 case Intrinsic::amdgcn_struct_buffer_store:
8294 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8295 return legalizeBufferStore(MI, Helper, false, false);
8296 case Intrinsic::amdgcn_raw_buffer_store_format:
8297 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8298 case Intrinsic::amdgcn_struct_buffer_store_format:
8299 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8300 return legalizeBufferStore(MI, Helper, false, true);
8301 case Intrinsic::amdgcn_raw_tbuffer_store:
8302 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8303 case Intrinsic::amdgcn_struct_tbuffer_store:
8304 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8305 return legalizeBufferStore(MI, Helper, true, true);
8306 case Intrinsic::amdgcn_raw_buffer_load:
8307 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8308 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8309 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8310 case Intrinsic::amdgcn_struct_buffer_load:
8311 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8312 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8313 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8314 return legalizeBufferLoad(MI, Helper, false, false);
8315 case Intrinsic::amdgcn_raw_buffer_load_format:
8316 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8317 case Intrinsic::amdgcn_struct_buffer_load_format:
8318 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8319 return legalizeBufferLoad(MI, Helper, true, false);
8320 case Intrinsic::amdgcn_raw_tbuffer_load:
8321 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8322 case Intrinsic::amdgcn_struct_tbuffer_load:
8323 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8324 return legalizeBufferLoad(MI, Helper, true, true);
8325 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8326 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8327 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8328 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8329 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8330 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8331 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8332 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8333 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8334 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8335 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8336 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8337 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8338 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8339 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8340 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8341 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8342 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8343 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8344 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8345 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8346 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8347 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8348 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8349 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8350 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8351 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8352 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8353 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8354 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8355 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8356 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8357 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8358 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8359 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8360 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8361 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8362 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8363 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8364 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8365 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8366 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8367 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8368 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8369 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8370 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8371 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8372 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8373 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8374 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8375 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8376 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8377 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8378 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8379 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8380 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8381 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8382 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8383 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8384 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8385 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8386 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8387 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8388 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8389 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8390 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8391 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8392 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8393 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8394 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8395 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8396 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8397 return legalizeBufferAtomic(MI, B, IntrID);
8398 case Intrinsic::amdgcn_rsq_clamp:
8399 return legalizeRsqClampIntrinsic(MI, MRI, B);
8400 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8402 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8403 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8405 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8406 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8407 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8408 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8409 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8410 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8411 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8412 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8413 Register Index = MI.getOperand(5).getReg();
8414 LLT S64 = LLT::scalar(64);
8415 LLT IndexArgTy = MRI.getType(Index);
8416 if (IndexArgTy != S64) {
8417 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8418 : B.buildAnyExt(S64, Index);
8419 MI.getOperand(5).setReg(NewIndex.getReg(0));
8420 }
8421 return true;
8422 }
8423 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8424 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8425 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8426 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8427 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8428 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8429 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8430 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8431 Register Index = MI.getOperand(5).getReg();
8432 LLT S32 = LLT::scalar(32);
8433 if (MRI.getType(Index) != S32)
8434 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8435 return true;
8436 }
8437 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8438 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8439 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8440 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8441 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8442 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8443 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8444 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8445 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8446 Register Index = MI.getOperand(7).getReg();
8447 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8448 ? LLT::scalar(64)
8449 : LLT::scalar(32);
8450 LLT IndexArgTy = MRI.getType(Index);
8451 if (IndexArgTy != IdxTy) {
8452 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8453 : B.buildAnyExt(IdxTy, Index);
8454 MI.getOperand(7).setReg(NewIndex.getReg(0));
8455 }
8456 return true;
8457 }
8458
8459 case Intrinsic::amdgcn_fmed3: {
8460 GISelChangeObserver &Observer = Helper.Observer;
8461
8462 // FIXME: This is to workaround the inability of tablegen match combiners to
8463 // match intrinsics in patterns.
8464 Observer.changingInstr(MI);
8465 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8466 MI.removeOperand(1);
8467 Observer.changedInstr(MI);
8468 return true;
8469 }
8470 case Intrinsic::amdgcn_readlane:
8471 case Intrinsic::amdgcn_writelane:
8472 case Intrinsic::amdgcn_readfirstlane:
8473 case Intrinsic::amdgcn_permlane16:
8474 case Intrinsic::amdgcn_permlanex16:
8475 case Intrinsic::amdgcn_permlane64:
8476 case Intrinsic::amdgcn_set_inactive:
8477 case Intrinsic::amdgcn_set_inactive_chain_arg:
8478 case Intrinsic::amdgcn_mov_dpp8:
8479 case Intrinsic::amdgcn_update_dpp:
8480 case Intrinsic::amdgcn_permlane_bcast:
8481 case Intrinsic::amdgcn_permlane_up:
8482 case Intrinsic::amdgcn_permlane_down:
8483 case Intrinsic::amdgcn_permlane_xor:
8484 return legalizeLaneOp(Helper, MI, IntrID);
8485 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8486 return legalizeSBufferPrefetch(Helper, MI);
8487 case Intrinsic::amdgcn_dead: {
8488 // TODO: Use poison instead of undef
8489 for (const MachineOperand &Def : MI.defs())
8490 B.buildUndef(Def);
8491 MI.eraseFromParent();
8492 return true;
8493 }
8494 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8495 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8496 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8497 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8498 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8499 MI.eraseFromParent();
8500 return true;
8501 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8502 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8503 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8504 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8505 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8506 MI.eraseFromParent();
8507 return true;
8508 case Intrinsic::amdgcn_flat_load_monitor_b32:
8509 case Intrinsic::amdgcn_flat_load_monitor_b64:
8510 case Intrinsic::amdgcn_flat_load_monitor_b128:
8511 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8512 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8513 .add(MI.getOperand(0))
8514 .add(MI.getOperand(2))
8515 .addMemOperand(*MI.memoperands_begin());
8516 MI.eraseFromParent();
8517 return true;
8518 case Intrinsic::amdgcn_global_load_monitor_b32:
8519 case Intrinsic::amdgcn_global_load_monitor_b64:
8520 case Intrinsic::amdgcn_global_load_monitor_b128:
8521 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8522 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8523 .add(MI.getOperand(0))
8524 .add(MI.getOperand(2))
8525 .addMemOperand(*MI.memoperands_begin());
8526 MI.eraseFromParent();
8527 return true;
8528 default: {
8529 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8531 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8532 return true;
8533 }
8534 }
8535
8536 return true;
8537}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:77
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1273
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLZ_ZERO_POISON(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1217
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
LLVM_ABI bool isExactlyValue(const APFloat &V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
LLT getScalarType() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr bool isAnyScalar() const
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty if condition is met.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:383
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:318
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:858
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:557
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:1984
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:653
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:461
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:314
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1682
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:347
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:78
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.