LLVM 23.0.0git
AMDGPULegalizerInfo.cpp
Go to the documentation of this file.
1//===- AMDGPULegalizerInfo.cpp -----------------------------------*- C++ -*-==//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8/// \file
9/// This file implements the targeting of the Machinelegalizer class for
10/// AMDGPU.
11/// \todo This should be generated by TableGen.
12//===----------------------------------------------------------------------===//
13
14#include "AMDGPULegalizerInfo.h"
15
16#include "AMDGPU.h"
18#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
22#include "SIInstrInfo.h"
24#include "SIRegisterInfo.h"
26#include "llvm/ADT/ScopeExit.h"
37#include "llvm/IR/IntrinsicsAMDGPU.h"
38#include "llvm/IR/IntrinsicsR600.h"
39
40#define DEBUG_TYPE "amdgpu-legalinfo"
41
42using namespace llvm;
43using namespace LegalizeActions;
44using namespace LegalizeMutations;
45using namespace LegalityPredicates;
46using namespace MIPatternMatch;
47
48// Hack until load/store selection patterns support any tuple of legal types.
50 "amdgpu-global-isel-new-legality",
51 cl::desc("Use GlobalISel desired legality, rather than try to use"
52 "rules compatible with selection patterns"),
53 cl::init(false),
55
56static constexpr unsigned MaxRegisterSize = 1024;
57
58// Round the number of elements to the next power of two elements
60 unsigned NElts = Ty.getNumElements();
61 unsigned Pow2NElts = 1 << Log2_32_Ceil(NElts);
62 return Ty.changeElementCount(ElementCount::getFixed(Pow2NElts));
63}
64
65// Round the number of bits to the next power of two bits
67 unsigned Bits = Ty.getSizeInBits();
68 unsigned Pow2Bits = 1 << Log2_32_Ceil(Bits);
69 return LLT::scalar(Pow2Bits);
70}
71
72/// \returns true if this is an odd sized vector which should widen by adding an
73/// additional element. This is mostly to handle <3 x s16> -> <4 x s16>. This
74/// excludes s1 vectors, which should always be scalarized.
75static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
76 return [=](const LegalityQuery &Query) {
77 const LLT Ty = Query.Types[TypeIdx];
78 if (!Ty.isVector())
79 return false;
80
81 const LLT EltTy = Ty.getElementType();
82 const unsigned EltSize = EltTy.getSizeInBits();
83 return Ty.getNumElements() % 2 != 0 &&
84 EltSize > 1 && EltSize < 32 &&
85 Ty.getSizeInBits() % 32 != 0;
86 };
87}
88
89static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx) {
90 return [=](const LegalityQuery &Query) {
91 const LLT Ty = Query.Types[TypeIdx];
92 return Ty.getSizeInBits() % 32 == 0;
93 };
94}
95
96static LegalityPredicate isWideVec16(unsigned TypeIdx) {
97 return [=](const LegalityQuery &Query) {
98 const LLT Ty = Query.Types[TypeIdx];
99 const LLT EltTy = Ty.getScalarType();
100 return EltTy.getSizeInBits() == 16 && Ty.getNumElements() > 2;
101 };
102}
103
104static LegalizeMutation oneMoreElement(unsigned TypeIdx) {
105 return [=](const LegalityQuery &Query) {
106 const LLT Ty = Query.Types[TypeIdx];
107 const LLT EltTy = Ty.getElementType();
108 return std::pair(TypeIdx,
109 LLT::fixed_vector(Ty.getNumElements() + 1, EltTy));
110 };
111}
112
114 return [=](const LegalityQuery &Query) {
115 const LLT Ty = Query.Types[TypeIdx];
116 const LLT EltTy = Ty.getElementType();
117 unsigned Size = Ty.getSizeInBits();
118 unsigned Pieces = (Size + 63) / 64;
119 unsigned NewNumElts = (Ty.getNumElements() + 1) / Pieces;
120 return std::pair(TypeIdx, LLT::scalarOrVector(
121 ElementCount::getFixed(NewNumElts), EltTy));
122 };
123}
124
125// Increase the number of vector elements to reach the next multiple of 32-bit
126// type.
127static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
128 return [=](const LegalityQuery &Query) {
129 const LLT Ty = Query.Types[TypeIdx];
130
131 const LLT EltTy = Ty.getElementType();
132 const int Size = Ty.getSizeInBits();
133 const int EltSize = EltTy.getSizeInBits();
134 const int NextMul32 = (Size + 31) / 32;
135
136 assert(EltSize < 32);
137
138 const int NewNumElts = (32 * NextMul32 + EltSize - 1) / EltSize;
139 return std::pair(TypeIdx, LLT::fixed_vector(NewNumElts, EltTy));
140 };
141}
142
143// Retrieves the scalar type that's the same size as the mem desc
145 return [=](const LegalityQuery &Query) {
146 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
147 return std::make_pair(TypeIdx, LLT::scalar(MemSize));
148 };
149}
150
151// Increase the number of vector elements to reach the next legal RegClass.
153 return [=](const LegalityQuery &Query) {
154 const LLT Ty = Query.Types[TypeIdx];
155 const unsigned NumElts = Ty.getNumElements();
156 const unsigned EltSize = Ty.getElementType().getSizeInBits();
157 const unsigned MaxNumElts = MaxRegisterSize / EltSize;
158
159 assert(EltSize == 32 || EltSize == 64);
160 assert(Ty.getSizeInBits() < MaxRegisterSize);
161
162 unsigned NewNumElts;
163 // Find the nearest legal RegClass that is larger than the current type.
164 for (NewNumElts = NumElts; NewNumElts < MaxNumElts; ++NewNumElts) {
165 if (SIRegisterInfo::getSGPRClassForBitWidth(NewNumElts * EltSize))
166 break;
167 }
168 return std::pair(TypeIdx,
169 LLT::fixed_vector(NewNumElts, Ty.getElementType()));
170 };
171}
172
174 if (!Ty.isVector())
175 return LLT::scalar(128);
176 const ElementCount NumElems = Ty.getElementCount();
177 return LLT::vector(NumElems, LLT::scalar(128));
178}
179
181 if (!Ty.isVector())
182 return LLT::fixed_vector(4, LLT::scalar(32));
183 const unsigned NumElems = Ty.getElementCount().getFixedValue();
184 return LLT::fixed_vector(NumElems * 4, LLT::scalar(32));
185}
186
188 const unsigned Size = Ty.getSizeInBits();
189
190 if (Size <= 32) {
191 // <2 x s8> -> s16
192 // <4 x s8> -> s32
193 return LLT::scalar(Size);
194 }
195
197}
198
199static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
200 return [=](const LegalityQuery &Query) {
201 const LLT Ty = Query.Types[TypeIdx];
202 return std::pair(TypeIdx, getBitcastRegisterType(Ty));
203 };
204}
205
207 return [=](const LegalityQuery &Query) {
208 const LLT Ty = Query.Types[TypeIdx];
209 unsigned Size = Ty.getSizeInBits();
210 assert(Size % 32 == 0);
211 return std::pair(
213 };
214}
215
216static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
217 return [=](const LegalityQuery &Query) {
218 const LLT QueryTy = Query.Types[TypeIdx];
219 return QueryTy.isVector() && QueryTy.getSizeInBits() < Size;
220 };
221}
222
223static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size) {
224 return [=](const LegalityQuery &Query) {
225 const LLT QueryTy = Query.Types[TypeIdx];
226 return QueryTy.isVector() && QueryTy.getSizeInBits() > Size;
227 };
228}
229
230static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
231 return [=](const LegalityQuery &Query) {
232 const LLT QueryTy = Query.Types[TypeIdx];
233 return QueryTy.isVector() && QueryTy.getNumElements() % 2 != 0;
234 };
235}
236
237static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size) {
238 return ((ST.useRealTrue16Insts() && Size == 16) || Size % 32 == 0) &&
240}
241
243 const int EltSize = EltTy.getSizeInBits();
244 return EltSize == 16 || EltSize % 32 == 0;
245}
246
247static bool isRegisterVectorType(LLT Ty) {
248 const int EltSize = Ty.getElementType().getSizeInBits();
249 return EltSize == 32 || EltSize == 64 ||
250 (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
251 EltSize == 128 || EltSize == 256;
252}
253
254// TODO: replace all uses of isRegisterType with isRegisterClassType
255static bool isRegisterType(const GCNSubtarget &ST, LLT Ty) {
256 if (!isRegisterSize(ST, Ty.getSizeInBits()))
257 return false;
258
259 if (Ty.isVector())
260 return isRegisterVectorType(Ty);
261
262 return true;
263}
264
265// Any combination of 32 or 64-bit elements up the maximum register size, and
266// multiples of v2s16.
268 unsigned TypeIdx) {
269 return [=, &ST](const LegalityQuery &Query) {
270 return isRegisterType(ST, Query.Types[TypeIdx]);
271 };
272}
273
274// RegisterType that doesn't have a corresponding RegClass.
275// TODO: Once `isRegisterType` is replaced with `isRegisterClassType` this
276// should be removed.
278 unsigned TypeIdx) {
279 return [=, &ST](const LegalityQuery &Query) {
280 LLT Ty = Query.Types[TypeIdx];
281 return isRegisterType(ST, Ty) &&
282 !SIRegisterInfo::getSGPRClassForBitWidth(Ty.getSizeInBits());
283 };
284}
285
286static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
287 return [=](const LegalityQuery &Query) {
288 const LLT QueryTy = Query.Types[TypeIdx];
289 if (!QueryTy.isVector())
290 return false;
291 const LLT EltTy = QueryTy.getElementType();
292 return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
293 };
294}
295
296constexpr LLT S1 = LLT::scalar(1);
297constexpr LLT S8 = LLT::scalar(8);
298constexpr LLT S16 = LLT::scalar(16);
299constexpr LLT S32 = LLT::scalar(32);
300constexpr LLT F32 = LLT::float32();
301constexpr LLT S64 = LLT::scalar(64);
302constexpr LLT F64 = LLT::float64();
303constexpr LLT S96 = LLT::scalar(96);
304constexpr LLT S128 = LLT::scalar(128);
305constexpr LLT S160 = LLT::scalar(160);
306constexpr LLT S192 = LLT::scalar(192);
307constexpr LLT S224 = LLT::scalar(224);
308constexpr LLT S256 = LLT::scalar(256);
309constexpr LLT S512 = LLT::scalar(512);
310constexpr LLT S1024 = LLT::scalar(1024);
312
313constexpr LLT V2S8 = LLT::fixed_vector(2, 8);
314constexpr LLT V2S16 = LLT::fixed_vector(2, 16);
315constexpr LLT V4S16 = LLT::fixed_vector(4, 16);
316constexpr LLT V6S16 = LLT::fixed_vector(6, 16);
317constexpr LLT V8S16 = LLT::fixed_vector(8, 16);
318constexpr LLT V10S16 = LLT::fixed_vector(10, 16);
319constexpr LLT V12S16 = LLT::fixed_vector(12, 16);
320constexpr LLT V16S16 = LLT::fixed_vector(16, 16);
321
323constexpr LLT V2BF16 = V2F16; // FIXME
324
325constexpr LLT V2S32 = LLT::fixed_vector(2, 32);
326constexpr LLT V3S32 = LLT::fixed_vector(3, 32);
327constexpr LLT V4S32 = LLT::fixed_vector(4, 32);
328constexpr LLT V5S32 = LLT::fixed_vector(5, 32);
329constexpr LLT V6S32 = LLT::fixed_vector(6, 32);
330constexpr LLT V7S32 = LLT::fixed_vector(7, 32);
331constexpr LLT V8S32 = LLT::fixed_vector(8, 32);
332constexpr LLT V9S32 = LLT::fixed_vector(9, 32);
333constexpr LLT V10S32 = LLT::fixed_vector(10, 32);
334constexpr LLT V11S32 = LLT::fixed_vector(11, 32);
335constexpr LLT V12S32 = LLT::fixed_vector(12, 32);
336constexpr LLT V16S32 = LLT::fixed_vector(16, 32);
337constexpr LLT V32S32 = LLT::fixed_vector(32, 32);
338
339constexpr LLT V2S64 = LLT::fixed_vector(2, 64);
340constexpr LLT V3S64 = LLT::fixed_vector(3, 64);
341constexpr LLT V4S64 = LLT::fixed_vector(4, 64);
342constexpr LLT V5S64 = LLT::fixed_vector(5, 64);
343constexpr LLT V6S64 = LLT::fixed_vector(6, 64);
344constexpr LLT V7S64 = LLT::fixed_vector(7, 64);
345constexpr LLT V8S64 = LLT::fixed_vector(8, 64);
346constexpr LLT V16S64 = LLT::fixed_vector(16, 64);
347
348constexpr LLT V2S128 = LLT::fixed_vector(2, 128);
349constexpr LLT V4S128 = LLT::fixed_vector(4, 128);
350
351constexpr std::initializer_list<LLT> AllScalarTypes = {
353
354constexpr std::initializer_list<LLT> AllS16Vectors{
356
357constexpr std::initializer_list<LLT> AllS32Vectors = {
360
361constexpr std::initializer_list<LLT> AllS64Vectors = {
363
369
370// Checks whether a type is in the list of legal register types.
371static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty) {
372 if (Ty.isPointerOrPointerVector())
373 Ty = Ty.changeElementType(LLT::scalar(Ty.getScalarSizeInBits()));
374
377 (ST.useRealTrue16Insts() && Ty == S16) ||
379}
380
382 unsigned TypeIdx) {
383 return [&ST, TypeIdx](const LegalityQuery &Query) {
384 return isRegisterClassType(ST, Query.Types[TypeIdx]);
385 };
386}
387
388// If we have a truncating store or an extending load with a data size larger
389// than 32-bits, we need to reduce to a 32-bit type.
391 return [=](const LegalityQuery &Query) {
392 const LLT Ty = Query.Types[TypeIdx];
393 return !Ty.isVector() && Ty.getSizeInBits() > 32 &&
394 Query.MMODescrs[0].MemoryTy.getSizeInBits() < Ty.getSizeInBits();
395 };
396}
397
398// If we have a truncating store or an extending load with a data size larger
399// than 32-bits and mem location is a power of 2
401 return [=](const LegalityQuery &Query) {
402 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
403 return isWideScalarExtLoadTruncStore(TypeIdx)(Query) &&
404 isPowerOf2_64(MemSize);
405 };
406}
407
408// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
409// handle some operations by just promoting the register during
410// selection. There are also d16 loads on GFX9+ which preserve the high bits.
411static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
412 bool IsLoad, bool IsAtomic) {
413 switch (AS) {
415 // FIXME: Private element size.
416 return ST.hasFlatScratchEnabled() ? 128 : 32;
418 return ST.useDS128() ? 128 : 64;
423 // Treat constant and global as identical. SMRD loads are sometimes usable for
424 // global loads (ideally constant address space should be eliminated)
425 // depending on the context. Legality cannot be context dependent, but
426 // RegBankSelect can split the load as necessary depending on the pointer
427 // register bank/uniformity and if the memory is invariant or not written in a
428 // kernel.
429 return IsLoad ? 512 : 128;
430 default:
431 // FIXME: Flat addresses may contextually need to be split to 32-bit parts
432 // if they may alias scratch depending on the subtarget. This needs to be
433 // moved to custom handling to use addressMayBeAccessedAsPrivate
434 return ST.hasMultiDwordFlatScratchAddressing() || IsAtomic ? 128 : 32;
435 }
436}
437
438static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
439 const LegalityQuery &Query) {
440 const LLT Ty = Query.Types[0];
441
442 // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
443 const bool IsLoad = Query.Opcode != AMDGPU::G_STORE;
444
445 unsigned RegSize = Ty.getSizeInBits();
446 uint64_t MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
447 uint64_t AlignBits = Query.MMODescrs[0].AlignInBits;
448 unsigned AS = Query.Types[1].getAddressSpace();
449
450 // All of these need to be custom lowered to cast the pointer operand.
452 return false;
453
454 // Do not handle extending vector loads.
455 if (Ty.isVector() && MemSize != RegSize)
456 return false;
457
458 // TODO: We should be able to widen loads if the alignment is high enough, but
459 // we also need to modify the memory access size.
460#if 0
461 // Accept widening loads based on alignment.
462 if (IsLoad && MemSize < Size)
463 MemSize = std::max(MemSize, Align);
464#endif
465
466 // Only 1-byte and 2-byte to 32-bit extloads are valid.
467 if (MemSize != RegSize && RegSize != 32)
468 return false;
469
470 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
471 Query.MMODescrs[0].Ordering !=
473 return false;
474
475 switch (MemSize) {
476 case 8:
477 case 16:
478 case 32:
479 case 64:
480 case 128:
481 break;
482 case 96:
483 if (!ST.hasDwordx3LoadStores())
484 return false;
485 break;
486 case 256:
487 case 512:
488 // These may contextually need to be broken down.
489 break;
490 default:
491 return false;
492 }
493
494 assert(RegSize >= MemSize);
495
496 if (AlignBits < MemSize) {
497 const SITargetLowering *TLI = ST.getTargetLowering();
498 if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS,
499 Align(AlignBits / 8)))
500 return false;
501 }
502
503 return true;
504}
505
506// The newer buffer intrinsic forms take their resource arguments as
507// pointers in address space 8, aka s128 values. However, in order to not break
508// SelectionDAG, the underlying operations have to continue to take v4i32
509// arguments. Therefore, we convert resource pointers - or vectors of them
510// to integer values here.
511static bool hasBufferRsrcWorkaround(const LLT Ty) {
512 if (Ty.isPointer() && Ty.getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
513 return true;
514 if (Ty.isVector()) {
515 const LLT ElemTy = Ty.getElementType();
516 return hasBufferRsrcWorkaround(ElemTy);
517 }
518 return false;
519}
520
521// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
522// workaround this. Eventually it should ignore the type for loads and only care
523// about the size. Return true in cases where we will workaround this for now by
524// bitcasting.
525static bool loadStoreBitcastWorkaround(const LLT Ty) {
527 return false;
528
529 const unsigned Size = Ty.getSizeInBits();
530 if (Ty.isPointerVector())
531 return true;
532 if (Size <= 64)
533 return false;
534 // Address space 8 pointers get their own workaround.
536 return false;
537 if (!Ty.isVector())
538 return true;
539
540 unsigned EltSize = Ty.getScalarSizeInBits();
541 return EltSize != 32 && EltSize != 64;
542}
543
544static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query) {
545 const LLT Ty = Query.Types[0];
546 return isRegisterType(ST, Ty) && isLoadStoreSizeLegal(ST, Query) &&
548}
549
550/// Return true if a load or store of the type should be lowered with a bitcast
551/// to a different type.
552static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty,
553 const LLT MemTy) {
554 const unsigned MemSizeInBits = MemTy.getSizeInBits();
555 const unsigned Size = Ty.getSizeInBits();
556 if (Size != MemSizeInBits)
557 return Size <= 32 && Ty.isVector();
558
560 return true;
561
562 // Don't try to handle bitcasting vector ext loads for now.
563 return Ty.isVector() && (!MemTy.isVector() || MemTy == Ty) &&
564 (Size <= 32 || isRegisterSize(ST, Size)) &&
565 !isRegisterVectorElementType(Ty.getElementType());
566}
567
568/// Return true if we should legalize a load by widening an odd sized memory
569/// access up to the alignment. Note this case when the memory access itself
570/// changes, not the size of the result register.
571static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy,
572 uint64_t AlignInBits, unsigned AddrSpace,
573 unsigned Opcode) {
574 unsigned SizeInBits = MemoryTy.getSizeInBits();
575 // We don't want to widen cases that are naturally legal.
576 if (isPowerOf2_32(SizeInBits))
577 return false;
578
579 // If we have 96-bit memory operations, we shouldn't touch them. Note we may
580 // end up widening these for a scalar load during RegBankSelect, if we don't
581 // have 96-bit scalar loads.
582 if (SizeInBits == 96 && ST.hasDwordx3LoadStores())
583 return false;
584
585 if (SizeInBits >= maxSizeForAddrSpace(ST, AddrSpace, Opcode, false))
586 return false;
587
588 // A load is known dereferenceable up to the alignment, so it's legal to widen
589 // to it.
590 //
591 // TODO: Could check dereferenceable for less aligned cases.
592 unsigned RoundedSize = NextPowerOf2(SizeInBits);
593 if (AlignInBits < RoundedSize)
594 return false;
595
596 // Do not widen if it would introduce a slow unaligned load.
597 const SITargetLowering *TLI = ST.getTargetLowering();
598 unsigned Fast = 0;
600 RoundedSize, AddrSpace, Align(AlignInBits / 8),
602 Fast;
603}
604
605static bool shouldWidenLoad(const GCNSubtarget &ST, const LegalityQuery &Query,
606 unsigned Opcode) {
607 if (Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic)
608 return false;
609
610 return shouldWidenLoad(ST, Query.MMODescrs[0].MemoryTy,
611 Query.MMODescrs[0].AlignInBits,
612 Query.Types[1].getAddressSpace(), Opcode);
613}
614
615/// Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial
616/// type of the operand `idx` and then to transform it to a `p8` via bitcasts
617/// and inttoptr. In addition, handle vectors of p8. Returns the new type.
619 MachineRegisterInfo &MRI, unsigned Idx) {
620 MachineOperand &MO = MI.getOperand(Idx);
621
622 const LLT PointerTy = MRI.getType(MO.getReg());
623
624 // Paranoidly prevent us from doing this multiple times.
626 return PointerTy;
627
628 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
629 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
630 if (!PointerTy.isVector()) {
631 // Happy path: (4 x s32) -> (s32, s32, s32, s32) -> (p8)
632 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
633 const LLT S32 = LLT::scalar(32);
634
635 Register VectorReg = MRI.createGenericVirtualRegister(VectorTy);
636 std::array<Register, 4> VectorElems;
637 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
638 for (unsigned I = 0; I < NumParts; ++I)
639 VectorElems[I] =
640 B.buildExtractVectorElementConstant(S32, VectorReg, I).getReg(0);
641 B.buildMergeValues(MO, VectorElems);
642 MO.setReg(VectorReg);
643 return VectorTy;
644 }
645 Register BitcastReg = MRI.createGenericVirtualRegister(VectorTy);
646 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
647 auto Scalar = B.buildBitcast(ScalarTy, BitcastReg);
648 B.buildIntToPtr(MO, Scalar);
649 MO.setReg(BitcastReg);
650
651 return VectorTy;
652}
653
654/// Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is
655/// the form in which the value must be in order to be passed to the low-level
656/// representations used for MUBUF/MTBUF intrinsics. This is a hack, which is
657/// needed in order to account for the fact that we can't define a register
658/// class for s128 without breaking SelectionDAG.
660 MachineRegisterInfo &MRI = *B.getMRI();
661 const LLT PointerTy = MRI.getType(Pointer);
662 const LLT ScalarTy = getBufferRsrcScalarType(PointerTy);
663 const LLT VectorTy = getBufferRsrcRegisterType(PointerTy);
664
665 if (!PointerTy.isVector()) {
666 // Special case: p8 -> (s32, s32, s32, s32) -> (4xs32)
667 SmallVector<Register, 4> PointerParts;
668 const unsigned NumParts = PointerTy.getSizeInBits() / 32;
669 auto Unmerged = B.buildUnmerge(LLT::scalar(32), Pointer);
670 for (unsigned I = 0; I < NumParts; ++I)
671 PointerParts.push_back(Unmerged.getReg(I));
672 return B.buildBuildVector(VectorTy, PointerParts).getReg(0);
673 }
674 Register Scalar = B.buildPtrToInt(ScalarTy, Pointer).getReg(0);
675 return B.buildBitcast(VectorTy, Scalar).getReg(0);
676}
677
679 unsigned Idx) {
680 MachineOperand &MO = MI.getOperand(Idx);
681
682 const LLT PointerTy = B.getMRI()->getType(MO.getReg());
683 // Paranoidly prevent us from doing this multiple times.
685 return;
687}
688
690 const GCNTargetMachine &TM)
691 : ST(ST_) {
692 using namespace TargetOpcode;
693
694 auto GetAddrSpacePtr = [&TM](unsigned AS) {
695 return LLT::pointer(AS, TM.getPointerSizeInBits(AS));
696 };
697
698 const LLT GlobalPtr = GetAddrSpacePtr(AMDGPUAS::GLOBAL_ADDRESS);
699 const LLT ConstantPtr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS);
700 const LLT Constant32Ptr = GetAddrSpacePtr(AMDGPUAS::CONSTANT_ADDRESS_32BIT);
701 const LLT LocalPtr = GetAddrSpacePtr(AMDGPUAS::LOCAL_ADDRESS);
702 const LLT RegionPtr = GetAddrSpacePtr(AMDGPUAS::REGION_ADDRESS);
703 const LLT FlatPtr = GetAddrSpacePtr(AMDGPUAS::FLAT_ADDRESS);
704 const LLT PrivatePtr = GetAddrSpacePtr(AMDGPUAS::PRIVATE_ADDRESS);
705 const LLT BufferFatPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_FAT_POINTER);
706 const LLT RsrcPtr = GetAddrSpacePtr(AMDGPUAS::BUFFER_RESOURCE);
707 const LLT BufferStridedPtr =
708 GetAddrSpacePtr(AMDGPUAS::BUFFER_STRIDED_POINTER);
709
710 const LLT CodePtr = FlatPtr;
711
712 const std::initializer_list<LLT> AddrSpaces64 = {
713 GlobalPtr, ConstantPtr, FlatPtr
714 };
715
716 const std::initializer_list<LLT> AddrSpaces32 = {
717 LocalPtr, PrivatePtr, Constant32Ptr, RegionPtr
718 };
719
720 const std::initializer_list<LLT> AddrSpaces128 = {RsrcPtr};
721
722 const std::initializer_list<LLT> FPTypesBase = {
723 S32, S64
724 };
725
726 const std::initializer_list<LLT> FPTypes16 = {
727 S32, S64, S16
728 };
729
730 const std::initializer_list<LLT> FPTypesPK16 = {
731 S32, S64, S16, V2S16
732 };
733
734 const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
735
736 // s1 for VCC branches, s32 for SCC branches.
738
739 // TODO: All multiples of 32, vectors of pointers, all v2s16 pairs, more
740 // elements for v3s16
743 .legalFor(AllS32Vectors)
745 .legalFor(AddrSpaces64)
746 .legalFor(AddrSpaces32)
747 .legalFor(AddrSpaces128)
748 .legalIf(isPointer(0))
749 .clampScalar(0, S16, S256)
751 .clampMaxNumElements(0, S32, 16)
753 .scalarize(0);
754
755 if (ST.hasVOP3PInsts() && ST.hasAddNoCarryInsts() && ST.hasIntClamp()) {
756 // Full set of gfx9 features.
757 if (ST.hasScalarAddSub64()) {
758 getActionDefinitionsBuilder({G_ADD, G_SUB})
759 .legalFor({S64, S32, S16, V2S16})
760 .clampMaxNumElementsStrict(0, S16, 2)
761 .scalarize(0)
762 .minScalar(0, S16)
764 .maxScalar(0, S32);
765 } else {
766 getActionDefinitionsBuilder({G_ADD, G_SUB})
767 .legalFor({S32, S16, V2S16})
768 .clampMaxNumElementsStrict(0, S16, 2)
769 .scalarize(0)
770 .minScalar(0, S16)
772 .maxScalar(0, S32);
773 }
774
775 if (ST.hasScalarSMulU64()) {
777 .legalFor({S64, S32, S16, V2S16})
778 .clampMaxNumElementsStrict(0, S16, 2)
779 .scalarize(0)
780 .minScalar(0, S16)
782 .custom();
783 } else {
785 .legalFor({S32, S16, V2S16})
786 .clampMaxNumElementsStrict(0, S16, 2)
787 .scalarize(0)
788 .minScalar(0, S16)
790 .custom();
791 }
792 assert(ST.hasMad64_32());
793
794 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT, G_SADDSAT, G_SSUBSAT})
795 .legalFor({S32, S16, V2S16}) // Clamp modifier
796 .minScalarOrElt(0, S16)
798 .scalarize(0)
800 .lower();
801 } else if (ST.has16BitInsts()) {
802 getActionDefinitionsBuilder({G_ADD, G_SUB})
803 .legalFor({S32, S16})
804 .minScalar(0, S16)
806 .maxScalar(0, S32)
807 .scalarize(0);
808
810 .legalFor({S32, S16})
811 .scalarize(0)
812 .minScalar(0, S16)
814 .custom();
815 assert(ST.hasMad64_32());
816
817 // Technically the saturating operations require clamp bit support, but this
818 // was introduced at the same time as 16-bit operations.
819 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
820 .legalFor({S32, S16}) // Clamp modifier
821 .minScalar(0, S16)
822 .scalarize(0)
824 .lower();
825
826 // We're just lowering this, but it helps get a better result to try to
827 // coerce to the desired type first.
828 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
829 .minScalar(0, S16)
830 .scalarize(0)
831 .lower();
832 } else {
833 getActionDefinitionsBuilder({G_ADD, G_SUB})
834 .legalFor({S32})
835 .widenScalarToNextMultipleOf(0, 32)
836 .clampScalar(0, S32, S32)
837 .scalarize(0);
838
839 auto &Mul = getActionDefinitionsBuilder(G_MUL)
840 .legalFor({S32})
841 .scalarize(0)
842 .minScalar(0, S32)
844
845 if (ST.hasMad64_32())
846 Mul.custom();
847 else
848 Mul.maxScalar(0, S32);
849
850 if (ST.hasIntClamp()) {
851 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
852 .legalFor({S32}) // Clamp modifier.
853 .scalarize(0)
855 .lower();
856 } else {
857 // Clamp bit support was added in VI, along with 16-bit operations.
858 getActionDefinitionsBuilder({G_UADDSAT, G_USUBSAT})
859 .minScalar(0, S32)
860 .scalarize(0)
861 .lower();
862 }
863
864 // FIXME: DAG expansion gets better results. The widening uses the smaller
865 // range values and goes for the min/max lowering directly.
866 getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT})
867 .minScalar(0, S32)
868 .scalarize(0)
869 .lower();
870 }
871
873 {G_SDIV, G_UDIV, G_SREM, G_UREM, G_SDIVREM, G_UDIVREM})
874 .customFor({S32, S64})
875 .clampScalar(0, S32, S64)
877 .scalarize(0);
878
879 auto &Mulh = getActionDefinitionsBuilder({G_UMULH, G_SMULH})
880 .legalFor({S32})
881 .maxScalar(0, S32);
882
883 if (ST.hasVOP3PInsts()) {
884 Mulh
885 .clampMaxNumElements(0, S8, 2)
886 .lowerFor({V2S8});
887 }
888
889 Mulh
890 .scalarize(0)
891 .lower();
892
893 // Report legal for any types we can handle anywhere. For the cases only legal
894 // on the SALU, RegBankSelect will be able to re-legalize.
895 getActionDefinitionsBuilder({G_AND, G_OR, G_XOR})
896 .legalFor({S32, S1, S64, V2S32, S16, V2S16, V4S16})
897 .clampScalar(0, S32, S64)
903 .scalarize(0);
904
906 {G_UADDO, G_USUBO, G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
907 .legalFor({{S32, S1}, {S32, S32}})
908 .clampScalar(0, S32, S32)
909 .scalarize(0);
910
912 // Don't worry about the size constraint.
914 .lower();
915
917 .legalFor({S1, S32, S64, S16, GlobalPtr,
918 LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
919 .legalIf(isPointer(0))
920 .clampScalar(0, S32, S64)
922
923 getActionDefinitionsBuilder(G_FCONSTANT)
924 .legalFor({S32, S64, S16})
925 .clampScalar(0, S16, S64);
926
927 getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
928 .legalIf(isRegisterClassType(ST, 0))
929 // s1 and s16 are special cases because they have legal operations on
930 // them, but don't really occupy registers in the normal way.
931 .legalFor({S1, S16})
932 .clampNumElements(0, V16S32, V32S32)
936 .clampMaxNumElements(0, S32, 16);
937
938 getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({PrivatePtr});
939
940 // If the amount is divergent, we have to do a wave reduction to get the
941 // maximum value, so this is expanded during RegBankSelect.
942 getActionDefinitionsBuilder(G_DYN_STACKALLOC)
943 .legalFor({{PrivatePtr, S32}});
944
945 getActionDefinitionsBuilder(G_STACKSAVE)
946 .customFor({PrivatePtr});
947 getActionDefinitionsBuilder(G_STACKRESTORE)
948 .legalFor({PrivatePtr});
949
950 getActionDefinitionsBuilder({G_GET_FPENV, G_SET_FPENV}).customFor({S64});
951
952 getActionDefinitionsBuilder(G_GLOBAL_VALUE)
953 .customIf(typeIsNot(0, PrivatePtr));
954
955 getActionDefinitionsBuilder(G_BLOCK_ADDR).legalFor({CodePtr});
956
957 auto &FPOpActions = getActionDefinitionsBuilder(
958 { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE,
959 G_STRICT_FADD, G_STRICT_FMUL, G_STRICT_FMA})
960 .legalFor({S32, S64});
961 auto &TrigActions = getActionDefinitionsBuilder({G_FSIN, G_FCOS})
962 .customFor({S32, S64});
963 auto &FDIVActions = getActionDefinitionsBuilder(G_FDIV)
964 .customFor({S32, S64});
965
966 if (ST.has16BitInsts()) {
967 if (ST.hasVOP3PInsts())
968 FPOpActions.legalFor({S16, V2S16});
969 else
970 FPOpActions.legalFor({S16});
971
972 TrigActions.customFor({S16});
973 FDIVActions.customFor({S16});
974 }
975
976 if (ST.hasPackedFP32Ops()) {
977 FPOpActions.legalFor({V2S32});
978 FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
979 }
980
981 auto &MinNumMaxNumIeee =
982 getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
983
984 if (ST.hasVOP3PInsts()) {
985 MinNumMaxNumIeee.legalFor(FPTypesPK16)
986 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
987 .clampMaxNumElements(0, S16, 2)
988 .clampScalar(0, S16, S64)
989 .scalarize(0);
990 } else if (ST.has16BitInsts()) {
991 MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
992 } else {
993 MinNumMaxNumIeee.legalFor(FPTypesBase)
994 .clampScalar(0, S32, S64)
995 .scalarize(0);
996 }
997
998 auto &MinNumMaxNum = getActionDefinitionsBuilder(
999 {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
1000
1001 if (ST.hasVOP3PInsts()) {
1002 MinNumMaxNum.customFor(FPTypesPK16)
1003 .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
1004 .clampMaxNumElements(0, S16, 2)
1005 .clampScalar(0, S16, S64)
1006 .scalarize(0);
1007 } else if (ST.has16BitInsts()) {
1008 MinNumMaxNum.customFor(FPTypes16)
1009 .clampScalar(0, S16, S64)
1010 .scalarize(0);
1011 } else {
1012 MinNumMaxNum.customFor(FPTypesBase)
1013 .clampScalar(0, S32, S64)
1014 .scalarize(0);
1015 }
1016
1017 if (ST.hasVOP3PInsts())
1018 FPOpActions.clampMaxNumElementsStrict(0, S16, 2);
1019
1020 FPOpActions
1021 .scalarize(0)
1022 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1023
1024 TrigActions
1025 .scalarize(0)
1026 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1027
1028 FDIVActions
1029 .scalarize(0)
1030 .clampScalar(0, ST.has16BitInsts() ? S16 : S32, S64);
1031
1032 getActionDefinitionsBuilder({G_FNEG, G_FABS})
1033 .legalFor(FPTypesPK16)
1035 .scalarize(0)
1036 .clampScalar(0, S16, S64);
1037
1038 if (ST.has16BitInsts()) {
1040 .legalFor({S16})
1041 .customFor({S32, S64})
1042 .scalarize(0)
1043 .unsupported();
1045 .legalFor({S32, S64, S16})
1046 .scalarize(0)
1047 .clampScalar(0, S16, S64);
1048
1049 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1050 .legalFor({{S32, S32}, {S64, S32}, {S16, S16}})
1051 .scalarize(0)
1052 .maxScalarIf(typeIs(0, S16), 1, S16)
1053 .clampScalar(1, S32, S32)
1054 .lower();
1055
1057 .customFor({{S32, S32}, {S64, S32}, {S16, S16}, {S16, S32}})
1058 .scalarize(0)
1059 .lower();
1060
1062 .lowerFor({S16, S32, S64})
1063 .scalarize(0)
1064 .lower();
1065 } else {
1067 .customFor({S32, S64, S16})
1068 .scalarize(0)
1069 .unsupported();
1070
1071
1072 if (ST.hasFractBug()) {
1074 .customFor({S64})
1075 .legalFor({S32, S64})
1076 .scalarize(0)
1077 .clampScalar(0, S32, S64);
1078 } else {
1080 .legalFor({S32, S64})
1081 .scalarize(0)
1082 .clampScalar(0, S32, S64);
1083 }
1084
1085 getActionDefinitionsBuilder({G_FLDEXP, G_STRICT_FLDEXP})
1086 .legalFor({{S32, S32}, {S64, S32}})
1087 .scalarize(0)
1088 .clampScalar(0, S32, S64)
1089 .clampScalar(1, S32, S32)
1090 .lower();
1091
1093 .customFor({{S32, S32}, {S64, S32}})
1094 .scalarize(0)
1095 .minScalar(0, S32)
1096 .clampScalar(1, S32, S32)
1097 .lower();
1098
1100 .lowerFor({S32, S64})
1101 .scalarize(0)
1102 .lower();
1103 }
1104
1105 auto &FPTruncActions = getActionDefinitionsBuilder(G_FPTRUNC);
1106 if (ST.hasCvtPkF16F32Inst()) {
1107 FPTruncActions.legalFor({{S32, S64}, {S16, S32}, {V2S16, V2S32}})
1108 .clampMaxNumElements(0, S16, 2);
1109 } else {
1110 FPTruncActions.legalFor({{S32, S64}, {S16, S32}});
1111 }
1112 FPTruncActions.scalarize(0).lower();
1113
1115 .legalFor({{S64, S32}, {S32, S16}})
1116 .narrowScalarFor({{S64, S16}}, changeTo(0, S32))
1117 .scalarize(0);
1118
1119 auto &FSubActions = getActionDefinitionsBuilder({G_FSUB, G_STRICT_FSUB});
1120 if (ST.has16BitInsts()) {
1121 FSubActions
1122 // Use actual fsub instruction
1123 .legalFor({S32, S16})
1124 // Must use fadd + fneg
1125 .lowerFor({S64, V2S16});
1126 } else {
1127 FSubActions
1128 // Use actual fsub instruction
1129 .legalFor({S32})
1130 // Must use fadd + fneg
1131 .lowerFor({S64, S16, V2S16});
1132 }
1133
1134 FSubActions
1135 .scalarize(0)
1136 .clampScalar(0, S32, S64);
1137
1138 // Whether this is legal depends on the floating point mode for the function.
1139 auto &FMad = getActionDefinitionsBuilder(G_FMAD);
1140 if (ST.hasMadF16() && ST.hasMadMacF32Insts())
1141 FMad.customFor({S32, S16});
1142 else if (ST.hasMadMacF32Insts())
1143 FMad.customFor({S32});
1144 else if (ST.hasMadF16())
1145 FMad.customFor({S16});
1146 FMad.scalarize(0)
1147 .lower();
1148
1149 auto &FRem = getActionDefinitionsBuilder(G_FREM);
1150 if (ST.has16BitInsts()) {
1151 FRem.customFor({S16, S32, S64});
1152 } else {
1153 FRem.minScalar(0, S32)
1154 .customFor({S32, S64});
1155 }
1156 FRem.scalarize(0);
1157
1158 // TODO: Do we need to clamp maximum bitwidth?
1160 .legalIf(isScalar(0))
1161 .legalFor({{V2S16, V2S32}})
1162 .clampMaxNumElements(0, S16, 2)
1163 // Avoid scalarizing in cases that should be truly illegal. In unresolvable
1164 // situations (like an invalid implicit use), we don't want to infinite loop
1165 // in the legalizer.
1167 .alwaysLegal();
1168
1169 getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
1170 .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
1171 {S32, S1}, {S64, S1}, {S16, S1}})
1172 .scalarize(0)
1173 .clampScalar(0, S32, S64)
1174 .widenScalarToNextPow2(1, 32);
1175
1176 // TODO: Split s1->s64 during regbankselect for VALU.
1177 auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
1178 .legalFor({{S32, S32}, {S64, S32}, {S16, S32}})
1179 .lowerIf(typeIs(1, S1))
1180 .customFor({{S32, S64}, {S64, S64}});
1181 if (ST.has16BitInsts())
1182 IToFP.legalFor({{S16, S16}});
1183 IToFP.clampScalar(1, S32, S64)
1184 .minScalar(0, S32)
1185 .scalarize(0)
1187
1188 auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
1189 .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
1190 .customFor({{S64, S32}, {S64, S64}})
1191 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1192 if (ST.has16BitInsts())
1193 FPToI.legalFor({{S16, S16}});
1194 else
1195 FPToI.minScalar(1, S32);
1196
1197 FPToI.minScalar(0, S32)
1198 .widenScalarToNextPow2(0, 32)
1199 .scalarize(0)
1200 .lower();
1201
1202 // clang-format off
1203 auto &FPToISat = getActionDefinitionsBuilder({G_FPTOSI_SAT, G_FPTOUI_SAT})
1204 .legalFor({{S32, S32}, {S32, S64}})
1205 .narrowScalarFor({{S64, S16}}, changeTo(0, S32));
1206 if (ST.has16BitInsts())
1207 FPToISat.legalFor({{S16, S16}});
1208
1209 FPToISat.minScalar(1, S32);
1210 FPToISat.minScalar(0, S32)
1211 .widenScalarToNextPow2(0, 32)
1212 .scalarize(0)
1213 .lower();
1214 // clang-format on
1215
1216 getActionDefinitionsBuilder({G_LROUND, G_LLROUND})
1217 .clampScalar(0, S16, S64)
1218 .scalarize(0)
1219 .lower();
1220
1221 getActionDefinitionsBuilder(G_INTRINSIC_FPTRUNC_ROUND)
1222 .legalFor({S16, S32})
1223 .scalarize(0)
1224 .lower();
1225
1226 // Lower G_FNEARBYINT and G_FRINT into G_INTRINSIC_ROUNDEVEN
1227 getActionDefinitionsBuilder({G_INTRINSIC_ROUND, G_FRINT, G_FNEARBYINT})
1228 .scalarize(0)
1229 .lower();
1230
1231 getActionDefinitionsBuilder({G_INTRINSIC_LRINT, G_INTRINSIC_LLRINT})
1232 .clampScalar(0, S16, S64)
1233 .scalarize(0)
1234 .lower();
1235
1236 if (ST.has16BitInsts()) {
1238 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1239 .legalFor({S16, S32, S64})
1240 .clampScalar(0, S16, S64)
1241 .scalarize(0);
1242 } else if (ST.getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS) {
1244 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1245 .legalFor({S32, S64})
1246 .clampScalar(0, S32, S64)
1247 .scalarize(0);
1248 } else {
1250 {G_INTRINSIC_TRUNC, G_FCEIL, G_INTRINSIC_ROUNDEVEN})
1251 .legalFor({S32})
1252 .customFor({S64})
1253 .clampScalar(0, S32, S64)
1254 .scalarize(0);
1255 }
1256
1258 .unsupportedFor({BufferFatPtr, BufferStridedPtr, RsrcPtr})
1259 .legalIf(all(isPointer(0), sameSize(0, 1)))
1260 .scalarize(0)
1261 .scalarSameSizeAs(1, 0);
1262
1264 .legalIf(all(sameSize(0, 1), typeInSet(1, {S64, S32})))
1265 .scalarSameSizeAs(1, 0)
1266 .scalarize(0);
1267
1268 auto &CmpBuilder =
1270 // The compare output type differs based on the register bank of the output,
1271 // so make both s1 and s32 legal.
1272 //
1273 // Scalar compares producing output in scc will be promoted to s32, as that
1274 // is the allocatable register type that will be needed for the copy from
1275 // scc. This will be promoted during RegBankSelect, and we assume something
1276 // before that won't try to use s32 result types.
1277 //
1278 // Vector compares producing an output in vcc/SGPR will use s1 in VCC reg
1279 // bank.
1281 {S1}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr})
1282 .legalForCartesianProduct(
1283 {S32}, {S32, S64, GlobalPtr, LocalPtr, ConstantPtr, PrivatePtr, FlatPtr});
1284 if (ST.has16BitInsts()) {
1285 CmpBuilder.legalFor({{S1, S16}});
1286 }
1287
1288 CmpBuilder
1290 .clampScalar(1, S32, S64)
1291 .scalarize(0)
1292 .legalIf(all(typeInSet(0, {S1, S32}), isPointer(1)));
1293
1294 auto &FCmpBuilder =
1296 {S1}, ST.has16BitInsts() ? FPTypes16 : FPTypesBase);
1297
1298 if (ST.hasSALUFloatInsts())
1299 FCmpBuilder.legalForCartesianProduct({S32}, {S16, S32});
1300
1301 FCmpBuilder
1303 .clampScalar(1, S32, S64)
1304 .scalarize(0);
1305
1306 // FIXME: fpow has a selection pattern that should move to custom lowering.
1307 auto &ExpOps = getActionDefinitionsBuilder(G_FPOW);
1308 if (ST.has16BitInsts())
1309 ExpOps.customFor({{S32}, {S16}});
1310 else
1311 ExpOps.customFor({S32});
1312 ExpOps.clampScalar(0, MinScalarFPTy, S32)
1313 .scalarize(0);
1314
1316 .clampScalar(0, MinScalarFPTy, S32)
1317 .lower();
1318
1320 .legalFor(ST.has16BitInsts(), {S16})
1321 .customFor({S32, S16})
1322 .scalarize(0)
1323 .lower();
1324
1326 .legalFor(ST.has16BitInsts(), {S16})
1327 .customFor({S32, S64, S16})
1328 .scalarize(0)
1329 .lower();
1330
1331 auto &LogOps =
1332 getActionDefinitionsBuilder({G_FLOG, G_FLOG10, G_FEXP, G_FEXP10});
1333 LogOps.customFor({S32, S16, S64});
1334 LogOps.clampScalar(0, MinScalarFPTy, S32)
1335 .scalarize(0);
1336
1337 // The 64-bit versions produce 32-bit results, but only on the SALU.
1339 .legalFor({{S32, S32}, {S32, S64}})
1340 .clampScalar(0, S32, S32)
1341 .widenScalarToNextPow2(1, 32)
1342 .clampScalar(1, S32, S64)
1343 .scalarize(0)
1344 .widenScalarToNextPow2(0, 32);
1345
1346 // If no 16 bit instr is available, lower into different instructions.
1347 if (ST.has16BitInsts())
1348 getActionDefinitionsBuilder(G_IS_FPCLASS)
1349 .legalForCartesianProduct({S1}, FPTypes16)
1350 .widenScalarToNextPow2(1)
1351 .scalarize(0)
1352 .lower();
1353 else
1354 getActionDefinitionsBuilder(G_IS_FPCLASS)
1355 .legalForCartesianProduct({S1}, FPTypesBase)
1356 .lowerFor({S1, S16})
1357 .widenScalarToNextPow2(1)
1358 .scalarize(0)
1359 .lower();
1360
1361 // The hardware instructions return a different result on 0 than the generic
1362 // instructions expect. The hardware produces -1, but these produce the
1363 // bitwidth.
1364 getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
1365 .scalarize(0)
1366 .clampScalar(0, S32, S32)
1367 .clampScalar(1, S32, S64)
1368 .widenScalarToNextPow2(0, 32)
1369 .widenScalarToNextPow2(1, 32)
1370 .custom();
1371
1372 // The 64-bit versions produce 32-bit results, but only on the SALU.
1373 getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
1374 .legalFor({{S32, S32}, {S32, S64}})
1375 .customIf(scalarNarrowerThan(1, 32))
1376 .clampScalar(0, S32, S32)
1377 .clampScalar(1, S32, S64)
1378 .scalarize(0)
1379 .widenScalarToNextPow2(0, 32)
1380 .widenScalarToNextPow2(1, 32);
1381
1382 getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
1383 .legalFor({{S32, S32}, {S32, S64}})
1384 .clampScalar(0, S32, S32)
1385 .clampScalar(1, S32, S64)
1386 .scalarize(0)
1387 .widenScalarToNextPow2(0, 32)
1388 .widenScalarToNextPow2(1, 32);
1389
1391 .customFor({{S32, S32}})
1392 .scalarize(0)
1393 .clampScalar(0, S32, S32)
1394 .clampScalar(1, S32, S32);
1395
1396 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
1397 // RegBankSelect.
1398 getActionDefinitionsBuilder(G_BITREVERSE)
1399 .legalFor({S32, S64})
1400 .clampScalar(0, S32, S64)
1401 .scalarize(0)
1403
1404 if (ST.has16BitInsts()) {
1406 .legalFor({S16, S32, V2S16})
1407 .clampMaxNumElementsStrict(0, S16, 2)
1408 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1409 // narrowScalar limitation.
1411 .clampScalar(0, S16, S32)
1412 .scalarize(0);
1413
1414 if (ST.hasVOP3PInsts()) {
1416 .legalFor({S32, S16, V2S16})
1417 .clampMaxNumElements(0, S16, 2)
1418 .minScalar(0, S16)
1420 .scalarize(0)
1421 .lower();
1422 if (ST.hasIntMinMax64()) {
1423 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1424 .legalFor({S32, S16, S64, V2S16})
1425 .clampMaxNumElements(0, S16, 2)
1426 .minScalar(0, S16)
1428 .scalarize(0)
1429 .lower();
1430 } else {
1431 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
1432 .legalFor({S32, S16, V2S16})
1433 .clampMaxNumElements(0, S16, 2)
1434 .minScalar(0, S16)
1436 .scalarize(0)
1437 .lower();
1438 }
1439 } else {
1440 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1441 .legalFor({S32, S16})
1442 .widenScalarToNextPow2(0)
1443 .minScalar(0, S16)
1444 .scalarize(0)
1445 .lower();
1446 }
1447 } else {
1448 // TODO: Should have same legality without v_perm_b32
1450 .legalFor({S32})
1451 .lowerIf(scalarNarrowerThan(0, 32))
1452 // FIXME: Fixing non-power-of-2 before clamp is workaround for
1453 // narrowScalar limitation.
1455 .maxScalar(0, S32)
1456 .scalarize(0)
1457 .lower();
1458
1459 getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX, G_ABS})
1460 .legalFor({S32})
1461 .minScalar(0, S32)
1463 .scalarize(0)
1464 .lower();
1465 }
1466
1467 getActionDefinitionsBuilder(G_INTTOPTR)
1468 // List the common cases
1469 .legalForCartesianProduct(AddrSpaces64, {S64})
1470 .legalForCartesianProduct(AddrSpaces32, {S32})
1471 .scalarize(0)
1472 // Accept any address space as long as the size matches
1473 .legalIf(sameSize(0, 1))
1475 [](const LegalityQuery &Query) {
1476 return std::pair(
1477 1, LLT::scalar(Query.Types[0].getSizeInBits()));
1478 })
1479 .narrowScalarIf(largerThan(1, 0), [](const LegalityQuery &Query) {
1480 return std::pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
1481 });
1482
1483 getActionDefinitionsBuilder(G_PTRTOINT)
1484 // List the common cases
1485 .legalForCartesianProduct(AddrSpaces64, {S64})
1486 .legalForCartesianProduct(AddrSpaces32, {S32})
1487 .scalarize(0)
1488 // Accept any address space as long as the size matches
1489 .legalIf(sameSize(0, 1))
1491 [](const LegalityQuery &Query) {
1492 return std::pair(
1493 0, LLT::scalar(Query.Types[1].getSizeInBits()));
1494 })
1495 .narrowScalarIf(largerThan(0, 1), [](const LegalityQuery &Query) {
1496 return std::pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
1497 });
1498
1499 getActionDefinitionsBuilder(G_ADDRSPACE_CAST)
1500 .scalarize(0)
1501 .custom();
1502
1503 const auto needToSplitMemOp = [=](const LegalityQuery &Query,
1504 bool IsLoad) -> bool {
1505 const LLT DstTy = Query.Types[0];
1506
1507 // Split vector extloads.
1508 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1509
1510 if (DstTy.isVector() && DstTy.getSizeInBits() > MemSize)
1511 return true;
1512
1513 const LLT PtrTy = Query.Types[1];
1514 unsigned AS = PtrTy.getAddressSpace();
1515 if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad,
1516 Query.MMODescrs[0].Ordering !=
1518 return true;
1519
1520 // Catch weird sized loads that don't evenly divide into the access sizes
1521 // TODO: May be able to widen depending on alignment etc.
1522 unsigned NumRegs = (MemSize + 31) / 32;
1523 if (NumRegs == 3) {
1524 if (!ST.hasDwordx3LoadStores())
1525 return true;
1526 } else {
1527 // If the alignment allows, these should have been widened.
1528 if (!isPowerOf2_32(NumRegs))
1529 return true;
1530 }
1531
1532 return false;
1533 };
1534
1535 unsigned GlobalAlign32 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 32;
1536 unsigned GlobalAlign16 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 16;
1537 unsigned GlobalAlign8 = ST.hasUnalignedBufferAccessEnabled() ? 0 : 8;
1538
1539 // TODO: Refine based on subtargets which support unaligned access or 128-bit
1540 // LDS
1541 // TODO: Unsupported flat for SI.
1542
1543 for (unsigned Op : {G_LOAD, G_STORE}) {
1544 const bool IsStore = Op == G_STORE;
1545
1546 auto &Actions = getActionDefinitionsBuilder(Op);
1547 // Explicitly list some common cases.
1548 // TODO: Does this help compile time at all?
1549 Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, S32, GlobalAlign32},
1550 {V2S32, GlobalPtr, V2S32, GlobalAlign32},
1551 {V4S32, GlobalPtr, V4S32, GlobalAlign32},
1552 {S64, GlobalPtr, S64, GlobalAlign32},
1553 {V2S64, GlobalPtr, V2S64, GlobalAlign32},
1554 {V2S16, GlobalPtr, V2S16, GlobalAlign32},
1555 {S32, GlobalPtr, S8, GlobalAlign8},
1556 {S32, GlobalPtr, S16, GlobalAlign16},
1557
1558 {S32, LocalPtr, S32, 32},
1559 {S64, LocalPtr, S64, 32},
1560 {V2S32, LocalPtr, V2S32, 32},
1561 {S32, LocalPtr, S8, 8},
1562 {S32, LocalPtr, S16, 16},
1563 {V2S16, LocalPtr, S32, 32},
1564
1565 {S32, PrivatePtr, S32, 32},
1566 {S32, PrivatePtr, S8, 8},
1567 {S32, PrivatePtr, S16, 16},
1568 {V2S16, PrivatePtr, S32, 32},
1569
1570 {S32, ConstantPtr, S32, GlobalAlign32},
1571 {V2S32, ConstantPtr, V2S32, GlobalAlign32},
1572 {V4S32, ConstantPtr, V4S32, GlobalAlign32},
1573 {S64, ConstantPtr, S64, GlobalAlign32},
1574 {V2S32, ConstantPtr, V2S32, GlobalAlign32}});
1575 Actions.legalIf(
1576 [=](const LegalityQuery &Query) -> bool {
1577 return isLoadStoreLegal(ST, Query);
1578 });
1579
1580 // The custom pointers (fat pointers, buffer resources) don't work with load
1581 // and store at this level. Fat pointers should have been lowered to
1582 // intrinsics before the translation to MIR.
1583 Actions.unsupportedIf(
1584 typeInSet(1, {BufferFatPtr, BufferStridedPtr, RsrcPtr}));
1585
1586 // Address space 8 pointers are handled by a 4xs32 load, bitcast, and
1587 // ptrtoint. This is needed to account for the fact that we can't have i128
1588 // as a register class for SelectionDAG reasons.
1589 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1590 return hasBufferRsrcWorkaround(Query.Types[0]);
1591 });
1592
1593 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1594 // 64-bits.
1595 //
1596 // TODO: Should generalize bitcast action into coerce, which will also cover
1597 // inserting addrspacecasts.
1598 Actions.customIf(typeIs(1, Constant32Ptr));
1599
1600 // Turn any illegal element vectors into something easier to deal
1601 // with. These will ultimately produce 32-bit scalar shifts to extract the
1602 // parts anyway.
1603 //
1604 // For odd 16-bit element vectors, prefer to split those into pieces with
1605 // 16-bit vector parts.
1606 Actions.bitcastIf(
1607 [=](const LegalityQuery &Query) -> bool {
1608 return shouldBitcastLoadStoreType(ST, Query.Types[0],
1609 Query.MMODescrs[0].MemoryTy);
1610 }, bitcastToRegisterType(0));
1611
1612 if (!IsStore) {
1613 // Widen suitably aligned loads by loading extra bytes. The standard
1614 // legalization actions can't properly express widening memory operands.
1615 Actions.customIf([=](const LegalityQuery &Query) -> bool {
1616 return shouldWidenLoad(ST, Query, G_LOAD);
1617 });
1618 }
1619
1620 // FIXME: load/store narrowing should be moved to lower action
1621 Actions
1622 .narrowScalarIf(
1623 [=](const LegalityQuery &Query) -> bool {
1624 return !Query.Types[0].isVector() &&
1625 needToSplitMemOp(Query, Op == G_LOAD);
1626 },
1627 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1628 const LLT DstTy = Query.Types[0];
1629 const LLT PtrTy = Query.Types[1];
1630
1631 const unsigned DstSize = DstTy.getSizeInBits();
1632 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1633
1634 // Split extloads.
1635 if (DstSize > MemSize)
1636 return std::pair(0, LLT::scalar(MemSize));
1637
1638 unsigned MaxSize = maxSizeForAddrSpace(
1639 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1640 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1641 if (MemSize > MaxSize)
1642 return std::pair(0, LLT::scalar(MaxSize));
1643
1644 uint64_t Align = Query.MMODescrs[0].AlignInBits;
1645 return std::pair(0, LLT::scalar(Align));
1646 })
1647 .fewerElementsIf(
1648 [=](const LegalityQuery &Query) -> bool {
1649 return Query.Types[0].isVector() &&
1650 needToSplitMemOp(Query, Op == G_LOAD);
1651 },
1652 [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
1653 const LLT DstTy = Query.Types[0];
1654 const LLT PtrTy = Query.Types[1];
1655
1656 LLT EltTy = DstTy.getElementType();
1657 unsigned MaxSize = maxSizeForAddrSpace(
1658 ST, PtrTy.getAddressSpace(), Op == G_LOAD,
1659 Query.MMODescrs[0].Ordering != AtomicOrdering::NotAtomic);
1660
1661 // FIXME: Handle widened to power of 2 results better. This ends
1662 // up scalarizing.
1663 // FIXME: 3 element stores scalarized on SI
1664
1665 // Split if it's too large for the address space.
1666 unsigned MemSize = Query.MMODescrs[0].MemoryTy.getSizeInBits();
1667 if (MemSize > MaxSize) {
1668 unsigned NumElts = DstTy.getNumElements();
1669 unsigned EltSize = EltTy.getSizeInBits();
1670
1671 if (MaxSize % EltSize == 0) {
1672 return std::pair(
1674 ElementCount::getFixed(MaxSize / EltSize), EltTy));
1675 }
1676
1677 unsigned NumPieces = MemSize / MaxSize;
1678
1679 // FIXME: Refine when odd breakdowns handled
1680 // The scalars will need to be re-legalized.
1681 if (NumPieces == 1 || NumPieces >= NumElts ||
1682 NumElts % NumPieces != 0)
1683 return std::pair(0, EltTy);
1684
1685 return std::pair(0,
1686 LLT::fixed_vector(NumElts / NumPieces, EltTy));
1687 }
1688
1689 // FIXME: We could probably handle weird extending loads better.
1690 if (DstTy.getSizeInBits() > MemSize)
1691 return std::pair(0, EltTy);
1692
1693 unsigned EltSize = EltTy.getSizeInBits();
1694 unsigned DstSize = DstTy.getSizeInBits();
1695 if (!isPowerOf2_32(DstSize)) {
1696 // We're probably decomposing an odd sized store. Try to split
1697 // to the widest type. TODO: Account for alignment. As-is it
1698 // should be OK, since the new parts will be further legalized.
1699 unsigned FloorSize = llvm::bit_floor(DstSize);
1700 return std::pair(
1702 ElementCount::getFixed(FloorSize / EltSize), EltTy));
1703 }
1704
1705 // May need relegalization for the scalars.
1706 return std::pair(0, EltTy);
1707 })
1708 .minScalar(0, S32)
1709 .narrowScalarIf(isTruncStoreToSizePowerOf2(0),
1711 .widenScalarToNextPow2(0)
1712 .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0))
1713 .lower();
1714 }
1715
1716 // FIXME: Unaligned accesses not lowered.
1717 auto &ExtLoads = getActionDefinitionsBuilder({G_SEXTLOAD, G_ZEXTLOAD})
1718 .legalForTypesWithMemDesc({{S32, GlobalPtr, S8, 8},
1719 {S32, GlobalPtr, S16, 2 * 8},
1720 {S32, LocalPtr, S8, 8},
1721 {S32, LocalPtr, S16, 16},
1722 {S32, PrivatePtr, S8, 8},
1723 {S32, PrivatePtr, S16, 16},
1724 {S32, ConstantPtr, S8, 8},
1725 {S32, ConstantPtr, S16, 2 * 8}})
1726 .legalIf(
1727 [=](const LegalityQuery &Query) -> bool {
1728 return isLoadStoreLegal(ST, Query);
1729 });
1730
1731 if (ST.hasFlatAddressSpace()) {
1732 ExtLoads.legalForTypesWithMemDesc(
1733 {{S32, FlatPtr, S8, 8}, {S32, FlatPtr, S16, 16}});
1734 }
1735
1736 // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
1737 // 64-bits.
1738 //
1739 // TODO: Should generalize bitcast action into coerce, which will also cover
1740 // inserting addrspacecasts.
1741 ExtLoads.customIf(typeIs(1, Constant32Ptr));
1742
1743 ExtLoads.clampScalar(0, S32, S32)
1745 .lower();
1746
1747 auto &Atomics = getActionDefinitionsBuilder(
1748 {G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD, G_ATOMICRMW_SUB,
1749 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
1750 G_ATOMICRMW_MAX, G_ATOMICRMW_MIN, G_ATOMICRMW_UMAX,
1751 G_ATOMICRMW_UMIN, G_ATOMICRMW_UINC_WRAP, G_ATOMICRMW_UDEC_WRAP})
1752 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr},
1753 {S64, GlobalPtr}, {S64, LocalPtr},
1754 {S32, RegionPtr}, {S64, RegionPtr}});
1755 if (ST.hasFlatAddressSpace()) {
1756 Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
1757 }
1758
1759 auto &Atomics32 =
1760 getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
1761 .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
1762 if (ST.hasFlatAddressSpace()) {
1763 Atomics32.legalFor({{S32, FlatPtr}});
1764 }
1765
1766 // TODO: v2bf16 operations, and fat buffer pointer support.
1767 auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
1768 if (ST.hasLDSFPAtomicAddF32()) {
1769 Atomic.legalFor({{S32, LocalPtr}, {S32, RegionPtr}});
1770 if (ST.hasLdsAtomicAddF64())
1771 Atomic.legalFor({{S64, LocalPtr}});
1772 if (ST.hasAtomicDsPkAdd16Insts())
1773 Atomic.legalFor({{V2F16, LocalPtr}, {V2BF16, LocalPtr}});
1774 }
1775 if (ST.hasAtomicFaddInsts())
1776 Atomic.legalFor({{S32, GlobalPtr}});
1777 if (ST.hasFlatAtomicFaddF32Inst())
1778 Atomic.legalFor({{S32, FlatPtr}});
1779
1780 if (ST.hasGFX90AInsts() || ST.hasGFX1250Insts()) {
1781 // These are legal with some caveats, and should have undergone expansion in
1782 // the IR in most situations
1783 // TODO: Move atomic expansion into legalizer
1784 Atomic.legalFor({
1785 {S32, GlobalPtr},
1786 {S64, GlobalPtr},
1787 {S64, FlatPtr}
1788 });
1789 }
1790
1791 if (ST.hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
1792 ST.hasAtomicBufferGlobalPkAddF16Insts())
1793 Atomic.legalFor({{V2F16, GlobalPtr}, {V2F16, BufferFatPtr}});
1794 if (ST.hasAtomicGlobalPkAddBF16Inst())
1795 Atomic.legalFor({{V2BF16, GlobalPtr}});
1796 if (ST.hasAtomicFlatPkAdd16Insts())
1797 Atomic.legalFor({{V2F16, FlatPtr}, {V2BF16, FlatPtr}});
1798
1799
1800 // Most of the legalization work here is done by AtomicExpand. We could
1801 // probably use a simpler legality rule that just assumes anything is OK.
1802 auto &AtomicFMinFMax =
1803 getActionDefinitionsBuilder({G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
1804 .legalFor({{F32, LocalPtr}, {F64, LocalPtr}});
1805
1806 if (ST.hasAtomicFMinFMaxF32GlobalInsts())
1807 AtomicFMinFMax.legalFor({{F32, GlobalPtr},{F32, BufferFatPtr}});
1808 if (ST.hasAtomicFMinFMaxF64GlobalInsts())
1809 AtomicFMinFMax.legalFor({{F64, GlobalPtr}, {F64, BufferFatPtr}});
1810 if (ST.hasAtomicFMinFMaxF32FlatInsts())
1811 AtomicFMinFMax.legalFor({F32, FlatPtr});
1812 if (ST.hasAtomicFMinFMaxF64FlatInsts())
1813 AtomicFMinFMax.legalFor({F64, FlatPtr});
1814
1815 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
1816 // demarshalling
1817 getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
1818 .customFor({{S32, GlobalPtr}, {S64, GlobalPtr},
1819 {S32, FlatPtr}, {S64, FlatPtr}})
1820 .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
1821 {S32, RegionPtr}, {S64, RegionPtr}});
1822 // TODO: Pointer types, any 32-bit or 64-bit vector
1823
1824 // Condition should be s32 for scalar, s1 for vector.
1827 LocalPtr, FlatPtr, PrivatePtr,
1828 LLT::fixed_vector(2, LocalPtr),
1829 LLT::fixed_vector(2, PrivatePtr)},
1830 {S1, S32})
1831 .clampScalar(0, S16, S64)
1832 .scalarize(1)
1835 .clampMaxNumElements(0, S32, 2)
1836 .clampMaxNumElements(0, LocalPtr, 2)
1837 .clampMaxNumElements(0, PrivatePtr, 2)
1838 .scalarize(0)
1840 .legalIf(all(isPointer(0), typeInSet(1, {S1, S32})));
1841
1842 // TODO: Only the low 4/5/6 bits of the shift amount are observed, so we can
1843 // be more flexible with the shift amount type.
1844 auto &Shifts = getActionDefinitionsBuilder({G_SHL, G_LSHR, G_ASHR})
1845 .legalFor({{S32, S32}, {S64, S32}});
1846 if (ST.has16BitInsts()) {
1847 if (ST.hasVOP3PInsts()) {
1848 Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
1849 .clampMaxNumElements(0, S16, 2);
1850 } else
1851 Shifts.legalFor({{S16, S16}});
1852
1853 // TODO: Support 16-bit shift amounts for all types
1854 Shifts.widenScalarIf(
1855 [=](const LegalityQuery &Query) {
1856 // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
1857 // 32-bit amount.
1858 const LLT ValTy = Query.Types[0];
1859 const LLT AmountTy = Query.Types[1];
1860 return ValTy.isScalar() && ValTy.getSizeInBits() <= 16 &&
1861 AmountTy.getSizeInBits() < 16;
1862 }, changeTo(1, S16));
1863 Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
1864 Shifts.clampScalar(1, S32, S32);
1865 Shifts.widenScalarToNextPow2(0, 16);
1866 Shifts.clampScalar(0, S16, S64);
1867
1868 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1869 .minScalar(0, S16)
1870 .scalarize(0)
1871 .lower();
1872 } else {
1873 // Make sure we legalize the shift amount type first, as the general
1874 // expansion for the shifted type will produce much worse code if it hasn't
1875 // been truncated already.
1876 Shifts.clampScalar(1, S32, S32);
1877 Shifts.widenScalarToNextPow2(0, 32);
1878 Shifts.clampScalar(0, S32, S64);
1879
1880 getActionDefinitionsBuilder({G_SSHLSAT, G_USHLSAT})
1881 .minScalar(0, S32)
1882 .scalarize(0)
1883 .lower();
1884 }
1885 Shifts.scalarize(0);
1886
1887 for (unsigned Op : {G_EXTRACT_VECTOR_ELT, G_INSERT_VECTOR_ELT}) {
1888 unsigned VecTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 1 : 0;
1889 unsigned EltTypeIdx = Op == G_EXTRACT_VECTOR_ELT ? 0 : 1;
1890 unsigned IdxTypeIdx = 2;
1891
1893 .customIf([=](const LegalityQuery &Query) {
1894 const LLT EltTy = Query.Types[EltTypeIdx];
1895 const LLT VecTy = Query.Types[VecTypeIdx];
1896 const LLT IdxTy = Query.Types[IdxTypeIdx];
1897 const unsigned EltSize = EltTy.getSizeInBits();
1898 const bool isLegalVecType =
1900 // Address space 8 pointers are 128-bit wide values, but the logic
1901 // below will try to bitcast them to 2N x s64, which will fail.
1902 // Therefore, as an intermediate step, wrap extracts/insertions from a
1903 // ptrtoint-ing the vector and scalar arguments (or inttoptring the
1904 // extraction result) in order to produce a vector operation that can
1905 // be handled by the logic below.
1906 if (EltTy.isPointer() && EltSize > 64)
1907 return true;
1908 return (EltSize == 32 || EltSize == 64) &&
1909 VecTy.getSizeInBits() % 32 == 0 &&
1910 VecTy.getSizeInBits() <= MaxRegisterSize &&
1911 IdxTy.getSizeInBits() == 32 &&
1912 isLegalVecType;
1913 })
1914 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1915 scalarOrEltNarrowerThan(VecTypeIdx, 32)),
1916 bitcastToVectorElement32(VecTypeIdx))
1917 //.bitcastIf(vectorSmallerThan(1, 32), bitcastToScalar(1))
1918 .bitcastIf(all(sizeIsMultipleOf32(VecTypeIdx),
1919 scalarOrEltWiderThan(VecTypeIdx, 64)),
1920 [=](const LegalityQuery &Query) {
1921 // For > 64-bit element types, try to turn this into a
1922 // 64-bit element vector since we may be able to do better
1923 // indexing if this is scalar. If not, fall back to 32.
1924 const LLT EltTy = Query.Types[EltTypeIdx];
1925 const LLT VecTy = Query.Types[VecTypeIdx];
1926 const unsigned DstEltSize = EltTy.getSizeInBits();
1927 const unsigned VecSize = VecTy.getSizeInBits();
1928
1929 const unsigned TargetEltSize =
1930 DstEltSize % 64 == 0 ? 64 : 32;
1931 return std::pair(VecTypeIdx,
1932 LLT::fixed_vector(VecSize / TargetEltSize,
1933 TargetEltSize));
1934 })
1935 .clampScalar(EltTypeIdx, S32, S64)
1936 .clampScalar(VecTypeIdx, S32, S64)
1937 .clampScalar(IdxTypeIdx, S32, S32)
1938 .clampMaxNumElements(VecTypeIdx, S32, 32)
1939 // TODO: Clamp elements for 64-bit vectors?
1940 .moreElementsIf(isIllegalRegisterType(ST, VecTypeIdx),
1942 // It should only be necessary with variable indexes.
1943 // As a last resort, lower to the stack
1944 .lower();
1945 }
1946
1947 getActionDefinitionsBuilder(G_EXTRACT_VECTOR_ELT)
1948 .unsupportedIf([=](const LegalityQuery &Query) {
1949 const LLT &EltTy = Query.Types[1].getElementType();
1950 return Query.Types[0] != EltTy;
1951 });
1952
1953 for (unsigned Op : {G_EXTRACT, G_INSERT}) {
1954 unsigned BigTyIdx = Op == G_EXTRACT ? 1 : 0;
1955 unsigned LitTyIdx = Op == G_EXTRACT ? 0 : 1;
1958 [=](const LegalityQuery &Query) {
1959 const LLT BigTy = Query.Types[BigTyIdx];
1960 return (BigTy.getScalarSizeInBits() < 16);
1961 },
1963 .widenScalarIf(
1964 [=](const LegalityQuery &Query) {
1965 const LLT LitTy = Query.Types[LitTyIdx];
1966 return (LitTy.getScalarSizeInBits() < 16);
1967 },
1969 .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
1970 .widenScalarToNextPow2(BigTyIdx, 32)
1971 .customIf([=](const LegalityQuery &Query) {
1972 // Generic lower operates on the full-width value, producing
1973 // shift+trunc/mask sequences. For simple cases where extract/insert
1974 // values are 32-bit aligned, we can instead unmerge/merge and work on
1975 // the 32-bit components. However, we can't check the offset here so
1976 // custom lower function will have to call generic lowering if offset
1977 // is not 32-bit aligned.
1978 const LLT BigTy = Query.Types[BigTyIdx];
1979 const LLT LitTy = Query.Types[LitTyIdx];
1980 return !BigTy.isVector() && BigTy.getSizeInBits() % 32 == 0 &&
1981 LitTy.getSizeInBits() % 32 == 0;
1982 })
1983 .lower();
1984 }
1985
1986 auto &BuildVector =
1987 getActionDefinitionsBuilder(G_BUILD_VECTOR)
1989 .legalForCartesianProduct(AllS64Vectors, {S64})
1990 .clampNumElements(0, V16S32, V32S32)
1995
1996 if (ST.hasScalarPackInsts()) {
1997 BuildVector
1998 // FIXME: Should probably widen s1 vectors straight to s32
1999 .minScalarOrElt(0, S16)
2000 .minScalar(1, S16);
2001
2002 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2003 .legalFor({V2S16, S32})
2004 .lower();
2005 } else {
2006 BuildVector.customFor({V2S16, S16});
2007 BuildVector.minScalarOrElt(0, S32);
2008
2009 getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
2010 .customFor({V2S16, S32})
2011 .lower();
2012 }
2013
2014 BuildVector.legalIf(isRegisterType(ST, 0));
2015
2016 // FIXME: Clamp maximum size
2017 getActionDefinitionsBuilder(G_CONCAT_VECTORS)
2018 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2019 .clampMaxNumElements(0, S32, 32)
2020 .clampMaxNumElements(1, S16, 2) // TODO: Make 4?
2021 .clampMaxNumElements(0, S16, 64);
2022
2023 getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
2024
2025 // Merge/Unmerge
2026 for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
2027 unsigned BigTyIdx = Op == G_MERGE_VALUES ? 0 : 1;
2028 unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
2029
2030 auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
2031 const LLT Ty = Query.Types[TypeIdx];
2032 if (Ty.isVector()) {
2033 const LLT &EltTy = Ty.getElementType();
2034 if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
2035 return true;
2037 return true;
2038 }
2039 return false;
2040 };
2041
2042 auto &Builder =
2044 .legalIf(all(isRegisterType(ST, 0), isRegisterType(ST, 1)))
2045 .lowerFor({{S16, V2S16}})
2046 .lowerIf([=](const LegalityQuery &Query) {
2047 const LLT BigTy = Query.Types[BigTyIdx];
2048 return BigTy.getSizeInBits() == 32;
2049 })
2050 // Try to widen to s16 first for small types.
2051 // TODO: Only do this on targets with legal s16 shifts
2052 .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
2053 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
2055 oneMoreElement(BigTyIdx))
2057 elementTypeIs(1, S16)),
2058 changeTo(1, V2S16))
2059 // Clamp the little scalar to s8-s256 and make it a power of 2. It's
2060 // not worth considering the multiples of 64 since 2*192 and 2*384
2061 // are not valid.
2062 .clampScalar(LitTyIdx, S32, S512)
2063 .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
2064 // Break up vectors with weird elements into scalars
2066 [=](const LegalityQuery &Query) {
2067 return notValidElt(Query, LitTyIdx);
2068 },
2069 scalarize(0))
2070 .fewerElementsIf(
2071 [=](const LegalityQuery &Query) {
2072 return notValidElt(Query, BigTyIdx);
2073 },
2074 scalarize(1))
2075 .clampScalar(BigTyIdx, S32, MaxScalar);
2076
2077 if (Op == G_MERGE_VALUES) {
2078 Builder.widenScalarIf(
2079 // TODO: Use 16-bit shifts if legal for 8-bit values?
2080 [=](const LegalityQuery &Query) {
2081 const LLT Ty = Query.Types[LitTyIdx];
2082 return Ty.getSizeInBits() < 32;
2083 },
2084 changeTo(LitTyIdx, S32));
2085 }
2086
2087 Builder.widenScalarIf(
2088 [=](const LegalityQuery &Query) {
2089 const LLT Ty = Query.Types[BigTyIdx];
2090 return Ty.getSizeInBits() % 16 != 0;
2091 },
2092 [=](const LegalityQuery &Query) {
2093 // Pick the next power of 2, or a multiple of 64 over 128.
2094 // Whichever is smaller.
2095 const LLT &Ty = Query.Types[BigTyIdx];
2096 unsigned NewSizeInBits = 1 << Log2_32_Ceil(Ty.getSizeInBits() + 1);
2097 if (NewSizeInBits >= 256) {
2098 unsigned RoundedTo = alignTo<64>(Ty.getSizeInBits() + 1);
2099 if (RoundedTo < NewSizeInBits)
2100 NewSizeInBits = RoundedTo;
2101 }
2102 return std::pair(BigTyIdx, LLT::scalar(NewSizeInBits));
2103 })
2104 // Any vectors left are the wrong size. Scalarize them.
2105 .scalarize(0)
2106 .scalarize(1);
2107 }
2108
2109 // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
2110 // RegBankSelect.
2111 auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
2112 .legalFor({{S32}, {S64}})
2113 .clampScalar(0, S32, S64);
2114
2115 if (ST.hasVOP3PInsts()) {
2116 SextInReg.lowerFor({{V2S16}})
2117 // Prefer to reduce vector widths for 16-bit vectors before lowering, to
2118 // get more vector shift opportunities, since we'll get those when
2119 // expanded.
2120 .clampMaxNumElementsStrict(0, S16, 2);
2121 } else if (ST.has16BitInsts()) {
2122 SextInReg.lowerFor({{S32}, {S64}, {S16}});
2123 } else {
2124 // Prefer to promote to s32 before lowering if we don't have 16-bit
2125 // shifts. This avoid a lot of intermediate truncate and extend operations.
2126 SextInReg.lowerFor({{S32}, {S64}});
2127 }
2128
2129 SextInReg
2130 .scalarize(0)
2131 .clampScalar(0, S32, S64)
2132 .lower();
2133
2134 getActionDefinitionsBuilder({G_ROTR, G_ROTL})
2135 .scalarize(0)
2136 .lower();
2137
2138 auto &FSHRActionDefs = getActionDefinitionsBuilder(G_FSHR);
2139 FSHRActionDefs.legalFor({{S32, S32}})
2140 .clampMaxNumElementsStrict(0, S16, 2);
2141 if (ST.hasVOP3PInsts())
2142 FSHRActionDefs.lowerFor({{V2S16, V2S16}});
2143 FSHRActionDefs.scalarize(0).lower();
2144
2145 if (ST.hasVOP3PInsts()) {
2147 .lowerFor({{V2S16, V2S16}})
2148 .clampMaxNumElementsStrict(0, S16, 2)
2149 .scalarize(0)
2150 .lower();
2151 } else {
2153 .scalarize(0)
2154 .lower();
2155 }
2156
2157 getActionDefinitionsBuilder(G_READCYCLECOUNTER)
2158 .legalFor({S64});
2159
2160 getActionDefinitionsBuilder(G_READSTEADYCOUNTER).legalFor({S64});
2161
2163 .alwaysLegal();
2164
2165 getActionDefinitionsBuilder({G_SMULO, G_UMULO})
2166 .scalarize(0)
2167 .minScalar(0, S32)
2168 .lower();
2169
2170 getActionDefinitionsBuilder({G_SBFX, G_UBFX})
2171 .legalFor({{S32, S32}, {S64, S32}})
2172 .clampScalar(1, S32, S32)
2173 .clampScalar(0, S32, S64)
2175 .scalarize(0);
2176
2178 {// TODO: Verify V_BFI_B32 is generated from expanded bit ops
2179 G_FCOPYSIGN,
2180
2181 G_ATOMIC_CMPXCHG_WITH_SUCCESS, G_ATOMICRMW_NAND, G_ATOMICRMW_FSUB,
2182 G_READ_REGISTER, G_WRITE_REGISTER,
2183
2184 G_SADDO, G_SSUBO})
2185 .lower();
2186
2187 if (ST.hasIEEEMinimumMaximumInsts()) {
2188 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2189 .legalFor(FPTypesPK16)
2190 .clampMaxNumElements(0, S16, 2)
2191 .scalarize(0);
2192 } else if (ST.hasVOP3PInsts()) {
2193 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2194 .lowerFor({V2S16})
2195 .clampMaxNumElementsStrict(0, S16, 2)
2196 .scalarize(0)
2197 .lower();
2198 } else {
2199 getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
2200 .scalarize(0)
2201 .clampScalar(0, S32, S64)
2202 .lower();
2203 }
2204
2205 getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
2206 .lower();
2207
2208 getActionDefinitionsBuilder({G_TRAP, G_DEBUGTRAP}).custom();
2209
2210 getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
2211 G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
2212 G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
2213 .unsupported();
2214
2216
2218 {G_VECREDUCE_SMIN, G_VECREDUCE_SMAX, G_VECREDUCE_UMIN, G_VECREDUCE_UMAX,
2219 G_VECREDUCE_ADD, G_VECREDUCE_MUL, G_VECREDUCE_FMUL, G_VECREDUCE_FMIN,
2220 G_VECREDUCE_FMAX, G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM,
2221 G_VECREDUCE_OR, G_VECREDUCE_AND, G_VECREDUCE_XOR})
2222 .legalFor(AllVectors)
2223 .scalarize(1)
2224 .lower();
2225
2227 verify(*ST.getInstrInfo());
2228}
2229
2232 LostDebugLocObserver &LocObserver) const {
2233 MachineIRBuilder &B = Helper.MIRBuilder;
2234 MachineRegisterInfo &MRI = *B.getMRI();
2235
2236 switch (MI.getOpcode()) {
2237 case TargetOpcode::G_ADDRSPACE_CAST:
2238 return legalizeAddrSpaceCast(MI, MRI, B);
2239 case TargetOpcode::G_INTRINSIC_ROUNDEVEN:
2240 return legalizeFroundeven(MI, MRI, B);
2241 case TargetOpcode::G_FCEIL:
2242 return legalizeFceil(MI, MRI, B);
2243 case TargetOpcode::G_FREM:
2244 return legalizeFrem(MI, MRI, B);
2245 case TargetOpcode::G_INTRINSIC_TRUNC:
2246 return legalizeIntrinsicTrunc(MI, MRI, B);
2247 case TargetOpcode::G_SITOFP:
2248 return legalizeITOFP(MI, MRI, B, true);
2249 case TargetOpcode::G_UITOFP:
2250 return legalizeITOFP(MI, MRI, B, false);
2251 case TargetOpcode::G_FPTOSI:
2252 return legalizeFPTOI(MI, MRI, B, true);
2253 case TargetOpcode::G_FPTOUI:
2254 return legalizeFPTOI(MI, MRI, B, false);
2255 case TargetOpcode::G_FMINNUM:
2256 case TargetOpcode::G_FMAXNUM:
2257 case TargetOpcode::G_FMINIMUMNUM:
2258 case TargetOpcode::G_FMAXIMUMNUM:
2259 return legalizeMinNumMaxNum(Helper, MI);
2260 case TargetOpcode::G_EXTRACT:
2261 return legalizeExtract(Helper, MI);
2262 case TargetOpcode::G_INSERT:
2263 return legalizeInsert(Helper, MI);
2264 case TargetOpcode::G_EXTRACT_VECTOR_ELT:
2265 return legalizeExtractVectorElt(MI, MRI, B);
2266 case TargetOpcode::G_INSERT_VECTOR_ELT:
2267 return legalizeInsertVectorElt(MI, MRI, B);
2268 case TargetOpcode::G_FSIN:
2269 case TargetOpcode::G_FCOS:
2270 return legalizeSinCos(MI, MRI, B);
2271 case TargetOpcode::G_GLOBAL_VALUE:
2272 return legalizeGlobalValue(MI, MRI, B);
2273 case TargetOpcode::G_LOAD:
2274 case TargetOpcode::G_SEXTLOAD:
2275 case TargetOpcode::G_ZEXTLOAD:
2276 return legalizeLoad(Helper, MI);
2277 case TargetOpcode::G_STORE:
2278 return legalizeStore(Helper, MI);
2279 case TargetOpcode::G_FMAD:
2280 return legalizeFMad(MI, MRI, B);
2281 case TargetOpcode::G_FDIV:
2282 return legalizeFDIV(MI, MRI, B);
2283 case TargetOpcode::G_FFREXP:
2284 return legalizeFFREXP(MI, MRI, B);
2285 case TargetOpcode::G_FSQRT:
2286 return legalizeFSQRT(MI, MRI, B);
2287 case TargetOpcode::G_UDIV:
2288 case TargetOpcode::G_UREM:
2289 case TargetOpcode::G_UDIVREM:
2290 return legalizeUnsignedDIV_REM(MI, MRI, B);
2291 case TargetOpcode::G_SDIV:
2292 case TargetOpcode::G_SREM:
2293 case TargetOpcode::G_SDIVREM:
2294 return legalizeSignedDIV_REM(MI, MRI, B);
2295 case TargetOpcode::G_ATOMIC_CMPXCHG:
2296 return legalizeAtomicCmpXChg(MI, MRI, B);
2297 case TargetOpcode::G_FLOG2:
2298 return legalizeFlog2(MI, B);
2299 case TargetOpcode::G_FLOG:
2300 case TargetOpcode::G_FLOG10:
2301 return legalizeFlogCommon(MI, B);
2302 case TargetOpcode::G_FEXP2:
2303 return legalizeFExp2(MI, B);
2304 case TargetOpcode::G_FEXP:
2305 case TargetOpcode::G_FEXP10:
2306 return legalizeFExp(MI, B);
2307 case TargetOpcode::G_FPOW:
2308 return legalizeFPow(MI, B);
2309 case TargetOpcode::G_FFLOOR:
2310 return legalizeFFloor(MI, MRI, B);
2311 case TargetOpcode::G_BUILD_VECTOR:
2312 case TargetOpcode::G_BUILD_VECTOR_TRUNC:
2313 return legalizeBuildVector(MI, MRI, B);
2314 case TargetOpcode::G_MUL:
2315 return legalizeMul(Helper, MI);
2316 case TargetOpcode::G_CTLZ:
2317 case TargetOpcode::G_CTTZ:
2318 return legalizeCTLZ_CTTZ(MI, MRI, B);
2319 case TargetOpcode::G_CTLS:
2320 return legalizeCTLS(MI, MRI, B);
2321 case TargetOpcode::G_CTLZ_ZERO_UNDEF:
2322 return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
2323 case TargetOpcode::G_STACKSAVE:
2324 return legalizeStackSave(MI, B);
2325 case TargetOpcode::G_GET_FPENV:
2326 return legalizeGetFPEnv(MI, MRI, B);
2327 case TargetOpcode::G_SET_FPENV:
2328 return legalizeSetFPEnv(MI, MRI, B);
2329 case TargetOpcode::G_TRAP:
2330 return legalizeTrap(MI, MRI, B);
2331 case TargetOpcode::G_DEBUGTRAP:
2332 return legalizeDebugTrap(MI, MRI, B);
2333 default:
2334 return false;
2335 }
2336
2337 llvm_unreachable("expected switch to return");
2338}
2339
2341 unsigned AS,
2343 MachineIRBuilder &B) const {
2344 MachineFunction &MF = B.getMF();
2345 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
2346 const LLT S32 = LLT::scalar(32);
2347 const LLT S64 = LLT::scalar(64);
2348
2350
2351 if (ST.hasApertureRegs()) {
2352 // Note: this register is somewhat broken. When used as a 32-bit operand,
2353 // it only returns zeroes. The real value is in the upper 32 bits.
2354 // Thus, we must emit extract the high 32 bits.
2355 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
2356 ? AMDGPU::SRC_SHARED_BASE
2357 : AMDGPU::SRC_PRIVATE_BASE;
2358 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
2359 !ST.hasGloballyAddressableScratch()) &&
2360 "Cannot use src_private_base with globally addressable scratch!");
2362 MRI.setRegClass(Dst, &AMDGPU::SReg_64RegClass);
2363 B.buildCopy({Dst}, {Register(ApertureRegNo)});
2364 return B.buildUnmerge(S32, Dst).getReg(1);
2365 }
2366
2369 // For code object version 5, private_base and shared_base are passed through
2370 // implicit kernargs.
2374
2379 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
2380
2381 Register KernargPtrReg = MRI.createGenericVirtualRegister(
2383
2384 if (!loadInputValue(KernargPtrReg, B,
2386 return Register();
2387
2389 PtrInfo.getWithOffset(Offset),
2393
2394 // Pointer address
2395 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
2396 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
2397 // Load address
2398 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2399 }
2400
2403
2405 return Register();
2406
2407 // TODO: Use custom PseudoSourceValue
2409
2410 // Offset into amd_queue_t for group_segment_aperture_base_hi /
2411 // private_segment_aperture_base_hi.
2412 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
2413
2415 PtrInfo,
2418 LLT::scalar(32), commonAlignment(Align(64), StructOffset));
2419
2420 B.buildObjectPtrOffset(
2421 LoadAddr, QueuePtr,
2422 B.buildConstant(LLT::scalar(64), StructOffset).getReg(0));
2423 return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
2424}
2425
2426/// Return true if the value is a known valid address, such that a null check is
2427/// not necessary.
2429 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
2430 MachineInstr *Def = MRI.getVRegDef(Val);
2431 switch (Def->getOpcode()) {
2432 case AMDGPU::G_FRAME_INDEX:
2433 case AMDGPU::G_GLOBAL_VALUE:
2434 case AMDGPU::G_BLOCK_ADDR:
2435 return true;
2436 case AMDGPU::G_CONSTANT: {
2437 const ConstantInt *CI = Def->getOperand(1).getCImm();
2438 return CI->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
2439 }
2440 default:
2441 return false;
2442 }
2443
2444 return false;
2445}
2446
2449 MachineIRBuilder &B) const {
2450 MachineFunction &MF = B.getMF();
2451
2452 // MI can either be a G_ADDRSPACE_CAST or a
2453 // G_INTRINSIC @llvm.amdgcn.addrspacecast.nonnull
2454 assert(MI.getOpcode() == TargetOpcode::G_ADDRSPACE_CAST ||
2455 (isa<GIntrinsic>(MI) && cast<GIntrinsic>(MI).getIntrinsicID() ==
2456 Intrinsic::amdgcn_addrspacecast_nonnull));
2457
2458 const LLT S32 = LLT::scalar(32);
2459 Register Dst = MI.getOperand(0).getReg();
2460 Register Src = isa<GIntrinsic>(MI) ? MI.getOperand(2).getReg()
2461 : MI.getOperand(1).getReg();
2462 LLT DstTy = MRI.getType(Dst);
2463 LLT SrcTy = MRI.getType(Src);
2464 unsigned DestAS = DstTy.getAddressSpace();
2465 unsigned SrcAS = SrcTy.getAddressSpace();
2466
2467 // TODO: Avoid reloading from the queue ptr for each cast, or at least each
2468 // vector element.
2469 assert(!DstTy.isVector());
2470
2471 const AMDGPUTargetMachine &TM
2472 = static_cast<const AMDGPUTargetMachine &>(MF.getTarget());
2473
2474 if (TM.isNoopAddrSpaceCast(SrcAS, DestAS)) {
2475 MI.setDesc(B.getTII().get(TargetOpcode::G_BITCAST));
2476 return true;
2477 }
2478
2479 if (SrcAS == AMDGPUAS::FLAT_ADDRESS &&
2480 (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
2481 DestAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2482 auto castFlatToLocalOrPrivate = [&](const DstOp &Dst) -> Register {
2483 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2484 ST.hasGloballyAddressableScratch()) {
2485 // flat -> private with globally addressable scratch: subtract
2486 // src_flat_scratch_base_lo.
2487 const LLT S32 = LLT::scalar(32);
2488 Register SrcLo = B.buildExtract(S32, Src, 0).getReg(0);
2489 Register FlatScratchBaseLo =
2490 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
2491 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO)})
2492 .getReg(0);
2493 MRI.setRegClass(FlatScratchBaseLo, &AMDGPU::SReg_32RegClass);
2494 Register Sub = B.buildSub(S32, SrcLo, FlatScratchBaseLo).getReg(0);
2495 return B.buildIntToPtr(Dst, Sub).getReg(0);
2496 }
2497
2498 // Extract low 32-bits of the pointer.
2499 return B.buildExtract(Dst, Src, 0).getReg(0);
2500 };
2501
2502 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2503 // G_ADDRSPACE_CAST we need to guess.
2504 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2505 castFlatToLocalOrPrivate(Dst);
2506 MI.eraseFromParent();
2507 return true;
2508 }
2509
2510 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
2511
2512 auto SegmentNull = B.buildConstant(DstTy, NullVal);
2513 auto FlatNull = B.buildConstant(SrcTy, 0);
2514
2515 // Extract low 32-bits of the pointer.
2516 auto PtrLo32 = castFlatToLocalOrPrivate(DstTy);
2517
2518 auto CmpRes =
2519 B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
2520 B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
2521
2522 MI.eraseFromParent();
2523 return true;
2524 }
2525
2526 if (DestAS == AMDGPUAS::FLAT_ADDRESS &&
2527 (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
2528 SrcAS == AMDGPUAS::PRIVATE_ADDRESS)) {
2529 auto castLocalOrPrivateToFlat = [&](const DstOp &Dst) -> Register {
2530 // Coerce the type of the low half of the result so we can use
2531 // merge_values.
2532 Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
2533
2534 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
2535 ST.hasGloballyAddressableScratch()) {
2536 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
2537 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
2538 Register AllOnes = B.buildConstant(S32, -1).getReg(0);
2539 Register ThreadID = B.buildConstant(S32, 0).getReg(0);
2540 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_lo, {S32})
2541 .addUse(AllOnes)
2542 .addUse(ThreadID)
2543 .getReg(0);
2544 if (ST.isWave64()) {
2545 ThreadID = B.buildIntrinsic(Intrinsic::amdgcn_mbcnt_hi, {S32})
2546 .addUse(AllOnes)
2547 .addUse(ThreadID)
2548 .getReg(0);
2549 }
2550 Register ShAmt =
2551 B.buildConstant(S32, 57 - 32 - ST.getWavefrontSizeLog2()).getReg(0);
2552 Register SrcHi = B.buildShl(S32, ThreadID, ShAmt).getReg(0);
2553 Register CvtPtr =
2554 B.buildMergeLikeInstr(DstTy, {SrcAsInt, SrcHi}).getReg(0);
2555 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
2556 // 64-bit hi:lo value.
2557 Register FlatScratchBase =
2558 B.buildInstr(AMDGPU::S_MOV_B64, {S64},
2559 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE)})
2560 .getReg(0);
2561 MRI.setRegClass(FlatScratchBase, &AMDGPU::SReg_64RegClass);
2562 return B.buildPtrAdd(Dst, CvtPtr, FlatScratchBase).getReg(0);
2563 }
2564
2565 Register ApertureReg = getSegmentAperture(SrcAS, MRI, B);
2566 if (!ApertureReg.isValid())
2567 return false;
2568
2569 // TODO: Should we allow mismatched types but matching sizes in merges to
2570 // avoid the ptrtoint?
2571 return B.buildMergeLikeInstr(Dst, {SrcAsInt, ApertureReg}).getReg(0);
2572 };
2573
2574 // For llvm.amdgcn.addrspacecast.nonnull we can always assume non-null, for
2575 // G_ADDRSPACE_CAST we need to guess.
2576 if (isa<GIntrinsic>(MI) || isKnownNonNull(Src, MRI, TM, SrcAS)) {
2577 castLocalOrPrivateToFlat(Dst);
2578 MI.eraseFromParent();
2579 return true;
2580 }
2581
2582 Register BuildPtr = castLocalOrPrivateToFlat(DstTy);
2583
2584 auto SegmentNull =
2585 B.buildConstant(SrcTy, AMDGPU::getNullPointerValue(SrcAS));
2586 auto FlatNull = B.buildConstant(DstTy, AMDGPU::getNullPointerValue(DestAS));
2587
2588 auto CmpRes = B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src,
2589 SegmentNull.getReg(0));
2590
2591 B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
2592
2593 MI.eraseFromParent();
2594 return true;
2595 }
2596
2597 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2598 SrcTy.getSizeInBits() == 64) {
2599 // Truncate.
2600 B.buildExtract(Dst, Src, 0);
2601 MI.eraseFromParent();
2602 return true;
2603 }
2604
2605 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
2606 DstTy.getSizeInBits() == 64) {
2608 uint32_t AddrHiVal = Info->get32BitAddressHighBits();
2609 auto PtrLo = B.buildPtrToInt(S32, Src);
2610 if (AddrHiVal == 0) {
2611 auto Zext = B.buildZExt(LLT::scalar(64), PtrLo);
2612 B.buildIntToPtr(Dst, Zext);
2613 } else {
2614 auto HighAddr = B.buildConstant(S32, AddrHiVal);
2615 B.buildMergeLikeInstr(Dst, {PtrLo, HighAddr});
2616 }
2617
2618 MI.eraseFromParent();
2619 return true;
2620 }
2621
2622 // Invalid casts are poison.
2623 // TODO: Should return poison
2624 B.buildUndef(Dst);
2625 MI.eraseFromParent();
2626 return true;
2627}
2628
2631 MachineIRBuilder &B) const {
2632 Register Src = MI.getOperand(1).getReg();
2633 LLT Ty = MRI.getType(Src);
2634 assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
2635
2636 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2637 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2638
2639 auto C1 = B.buildFConstant(Ty, C1Val);
2640 auto CopySign = B.buildFCopysign(Ty, C1, Src);
2641
2642 // TODO: Should this propagate fast-math-flags?
2643 auto Tmp1 = B.buildFAdd(Ty, Src, CopySign);
2644 auto Tmp2 = B.buildFSub(Ty, Tmp1, CopySign);
2645
2646 auto C2 = B.buildFConstant(Ty, C2Val);
2647 auto Fabs = B.buildFAbs(Ty, Src);
2648
2649 auto Cond = B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), Fabs, C2);
2650 B.buildSelect(MI.getOperand(0).getReg(), Cond, Src, Tmp2);
2651 MI.eraseFromParent();
2652 return true;
2653}
2654
2657 MachineIRBuilder &B) const {
2658
2659 const LLT S1 = LLT::scalar(1);
2660 const LLT S64 = LLT::scalar(64);
2661
2662 Register Src = MI.getOperand(1).getReg();
2663 assert(MRI.getType(Src) == S64);
2664
2665 // result = trunc(src)
2666 // if (src > 0.0 && src != result)
2667 // result += 1.0
2668
2669 auto Trunc = B.buildIntrinsicTrunc(S64, Src);
2670
2671 const auto Zero = B.buildFConstant(S64, 0.0);
2672 const auto One = B.buildFConstant(S64, 1.0);
2673 auto Lt0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, Src, Zero);
2674 auto NeTrunc = B.buildFCmp(CmpInst::FCMP_ONE, S1, Src, Trunc);
2675 auto And = B.buildAnd(S1, Lt0, NeTrunc);
2676 auto Add = B.buildSelect(S64, And, One, Zero);
2677
2678 // TODO: Should this propagate fast-math-flags?
2679 B.buildFAdd(MI.getOperand(0).getReg(), Trunc, Add);
2680 MI.eraseFromParent();
2681 return true;
2682}
2683
2686 MachineIRBuilder &B) const {
2687 Register DstReg = MI.getOperand(0).getReg();
2688 Register Src0Reg = MI.getOperand(1).getReg();
2689 Register Src1Reg = MI.getOperand(2).getReg();
2690 auto Flags = MI.getFlags();
2691 LLT Ty = MRI.getType(DstReg);
2692
2693 auto Div = B.buildFDiv(Ty, Src0Reg, Src1Reg, Flags);
2694 auto Trunc = B.buildIntrinsicTrunc(Ty, Div, Flags);
2695 auto Neg = B.buildFNeg(Ty, Trunc, Flags);
2696 B.buildFMA(DstReg, Neg, Src1Reg, Src0Reg, Flags);
2697 MI.eraseFromParent();
2698 return true;
2699}
2700
2703 const unsigned FractBits = 52;
2704 const unsigned ExpBits = 11;
2705 LLT S32 = LLT::scalar(32);
2706
2707 auto Const0 = B.buildConstant(S32, FractBits - 32);
2708 auto Const1 = B.buildConstant(S32, ExpBits);
2709
2710 auto ExpPart = B.buildIntrinsic(Intrinsic::amdgcn_ubfe, {S32})
2711 .addUse(Hi)
2712 .addUse(Const0.getReg(0))
2713 .addUse(Const1.getReg(0));
2714
2715 return B.buildSub(S32, ExpPart, B.buildConstant(S32, 1023));
2716}
2717
2720 MachineIRBuilder &B) const {
2721 const LLT S1 = LLT::scalar(1);
2722 const LLT S32 = LLT::scalar(32);
2723 const LLT S64 = LLT::scalar(64);
2724
2725 Register Src = MI.getOperand(1).getReg();
2726 assert(MRI.getType(Src) == S64);
2727
2728 // TODO: Should this use extract since the low half is unused?
2729 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2730 Register Hi = Unmerge.getReg(1);
2731
2732 // Extract the upper half, since this is where we will find the sign and
2733 // exponent.
2734 auto Exp = extractF64Exponent(Hi, B);
2735
2736 const unsigned FractBits = 52;
2737
2738 // Extract the sign bit.
2739 const auto SignBitMask = B.buildConstant(S32, UINT32_C(1) << 31);
2740 auto SignBit = B.buildAnd(S32, Hi, SignBitMask);
2741
2742 const auto FractMask = B.buildConstant(S64, (UINT64_C(1) << FractBits) - 1);
2743
2744 const auto Zero32 = B.buildConstant(S32, 0);
2745
2746 // Extend back to 64-bits.
2747 auto SignBit64 = B.buildMergeLikeInstr(S64, {Zero32, SignBit});
2748
2749 auto Shr = B.buildAShr(S64, FractMask, Exp);
2750 auto Not = B.buildNot(S64, Shr);
2751 auto Tmp0 = B.buildAnd(S64, Src, Not);
2752 auto FiftyOne = B.buildConstant(S32, FractBits - 1);
2753
2754 auto ExpLt0 = B.buildICmp(CmpInst::ICMP_SLT, S1, Exp, Zero32);
2755 auto ExpGt51 = B.buildICmp(CmpInst::ICMP_SGT, S1, Exp, FiftyOne);
2756
2757 auto Tmp1 = B.buildSelect(S64, ExpLt0, SignBit64, Tmp0);
2758 B.buildSelect(MI.getOperand(0).getReg(), ExpGt51, Src, Tmp1);
2759 MI.eraseFromParent();
2760 return true;
2761}
2762
2765 MachineIRBuilder &B, bool Signed) const {
2766
2767 Register Dst = MI.getOperand(0).getReg();
2768 Register Src = MI.getOperand(1).getReg();
2769
2770 const LLT S64 = LLT::scalar(64);
2771 const LLT S32 = LLT::scalar(32);
2772
2773 assert(MRI.getType(Src) == S64);
2774
2775 auto Unmerge = B.buildUnmerge({S32, S32}, Src);
2776 auto ThirtyTwo = B.buildConstant(S32, 32);
2777
2778 if (MRI.getType(Dst) == S64) {
2779 auto CvtHi = Signed ? B.buildSITOFP(S64, Unmerge.getReg(1))
2780 : B.buildUITOFP(S64, Unmerge.getReg(1));
2781
2782 auto CvtLo = B.buildUITOFP(S64, Unmerge.getReg(0));
2783 auto LdExp = B.buildFLdexp(S64, CvtHi, ThirtyTwo);
2784
2785 // TODO: Should this propagate fast-math-flags?
2786 B.buildFAdd(Dst, LdExp, CvtLo);
2787 MI.eraseFromParent();
2788 return true;
2789 }
2790
2791 assert(MRI.getType(Dst) == S32);
2792
2793 auto One = B.buildConstant(S32, 1);
2794
2795 MachineInstrBuilder ShAmt;
2796 if (Signed) {
2797 auto ThirtyOne = B.buildConstant(S32, 31);
2798 auto X = B.buildXor(S32, Unmerge.getReg(0), Unmerge.getReg(1));
2799 auto OppositeSign = B.buildAShr(S32, X, ThirtyOne);
2800 auto MaxShAmt = B.buildAdd(S32, ThirtyTwo, OppositeSign);
2801 auto LS = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32})
2802 .addUse(Unmerge.getReg(1));
2803 auto LS2 = B.buildSub(S32, LS, One);
2804 ShAmt = B.buildUMin(S32, LS2, MaxShAmt);
2805 } else
2806 ShAmt = B.buildCTLZ(S32, Unmerge.getReg(1));
2807 auto Norm = B.buildShl(S64, Src, ShAmt);
2808 auto Unmerge2 = B.buildUnmerge({S32, S32}, Norm);
2809 auto Adjust = B.buildUMin(S32, One, Unmerge2.getReg(0));
2810 auto Norm2 = B.buildOr(S32, Unmerge2.getReg(1), Adjust);
2811 auto FVal = Signed ? B.buildSITOFP(S32, Norm2) : B.buildUITOFP(S32, Norm2);
2812 auto Scale = B.buildSub(S32, ThirtyTwo, ShAmt);
2813 B.buildFLdexp(Dst, FVal, Scale);
2814 MI.eraseFromParent();
2815 return true;
2816}
2817
2818// TODO: Copied from DAG implementation. Verify logic and document how this
2819// actually works.
2823 bool Signed) const {
2824
2825 Register Dst = MI.getOperand(0).getReg();
2826 Register Src = MI.getOperand(1).getReg();
2827
2828 const LLT S64 = LLT::scalar(64);
2829 const LLT S32 = LLT::scalar(32);
2830
2831 const LLT SrcLT = MRI.getType(Src);
2832 assert((SrcLT == S32 || SrcLT == S64) && MRI.getType(Dst) == S64);
2833
2834 unsigned Flags = MI.getFlags();
2835
2836 // The basic idea of converting a floating point number into a pair of 32-bit
2837 // integers is illustrated as follows:
2838 //
2839 // tf := trunc(val);
2840 // hif := floor(tf * 2^-32);
2841 // lof := tf - hif * 2^32; // lof is always positive due to floor.
2842 // hi := fptoi(hif);
2843 // lo := fptoi(lof);
2844 //
2845 auto Trunc = B.buildIntrinsicTrunc(SrcLT, Src, Flags);
2847 if (Signed && SrcLT == S32) {
2848 // However, a 32-bit floating point number has only 23 bits mantissa and
2849 // it's not enough to hold all the significant bits of `lof` if val is
2850 // negative. To avoid the loss of precision, We need to take the absolute
2851 // value after truncating and flip the result back based on the original
2852 // signedness.
2853 Sign = B.buildAShr(S32, Src, B.buildConstant(S32, 31));
2854 Trunc = B.buildFAbs(S32, Trunc, Flags);
2855 }
2856 MachineInstrBuilder K0, K1;
2857 if (SrcLT == S64) {
2858 K0 = B.buildFConstant(
2859 S64, llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)));
2860 K1 = B.buildFConstant(
2861 S64, llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)));
2862 } else {
2863 K0 = B.buildFConstant(
2864 S32, llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)));
2865 K1 = B.buildFConstant(
2866 S32, llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)));
2867 }
2868
2869 auto Mul = B.buildFMul(SrcLT, Trunc, K0, Flags);
2870 auto FloorMul = B.buildFFloor(SrcLT, Mul, Flags);
2871 auto Fma = B.buildFMA(SrcLT, FloorMul, K1, Trunc, Flags);
2872
2873 auto Hi = (Signed && SrcLT == S64) ? B.buildFPTOSI(S32, FloorMul)
2874 : B.buildFPTOUI(S32, FloorMul);
2875 auto Lo = B.buildFPTOUI(S32, Fma);
2876
2877 if (Signed && SrcLT == S32) {
2878 // Flip the result based on the signedness, which is either all 0s or 1s.
2879 Sign = B.buildMergeLikeInstr(S64, {Sign, Sign});
2880 // r := xor({lo, hi}, sign) - sign;
2881 B.buildSub(Dst, B.buildXor(S64, B.buildMergeLikeInstr(S64, {Lo, Hi}), Sign),
2882 Sign);
2883 } else
2884 B.buildMergeLikeInstr(Dst, {Lo, Hi});
2885 MI.eraseFromParent();
2886
2887 return true;
2888}
2889
2891 MachineInstr &MI) const {
2892 MachineFunction &MF = Helper.MIRBuilder.getMF();
2894
2895 // With ieee_mode disabled, the instructions have the correct behavior.
2896 if (!MFI->getMode().IEEE)
2897 return true;
2898
2900}
2901
2903 MachineInstr &MI) const {
2904 MachineIRBuilder &B = Helper.MIRBuilder;
2905 MachineRegisterInfo &MRI = *B.getMRI();
2906 Register DstReg = MI.getOperand(0).getReg();
2907 Register SrcReg = MI.getOperand(1).getReg();
2908 uint64_t Offset = MI.getOperand(2).getImm();
2909
2910 // Fall back to generic lowering for offset 0 (trivial trunc) and
2911 // non-32-bit-aligned cases which require shift+trunc sequences
2912 // that generic code handles correctly.
2913 if (Offset == 0 || Offset % 32 != 0)
2914 return Helper.lowerExtract(MI) == LegalizerHelper::Legalized;
2915
2916 const LLT DstTy = MRI.getType(DstReg);
2917 unsigned StartIdx = Offset / 32;
2918 unsigned DstCount = DstTy.getSizeInBits() / 32;
2919 auto Unmerge = B.buildUnmerge(LLT::scalar(32), SrcReg);
2920
2921 if (DstCount == 1) {
2922 if (DstTy.isPointer())
2923 B.buildIntToPtr(DstReg, Unmerge.getReg(StartIdx));
2924 else
2925 MRI.replaceRegWith(DstReg, Unmerge.getReg(StartIdx));
2926 } else {
2927 SmallVector<Register, 8> MergeVec;
2928 for (unsigned I = 0; I < DstCount; ++I)
2929 MergeVec.push_back(Unmerge.getReg(StartIdx + I));
2930 B.buildMergeLikeInstr(DstReg, MergeVec);
2931 }
2932
2933 MI.eraseFromParent();
2934 return true;
2935}
2936
2938 MachineInstr &MI) const {
2939 MachineIRBuilder &B = Helper.MIRBuilder;
2940 MachineRegisterInfo &MRI = *B.getMRI();
2941 Register DstReg = MI.getOperand(0).getReg();
2942 Register SrcReg = MI.getOperand(1).getReg();
2943 Register InsertSrc = MI.getOperand(2).getReg();
2944 uint64_t Offset = MI.getOperand(3).getImm();
2945
2946 unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
2947 const LLT InsertTy = MRI.getType(InsertSrc);
2948 unsigned InsertSize = InsertTy.getSizeInBits();
2949
2950 // Fall back to generic lowering for non-32-bit-aligned cases which
2951 // require shift+mask sequences that generic code handles correctly.
2952 if (Offset % 32 != 0 || DstSize % 32 != 0 || InsertSize % 32 != 0)
2953 return Helper.lowerInsert(MI) == LegalizerHelper::Legalized;
2954
2955 const LLT S32 = LLT::scalar(32);
2956 unsigned DstCount = DstSize / 32;
2957 unsigned InsertCount = InsertSize / 32;
2958 unsigned StartIdx = Offset / 32;
2959
2960 auto SrcUnmerge = B.buildUnmerge(S32, SrcReg);
2961
2962 SmallVector<Register, 8> MergeVec;
2963 for (unsigned I = 0; I < StartIdx; ++I)
2964 MergeVec.push_back(SrcUnmerge.getReg(I));
2965
2966 if (InsertCount == 1) {
2967 // Merge-like instructions require same source types. Convert pointer
2968 // to scalar when inserting a pointer value into a scalar.
2969 if (InsertTy.isPointer())
2970 InsertSrc = B.buildPtrToInt(S32, InsertSrc).getReg(0);
2971 MergeVec.push_back(InsertSrc);
2972 } else {
2973 auto InsertUnmerge = B.buildUnmerge(S32, InsertSrc);
2974 for (unsigned I = 0; I < InsertCount; ++I)
2975 MergeVec.push_back(InsertUnmerge.getReg(I));
2976 }
2977
2978 for (unsigned I = StartIdx + InsertCount; I < DstCount; ++I)
2979 MergeVec.push_back(SrcUnmerge.getReg(I));
2980
2981 B.buildMergeLikeInstr(DstReg, MergeVec);
2982
2983 MI.eraseFromParent();
2984 return true;
2985}
2986
2989 MachineIRBuilder &B) const {
2990 // TODO: Should move some of this into LegalizerHelper.
2991
2992 // TODO: Promote dynamic indexing of s16 to s32
2993
2994 Register Dst = MI.getOperand(0).getReg();
2995 Register Vec = MI.getOperand(1).getReg();
2996
2997 LLT VecTy = MRI.getType(Vec);
2998 LLT EltTy = VecTy.getElementType();
2999 assert(EltTy == MRI.getType(Dst));
3000
3001 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3002 // but we can't go directly to that logic becasue you can't bitcast a vector
3003 // of pointers to a vector of integers. Therefore, introduce an intermediate
3004 // vector of integers using ptrtoint (and inttoptr on the output) in order to
3005 // drive the legalization forward.
3006 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3007 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3008 LLT IntVecTy = VecTy.changeElementType(IntTy);
3009
3010 auto IntVec = B.buildPtrToInt(IntVecTy, Vec);
3011 auto IntElt = B.buildExtractVectorElement(IntTy, IntVec, MI.getOperand(2));
3012 B.buildIntToPtr(Dst, IntElt);
3013
3014 MI.eraseFromParent();
3015 return true;
3016 }
3017
3018 // FIXME: Artifact combiner probably should have replaced the truncated
3019 // constant before this, so we shouldn't need
3020 // getIConstantVRegValWithLookThrough.
3021 std::optional<ValueAndVReg> MaybeIdxVal =
3022 getIConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
3023 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3024 return true;
3025 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3026
3027 if (IdxVal < VecTy.getNumElements()) {
3028 auto Unmerge = B.buildUnmerge(EltTy, Vec);
3029 B.buildCopy(Dst, Unmerge.getReg(IdxVal));
3030 } else {
3031 B.buildUndef(Dst);
3032 }
3033
3034 MI.eraseFromParent();
3035 return true;
3036}
3037
3040 MachineIRBuilder &B) const {
3041 // TODO: Should move some of this into LegalizerHelper.
3042
3043 // TODO: Promote dynamic indexing of s16 to s32
3044
3045 Register Dst = MI.getOperand(0).getReg();
3046 Register Vec = MI.getOperand(1).getReg();
3047 Register Ins = MI.getOperand(2).getReg();
3048
3049 LLT VecTy = MRI.getType(Vec);
3050 LLT EltTy = VecTy.getElementType();
3051 assert(EltTy == MRI.getType(Ins));
3052
3053 // Other legalization maps vector<? x [type bigger than 64 bits]> via bitcasts
3054 // but we can't go directly to that logic becasue you can't bitcast a vector
3055 // of pointers to a vector of integers. Therefore, make the pointer vector
3056 // into an equivalent vector of integers with ptrtoint, insert the ptrtoint'd
3057 // new value, and then inttoptr the result vector back. This will then allow
3058 // the rest of legalization to take over.
3059 if (EltTy.isPointer() && EltTy.getSizeInBits() > 64) {
3060 LLT IntTy = LLT::scalar(EltTy.getSizeInBits());
3061 LLT IntVecTy = VecTy.changeElementType(IntTy);
3062
3063 auto IntVecSource = B.buildPtrToInt(IntVecTy, Vec);
3064 auto IntIns = B.buildPtrToInt(IntTy, Ins);
3065 auto IntVecDest = B.buildInsertVectorElement(IntVecTy, IntVecSource, IntIns,
3066 MI.getOperand(3));
3067 B.buildIntToPtr(Dst, IntVecDest);
3068 MI.eraseFromParent();
3069 return true;
3070 }
3071
3072 // FIXME: Artifact combiner probably should have replaced the truncated
3073 // constant before this, so we shouldn't need
3074 // getIConstantVRegValWithLookThrough.
3075 std::optional<ValueAndVReg> MaybeIdxVal =
3076 getIConstantVRegValWithLookThrough(MI.getOperand(3).getReg(), MRI);
3077 if (!MaybeIdxVal) // Dynamic case will be selected to register indexing.
3078 return true;
3079
3080 const uint64_t IdxVal = MaybeIdxVal->Value.getZExtValue();
3081
3082 unsigned NumElts = VecTy.getNumElements();
3083 if (IdxVal < NumElts) {
3085 for (unsigned i = 0; i < NumElts; ++i)
3086 SrcRegs.push_back(MRI.createGenericVirtualRegister(EltTy));
3087 B.buildUnmerge(SrcRegs, Vec);
3088
3089 SrcRegs[IdxVal] = MI.getOperand(2).getReg();
3090 B.buildMergeLikeInstr(Dst, SrcRegs);
3091 } else {
3092 B.buildUndef(Dst);
3093 }
3094
3095 MI.eraseFromParent();
3096 return true;
3097}
3098
3101 MachineIRBuilder &B) const {
3102
3103 Register DstReg = MI.getOperand(0).getReg();
3104 Register SrcReg = MI.getOperand(1).getReg();
3105 LLT Ty = MRI.getType(DstReg);
3106 unsigned Flags = MI.getFlags();
3107
3108 Register TrigVal;
3109 auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
3110 if (ST.hasTrigReducedRange()) {
3111 auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
3112 TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty})
3113 .addUse(MulVal.getReg(0))
3114 .setMIFlags(Flags)
3115 .getReg(0);
3116 } else
3117 TrigVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags).getReg(0);
3118
3119 Intrinsic::ID TrigIntrin = MI.getOpcode() == AMDGPU::G_FSIN ?
3120 Intrinsic::amdgcn_sin : Intrinsic::amdgcn_cos;
3121 B.buildIntrinsic(TrigIntrin, ArrayRef<Register>(DstReg))
3122 .addUse(TrigVal)
3123 .setMIFlags(Flags);
3124 MI.eraseFromParent();
3125 return true;
3126}
3127
3130 const GlobalValue *GV,
3131 int64_t Offset,
3132 unsigned GAFlags) const {
3133 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
3134 // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
3135 // to the following code sequence:
3136 //
3137 // For constant address space:
3138 // s_getpc_b64 s[0:1]
3139 // s_add_u32 s0, s0, $symbol
3140 // s_addc_u32 s1, s1, 0
3141 //
3142 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3143 // a fixup or relocation is emitted to replace $symbol with a literal
3144 // constant, which is a pc-relative offset from the encoding of the $symbol
3145 // operand to the global variable.
3146 //
3147 // For global address space:
3148 // s_getpc_b64 s[0:1]
3149 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
3150 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
3151 //
3152 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
3153 // fixups or relocations are emitted to replace $symbol@*@lo and
3154 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
3155 // which is a 64-bit pc-relative offset from the encoding of the $symbol
3156 // operand to the global variable.
3157
3159
3160 Register PCReg = PtrTy.getSizeInBits() != 32 ? DstReg :
3161 B.getMRI()->createGenericVirtualRegister(ConstPtrTy);
3162
3163 if (ST.has64BitLiterals()) {
3164 assert(GAFlags != SIInstrInfo::MO_NONE);
3165
3167 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET64).addDef(PCReg);
3168 MIB.addGlobalAddress(GV, Offset, GAFlags + 2);
3169 } else {
3171 B.buildInstr(AMDGPU::SI_PC_ADD_REL_OFFSET).addDef(PCReg);
3172
3173 MIB.addGlobalAddress(GV, Offset, GAFlags);
3174 if (GAFlags == SIInstrInfo::MO_NONE)
3175 MIB.addImm(0);
3176 else
3177 MIB.addGlobalAddress(GV, Offset, GAFlags + 1);
3178 }
3179
3180 if (!B.getMRI()->getRegClassOrNull(PCReg))
3181 B.getMRI()->setRegClass(PCReg, &AMDGPU::SReg_64RegClass);
3182
3183 if (PtrTy.getSizeInBits() == 32)
3184 B.buildExtract(DstReg, PCReg, 0);
3185 return true;
3186}
3187
3188// Emit a ABS32_LO / ABS32_HI relocation stub.
3190 Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
3191 MachineRegisterInfo &MRI) const {
3192 bool RequiresHighHalf = PtrTy.getSizeInBits() != 32;
3193
3194 if (RequiresHighHalf && ST.has64BitLiterals()) {
3195 if (!MRI.getRegClassOrNull(DstReg))
3196 MRI.setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
3197 B.buildInstr(AMDGPU::S_MOV_B64)
3198 .addDef(DstReg)
3199 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS64);
3200 return;
3201 }
3202
3203 LLT S32 = LLT::scalar(32);
3204
3205 // Use the destination directly, if and only if we store the lower address
3206 // part only and we don't have a register class being set.
3207 Register AddrLo = !RequiresHighHalf && !MRI.getRegClassOrNull(DstReg)
3208 ? DstReg
3210
3211 if (!MRI.getRegClassOrNull(AddrLo))
3212 MRI.setRegClass(AddrLo, &AMDGPU::SReg_32RegClass);
3213
3214 // Write the lower half.
3215 B.buildInstr(AMDGPU::S_MOV_B32)
3216 .addDef(AddrLo)
3217 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_LO);
3218
3219 // If required, write the upper half as well.
3220 if (RequiresHighHalf) {
3221 assert(PtrTy.getSizeInBits() == 64 &&
3222 "Must provide a 64-bit pointer type!");
3223
3225 MRI.setRegClass(AddrHi, &AMDGPU::SReg_32RegClass);
3226
3227 B.buildInstr(AMDGPU::S_MOV_B32)
3228 .addDef(AddrHi)
3229 .addGlobalAddress(GV, 0, SIInstrInfo::MO_ABS32_HI);
3230
3231 // Use the destination directly, if and only if we don't have a register
3232 // class being set.
3233 Register AddrDst = !MRI.getRegClassOrNull(DstReg)
3234 ? DstReg
3236
3237 if (!MRI.getRegClassOrNull(AddrDst))
3238 MRI.setRegClass(AddrDst, &AMDGPU::SReg_64RegClass);
3239
3240 B.buildMergeValues(AddrDst, {AddrLo, AddrHi});
3241
3242 // If we created a new register for the destination, cast the result into
3243 // the final output.
3244 if (AddrDst != DstReg)
3245 B.buildCast(DstReg, AddrDst);
3246 } else if (AddrLo != DstReg) {
3247 // If we created a new register for the destination, cast the result into
3248 // the final output.
3249 B.buildCast(DstReg, AddrLo);
3250 }
3251}
3252
3255 MachineIRBuilder &B) const {
3256 Register DstReg = MI.getOperand(0).getReg();
3257 LLT Ty = MRI.getType(DstReg);
3258 unsigned AS = Ty.getAddressSpace();
3259
3260 const GlobalValue *GV = MI.getOperand(1).getGlobal();
3261 MachineFunction &MF = B.getMF();
3263
3265 if (!MFI->isModuleEntryFunction() &&
3266 GV->getName() != "llvm.amdgcn.module.lds" &&
3268 const Function &Fn = MF.getFunction();
3270 Fn, "local memory global used by non-kernel function",
3271 MI.getDebugLoc(), DS_Warning));
3272
3273 // We currently don't have a way to correctly allocate LDS objects that
3274 // aren't directly associated with a kernel. We do force inlining of
3275 // functions that use local objects. However, if these dead functions are
3276 // not eliminated, we don't want a compile time error. Just emit a warning
3277 // and a trap, since there should be no callable path here.
3278 B.buildTrap();
3279 B.buildUndef(DstReg);
3280 MI.eraseFromParent();
3281 return true;
3282 }
3283
3284 // TODO: We could emit code to handle the initialization somewhere.
3285 // We ignore the initializer for now and legalize it to allow selection.
3286 // The initializer will anyway get errored out during assembly emission.
3287 const SITargetLowering *TLI = ST.getTargetLowering();
3288 if (!TLI->shouldUseLDSConstAddress(GV)) {
3289 MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
3290 return true; // Leave in place;
3291 }
3292
3293 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
3294 if (AS == AMDGPUAS::LOCAL_ADDRESS && GV->hasExternalLinkage()) {
3295 // HIP uses an unsized array `extern __shared__ T s[]` or similar
3296 // zero-sized type in other languages to declare the dynamic shared
3297 // memory which size is not known at the compile time. They will be
3298 // allocated by the runtime and placed directly after the static
3299 // allocated ones. They all share the same offset.
3300 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
3301 // Adjust alignment for that dynamic shared memory array.
3302 MFI->setDynLDSAlign(MF.getFunction(), GVar);
3303 LLT S32 = LLT::scalar(32);
3304 auto Sz = B.buildIntrinsic(Intrinsic::amdgcn_groupstaticsize, {S32});
3305 B.buildIntToPtr(DstReg, Sz);
3306 MI.eraseFromParent();
3307 return true;
3308 }
3309 }
3310
3311 B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), GVar));
3312 MI.eraseFromParent();
3313 return true;
3314 }
3315
3316 if (ST.isAmdPalOS() || ST.isMesa3DOS()) {
3317 buildAbsGlobalAddress(DstReg, Ty, B, GV, MRI);
3318 MI.eraseFromParent();
3319 return true;
3320 }
3321
3322 const SITargetLowering *TLI = ST.getTargetLowering();
3323
3324 if (TLI->shouldEmitFixup(GV)) {
3325 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0);
3326 MI.eraseFromParent();
3327 return true;
3328 }
3329
3330 if (TLI->shouldEmitPCReloc(GV)) {
3331 buildPCRelGlobalAddress(DstReg, Ty, B, GV, 0, SIInstrInfo::MO_REL32);
3332 MI.eraseFromParent();
3333 return true;
3334 }
3335
3337 Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
3338
3339 LLT LoadTy = Ty.getSizeInBits() == 32 ? PtrTy : Ty;
3344 LoadTy, Align(8));
3345
3346 buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
3347
3348 if (Ty.getSizeInBits() == 32) {
3349 // Truncate if this is a 32-bit constant address.
3350 auto Load = B.buildLoad(PtrTy, GOTAddr, *GOTMMO);
3351 B.buildExtract(DstReg, Load, 0);
3352 } else
3353 B.buildLoad(DstReg, GOTAddr, *GOTMMO);
3354
3355 MI.eraseFromParent();
3356 return true;
3357}
3358
3360 if (Ty.isVector())
3361 return Ty.changeElementCount(
3362 ElementCount::getFixed(PowerOf2Ceil(Ty.getNumElements())));
3363 return LLT::scalar(PowerOf2Ceil(Ty.getSizeInBits()));
3364}
3365
3367 MachineInstr &MI) const {
3368 MachineIRBuilder &B = Helper.MIRBuilder;
3369 MachineRegisterInfo &MRI = *B.getMRI();
3370 GISelChangeObserver &Observer = Helper.Observer;
3371
3372 Register PtrReg = MI.getOperand(1).getReg();
3373 LLT PtrTy = MRI.getType(PtrReg);
3374 unsigned AddrSpace = PtrTy.getAddressSpace();
3375
3376 if (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
3378 auto Cast = B.buildAddrSpaceCast(ConstPtr, PtrReg);
3379 Observer.changingInstr(MI);
3380 MI.getOperand(1).setReg(Cast.getReg(0));
3381 Observer.changedInstr(MI);
3382 return true;
3383 }
3384
3385 if (MI.getOpcode() != AMDGPU::G_LOAD)
3386 return false;
3387
3388 Register ValReg = MI.getOperand(0).getReg();
3389 LLT ValTy = MRI.getType(ValReg);
3390
3391 if (hasBufferRsrcWorkaround(ValTy)) {
3392 Observer.changingInstr(MI);
3393 castBufferRsrcFromV4I32(MI, B, MRI, 0);
3394 Observer.changedInstr(MI);
3395 return true;
3396 }
3397
3398 MachineMemOperand *MMO = *MI.memoperands_begin();
3399 const unsigned ValSize = ValTy.getSizeInBits();
3400 const LLT MemTy = MMO->getMemoryType();
3401 const Align MemAlign = MMO->getAlign();
3402 const unsigned MemSize = MemTy.getSizeInBits();
3403 const uint64_t AlignInBits = 8 * MemAlign.value();
3404
3405 // Widen non-power-of-2 loads to the alignment if needed
3406 if (shouldWidenLoad(ST, MemTy, AlignInBits, AddrSpace, MI.getOpcode())) {
3407 const unsigned WideMemSize = PowerOf2Ceil(MemSize);
3408
3409 // This was already the correct extending load result type, so just adjust
3410 // the memory type.
3411 if (WideMemSize == ValSize) {
3412 MachineFunction &MF = B.getMF();
3413
3414 MachineMemOperand *WideMMO =
3415 MF.getMachineMemOperand(MMO, 0, WideMemSize / 8);
3416 Observer.changingInstr(MI);
3417 MI.setMemRefs(MF, {WideMMO});
3418 Observer.changedInstr(MI);
3419 return true;
3420 }
3421
3422 // Don't bother handling edge case that should probably never be produced.
3423 if (ValSize > WideMemSize)
3424 return false;
3425
3426 LLT WideTy = widenToNextPowerOf2(ValTy);
3427
3428 Register WideLoad;
3429 if (!WideTy.isVector()) {
3430 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3431 B.buildTrunc(ValReg, WideLoad).getReg(0);
3432 } else {
3433 // Extract the subvector.
3434
3435 if (isRegisterType(ST, ValTy)) {
3436 // If this a case where G_EXTRACT is legal, use it.
3437 // (e.g. <3 x s32> -> <4 x s32>)
3438 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3439 B.buildExtract(ValReg, WideLoad, 0);
3440 } else {
3441 // For cases where the widened type isn't a nice register value, unmerge
3442 // from a widened register (e.g. <3 x s16> -> <4 x s16>)
3443 WideLoad = B.buildLoadFromOffset(WideTy, PtrReg, *MMO, 0).getReg(0);
3444 B.buildDeleteTrailingVectorElements(ValReg, WideLoad);
3445 }
3446 }
3447
3448 MI.eraseFromParent();
3449 return true;
3450 }
3451
3452 return false;
3453}
3454
3456 MachineInstr &MI) const {
3457 MachineIRBuilder &B = Helper.MIRBuilder;
3458 MachineRegisterInfo &MRI = *B.getMRI();
3459 GISelChangeObserver &Observer = Helper.Observer;
3460
3461 Register DataReg = MI.getOperand(0).getReg();
3462 LLT DataTy = MRI.getType(DataReg);
3463
3464 if (hasBufferRsrcWorkaround(DataTy)) {
3465 Observer.changingInstr(MI);
3467 Observer.changedInstr(MI);
3468 return true;
3469 }
3470 return false;
3471}
3472
3475 MachineIRBuilder &B) const {
3476 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
3477 assert(Ty.isScalar());
3478
3479 MachineFunction &MF = B.getMF();
3481
3482 // TODO: Always legal with future ftz flag.
3483 // FIXME: Do we need just output?
3484 if (Ty == LLT::float32() &&
3486 return true;
3487 if (Ty == LLT::float16() &&
3489 return true;
3490
3491 MachineIRBuilder HelperBuilder(MI);
3492 GISelObserverWrapper DummyObserver;
3493 LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
3494 return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
3495}
3496
3499 Register DstReg = MI.getOperand(0).getReg();
3500 Register PtrReg = MI.getOperand(1).getReg();
3501 Register CmpVal = MI.getOperand(2).getReg();
3502 Register NewVal = MI.getOperand(3).getReg();
3503
3505 "this should not have been custom lowered");
3506
3507 LLT ValTy = MRI.getType(CmpVal);
3508 LLT VecTy = LLT::fixed_vector(2, ValTy);
3509
3510 Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
3511
3512 B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
3513 .addDef(DstReg)
3514 .addUse(PtrReg)
3515 .addUse(PackedVal)
3516 .setMemRefs(MI.memoperands());
3517
3518 MI.eraseFromParent();
3519 return true;
3520}
3521
3522/// Return true if it's known that \p Src can never be an f32 denormal value.
3524 Register Src) {
3525 const MachineInstr *DefMI = MRI.getVRegDef(Src);
3526 switch (DefMI->getOpcode()) {
3527 case TargetOpcode::G_INTRINSIC: {
3529 case Intrinsic::amdgcn_frexp_mant:
3530 case Intrinsic::amdgcn_log:
3531 case Intrinsic::amdgcn_log_clamp:
3532 case Intrinsic::amdgcn_exp2:
3533 case Intrinsic::amdgcn_sqrt:
3534 return true;
3535 default:
3536 break;
3537 }
3538
3539 break;
3540 }
3541 case TargetOpcode::G_FSQRT:
3542 return true;
3543 case TargetOpcode::G_FFREXP: {
3544 if (DefMI->getOperand(0).getReg() == Src)
3545 return true;
3546 break;
3547 }
3548 case TargetOpcode::G_FPEXT: {
3549 return MRI.getType(DefMI->getOperand(1).getReg()) == LLT::scalar(16);
3550 }
3551 default:
3552 return false;
3553 }
3554
3555 return false;
3556}
3557
3558static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags) {
3559 return Flags & MachineInstr::FmAfn;
3560}
3561
3563 unsigned Flags) {
3564 return !valueIsKnownNeverF32Denorm(MF.getRegInfo(), Src) &&
3567}
3568
3569std::pair<Register, Register>
3571 unsigned Flags) const {
3572 if (!needsDenormHandlingF32(B.getMF(), Src, Flags))
3573 return {};
3574
3575 const LLT F32 = LLT::scalar(32);
3576 auto SmallestNormal = B.buildFConstant(
3578 auto IsLtSmallestNormal =
3579 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src, SmallestNormal);
3580
3581 auto Scale32 = B.buildFConstant(F32, 0x1.0p+32);
3582 auto One = B.buildFConstant(F32, 1.0);
3583 auto ScaleFactor =
3584 B.buildSelect(F32, IsLtSmallestNormal, Scale32, One, Flags);
3585 auto ScaledInput = B.buildFMul(F32, Src, ScaleFactor, Flags);
3586
3587 return {ScaledInput.getReg(0), IsLtSmallestNormal.getReg(0)};
3588}
3589
3591 MachineIRBuilder &B) const {
3592 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
3593 // If we have to handle denormals, scale up the input and adjust the result.
3594
3595 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
3596 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
3597
3598 Register Dst = MI.getOperand(0).getReg();
3599 Register Src = MI.getOperand(1).getReg();
3600 LLT Ty = B.getMRI()->getType(Dst);
3601 unsigned Flags = MI.getFlags();
3602
3603 if (Ty == LLT::scalar(16)) {
3604 const LLT F32 = LLT::scalar(32);
3605 // Nothing in half is a denormal when promoted to f32.
3606 auto Ext = B.buildFPExt(F32, Src, Flags);
3607 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {F32})
3608 .addUse(Ext.getReg(0))
3609 .setMIFlags(Flags);
3610 B.buildFPTrunc(Dst, Log2, Flags);
3611 MI.eraseFromParent();
3612 return true;
3613 }
3614
3615 assert(Ty == LLT::scalar(32));
3616
3617 auto [ScaledInput, IsLtSmallestNormal] = getScaledLogInput(B, Src, Flags);
3618 if (!ScaledInput) {
3619 B.buildIntrinsic(Intrinsic::amdgcn_log, {MI.getOperand(0)})
3620 .addUse(Src)
3621 .setMIFlags(Flags);
3622 MI.eraseFromParent();
3623 return true;
3624 }
3625
3626 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3627 .addUse(ScaledInput)
3628 .setMIFlags(Flags);
3629
3630 auto ThirtyTwo = B.buildFConstant(Ty, 32.0);
3631 auto Zero = B.buildFConstant(Ty, 0.0);
3632 auto ResultOffset =
3633 B.buildSelect(Ty, IsLtSmallestNormal, ThirtyTwo, Zero, Flags);
3634 B.buildFSub(Dst, Log2, ResultOffset, Flags);
3635
3636 MI.eraseFromParent();
3637 return true;
3638}
3639
3641 Register Z, unsigned Flags) {
3642 auto FMul = B.buildFMul(Ty, X, Y, Flags);
3643 return B.buildFAdd(Ty, FMul, Z, Flags).getReg(0);
3644}
3645
3647 MachineIRBuilder &B) const {
3648 const bool IsLog10 = MI.getOpcode() == TargetOpcode::G_FLOG10;
3649 assert(IsLog10 || MI.getOpcode() == TargetOpcode::G_FLOG);
3650
3651 MachineRegisterInfo &MRI = *B.getMRI();
3652 Register Dst = MI.getOperand(0).getReg();
3653 Register X = MI.getOperand(1).getReg();
3654 unsigned Flags = MI.getFlags();
3655 const LLT Ty = MRI.getType(X);
3656
3657 const LLT F32 = LLT::scalar(32);
3658 const LLT F16 = LLT::scalar(16);
3659
3660 if (Ty == F16 || MI.getFlag(MachineInstr::FmAfn)) {
3661 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
3662 // depending on !fpmath metadata.
3663 bool PromoteToF32 =
3664 Ty == F16 && (!MI.getFlag(MachineInstr::FmAfn) || !ST.has16BitInsts());
3665 if (PromoteToF32) {
3667 auto PromoteSrc = B.buildFPExt(F32, X);
3668 legalizeFlogUnsafe(B, LogVal, PromoteSrc.getReg(0), IsLog10, Flags);
3669 B.buildFPTrunc(Dst, LogVal);
3670 } else {
3671 legalizeFlogUnsafe(B, Dst, X, IsLog10, Flags);
3672 }
3673
3674 MI.eraseFromParent();
3675 return true;
3676 }
3677
3678 auto [ScaledInput, IsScaled] = getScaledLogInput(B, X, Flags);
3679 if (ScaledInput)
3680 X = ScaledInput;
3681
3682 auto Y =
3683 B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty}).addUse(X).setMIFlags(Flags);
3684
3685 Register R;
3686 if (ST.hasFastFMAF32()) {
3687 // c+cc are ln(2)/ln(10) to more than 49 bits
3688 const float c_log10 = 0x1.344134p-2f;
3689 const float cc_log10 = 0x1.09f79ep-26f;
3690
3691 // c + cc is ln(2) to more than 49 bits
3692 const float c_log = 0x1.62e42ep-1f;
3693 const float cc_log = 0x1.efa39ep-25f;
3694
3695 auto C = B.buildFConstant(Ty, IsLog10 ? c_log10 : c_log);
3696 auto CC = B.buildFConstant(Ty, IsLog10 ? cc_log10 : cc_log);
3697 // This adds correction terms for which contraction may lead to an increase
3698 // in the error of the approximation, so disable it.
3699 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3700 R = B.buildFMul(Ty, Y, C, NewFlags).getReg(0);
3701 auto NegR = B.buildFNeg(Ty, R, NewFlags);
3702 auto FMA0 = B.buildFMA(Ty, Y, C, NegR, NewFlags);
3703 auto FMA1 = B.buildFMA(Ty, Y, CC, FMA0, NewFlags);
3704 R = B.buildFAdd(Ty, R, FMA1, NewFlags).getReg(0);
3705 } else {
3706 // ch+ct is ln(2)/ln(10) to more than 36 bits
3707 const float ch_log10 = 0x1.344000p-2f;
3708 const float ct_log10 = 0x1.3509f6p-18f;
3709
3710 // ch + ct is ln(2) to more than 36 bits
3711 const float ch_log = 0x1.62e000p-1f;
3712 const float ct_log = 0x1.0bfbe8p-15f;
3713
3714 auto CH = B.buildFConstant(Ty, IsLog10 ? ch_log10 : ch_log);
3715 auto CT = B.buildFConstant(Ty, IsLog10 ? ct_log10 : ct_log);
3716
3717 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
3718 auto YH = B.buildAnd(Ty, Y, MaskConst);
3719 auto YT = B.buildFSub(Ty, Y, YH, Flags);
3720 // This adds correction terms for which contraction may lead to an increase
3721 // in the error of the approximation, so disable it.
3722 auto NewFlags = Flags & ~(MachineInstr::FmContract);
3723 auto YTCT = B.buildFMul(Ty, YT, CT, NewFlags);
3724
3725 Register Mad0 =
3726 getMad(B, Ty, YH.getReg(0), CT.getReg(0), YTCT.getReg(0), NewFlags);
3727 Register Mad1 = getMad(B, Ty, YT.getReg(0), CH.getReg(0), Mad0, NewFlags);
3728 R = getMad(B, Ty, YH.getReg(0), CH.getReg(0), Mad1, NewFlags);
3729 }
3730
3731 const bool IsFiniteOnly =
3733
3734 if (!IsFiniteOnly) {
3735 // Expand isfinite(x) => fabs(x) < inf
3736 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
3737 auto Fabs = B.buildFAbs(Ty, Y);
3738 auto IsFinite =
3739 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
3740 R = B.buildSelect(Ty, IsFinite, R, Y, Flags).getReg(0);
3741 }
3742
3743 if (ScaledInput) {
3744 auto Zero = B.buildFConstant(Ty, 0.0);
3745 auto ShiftK =
3746 B.buildFConstant(Ty, IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f);
3747 auto Shift = B.buildSelect(Ty, IsScaled, ShiftK, Zero, Flags);
3748 B.buildFSub(Dst, R, Shift, Flags);
3749 } else {
3750 B.buildCopy(Dst, R);
3751 }
3752
3753 MI.eraseFromParent();
3754 return true;
3755}
3756
3758 Register Src, bool IsLog10,
3759 unsigned Flags) const {
3760 const double Log2BaseInverted =
3762
3763 LLT Ty = B.getMRI()->getType(Dst);
3764
3765 if (Ty == LLT::scalar(32)) {
3766 auto [ScaledInput, IsScaled] = getScaledLogInput(B, Src, Flags);
3767 if (ScaledInput) {
3768 auto LogSrc = B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3769 .addUse(Src)
3770 .setMIFlags(Flags);
3771 auto ScaledResultOffset = B.buildFConstant(Ty, -32.0 * Log2BaseInverted);
3772 auto Zero = B.buildFConstant(Ty, 0.0);
3773 auto ResultOffset =
3774 B.buildSelect(Ty, IsScaled, ScaledResultOffset, Zero, Flags);
3775 auto Log2Inv = B.buildFConstant(Ty, Log2BaseInverted);
3776
3777 if (ST.hasFastFMAF32())
3778 B.buildFMA(Dst, LogSrc, Log2Inv, ResultOffset, Flags);
3779 else {
3780 auto Mul = B.buildFMul(Ty, LogSrc, Log2Inv, Flags);
3781 B.buildFAdd(Dst, Mul, ResultOffset, Flags);
3782 }
3783
3784 return true;
3785 }
3786 }
3787
3788 auto Log2Operand = Ty == LLT::scalar(16)
3789 ? B.buildFLog2(Ty, Src, Flags)
3790 : B.buildIntrinsic(Intrinsic::amdgcn_log, {Ty})
3791 .addUse(Src)
3792 .setMIFlags(Flags);
3793 auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
3794 B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
3795 return true;
3796}
3797
3799 MachineIRBuilder &B) const {
3800 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3801 // If we have to handle denormals, scale up the input and adjust the result.
3802
3803 Register Dst = MI.getOperand(0).getReg();
3804 Register Src = MI.getOperand(1).getReg();
3805 unsigned Flags = MI.getFlags();
3806 LLT Ty = B.getMRI()->getType(Dst);
3807 const LLT F16 = LLT::scalar(16);
3808 const LLT F32 = LLT::scalar(32);
3809 const LLT F64 = LLT::scalar(64);
3810
3811 if (Ty == F64)
3812 return legalizeFEXPF64(MI, B);
3813
3814 if (Ty == F16) {
3815 // Nothing in half is a denormal when promoted to f32.
3816 auto Ext = B.buildFPExt(F32, Src, Flags);
3817 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {F32})
3818 .addUse(Ext.getReg(0))
3819 .setMIFlags(Flags);
3820 B.buildFPTrunc(Dst, Log2, Flags);
3821 MI.eraseFromParent();
3822 return true;
3823 }
3824
3825 assert(Ty == F32);
3826
3827 if (!needsDenormHandlingF32(B.getMF(), Src, Flags)) {
3828 B.buildIntrinsic(Intrinsic::amdgcn_exp2, ArrayRef<Register>{Dst})
3829 .addUse(Src)
3830 .setMIFlags(Flags);
3831 MI.eraseFromParent();
3832 return true;
3833 }
3834
3835 // bool needs_scaling = x < -0x1.f80000p+6f;
3836 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3837
3838 // -nextafter(128.0, -1)
3839 auto RangeCheckConst = B.buildFConstant(Ty, -0x1.f80000p+6f);
3840 auto NeedsScaling = B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Src,
3841 RangeCheckConst, Flags);
3842
3843 auto SixtyFour = B.buildFConstant(Ty, 0x1.0p+6f);
3844 auto Zero = B.buildFConstant(Ty, 0.0);
3845 auto AddOffset = B.buildSelect(F32, NeedsScaling, SixtyFour, Zero, Flags);
3846 auto AddInput = B.buildFAdd(F32, Src, AddOffset, Flags);
3847
3848 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3849 .addUse(AddInput.getReg(0))
3850 .setMIFlags(Flags);
3851
3852 auto TwoExpNeg64 = B.buildFConstant(Ty, 0x1.0p-64f);
3853 auto One = B.buildFConstant(Ty, 1.0);
3854 auto ResultScale = B.buildSelect(F32, NeedsScaling, TwoExpNeg64, One, Flags);
3855 B.buildFMul(Dst, Exp2, ResultScale, Flags);
3856 MI.eraseFromParent();
3857 return true;
3858}
3859
3861 const SrcOp &Src, unsigned Flags) {
3862 LLT Ty = Dst.getLLTTy(*B.getMRI());
3863
3864 if (Ty == LLT::scalar(32)) {
3865 return B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Dst})
3866 .addUse(Src.getReg())
3867 .setMIFlags(Flags);
3868 }
3869 return B.buildFExp2(Dst, Src, Flags);
3870}
3871
3873 Register Dst, Register X,
3874 unsigned Flags,
3875 bool IsExp10) const {
3876 LLT Ty = B.getMRI()->getType(X);
3877
3878 // exp(x) -> exp2(M_LOG2E_F * x);
3879 // exp10(x) -> exp2(log2(10) * x);
3880 auto Const = B.buildFConstant(Ty, IsExp10 ? 0x1.a934f0p+1f : numbers::log2e);
3881 auto Mul = B.buildFMul(Ty, X, Const, Flags);
3882 buildExp(B, Dst, Mul, Flags);
3883 return true;
3884}
3885
3887 Register X, unsigned Flags) const {
3888 LLT Ty = B.getMRI()->getType(Dst);
3889 LLT F32 = LLT::scalar(32);
3890
3891 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3892 return legalizeFExpUnsafeImpl(B, Dst, X, Flags, /*IsExp10=*/false);
3893 }
3894
3895 auto Threshold = B.buildFConstant(Ty, -0x1.5d58a0p+6f);
3896 auto NeedsScaling =
3897 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold, Flags);
3898 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+6f);
3899 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3900 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X, Flags);
3901
3902 auto Log2E = B.buildFConstant(Ty, numbers::log2e);
3903 auto ExpInput = B.buildFMul(Ty, AdjustedX, Log2E, Flags);
3904
3905 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
3906 .addUse(ExpInput.getReg(0))
3907 .setMIFlags(Flags);
3908
3909 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.969d48p-93f);
3910 auto AdjustedResult = B.buildFMul(Ty, Exp2, ResultScaleFactor, Flags);
3911 B.buildSelect(Dst, NeedsScaling, AdjustedResult, Exp2, Flags);
3912 return true;
3913}
3914
3916 Register Dst, Register X,
3917 unsigned Flags) const {
3918 LLT Ty = B.getMRI()->getType(Dst);
3919 LLT F32 = LLT::scalar(32);
3920
3921 if (Ty != F32 || !needsDenormHandlingF32(B.getMF(), X, Flags)) {
3922 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3923 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3924 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3925
3926 auto Mul1 = B.buildFMul(Ty, X, K1, Flags);
3927 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3928 auto Mul0 = B.buildFMul(Ty, X, K0, Flags);
3929 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3930 B.buildFMul(Dst, Exp2_0, Exp2_1, Flags);
3931 return true;
3932 }
3933
3934 // bool s = x < -0x1.2f7030p+5f;
3935 // x += s ? 0x1.0p+5f : 0.0f;
3936 // exp10 = exp2(x * 0x1.a92000p+1f) *
3937 // exp2(x * 0x1.4f0978p-11f) *
3938 // (s ? 0x1.9f623ep-107f : 1.0f);
3939
3940 auto Threshold = B.buildFConstant(Ty, -0x1.2f7030p+5f);
3941 auto NeedsScaling =
3942 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, Threshold);
3943
3944 auto ScaleOffset = B.buildFConstant(Ty, 0x1.0p+5f);
3945 auto ScaledX = B.buildFAdd(Ty, X, ScaleOffset, Flags);
3946 auto AdjustedX = B.buildSelect(Ty, NeedsScaling, ScaledX, X);
3947
3948 auto K0 = B.buildFConstant(Ty, 0x1.a92000p+1f);
3949 auto K1 = B.buildFConstant(Ty, 0x1.4f0978p-11f);
3950
3951 auto Mul1 = B.buildFMul(Ty, AdjustedX, K1, Flags);
3952 auto Exp2_1 = buildExp(B, Ty, Mul1, Flags);
3953 auto Mul0 = B.buildFMul(Ty, AdjustedX, K0, Flags);
3954 auto Exp2_0 = buildExp(B, Ty, Mul0, Flags);
3955
3956 auto MulExps = B.buildFMul(Ty, Exp2_0, Exp2_1, Flags);
3957 auto ResultScaleFactor = B.buildFConstant(Ty, 0x1.9f623ep-107f);
3958 auto AdjustedResult = B.buildFMul(Ty, MulExps, ResultScaleFactor, Flags);
3959
3960 B.buildSelect(Dst, NeedsScaling, AdjustedResult, MulExps);
3961 return true;
3962}
3963
3964// This expansion gives a result slightly better than 1ulp.
3966 MachineIRBuilder &B) const {
3967
3968 Register X = MI.getOperand(1).getReg();
3969 LLT S64 = LLT::scalar(64);
3970 LLT S32 = LLT::scalar(32);
3971 LLT S1 = LLT::scalar(1);
3972
3973 // TODO: Check if reassoc is safe. There is an output change in exp2 and
3974 // exp10, which slightly increases ulp.
3975 unsigned Flags = MI.getFlags() & ~MachineInstr::FmReassoc;
3976
3977 Register Dn, F, T;
3978
3979 if (MI.getOpcode() == TargetOpcode::G_FEXP2) {
3980 // Dn = rint(X)
3981 Dn = B.buildFRint(S64, X, Flags).getReg(0);
3982 // F = X - Dn
3983 F = B.buildFSub(S64, X, Dn, Flags).getReg(0);
3984 // T = F*C1 + F*C2
3985 auto C1 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
3986 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
3987 auto Mul2 = B.buildFMul(S64, F, C2, Flags).getReg(0);
3988 T = B.buildFMA(S64, F, C1, Mul2, Flags).getReg(0);
3989
3990 } else if (MI.getOpcode() == TargetOpcode::G_FEXP10) {
3991 auto C1 = B.buildFConstant(S64, APFloat(0x1.a934f0979a371p+1));
3992 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
3993 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
3994
3995 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
3996 auto C2 = B.buildFConstant(S64, APFloat(-0x1.9dc1da994fd21p-59));
3997 auto C3 = B.buildFConstant(S64, APFloat(0x1.34413509f79ffp-2));
3998 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
3999 F = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4000
4001 auto C4 = B.buildFConstant(S64, APFloat(0x1.26bb1bbb55516p+1));
4002 auto C5 = B.buildFConstant(S64, APFloat(-0x1.f48ad494ea3e9p-53));
4003 auto MulF = B.buildFMul(S64, F, C5, Flags).getReg(0);
4004 T = B.buildFMA(S64, F, C4, MulF, Flags).getReg(0);
4005
4006 } else { // G_FEXP
4007 auto C1 = B.buildFConstant(S64, APFloat(0x1.71547652b82fep+0));
4008 auto Mul = B.buildFMul(S64, X, C1, Flags).getReg(0);
4009 Dn = B.buildFRint(S64, Mul, Flags).getReg(0);
4010
4011 auto NegDn = B.buildFNeg(S64, Dn, Flags).getReg(0);
4012 auto C2 = B.buildFConstant(S64, APFloat(0x1.abc9e3b39803fp-56));
4013 auto C3 = B.buildFConstant(S64, APFloat(0x1.62e42fefa39efp-1));
4014 auto Inner = B.buildFMA(S64, NegDn, C3, X, Flags).getReg(0);
4015 T = B.buildFMA(S64, NegDn, C2, Inner, Flags).getReg(0);
4016 }
4017
4018 // Polynomial chain for P
4019 auto P = B.buildFConstant(S64, 0x1.ade156a5dcb37p-26);
4020 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.28af3fca7ab0cp-22),
4021 Flags);
4022 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.71dee623fde64p-19),
4023 Flags);
4024 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01997c89e6b0p-16),
4025 Flags);
4026 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.a01a014761f6ep-13),
4027 Flags);
4028 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.6c16c1852b7b0p-10),
4029 Flags);
4030 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.1111111122322p-7), Flags);
4031 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.55555555502a1p-5), Flags);
4032 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.5555555555511p-3), Flags);
4033 P = B.buildFMA(S64, T, P, B.buildFConstant(S64, 0x1.000000000000bp-1), Flags);
4034
4035 auto One = B.buildFConstant(S64, 1.0);
4036 P = B.buildFMA(S64, T, P, One, Flags);
4037 P = B.buildFMA(S64, T, P, One, Flags);
4038
4039 // Z = FLDEXP(P, (int)Dn)
4040 auto DnInt = B.buildFPTOSI(S32, Dn);
4041 auto Z = B.buildFLdexp(S64, P, DnInt, Flags);
4042
4043 if (!(Flags & MachineInstr::FmNoInfs)) {
4044 // Overflow guard: if X <= 1024.0 then Z else +inf
4045 auto CondHi = B.buildFCmp(CmpInst::FCMP_ULE, S1, X,
4046 B.buildFConstant(S64, APFloat(1024.0)));
4047 auto PInf = B.buildFConstant(S64, APFloat::getInf(APFloat::IEEEdouble()));
4048 Z = B.buildSelect(S64, CondHi, Z, PInf, Flags);
4049 }
4050
4051 // Underflow guard: if X >= -1075.0 then Z else 0.0
4052 auto CondLo = B.buildFCmp(CmpInst::FCMP_UGE, S1, X,
4053 B.buildFConstant(S64, APFloat(-1075.0)));
4054 auto Zero = B.buildFConstant(S64, APFloat(0.0));
4055 B.buildSelect(MI.getOperand(0).getReg(), CondLo, Z, Zero, Flags);
4056
4057 MI.eraseFromParent();
4058 return true;
4059}
4060
4062 MachineIRBuilder &B) const {
4063 Register Dst = MI.getOperand(0).getReg();
4064 Register X = MI.getOperand(1).getReg();
4065 const unsigned Flags = MI.getFlags();
4066 MachineFunction &MF = B.getMF();
4067 MachineRegisterInfo &MRI = *B.getMRI();
4068 LLT Ty = MRI.getType(Dst);
4069
4070 const LLT F64 = LLT::scalar(64);
4071
4072 if (Ty == F64)
4073 return legalizeFEXPF64(MI, B);
4074
4075 const LLT F16 = LLT::scalar(16);
4076 const LLT F32 = LLT::scalar(32);
4077 const bool IsExp10 = MI.getOpcode() == TargetOpcode::G_FEXP10;
4078
4079 if (Ty == F16) {
4080 // v_exp_f16 (fmul x, log2e)
4081 if (allowApproxFunc(MF, Flags)) {
4082 // TODO: Does this really require fast?
4083 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4084 : legalizeFExpUnsafe(B, Dst, X, Flags);
4085 MI.eraseFromParent();
4086 return true;
4087 }
4088
4089 // Nothing in half is a denormal when promoted to f32.
4090 //
4091 // exp(f16 x) ->
4092 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
4093 //
4094 // exp10(f16 x) ->
4095 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
4096 auto Ext = B.buildFPExt(F32, X, Flags);
4098 legalizeFExpUnsafeImpl(B, Lowered, Ext.getReg(0), Flags, IsExp10);
4099 B.buildFPTrunc(Dst, Lowered, Flags);
4100 MI.eraseFromParent();
4101 return true;
4102 }
4103
4104 assert(Ty == F32);
4105
4106 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
4107 // library behavior. Also, is known-not-daz source sufficient?
4108 if (allowApproxFunc(MF, Flags)) {
4109 IsExp10 ? legalizeFExp10Unsafe(B, Dst, X, Flags)
4110 : legalizeFExpUnsafe(B, Dst, X, Flags);
4111 MI.eraseFromParent();
4112 return true;
4113 }
4114
4115 // Algorithm:
4116 //
4117 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
4118 //
4119 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
4120 // n = 64*m + j, 0 <= j < 64
4121 //
4122 // e^x = 2^((64*m + j + f)/64)
4123 // = (2^m) * (2^(j/64)) * 2^(f/64)
4124 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
4125 //
4126 // f = x*(64/ln(2)) - n
4127 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
4128 //
4129 // e^x = (2^m) * (2^(j/64)) * e^r
4130 //
4131 // (2^(j/64)) is precomputed
4132 //
4133 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4134 // e^r = 1 + q
4135 //
4136 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
4137 //
4138 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
4139 const unsigned FlagsNoContract = Flags & ~MachineInstr::FmContract;
4140 Register PH, PL;
4141
4142 if (ST.hasFastFMAF32()) {
4143 const float c_exp = numbers::log2ef;
4144 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
4145 const float c_exp10 = 0x1.a934f0p+1f;
4146 const float cc_exp10 = 0x1.2f346ep-24f;
4147
4148 auto C = B.buildFConstant(Ty, IsExp10 ? c_exp10 : c_exp);
4149 PH = B.buildFMul(Ty, X, C, Flags).getReg(0);
4150 auto NegPH = B.buildFNeg(Ty, PH, Flags);
4151 auto FMA0 = B.buildFMA(Ty, X, C, NegPH, Flags);
4152
4153 auto CC = B.buildFConstant(Ty, IsExp10 ? cc_exp10 : cc_exp);
4154 PL = B.buildFMA(Ty, X, CC, FMA0, Flags).getReg(0);
4155 } else {
4156 const float ch_exp = 0x1.714000p+0f;
4157 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
4158
4159 const float ch_exp10 = 0x1.a92000p+1f;
4160 const float cl_exp10 = 0x1.4f0978p-11f;
4161
4162 auto MaskConst = B.buildConstant(Ty, 0xfffff000);
4163 auto XH = B.buildAnd(Ty, X, MaskConst);
4164 auto XL = B.buildFSub(Ty, X, XH, Flags);
4165
4166 auto CH = B.buildFConstant(Ty, IsExp10 ? ch_exp10 : ch_exp);
4167 PH = B.buildFMul(Ty, XH, CH, Flags).getReg(0);
4168
4169 auto CL = B.buildFConstant(Ty, IsExp10 ? cl_exp10 : cl_exp);
4170 auto XLCL = B.buildFMul(Ty, XL, CL, Flags);
4171
4172 Register Mad0 =
4173 getMad(B, Ty, XL.getReg(0), CH.getReg(0), XLCL.getReg(0), Flags);
4174 PL = getMad(B, Ty, XH.getReg(0), CL.getReg(0), Mad0, Flags);
4175 }
4176
4177 auto E = B.buildIntrinsicRoundeven(Ty, PH, Flags);
4178
4179 // It is unsafe to contract this fsub into the PH multiply.
4180 auto PHSubE = B.buildFSub(Ty, PH, E, FlagsNoContract);
4181 auto A = B.buildFAdd(Ty, PHSubE, PL, Flags);
4182 auto IntE = B.buildFPTOSI(LLT::scalar(32), E);
4183
4184 auto Exp2 = B.buildIntrinsic(Intrinsic::amdgcn_exp2, {Ty})
4185 .addUse(A.getReg(0))
4186 .setMIFlags(Flags);
4187 auto R = B.buildFLdexp(Ty, Exp2, IntE, Flags);
4188
4189 auto UnderflowCheckConst =
4190 B.buildFConstant(Ty, IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f);
4191 auto Zero = B.buildFConstant(Ty, 0.0);
4192 auto Underflow =
4193 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), X, UnderflowCheckConst);
4194
4195 R = B.buildSelect(Ty, Underflow, Zero, R);
4196
4197 if (!(Flags & MachineInstr::FmNoInfs)) {
4198 auto OverflowCheckConst =
4199 B.buildFConstant(Ty, IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f);
4200
4201 auto Overflow =
4202 B.buildFCmp(CmpInst::FCMP_OGT, LLT::scalar(1), X, OverflowCheckConst);
4203 auto Inf = B.buildFConstant(Ty, APFloat::getInf(APFloat::IEEEsingle()));
4204 R = B.buildSelect(Ty, Overflow, Inf, R, Flags);
4205 }
4206
4207 B.buildCopy(Dst, R);
4208 MI.eraseFromParent();
4209 return true;
4210}
4211
4213 MachineIRBuilder &B) const {
4214 Register Dst = MI.getOperand(0).getReg();
4215 Register Src0 = MI.getOperand(1).getReg();
4216 Register Src1 = MI.getOperand(2).getReg();
4217 unsigned Flags = MI.getFlags();
4218 LLT Ty = B.getMRI()->getType(Dst);
4219 const LLT F16 = LLT::float16();
4220 const LLT F32 = LLT::float32();
4221
4222 if (Ty == F32) {
4223 auto Log = B.buildFLog2(F32, Src0, Flags);
4224 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4225 .addUse(Log.getReg(0))
4226 .addUse(Src1)
4227 .setMIFlags(Flags);
4228 B.buildFExp2(Dst, Mul, Flags);
4229 } else if (Ty == F16) {
4230 // There's no f16 fmul_legacy, so we need to convert for it.
4231 auto Log = B.buildFLog2(F16, Src0, Flags);
4232 auto Ext0 = B.buildFPExt(F32, Log, Flags);
4233 auto Ext1 = B.buildFPExt(F32, Src1, Flags);
4234 auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {F32})
4235 .addUse(Ext0.getReg(0))
4236 .addUse(Ext1.getReg(0))
4237 .setMIFlags(Flags);
4238 B.buildFExp2(Dst, B.buildFPTrunc(F16, Mul), Flags);
4239 } else
4240 return false;
4241
4242 MI.eraseFromParent();
4243 return true;
4244}
4245
4246// Find a source register, ignoring any possible source modifiers.
4248 Register ModSrc = OrigSrc;
4249 if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
4250 ModSrc = SrcFNeg->getOperand(1).getReg();
4251 if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4252 ModSrc = SrcFAbs->getOperand(1).getReg();
4253 } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
4254 ModSrc = SrcFAbs->getOperand(1).getReg();
4255 return ModSrc;
4256}
4257
4260 MachineIRBuilder &B) const {
4261
4262 const LLT S1 = LLT::scalar(1);
4263 const LLT F64 = LLT::float64();
4264 Register Dst = MI.getOperand(0).getReg();
4265 Register OrigSrc = MI.getOperand(1).getReg();
4266 unsigned Flags = MI.getFlags();
4267 assert(ST.hasFractBug() && MRI.getType(Dst) == F64 &&
4268 "this should not have been custom lowered");
4269
4270 // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
4271 // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
4272 // efficient way to implement it is using V_FRACT_F64. The workaround for the
4273 // V_FRACT bug is:
4274 // fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
4275 //
4276 // Convert floor(x) to (x - fract(x))
4277
4278 auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {F64})
4279 .addUse(OrigSrc)
4280 .setMIFlags(Flags);
4281
4282 // Give source modifier matching some assistance before obscuring a foldable
4283 // pattern.
4284
4285 // TODO: We can avoid the neg on the fract? The input sign to fract
4286 // shouldn't matter?
4287 Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
4288
4289 auto Const =
4290 B.buildFConstant(F64, llvm::bit_cast<double>(0x3fefffffffffffff));
4291
4293
4294 // We don't need to concern ourselves with the snan handling difference, so
4295 // use the one which will directly select.
4296 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4297 if (MFI->getMode().IEEE)
4298 B.buildFMinNumIEEE(Min, Fract, Const, Flags);
4299 else
4300 B.buildFMinNum(Min, Fract, Const, Flags);
4301
4302 Register CorrectedFract = Min;
4303 if (!MI.getFlag(MachineInstr::FmNoNans)) {
4304 auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
4305 CorrectedFract = B.buildSelect(F64, IsNan, ModSrc, Min, Flags).getReg(0);
4306 }
4307
4308 auto NegFract = B.buildFNeg(F64, CorrectedFract, Flags);
4309 B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
4310
4311 MI.eraseFromParent();
4312 return true;
4313}
4314
4315// Turn an illegal packed v2s16 build vector into bit operations.
4316// TODO: This should probably be a bitcast action in LegalizerHelper.
4319 Register Dst = MI.getOperand(0).getReg();
4320 const LLT S32 = LLT::scalar(32);
4321 const LLT S16 = LLT::scalar(16);
4322 assert(MRI.getType(Dst) == LLT::fixed_vector(2, 16));
4323
4324 Register Src0 = MI.getOperand(1).getReg();
4325 Register Src1 = MI.getOperand(2).getReg();
4326
4327 if (MI.getOpcode() == AMDGPU::G_BUILD_VECTOR_TRUNC) {
4328 assert(MRI.getType(Src0) == S32);
4329 Src0 = B.buildTrunc(S16, MI.getOperand(1).getReg()).getReg(0);
4330 Src1 = B.buildTrunc(S16, MI.getOperand(2).getReg()).getReg(0);
4331 }
4332
4333 auto Merge = B.buildMergeLikeInstr(S32, {Src0, Src1});
4334 B.buildBitcast(Dst, Merge);
4335
4336 MI.eraseFromParent();
4337 return true;
4338}
4339
4340// Build a big integer multiply or multiply-add using MAD_64_32 instructions.
4341//
4342// Source and accumulation registers must all be 32-bits.
4343//
4344// TODO: When the multiply is uniform, we should produce a code sequence
4345// that is better suited to instruction selection on the SALU. Instead of
4346// the outer loop going over parts of the result, the outer loop should go
4347// over parts of one of the factors. This should result in instruction
4348// selection that makes full use of S_ADDC_U32 instructions.
4351 ArrayRef<Register> Src0,
4352 ArrayRef<Register> Src1,
4353 bool UsePartialMad64_32,
4354 bool SeparateOddAlignedProducts) const {
4355 // Use (possibly empty) vectors of S1 registers to represent the set of
4356 // carries from one pair of positions to the next.
4357 using Carry = SmallVector<Register, 2>;
4358
4359 MachineIRBuilder &B = Helper.MIRBuilder;
4360 GISelValueTracking &VT = *Helper.getValueTracking();
4361
4362 const LLT S1 = LLT::scalar(1);
4363 const LLT S32 = LLT::scalar(32);
4364 const LLT S64 = LLT::scalar(64);
4365
4366 Register Zero32;
4367 Register Zero64;
4368
4369 auto getZero32 = [&]() -> Register {
4370 if (!Zero32)
4371 Zero32 = B.buildConstant(S32, 0).getReg(0);
4372 return Zero32;
4373 };
4374 auto getZero64 = [&]() -> Register {
4375 if (!Zero64)
4376 Zero64 = B.buildConstant(S64, 0).getReg(0);
4377 return Zero64;
4378 };
4379
4380 SmallVector<bool, 2> Src0KnownZeros, Src1KnownZeros;
4381 for (unsigned i = 0; i < Src0.size(); ++i) {
4382 Src0KnownZeros.push_back(VT.getKnownBits(Src0[i]).isZero());
4383 Src1KnownZeros.push_back(VT.getKnownBits(Src1[i]).isZero());
4384 }
4385
4386 // Merge the given carries into the 32-bit LocalAccum, which is modified
4387 // in-place.
4388 //
4389 // Returns the carry-out, which is a single S1 register or null.
4390 auto mergeCarry =
4391 [&](Register &LocalAccum, const Carry &CarryIn) -> Register {
4392 if (CarryIn.empty())
4393 return Register();
4394
4395 bool HaveCarryOut = true;
4396 Register CarryAccum;
4397 if (CarryIn.size() == 1) {
4398 if (!LocalAccum) {
4399 LocalAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4400 return Register();
4401 }
4402
4403 CarryAccum = getZero32();
4404 } else {
4405 CarryAccum = B.buildZExt(S32, CarryIn[0]).getReg(0);
4406 for (unsigned i = 1; i + 1 < CarryIn.size(); ++i) {
4407 CarryAccum =
4408 B.buildUAdde(S32, S1, CarryAccum, getZero32(), CarryIn[i])
4409 .getReg(0);
4410 }
4411
4412 if (!LocalAccum) {
4413 LocalAccum = getZero32();
4414 HaveCarryOut = false;
4415 }
4416 }
4417
4418 auto Add =
4419 B.buildUAdde(S32, S1, CarryAccum, LocalAccum, CarryIn.back());
4420 LocalAccum = Add.getReg(0);
4421 return HaveCarryOut ? Add.getReg(1) : Register();
4422 };
4423
4424 // Build a multiply-add chain to compute
4425 //
4426 // LocalAccum + (partial products at DstIndex)
4427 // + (opportunistic subset of CarryIn)
4428 //
4429 // LocalAccum is an array of one or two 32-bit registers that are updated
4430 // in-place. The incoming registers may be null.
4431 //
4432 // In some edge cases, carry-ins can be consumed "for free". In that case,
4433 // the consumed carry bits are removed from CarryIn in-place.
4434 auto buildMadChain =
4435 [&](MutableArrayRef<Register> LocalAccum, unsigned DstIndex, Carry &CarryIn)
4436 -> Carry {
4437 assert((DstIndex + 1 < Accum.size() && LocalAccum.size() == 2) ||
4438 (DstIndex + 1 >= Accum.size() && LocalAccum.size() == 1));
4439
4440 Carry CarryOut;
4441 unsigned j0 = 0;
4442
4443 // Use plain 32-bit multiplication for the most significant part of the
4444 // result by default.
4445 if (LocalAccum.size() == 1 &&
4446 (!UsePartialMad64_32 || !CarryIn.empty())) {
4447 do {
4448 // Skip multiplication if one of the operands is 0
4449 unsigned j1 = DstIndex - j0;
4450 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4451 ++j0;
4452 continue;
4453 }
4454 auto Mul = B.buildMul(S32, Src0[j0], Src1[j1]);
4455 if (!LocalAccum[0] || VT.getKnownBits(LocalAccum[0]).isZero()) {
4456 LocalAccum[0] = Mul.getReg(0);
4457 } else {
4458 if (CarryIn.empty()) {
4459 LocalAccum[0] = B.buildAdd(S32, LocalAccum[0], Mul).getReg(0);
4460 } else {
4461 LocalAccum[0] =
4462 B.buildUAdde(S32, S1, LocalAccum[0], Mul, CarryIn.back())
4463 .getReg(0);
4464 CarryIn.pop_back();
4465 }
4466 }
4467 ++j0;
4468 } while (j0 <= DstIndex && (!UsePartialMad64_32 || !CarryIn.empty()));
4469 }
4470
4471 // Build full 64-bit multiplies.
4472 if (j0 <= DstIndex) {
4473 bool HaveSmallAccum = false;
4474 Register Tmp;
4475
4476 if (LocalAccum[0]) {
4477 if (LocalAccum.size() == 1) {
4478 Tmp = B.buildAnyExt(S64, LocalAccum[0]).getReg(0);
4479 HaveSmallAccum = true;
4480 } else if (LocalAccum[1]) {
4481 Tmp = B.buildMergeLikeInstr(S64, LocalAccum).getReg(0);
4482 HaveSmallAccum = false;
4483 } else {
4484 Tmp = B.buildZExt(S64, LocalAccum[0]).getReg(0);
4485 HaveSmallAccum = true;
4486 }
4487 } else {
4488 assert(LocalAccum.size() == 1 || !LocalAccum[1]);
4489 Tmp = getZero64();
4490 HaveSmallAccum = true;
4491 }
4492
4493 do {
4494 unsigned j1 = DstIndex - j0;
4495 if (Src0KnownZeros[j0] || Src1KnownZeros[j1]) {
4496 ++j0;
4497 continue;
4498 }
4499 auto Mad = B.buildInstr(AMDGPU::G_AMDGPU_MAD_U64_U32, {S64, S1},
4500 {Src0[j0], Src1[j1], Tmp});
4501 Tmp = Mad.getReg(0);
4502 if (!HaveSmallAccum)
4503 CarryOut.push_back(Mad.getReg(1));
4504 HaveSmallAccum = false;
4505
4506 ++j0;
4507 } while (j0 <= DstIndex);
4508
4509 auto Unmerge = B.buildUnmerge(S32, Tmp);
4510 LocalAccum[0] = Unmerge.getReg(0);
4511 if (LocalAccum.size() > 1)
4512 LocalAccum[1] = Unmerge.getReg(1);
4513 }
4514
4515 return CarryOut;
4516 };
4517
4518 // Outer multiply loop, iterating over destination parts from least
4519 // significant to most significant parts.
4520 //
4521 // The columns of the following diagram correspond to the destination parts
4522 // affected by one iteration of the outer loop (ignoring boundary
4523 // conditions).
4524 //
4525 // Dest index relative to 2 * i: 1 0 -1
4526 // ------
4527 // Carries from previous iteration: e o
4528 // Even-aligned partial product sum: E E .
4529 // Odd-aligned partial product sum: O O
4530 //
4531 // 'o' is OddCarry, 'e' is EvenCarry.
4532 // EE and OO are computed from partial products via buildMadChain and use
4533 // accumulation where possible and appropriate.
4534 //
4535 Register SeparateOddCarry;
4536 Carry EvenCarry;
4537 Carry OddCarry;
4538
4539 for (unsigned i = 0; i <= Accum.size() / 2; ++i) {
4540 Carry OddCarryIn = std::move(OddCarry);
4541 Carry EvenCarryIn = std::move(EvenCarry);
4542 OddCarry.clear();
4543 EvenCarry.clear();
4544
4545 // Partial products at offset 2 * i.
4546 if (2 * i < Accum.size()) {
4547 auto LocalAccum = Accum.drop_front(2 * i).take_front(2);
4548 EvenCarry = buildMadChain(LocalAccum, 2 * i, EvenCarryIn);
4549 }
4550
4551 // Partial products at offset 2 * i - 1.
4552 if (i > 0) {
4553 if (!SeparateOddAlignedProducts) {
4554 auto LocalAccum = Accum.drop_front(2 * i - 1).take_front(2);
4555 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4556 } else {
4557 bool IsHighest = 2 * i >= Accum.size();
4558 Register SeparateOddOut[2];
4559 auto LocalAccum = MutableArrayRef(SeparateOddOut)
4560 .take_front(IsHighest ? 1 : 2);
4561 OddCarry = buildMadChain(LocalAccum, 2 * i - 1, OddCarryIn);
4562
4564
4565 if (i == 1) {
4566 if (!IsHighest)
4567 Lo = B.buildUAddo(S32, S1, Accum[2 * i - 1], SeparateOddOut[0]);
4568 else
4569 Lo = B.buildAdd(S32, Accum[2 * i - 1], SeparateOddOut[0]);
4570 } else {
4571 Lo = B.buildUAdde(S32, S1, Accum[2 * i - 1], SeparateOddOut[0],
4572 SeparateOddCarry);
4573 }
4574 Accum[2 * i - 1] = Lo->getOperand(0).getReg();
4575
4576 if (!IsHighest) {
4577 auto Hi = B.buildUAdde(S32, S1, Accum[2 * i], SeparateOddOut[1],
4578 Lo->getOperand(1).getReg());
4579 Accum[2 * i] = Hi.getReg(0);
4580 SeparateOddCarry = Hi.getReg(1);
4581 }
4582 }
4583 }
4584
4585 // Add in the carries from the previous iteration
4586 if (i > 0) {
4587 if (Register CarryOut = mergeCarry(Accum[2 * i - 1], OddCarryIn))
4588 EvenCarryIn.push_back(CarryOut);
4589
4590 if (2 * i < Accum.size()) {
4591 if (Register CarryOut = mergeCarry(Accum[2 * i], EvenCarryIn))
4592 OddCarry.push_back(CarryOut);
4593 }
4594 }
4595 }
4596}
4597
4598// Custom narrowing of wide multiplies using wide multiply-add instructions.
4599//
4600// TODO: If the multiply is followed by an addition, we should attempt to
4601// integrate it to make better use of V_MAD_U64_U32's multiply-add capabilities.
4603 MachineInstr &MI) const {
4604 assert(ST.hasMad64_32());
4605 assert(MI.getOpcode() == TargetOpcode::G_MUL);
4606
4607 MachineIRBuilder &B = Helper.MIRBuilder;
4608 MachineRegisterInfo &MRI = *B.getMRI();
4609
4610 Register DstReg = MI.getOperand(0).getReg();
4611 Register Src0 = MI.getOperand(1).getReg();
4612 Register Src1 = MI.getOperand(2).getReg();
4613
4614 LLT Ty = MRI.getType(DstReg);
4615 assert(Ty.isScalar());
4616
4617 unsigned Size = Ty.getSizeInBits();
4618 if (ST.hasVectorMulU64() && Size == 64)
4619 return true;
4620
4621 unsigned NumParts = Size / 32;
4622 assert((Size % 32) == 0);
4623 assert(NumParts >= 2);
4624
4625 // Whether to use MAD_64_32 for partial products whose high half is
4626 // discarded. This avoids some ADD instructions but risks false dependency
4627 // stalls on some subtargets in some cases.
4628 const bool UsePartialMad64_32 = ST.getGeneration() < AMDGPUSubtarget::GFX10;
4629
4630 // Whether to compute odd-aligned partial products separately. This is
4631 // advisable on subtargets where the accumulator of MAD_64_32 must be placed
4632 // in an even-aligned VGPR.
4633 const bool SeparateOddAlignedProducts = ST.hasFullRate64Ops();
4634
4635 LLT S32 = LLT::scalar(32);
4636 SmallVector<Register, 2> Src0Parts, Src1Parts;
4637 for (unsigned i = 0; i < NumParts; ++i) {
4640 }
4641 B.buildUnmerge(Src0Parts, Src0);
4642 B.buildUnmerge(Src1Parts, Src1);
4643
4644 SmallVector<Register, 2> AccumRegs(NumParts);
4645 buildMultiply(Helper, AccumRegs, Src0Parts, Src1Parts, UsePartialMad64_32,
4646 SeparateOddAlignedProducts);
4647
4648 B.buildMergeLikeInstr(DstReg, AccumRegs);
4649 MI.eraseFromParent();
4650 return true;
4651}
4652
4653// Legalize ctlz/cttz to ffbh/ffbl instead of the default legalization to
4654// ctlz/cttz_zero_undef. This allows us to fix up the result for the zero input
4655// case with a single min instruction instead of a compare+select.
4658 MachineIRBuilder &B) const {
4659 Register Dst = MI.getOperand(0).getReg();
4660 Register Src = MI.getOperand(1).getReg();
4661 LLT DstTy = MRI.getType(Dst);
4662 LLT SrcTy = MRI.getType(Src);
4663
4664 unsigned NewOpc = MI.getOpcode() == AMDGPU::G_CTLZ
4665 ? AMDGPU::G_AMDGPU_FFBH_U32
4666 : AMDGPU::G_AMDGPU_FFBL_B32;
4667 auto Tmp = B.buildInstr(NewOpc, {DstTy}, {Src});
4668 B.buildUMin(Dst, Tmp, B.buildConstant(DstTy, SrcTy.getSizeInBits()));
4669
4670 MI.eraseFromParent();
4671 return true;
4672}
4673
4676 MachineIRBuilder &B) const {
4677 Register Dst = MI.getOperand(0).getReg();
4678 Register Src = MI.getOperand(1).getReg();
4679 LLT SrcTy = MRI.getType(Src);
4680 TypeSize NumBits = SrcTy.getSizeInBits();
4681
4682 assert(NumBits < 32u);
4683
4684 auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
4685 auto Extend = B.buildAnyExt(S32, {Src}).getReg(0u);
4686 auto Shift = B.buildShl(S32, Extend, ShiftAmt);
4687 auto Ctlz = B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {S32}, {Shift});
4688 B.buildTrunc(Dst, Ctlz);
4689 MI.eraseFromParent();
4690 return true;
4691}
4692
4695 MachineIRBuilder &B) const {
4696 Register Dst = MI.getOperand(0).getReg();
4697 Register Src = MI.getOperand(1).getReg();
4698 LLT SrcTy = MRI.getType(Src);
4699 const LLT S32 = LLT::scalar(32);
4700 assert(SrcTy == S32 && "legalizeCTLS only supports s32");
4701 unsigned BitWidth = SrcTy.getSizeInBits();
4702
4703 auto Sffbh = B.buildIntrinsic(Intrinsic::amdgcn_sffbh, {S32}).addUse(Src);
4704 auto Clamped = B.buildUMin(S32, Sffbh, B.buildConstant(S32, BitWidth));
4705 B.buildSub(Dst, Clamped, B.buildConstant(S32, 1));
4706 MI.eraseFromParent();
4707 return true;
4708}
4709
4710// Check that this is a G_XOR x, -1
4711static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
4712 if (MI.getOpcode() != TargetOpcode::G_XOR)
4713 return false;
4714 auto ConstVal = getIConstantVRegSExtVal(MI.getOperand(2).getReg(), MRI);
4715 return ConstVal == -1;
4716}
4717
4718// Return the use branch instruction, otherwise null if the usage is invalid.
4719static MachineInstr *
4721 MachineBasicBlock *&UncondBrTarget, bool &Negated) {
4722 Register CondDef = MI.getOperand(0).getReg();
4723 if (!MRI.hasOneNonDBGUse(CondDef))
4724 return nullptr;
4725
4726 MachineBasicBlock *Parent = MI.getParent();
4727 MachineInstr *UseMI = &*MRI.use_instr_nodbg_begin(CondDef);
4728
4729 if (isNot(MRI, *UseMI)) {
4730 Register NegatedCond = UseMI->getOperand(0).getReg();
4731 if (!MRI.hasOneNonDBGUse(NegatedCond))
4732 return nullptr;
4733
4734 // We're deleting the def of this value, so we need to remove it.
4735 eraseInstr(*UseMI, MRI);
4736
4737 UseMI = &*MRI.use_instr_nodbg_begin(NegatedCond);
4738 Negated = true;
4739 }
4740
4741 if (UseMI->getParent() != Parent || UseMI->getOpcode() != AMDGPU::G_BRCOND)
4742 return nullptr;
4743
4744 // Make sure the cond br is followed by a G_BR, or is the last instruction.
4745 MachineBasicBlock::iterator Next = std::next(UseMI->getIterator());
4746 if (Next == Parent->end()) {
4747 MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
4748 if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
4749 return nullptr;
4750 UncondBrTarget = &*NextMBB;
4751 } else {
4752 if (Next->getOpcode() != AMDGPU::G_BR)
4753 return nullptr;
4754 Br = &*Next;
4755 UncondBrTarget = Br->getOperand(0).getMBB();
4756 }
4757
4758 return UseMI;
4759}
4760
4763 const ArgDescriptor *Arg,
4764 const TargetRegisterClass *ArgRC,
4765 LLT ArgTy) const {
4766 MCRegister SrcReg = Arg->getRegister();
4767 assert(SrcReg.isPhysical() && "Physical register expected");
4768 assert(DstReg.isVirtual() && "Virtual register expected");
4769
4770 Register LiveIn = getFunctionLiveInPhysReg(B.getMF(), B.getTII(), SrcReg,
4771 *ArgRC, B.getDebugLoc(), ArgTy);
4772 if (Arg->isMasked()) {
4773 // TODO: Should we try to emit this once in the entry block?
4774 const LLT S32 = LLT::scalar(32);
4775 const unsigned Mask = Arg->getMask();
4776 const unsigned Shift = llvm::countr_zero<unsigned>(Mask);
4777
4778 Register AndMaskSrc = LiveIn;
4779
4780 // TODO: Avoid clearing the high bits if we know workitem id y/z are always
4781 // 0.
4782 if (Shift != 0) {
4783 auto ShiftAmt = B.buildConstant(S32, Shift);
4784 AndMaskSrc = B.buildLShr(S32, LiveIn, ShiftAmt).getReg(0);
4785 }
4786
4787 B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
4788 } else {
4789 B.buildCopy(DstReg, LiveIn);
4790 }
4791}
4792
4797 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
4798 Register DstReg = MI.getOperand(0).getReg();
4799 if (!ST.hasClusters()) {
4800 if (!loadInputValue(DstReg, B, WorkGroupIdPV))
4801 return false;
4802 MI.eraseFromParent();
4803 return true;
4804 }
4805
4806 // Clusters are supported. Return the global position in the grid. If clusters
4807 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
4808
4809 // WorkGroupIdXYZ = ClusterId == 0 ?
4810 // ClusterIdXYZ :
4811 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
4812 MachineRegisterInfo &MRI = *B.getMRI();
4813 const LLT S32 = LLT::scalar(32);
4814 Register ClusterIdXYZ = MRI.createGenericVirtualRegister(S32);
4815 Register ClusterMaxIdXYZ = MRI.createGenericVirtualRegister(S32);
4816 Register ClusterWorkGroupIdXYZ = MRI.createGenericVirtualRegister(S32);
4817 if (!loadInputValue(ClusterIdXYZ, B, WorkGroupIdPV) ||
4818 !loadInputValue(ClusterWorkGroupIdXYZ, B, ClusterWorkGroupIdPV) ||
4819 !loadInputValue(ClusterMaxIdXYZ, B, ClusterMaxIdPV))
4820 return false;
4821
4822 auto One = B.buildConstant(S32, 1);
4823 auto ClusterSizeXYZ = B.buildAdd(S32, ClusterMaxIdXYZ, One);
4824 auto GlobalIdXYZ = B.buildAdd(S32, ClusterWorkGroupIdXYZ,
4825 B.buildMul(S32, ClusterIdXYZ, ClusterSizeXYZ));
4826
4827 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4828
4829 switch (MFI->getClusterDims().getKind()) {
4832 B.buildCopy(DstReg, GlobalIdXYZ);
4833 MI.eraseFromParent();
4834 return true;
4835 }
4837 B.buildCopy(DstReg, ClusterIdXYZ);
4838 MI.eraseFromParent();
4839 return true;
4840 }
4842 using namespace AMDGPU::Hwreg;
4843 unsigned ClusterIdField = HwregEncoding::encode(ID_IB_STS2, 6, 4);
4844 Register ClusterId = MRI.createGenericVirtualRegister(S32);
4845 MRI.setRegClass(ClusterId, &AMDGPU::SReg_32RegClass);
4846 B.buildInstr(AMDGPU::S_GETREG_B32_const)
4847 .addDef(ClusterId)
4848 .addImm(ClusterIdField);
4849 auto Zero = B.buildConstant(S32, 0);
4850 auto NoClusters =
4851 B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1), ClusterId, Zero);
4852 B.buildSelect(DstReg, NoClusters, ClusterIdXYZ, GlobalIdXYZ);
4853 MI.eraseFromParent();
4854 return true;
4855 }
4856 }
4857
4858 llvm_unreachable("nothing should reach here");
4859}
4860
4862 Register DstReg, MachineIRBuilder &B,
4864 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
4865 const ArgDescriptor *Arg = nullptr;
4866 const TargetRegisterClass *ArgRC;
4867 LLT ArgTy;
4868
4869 CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
4870 const ArgDescriptor WorkGroupIDX =
4871 ArgDescriptor::createRegister(AMDGPU::TTMP9);
4872 // If GridZ is not programmed in an entry function then the hardware will set
4873 // it to all zeros, so there is no need to mask the GridY value in the low
4874 // order bits.
4875 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
4876 AMDGPU::TTMP7,
4877 AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
4878 const ArgDescriptor WorkGroupIDZ =
4879 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
4880 const ArgDescriptor ClusterWorkGroupIDX =
4881 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
4882 const ArgDescriptor ClusterWorkGroupIDY =
4883 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
4884 const ArgDescriptor ClusterWorkGroupIDZ =
4885 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
4886 const ArgDescriptor ClusterWorkGroupMaxIDX =
4887 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
4888 const ArgDescriptor ClusterWorkGroupMaxIDY =
4889 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
4890 const ArgDescriptor ClusterWorkGroupMaxIDZ =
4891 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
4892 const ArgDescriptor ClusterWorkGroupMaxFlatID =
4893 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
4894
4895 auto LoadConstant = [&](unsigned N) {
4896 B.buildConstant(DstReg, N);
4897 return true;
4898 };
4899
4900 if (ST.hasArchitectedSGPRs() &&
4902 AMDGPU::ClusterDimsAttr ClusterDims = MFI->getClusterDims();
4903 bool HasFixedDims = ClusterDims.isFixedDims();
4904
4905 switch (ArgType) {
4907 Arg = &WorkGroupIDX;
4908 ArgRC = &AMDGPU::SReg_32RegClass;
4909 ArgTy = LLT::scalar(32);
4910 break;
4912 Arg = &WorkGroupIDY;
4913 ArgRC = &AMDGPU::SReg_32RegClass;
4914 ArgTy = LLT::scalar(32);
4915 break;
4917 Arg = &WorkGroupIDZ;
4918 ArgRC = &AMDGPU::SReg_32RegClass;
4919 ArgTy = LLT::scalar(32);
4920 break;
4922 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
4923 return LoadConstant(0);
4924 Arg = &ClusterWorkGroupIDX;
4925 ArgRC = &AMDGPU::SReg_32RegClass;
4926 ArgTy = LLT::scalar(32);
4927 break;
4929 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
4930 return LoadConstant(0);
4931 Arg = &ClusterWorkGroupIDY;
4932 ArgRC = &AMDGPU::SReg_32RegClass;
4933 ArgTy = LLT::scalar(32);
4934 break;
4936 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
4937 return LoadConstant(0);
4938 Arg = &ClusterWorkGroupIDZ;
4939 ArgRC = &AMDGPU::SReg_32RegClass;
4940 ArgTy = LLT::scalar(32);
4941 break;
4943 if (HasFixedDims)
4944 return LoadConstant(ClusterDims.getDims()[0] - 1);
4945 Arg = &ClusterWorkGroupMaxIDX;
4946 ArgRC = &AMDGPU::SReg_32RegClass;
4947 ArgTy = LLT::scalar(32);
4948 break;
4950 if (HasFixedDims)
4951 return LoadConstant(ClusterDims.getDims()[1] - 1);
4952 Arg = &ClusterWorkGroupMaxIDY;
4953 ArgRC = &AMDGPU::SReg_32RegClass;
4954 ArgTy = LLT::scalar(32);
4955 break;
4957 if (HasFixedDims)
4958 return LoadConstant(ClusterDims.getDims()[2] - 1);
4959 Arg = &ClusterWorkGroupMaxIDZ;
4960 ArgRC = &AMDGPU::SReg_32RegClass;
4961 ArgTy = LLT::scalar(32);
4962 break;
4964 Arg = &ClusterWorkGroupMaxFlatID;
4965 ArgRC = &AMDGPU::SReg_32RegClass;
4966 ArgTy = LLT::scalar(32);
4967 break;
4968 default:
4969 break;
4970 }
4971 }
4972
4973 if (!Arg)
4974 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
4975
4976 if (!Arg) {
4978 // The intrinsic may appear when we have a 0 sized kernarg segment, in
4979 // which case the pointer argument may be missing and we use null.
4980 return LoadConstant(0);
4981 }
4982
4983 // It's undefined behavior if a function marked with the amdgpu-no-*
4984 // attributes uses the corresponding intrinsic.
4985 B.buildUndef(DstReg);
4986 return true;
4987 }
4988
4989 if (!Arg->isRegister() || !Arg->getRegister().isValid())
4990 return false; // TODO: Handle these
4991 buildLoadInputValue(DstReg, B, Arg, ArgRC, ArgTy);
4992 return true;
4993}
4994
4998 if (!loadInputValue(MI.getOperand(0).getReg(), B, ArgType))
4999 return false;
5000
5001 MI.eraseFromParent();
5002 return true;
5003}
5004
5006 int64_t C) {
5007 B.buildConstant(MI.getOperand(0).getReg(), C);
5008 MI.eraseFromParent();
5009 return true;
5010}
5011
5014 unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
5015 unsigned MaxID = ST.getMaxWorkitemID(B.getMF().getFunction(), Dim);
5016 if (MaxID == 0)
5017 return replaceWithConstant(B, MI, 0);
5018
5019 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5020 const ArgDescriptor *Arg;
5021 const TargetRegisterClass *ArgRC;
5022 LLT ArgTy;
5023 std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
5024
5025 Register DstReg = MI.getOperand(0).getReg();
5026 if (!Arg) {
5027 // It's undefined behavior if a function marked with the amdgpu-no-*
5028 // attributes uses the corresponding intrinsic.
5029 B.buildUndef(DstReg);
5030 MI.eraseFromParent();
5031 return true;
5032 }
5033
5034 if (Arg->isMasked()) {
5035 // Don't bother inserting AssertZext for packed IDs since we're emitting the
5036 // masking operations anyway.
5037 //
5038 // TODO: We could assert the top bit is 0 for the source copy.
5039 if (!loadInputValue(DstReg, B, ArgType))
5040 return false;
5041 } else {
5043 if (!loadInputValue(TmpReg, B, ArgType))
5044 return false;
5045 B.buildAssertZExt(DstReg, TmpReg, llvm::bit_width(MaxID));
5046 }
5047
5048 MI.eraseFromParent();
5049 return true;
5050}
5051
5054 // This isn't really a constant pool but close enough.
5057 return PtrInfo;
5058}
5059
5061 int64_t Offset) const {
5063 Register KernArgReg = B.getMRI()->createGenericVirtualRegister(PtrTy);
5064
5065 // TODO: If we passed in the base kernel offset we could have a better
5066 // alignment than 4, but we don't really need it.
5067 if (!loadInputValue(KernArgReg, B,
5069 llvm_unreachable("failed to find kernarg segment ptr");
5070
5071 auto COffset = B.buildConstant(LLT::scalar(64), Offset);
5072 return B.buildObjectPtrOffset(PtrTy, KernArgReg, COffset).getReg(0);
5073}
5074
5075/// Legalize a value that's loaded from kernel arguments. This is only used by
5076/// legacy intrinsics.
5080 Align Alignment) const {
5081 Register DstReg = MI.getOperand(0).getReg();
5082
5083 assert(B.getMRI()->getType(DstReg) == LLT::scalar(32) &&
5084 "unexpected kernarg parameter type");
5085
5088 B.buildLoad(DstReg, Ptr, PtrInfo.getWithOffset(Offset), Align(4),
5091 MI.eraseFromParent();
5092 return true;
5093}
5094
5097 MachineIRBuilder &B) const {
5098 Register Dst = MI.getOperand(0).getReg();
5099 LLT DstTy = MRI.getType(Dst);
5100 LLT S16 = LLT::scalar(16);
5101 LLT S32 = LLT::scalar(32);
5102 LLT S64 = LLT::scalar(64);
5103
5104 if (DstTy == S16)
5105 return legalizeFDIV16(MI, MRI, B);
5106 if (DstTy == S32)
5107 return legalizeFDIV32(MI, MRI, B);
5108 if (DstTy == S64)
5109 return legalizeFDIV64(MI, MRI, B);
5110
5111 return false;
5112}
5113
5115 Register DstDivReg,
5116 Register DstRemReg,
5117 Register X,
5118 Register Y) const {
5119 const LLT S1 = LLT::scalar(1);
5120 const LLT S32 = LLT::scalar(32);
5121
5122 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
5123 // algorithm used here.
5124
5125 // Initial estimate of inv(y).
5126 auto FloatY = B.buildUITOFP(S32, Y);
5127 auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
5128 auto Scale = B.buildFConstant(S32, llvm::bit_cast<float>(0x4f7ffffe));
5129 auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
5130 auto Z = B.buildFPTOUI(S32, ScaledY);
5131
5132 // One round of UNR.
5133 auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
5134 auto NegYZ = B.buildMul(S32, NegY, Z);
5135 Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
5136
5137 // Quotient/remainder estimate.
5138 auto Q = B.buildUMulH(S32, X, Z);
5139 auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
5140
5141 // First quotient/remainder refinement.
5142 auto One = B.buildConstant(S32, 1);
5143 auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5144 if (DstDivReg)
5145 Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
5146 R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
5147
5148 // Second quotient/remainder refinement.
5149 Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
5150 if (DstDivReg)
5151 B.buildSelect(DstDivReg, Cond, B.buildAdd(S32, Q, One), Q);
5152
5153 if (DstRemReg)
5154 B.buildSelect(DstRemReg, Cond, B.buildSub(S32, R, Y), R);
5155}
5156
5157// Build integer reciprocal sequence around V_RCP_IFLAG_F32
5158//
5159// Return lo, hi of result
5160//
5161// %cvt.lo = G_UITOFP Val.lo
5162// %cvt.hi = G_UITOFP Val.hi
5163// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
5164// %rcp = G_AMDGPU_RCP_IFLAG %mad
5165// %mul1 = G_FMUL %rcp, 0x5f7ffffc
5166// %mul2 = G_FMUL %mul1, 2**(-32)
5167// %trunc = G_INTRINSIC_TRUNC %mul2
5168// %mad2 = G_FMAD %trunc, -(2**32), %mul1
5169// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
5170static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
5171 Register Val) {
5172 const LLT S32 = LLT::scalar(32);
5173 auto Unmerge = B.buildUnmerge(S32, Val);
5174
5175 auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
5176 auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
5177
5178 auto Mad = B.buildFMAD(
5179 S32, CvtHi, // 2**32
5180 B.buildFConstant(S32, llvm::bit_cast<float>(0x4f800000)), CvtLo);
5181
5182 auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
5183 auto Mul1 = B.buildFMul(
5184 S32, Rcp, B.buildFConstant(S32, llvm::bit_cast<float>(0x5f7ffffc)));
5185
5186 // 2**(-32)
5187 auto Mul2 = B.buildFMul(
5188 S32, Mul1, B.buildFConstant(S32, llvm::bit_cast<float>(0x2f800000)));
5189 auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
5190
5191 // -(2**32)
5192 auto Mad2 = B.buildFMAD(
5193 S32, Trunc, B.buildFConstant(S32, llvm::bit_cast<float>(0xcf800000)),
5194 Mul1);
5195
5196 auto ResultLo = B.buildFPTOUI(S32, Mad2);
5197 auto ResultHi = B.buildFPTOUI(S32, Trunc);
5198
5199 return {ResultLo.getReg(0), ResultHi.getReg(0)};
5200}
5201
5203 Register DstDivReg,
5204 Register DstRemReg,
5205 Register Numer,
5206 Register Denom) const {
5207 const LLT S32 = LLT::scalar(32);
5208 const LLT S64 = LLT::scalar(64);
5209 const LLT S1 = LLT::scalar(1);
5210 Register RcpLo, RcpHi;
5211
5212 std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
5213
5214 auto Rcp = B.buildMergeLikeInstr(S64, {RcpLo, RcpHi});
5215
5216 auto Zero64 = B.buildConstant(S64, 0);
5217 auto NegDenom = B.buildSub(S64, Zero64, Denom);
5218
5219 auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
5220 auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
5221
5222 auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
5223 Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
5224 Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
5225
5226 auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
5227 auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
5228 auto Add1 = B.buildMergeLikeInstr(S64, {Add1_Lo, Add1_Hi});
5229
5230 auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
5231 auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
5232 auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
5233 Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
5234 Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
5235
5236 auto Zero32 = B.buildConstant(S32, 0);
5237 auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
5238 auto Add2_Hi = B.buildUAdde(S32, S1, Add1_Hi, MulHi2_Hi, Add2_Lo.getReg(1));
5239 auto Add2 = B.buildMergeLikeInstr(S64, {Add2_Lo, Add2_Hi});
5240
5241 auto UnmergeNumer = B.buildUnmerge(S32, Numer);
5242 Register NumerLo = UnmergeNumer.getReg(0);
5243 Register NumerHi = UnmergeNumer.getReg(1);
5244
5245 auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
5246 auto Mul3 = B.buildMul(S64, Denom, MulHi3);
5247 auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
5248 Register Mul3_Lo = UnmergeMul3.getReg(0);
5249 Register Mul3_Hi = UnmergeMul3.getReg(1);
5250 auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
5251 auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
5252 auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
5253 auto Sub1 = B.buildMergeLikeInstr(S64, {Sub1_Lo, Sub1_Hi});
5254
5255 auto UnmergeDenom = B.buildUnmerge(S32, Denom);
5256 Register DenomLo = UnmergeDenom.getReg(0);
5257 Register DenomHi = UnmergeDenom.getReg(1);
5258
5259 auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
5260 auto C1 = B.buildSExt(S32, CmpHi);
5261
5262 auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
5263 auto C2 = B.buildSExt(S32, CmpLo);
5264
5265 auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
5266 auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
5267
5268 // TODO: Here and below portions of the code can be enclosed into if/endif.
5269 // Currently control flow is unconditional and we have 4 selects after
5270 // potential endif to substitute PHIs.
5271
5272 // if C3 != 0 ...
5273 auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
5274 auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
5275 auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
5276 auto Sub2 = B.buildMergeLikeInstr(S64, {Sub2_Lo, Sub2_Hi});
5277
5278 auto One64 = B.buildConstant(S64, 1);
5279 auto Add3 = B.buildAdd(S64, MulHi3, One64);
5280
5281 auto C4 =
5282 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
5283 auto C5 =
5284 B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
5285 auto C6 = B.buildSelect(
5286 S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
5287
5288 // if (C6 != 0)
5289 auto Add4 = B.buildAdd(S64, Add3, One64);
5290 auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
5291
5292 auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
5293 auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
5294 auto Sub3 = B.buildMergeLikeInstr(S64, {Sub3_Lo, Sub3_Hi});
5295
5296 // endif C6
5297 // endif C3
5298
5299 if (DstDivReg) {
5300 auto Sel1 = B.buildSelect(
5301 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
5302 B.buildSelect(DstDivReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5303 Sel1, MulHi3);
5304 }
5305
5306 if (DstRemReg) {
5307 auto Sel2 = B.buildSelect(
5308 S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
5309 B.buildSelect(DstRemReg, B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32),
5310 Sel2, Sub1);
5311 }
5312}
5313
5316 MachineIRBuilder &B) const {
5317 Register DstDivReg, DstRemReg;
5318 switch (MI.getOpcode()) {
5319 default:
5320 llvm_unreachable("Unexpected opcode!");
5321 case AMDGPU::G_UDIV: {
5322 DstDivReg = MI.getOperand(0).getReg();
5323 break;
5324 }
5325 case AMDGPU::G_UREM: {
5326 DstRemReg = MI.getOperand(0).getReg();
5327 break;
5328 }
5329 case AMDGPU::G_UDIVREM: {
5330 DstDivReg = MI.getOperand(0).getReg();
5331 DstRemReg = MI.getOperand(1).getReg();
5332 break;
5333 }
5334 }
5335
5336 const LLT S64 = LLT::scalar(64);
5337 const LLT S32 = LLT::scalar(32);
5338 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5339 Register Num = MI.getOperand(FirstSrcOpIdx).getReg();
5340 Register Den = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5341 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5342
5343 if (Ty == S32)
5344 legalizeUnsignedDIV_REM32Impl(B, DstDivReg, DstRemReg, Num, Den);
5345 else if (Ty == S64)
5346 legalizeUnsignedDIV_REM64Impl(B, DstDivReg, DstRemReg, Num, Den);
5347 else
5348 return false;
5349
5350 MI.eraseFromParent();
5351 return true;
5352}
5353
5356 MachineIRBuilder &B) const {
5357 const LLT S64 = LLT::scalar(64);
5358 const LLT S32 = LLT::scalar(32);
5359
5360 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
5361 if (Ty != S32 && Ty != S64)
5362 return false;
5363
5364 const unsigned FirstSrcOpIdx = MI.getNumExplicitDefs();
5365 Register LHS = MI.getOperand(FirstSrcOpIdx).getReg();
5366 Register RHS = MI.getOperand(FirstSrcOpIdx + 1).getReg();
5367
5368 auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
5369 auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
5370 auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
5371
5372 LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
5373 RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
5374
5375 LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
5376 RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
5377
5378 Register DstDivReg, DstRemReg, TmpDivReg, TmpRemReg;
5379 switch (MI.getOpcode()) {
5380 default:
5381 llvm_unreachable("Unexpected opcode!");
5382 case AMDGPU::G_SDIV: {
5383 DstDivReg = MI.getOperand(0).getReg();
5384 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5385 break;
5386 }
5387 case AMDGPU::G_SREM: {
5388 DstRemReg = MI.getOperand(0).getReg();
5389 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5390 break;
5391 }
5392 case AMDGPU::G_SDIVREM: {
5393 DstDivReg = MI.getOperand(0).getReg();
5394 DstRemReg = MI.getOperand(1).getReg();
5395 TmpDivReg = MRI.createGenericVirtualRegister(Ty);
5396 TmpRemReg = MRI.createGenericVirtualRegister(Ty);
5397 break;
5398 }
5399 }
5400
5401 if (Ty == S32)
5402 legalizeUnsignedDIV_REM32Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5403 else
5404 legalizeUnsignedDIV_REM64Impl(B, TmpDivReg, TmpRemReg, LHS, RHS);
5405
5406 if (DstDivReg) {
5407 auto Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
5408 auto SignXor = B.buildXor(Ty, TmpDivReg, Sign).getReg(0);
5409 B.buildSub(DstDivReg, SignXor, Sign);
5410 }
5411
5412 if (DstRemReg) {
5413 auto Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
5414 auto SignXor = B.buildXor(Ty, TmpRemReg, Sign).getReg(0);
5415 B.buildSub(DstRemReg, SignXor, Sign);
5416 }
5417
5418 MI.eraseFromParent();
5419 return true;
5420}
5421
5424 MachineIRBuilder &B) const {
5425 Register Res = MI.getOperand(0).getReg();
5426 Register LHS = MI.getOperand(1).getReg();
5427 Register RHS = MI.getOperand(2).getReg();
5428 uint16_t Flags = MI.getFlags();
5429 LLT ResTy = MRI.getType(Res);
5430
5431 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5432
5433 if (const auto *CLHS = getConstantFPVRegVal(LHS, MRI)) {
5434 if (!AllowInaccurateRcp && ResTy != LLT::scalar(16))
5435 return false;
5436
5437 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
5438 // the CI documentation has a worst case error of 1 ulp.
5439 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
5440 // use it as long as we aren't trying to use denormals.
5441 //
5442 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
5443
5444 // 1 / x -> RCP(x)
5445 if (CLHS->isExactlyValue(1.0)) {
5446 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5447 .addUse(RHS)
5448 .setMIFlags(Flags);
5449
5450 MI.eraseFromParent();
5451 return true;
5452 }
5453
5454 // -1 / x -> RCP( FNEG(x) )
5455 if (CLHS->isExactlyValue(-1.0)) {
5456 auto FNeg = B.buildFNeg(ResTy, RHS, Flags);
5457 B.buildIntrinsic(Intrinsic::amdgcn_rcp, Res)
5458 .addUse(FNeg.getReg(0))
5459 .setMIFlags(Flags);
5460
5461 MI.eraseFromParent();
5462 return true;
5463 }
5464 }
5465
5466 // For f16 require afn or arcp.
5467 // For f32 require afn.
5468 if (!AllowInaccurateRcp && (ResTy != LLT::scalar(16) ||
5469 !MI.getFlag(MachineInstr::FmArcp)))
5470 return false;
5471
5472 // x / y -> x * (1.0 / y)
5473 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5474 .addUse(RHS)
5475 .setMIFlags(Flags);
5476 B.buildFMul(Res, LHS, RCP, Flags);
5477
5478 MI.eraseFromParent();
5479 return true;
5480}
5481
5484 MachineIRBuilder &B) const {
5485 Register Res = MI.getOperand(0).getReg();
5486 Register X = MI.getOperand(1).getReg();
5487 Register Y = MI.getOperand(2).getReg();
5488 uint16_t Flags = MI.getFlags();
5489 LLT ResTy = MRI.getType(Res);
5490
5491 bool AllowInaccurateRcp = MI.getFlag(MachineInstr::FmAfn);
5492
5493 if (!AllowInaccurateRcp)
5494 return false;
5495
5496 auto NegY = B.buildFNeg(ResTy, Y);
5497 auto One = B.buildFConstant(ResTy, 1.0);
5498
5499 auto R = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy})
5500 .addUse(Y)
5501 .setMIFlags(Flags);
5502
5503 auto Tmp0 = B.buildFMA(ResTy, NegY, R, One);
5504 R = B.buildFMA(ResTy, Tmp0, R, R);
5505
5506 auto Tmp1 = B.buildFMA(ResTy, NegY, R, One);
5507 R = B.buildFMA(ResTy, Tmp1, R, R);
5508
5509 auto Ret = B.buildFMul(ResTy, X, R);
5510 auto Tmp2 = B.buildFMA(ResTy, NegY, Ret, X);
5511
5512 B.buildFMA(Res, Tmp2, R, Ret);
5513 MI.eraseFromParent();
5514 return true;
5515}
5516
5519 MachineIRBuilder &B) const {
5520 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5521 return true;
5522
5523 Register Res = MI.getOperand(0).getReg();
5524 Register LHS = MI.getOperand(1).getReg();
5525 Register RHS = MI.getOperand(2).getReg();
5526
5527 uint16_t Flags = MI.getFlags();
5528
5529 LLT S16 = LLT::scalar(16);
5530 LLT S32 = LLT::scalar(32);
5531
5532 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
5533 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
5534 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
5535 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
5536 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5537 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
5538 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
5539 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
5540 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
5541 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
5542 // q16.u = opx(V_CVT_F16_F32, q32.u);
5543 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
5544
5545 auto LHSExt = B.buildFPExt(S32, LHS, Flags);
5546 auto RHSExt = B.buildFPExt(S32, RHS, Flags);
5547 auto NegRHSExt = B.buildFNeg(S32, RHSExt);
5548 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5549 .addUse(RHSExt.getReg(0))
5550 .setMIFlags(Flags);
5551 auto Quot = B.buildFMul(S32, LHSExt, Rcp, Flags);
5553 if (ST.hasMadMacF32Insts()) {
5554 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5555 Quot = B.buildFMAD(S32, Err, Rcp, Quot, Flags);
5556 Err = B.buildFMAD(S32, NegRHSExt, Quot, LHSExt, Flags);
5557 } else {
5558 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5559 Quot = B.buildFMA(S32, Err, Rcp, Quot, Flags);
5560 Err = B.buildFMA(S32, NegRHSExt, Quot, LHSExt, Flags);
5561 }
5562 auto Tmp = B.buildFMul(S32, Err, Rcp, Flags);
5563 Tmp = B.buildAnd(S32, Tmp, B.buildConstant(S32, 0xff800000));
5564 Quot = B.buildFAdd(S32, Tmp, Quot, Flags);
5565 auto RDst = B.buildFPTrunc(S16, Quot, Flags);
5566 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5567 .addUse(RDst.getReg(0))
5568 .addUse(RHS)
5569 .addUse(LHS)
5570 .setMIFlags(Flags);
5571
5572 MI.eraseFromParent();
5573 return true;
5574}
5575
5576static constexpr unsigned SPDenormModeBitField =
5578
5579// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
5580// to enable denorm mode. When 'Enable' is false, disable denorm mode.
5582 const GCNSubtarget &ST,
5584 // Set SP denorm mode to this value.
5585 unsigned SPDenormMode =
5586 Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
5587
5588 if (ST.hasDenormModeInst()) {
5589 // Preserve default FP64FP16 denorm mode while updating FP32 mode.
5590 uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
5591
5592 uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
5593 B.buildInstr(AMDGPU::S_DENORM_MODE)
5594 .addImm(NewDenormModeValue);
5595
5596 } else {
5597 B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
5598 .addImm(SPDenormMode)
5599 .addImm(SPDenormModeBitField);
5600 }
5601}
5602
5605 MachineIRBuilder &B) const {
5606 if (legalizeFastUnsafeFDIV(MI, MRI, B))
5607 return true;
5608
5609 Register Res = MI.getOperand(0).getReg();
5610 Register LHS = MI.getOperand(1).getReg();
5611 Register RHS = MI.getOperand(2).getReg();
5612 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
5613 SIModeRegisterDefaults Mode = MFI->getMode();
5614
5615 uint16_t Flags = MI.getFlags();
5616
5617 LLT S32 = LLT::scalar(32);
5618 LLT S1 = LLT::scalar(1);
5619
5620 auto One = B.buildFConstant(S32, 1.0f);
5621
5622 auto DenominatorScaled =
5623 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5624 .addUse(LHS)
5625 .addUse(RHS)
5626 .addImm(0)
5627 .setMIFlags(Flags);
5628 auto NumeratorScaled =
5629 B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1})
5630 .addUse(LHS)
5631 .addUse(RHS)
5632 .addImm(1)
5633 .setMIFlags(Flags);
5634
5635 auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5636 .addUse(DenominatorScaled.getReg(0))
5637 .setMIFlags(Flags);
5638 auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
5639
5640 const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
5641 const bool HasDynamicDenormals =
5642 (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
5643 (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
5644
5645 Register SavedSPDenormMode;
5646 if (!PreservesDenormals) {
5647 if (HasDynamicDenormals) {
5648 SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5649 B.buildInstr(AMDGPU::S_GETREG_B32)
5650 .addDef(SavedSPDenormMode)
5651 .addImm(SPDenormModeBitField);
5652 }
5653 toggleSPDenormMode(true, B, ST, Mode);
5654 }
5655
5656 auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
5657 auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
5658 auto Mul = B.buildFMul(S32, NumeratorScaled, Fma1, Flags);
5659 auto Fma2 = B.buildFMA(S32, NegDivScale0, Mul, NumeratorScaled, Flags);
5660 auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
5661 auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
5662
5663 if (!PreservesDenormals) {
5664 if (HasDynamicDenormals) {
5665 assert(SavedSPDenormMode);
5666 B.buildInstr(AMDGPU::S_SETREG_B32)
5667 .addReg(SavedSPDenormMode)
5668 .addImm(SPDenormModeBitField);
5669 } else
5670 toggleSPDenormMode(false, B, ST, Mode);
5671 }
5672
5673 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
5674 .addUse(Fma4.getReg(0))
5675 .addUse(Fma1.getReg(0))
5676 .addUse(Fma3.getReg(0))
5677 .addUse(NumeratorScaled.getReg(1))
5678 .setMIFlags(Flags);
5679
5680 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, Res)
5681 .addUse(Fmas.getReg(0))
5682 .addUse(RHS)
5683 .addUse(LHS)
5684 .setMIFlags(Flags);
5685
5686 MI.eraseFromParent();
5687 return true;
5688}
5689
5692 MachineIRBuilder &B) const {
5693 if (legalizeFastUnsafeFDIV64(MI, MRI, B))
5694 return true;
5695
5696 Register Res = MI.getOperand(0).getReg();
5697 Register LHS = MI.getOperand(1).getReg();
5698 Register RHS = MI.getOperand(2).getReg();
5699
5700 uint16_t Flags = MI.getFlags();
5701
5702 LLT S64 = LLT::scalar(64);
5703 LLT S1 = LLT::scalar(1);
5704
5705 auto One = B.buildFConstant(S64, 1.0);
5706
5707 auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5708 .addUse(LHS)
5709 .addUse(RHS)
5710 .addImm(0)
5711 .setMIFlags(Flags);
5712
5713 auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
5714
5715 auto Rcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S64})
5716 .addUse(DivScale0.getReg(0))
5717 .setMIFlags(Flags);
5718
5719 auto Fma0 = B.buildFMA(S64, NegDivScale0, Rcp, One, Flags);
5720 auto Fma1 = B.buildFMA(S64, Rcp, Fma0, Rcp, Flags);
5721 auto Fma2 = B.buildFMA(S64, NegDivScale0, Fma1, One, Flags);
5722
5723 auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1})
5724 .addUse(LHS)
5725 .addUse(RHS)
5726 .addImm(1)
5727 .setMIFlags(Flags);
5728
5729 auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
5730 auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
5731 auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
5732
5733 Register Scale;
5734 if (!ST.hasUsableDivScaleConditionOutput()) {
5735 // Workaround a hardware bug on SI where the condition output from div_scale
5736 // is not usable.
5737
5738 LLT S32 = LLT::scalar(32);
5739
5740 auto NumUnmerge = B.buildUnmerge(S32, LHS);
5741 auto DenUnmerge = B.buildUnmerge(S32, RHS);
5742 auto Scale0Unmerge = B.buildUnmerge(S32, DivScale0);
5743 auto Scale1Unmerge = B.buildUnmerge(S32, DivScale1);
5744
5745 auto CmpNum = B.buildICmp(ICmpInst::ICMP_EQ, S1, NumUnmerge.getReg(1),
5746 Scale1Unmerge.getReg(1));
5747 auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
5748 Scale0Unmerge.getReg(1));
5749 Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
5750 } else {
5751 Scale = DivScale1.getReg(1);
5752 }
5753
5754 auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S64})
5755 .addUse(Fma4.getReg(0))
5756 .addUse(Fma3.getReg(0))
5757 .addUse(Mul.getReg(0))
5758 .addUse(Scale)
5759 .setMIFlags(Flags);
5760
5761 B.buildIntrinsic(Intrinsic::amdgcn_div_fixup, ArrayRef(Res))
5762 .addUse(Fmas.getReg(0))
5763 .addUse(RHS)
5764 .addUse(LHS)
5765 .setMIFlags(Flags);
5766
5767 MI.eraseFromParent();
5768 return true;
5769}
5770
5773 MachineIRBuilder &B) const {
5774 Register Res0 = MI.getOperand(0).getReg();
5775 Register Res1 = MI.getOperand(1).getReg();
5776 Register Val = MI.getOperand(2).getReg();
5777 uint16_t Flags = MI.getFlags();
5778
5779 LLT Ty = MRI.getType(Res0);
5780 LLT InstrExpTy = Ty == LLT::scalar(16) ? LLT::scalar(16) : LLT::scalar(32);
5781
5782 auto Mant = B.buildIntrinsic(Intrinsic::amdgcn_frexp_mant, {Ty})
5783 .addUse(Val)
5784 .setMIFlags(Flags);
5785 auto Exp = B.buildIntrinsic(Intrinsic::amdgcn_frexp_exp, {InstrExpTy})
5786 .addUse(Val)
5787 .setMIFlags(Flags);
5788
5789 if (ST.hasFractBug()) {
5790 auto Fabs = B.buildFAbs(Ty, Val);
5791 auto Inf = B.buildFConstant(Ty, APFloat::getInf(getFltSemanticForLLT(Ty)));
5792 auto IsFinite =
5793 B.buildFCmp(CmpInst::FCMP_OLT, LLT::scalar(1), Fabs, Inf, Flags);
5794 auto Zero = B.buildConstant(InstrExpTy, 0);
5795 Exp = B.buildSelect(InstrExpTy, IsFinite, Exp, Zero);
5796 Mant = B.buildSelect(Ty, IsFinite, Mant, Val);
5797 }
5798
5799 B.buildCopy(Res0, Mant);
5800 B.buildSExtOrTrunc(Res1, Exp);
5801
5802 MI.eraseFromParent();
5803 return true;
5804}
5805
5808 MachineIRBuilder &B) const {
5809 Register Res = MI.getOperand(0).getReg();
5810 Register LHS = MI.getOperand(2).getReg();
5811 Register RHS = MI.getOperand(3).getReg();
5812 uint16_t Flags = MI.getFlags();
5813
5814 LLT S32 = LLT::scalar(32);
5815 LLT S1 = LLT::scalar(1);
5816
5817 auto Abs = B.buildFAbs(S32, RHS, Flags);
5818 const APFloat C0Val(1.0f);
5819
5820 auto C0 = B.buildFConstant(S32, 0x1p+96f);
5821 auto C1 = B.buildFConstant(S32, 0x1p-32f);
5822 auto C2 = B.buildFConstant(S32, 1.0f);
5823
5824 auto CmpRes = B.buildFCmp(CmpInst::FCMP_OGT, S1, Abs, C0, Flags);
5825 auto Sel = B.buildSelect(S32, CmpRes, C1, C2, Flags);
5826
5827 auto Mul0 = B.buildFMul(S32, RHS, Sel, Flags);
5828
5829 auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32})
5830 .addUse(Mul0.getReg(0))
5831 .setMIFlags(Flags);
5832
5833 auto Mul1 = B.buildFMul(S32, LHS, RCP, Flags);
5834
5835 B.buildFMul(Res, Sel, Mul1, Flags);
5836
5837 MI.eraseFromParent();
5838 return true;
5839}
5840
5843 MachineIRBuilder &B) const {
5844 // Bypass the correct expansion a standard promotion through G_FSQRT would
5845 // get. The f32 op is accurate enough for the f16 cas.
5846 unsigned Flags = MI.getFlags();
5847 assert(!ST.has16BitInsts());
5848 const LLT F32 = LLT::scalar(32);
5849 auto Ext = B.buildFPExt(F32, MI.getOperand(1), Flags);
5850 auto Log2 = B.buildIntrinsic(Intrinsic::amdgcn_sqrt, {F32})
5851 .addUse(Ext.getReg(0))
5852 .setMIFlags(Flags);
5853 B.buildFPTrunc(MI.getOperand(0), Log2, Flags);
5854 MI.eraseFromParent();
5855 return true;
5856}
5857
5860 MachineIRBuilder &B) const {
5861 MachineFunction &MF = B.getMF();
5862 Register Dst = MI.getOperand(0).getReg();
5863 Register X = MI.getOperand(1).getReg();
5864 const unsigned Flags = MI.getFlags();
5865 const LLT S1 = LLT::scalar(1);
5866 const LLT F32 = LLT::scalar(32);
5867 const LLT I32 = LLT::scalar(32);
5868
5869 if (allowApproxFunc(MF, Flags)) {
5870 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({Dst}))
5871 .addUse(X)
5872 .setMIFlags(Flags);
5873 MI.eraseFromParent();
5874 return true;
5875 }
5876
5877 auto ScaleThreshold = B.buildFConstant(F32, 0x1.0p-96f);
5878 auto NeedScale = B.buildFCmp(CmpInst::FCMP_OGT, S1, ScaleThreshold, X, Flags);
5879 auto ScaleUpFactor = B.buildFConstant(F32, 0x1.0p+32f);
5880 auto ScaledX = B.buildFMul(F32, X, ScaleUpFactor, Flags);
5881 auto SqrtX = B.buildSelect(F32, NeedScale, ScaledX, X, Flags);
5882
5884 if (needsDenormHandlingF32(MF, X, Flags)) {
5885 B.buildIntrinsic(Intrinsic::amdgcn_sqrt, ArrayRef<Register>({SqrtS}))
5886 .addUse(SqrtX.getReg(0))
5887 .setMIFlags(Flags);
5888
5889 auto NegOne = B.buildConstant(I32, -1);
5890 auto SqrtSNextDown = B.buildAdd(I32, SqrtS, NegOne);
5891
5892 auto NegSqrtSNextDown = B.buildFNeg(F32, SqrtSNextDown, Flags);
5893 auto SqrtVP = B.buildFMA(F32, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
5894
5895 auto PosOne = B.buildConstant(I32, 1);
5896 auto SqrtSNextUp = B.buildAdd(I32, SqrtS, PosOne);
5897
5898 auto NegSqrtSNextUp = B.buildFNeg(F32, SqrtSNextUp, Flags);
5899 auto SqrtVS = B.buildFMA(F32, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
5900
5901 auto Zero = B.buildFConstant(F32, 0.0f);
5902 auto SqrtVPLE0 = B.buildFCmp(CmpInst::FCMP_OLE, S1, SqrtVP, Zero, Flags);
5903
5904 SqrtS =
5905 B.buildSelect(F32, SqrtVPLE0, SqrtSNextDown, SqrtS, Flags).getReg(0);
5906
5907 auto SqrtVPVSGT0 = B.buildFCmp(CmpInst::FCMP_OGT, S1, SqrtVS, Zero, Flags);
5908 SqrtS =
5909 B.buildSelect(F32, SqrtVPVSGT0, SqrtSNextUp, SqrtS, Flags).getReg(0);
5910 } else {
5911 auto SqrtR =
5912 B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F32}).addReg(SqrtX.getReg(0));
5913 B.buildFMul(SqrtS, SqrtX, SqrtR, Flags);
5914
5915 auto Half = B.buildFConstant(F32, 0.5f);
5916 auto SqrtH = B.buildFMul(F32, SqrtR, Half, Flags);
5917 auto NegSqrtH = B.buildFNeg(F32, SqrtH, Flags);
5918 auto SqrtE = B.buildFMA(F32, NegSqrtH, SqrtS, Half, Flags);
5919 SqrtH = B.buildFMA(F32, SqrtH, SqrtE, SqrtH, Flags);
5920 SqrtS = B.buildFMA(F32, SqrtS, SqrtE, SqrtS, Flags).getReg(0);
5921 auto NegSqrtS = B.buildFNeg(F32, SqrtS, Flags);
5922 auto SqrtD = B.buildFMA(F32, NegSqrtS, SqrtS, SqrtX, Flags);
5923 SqrtS = B.buildFMA(F32, SqrtD, SqrtH, SqrtS, Flags).getReg(0);
5924 }
5925
5926 auto ScaleDownFactor = B.buildFConstant(F32, 0x1.0p-16f);
5927
5928 auto ScaledDown = B.buildFMul(F32, SqrtS, ScaleDownFactor, Flags);
5929
5930 SqrtS = B.buildSelect(F32, NeedScale, ScaledDown, SqrtS, Flags).getReg(0);
5931
5932 auto IsZeroOrInf = B.buildIsFPClass(LLT::scalar(1), SqrtX, fcZero | fcPosInf);
5933 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtS, Flags);
5934
5935 MI.eraseFromParent();
5936 return true;
5937}
5938
5941 MachineIRBuilder &B) const {
5942 // For double type, the SQRT and RSQ instructions don't have required
5943 // precision, we apply Goldschmidt's algorithm to improve the result:
5944 //
5945 // y0 = rsq(x)
5946 // g0 = x * y0
5947 // h0 = 0.5 * y0
5948 //
5949 // r0 = 0.5 - h0 * g0
5950 // g1 = g0 * r0 + g0
5951 // h1 = h0 * r0 + h0
5952 //
5953 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
5954 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
5955 // h2 = h1 * r1 + h1
5956 //
5957 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
5958 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
5959 //
5960 // sqrt(x) = g3
5961
5962 const LLT S1 = LLT::scalar(1);
5963 const LLT S32 = LLT::scalar(32);
5964 const LLT F64 = LLT::scalar(64);
5965
5966 Register Dst = MI.getOperand(0).getReg();
5967 assert(MRI.getType(Dst) == F64 && "only expect to lower f64 sqrt");
5968
5969 Register X = MI.getOperand(1).getReg();
5970 unsigned Flags = MI.getFlags();
5971
5972 Register SqrtX = X;
5973 Register Scaling, ZeroInt;
5974 if (!MI.getFlag(MachineInstr::FmAfn)) {
5975 auto ScaleConstant = B.buildFConstant(F64, 0x1.0p-767);
5976
5977 ZeroInt = B.buildConstant(S32, 0).getReg(0);
5978 Scaling = B.buildFCmp(FCmpInst::FCMP_OLT, S1, X, ScaleConstant).getReg(0);
5979
5980 // Scale up input if it is too small.
5981 auto ScaleUpFactor = B.buildConstant(S32, 256);
5982 auto ScaleUp = B.buildSelect(S32, Scaling, ScaleUpFactor, ZeroInt);
5983 SqrtX = B.buildFLdexp(F64, X, ScaleUp, Flags).getReg(0);
5984 }
5985
5986 auto SqrtY = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {F64}).addReg(SqrtX);
5987
5988 auto Half = B.buildFConstant(F64, 0.5);
5989 auto SqrtH0 = B.buildFMul(F64, SqrtY, Half);
5990 auto SqrtS0 = B.buildFMul(F64, SqrtX, SqrtY);
5991
5992 auto NegSqrtH0 = B.buildFNeg(F64, SqrtH0);
5993 auto SqrtR0 = B.buildFMA(F64, NegSqrtH0, SqrtS0, Half);
5994
5995 auto SqrtS1 = B.buildFMA(F64, SqrtS0, SqrtR0, SqrtS0);
5996 auto SqrtH1 = B.buildFMA(F64, SqrtH0, SqrtR0, SqrtH0);
5997
5998 auto NegSqrtS1 = B.buildFNeg(F64, SqrtS1);
5999 auto SqrtD0 = B.buildFMA(F64, NegSqrtS1, SqrtS1, SqrtX);
6000
6001 auto SqrtS2 = B.buildFMA(F64, SqrtD0, SqrtH1, SqrtS1);
6002
6003 Register SqrtRet = SqrtS2.getReg(0);
6004 if (!MI.getFlag(MachineInstr::FmAfn)) {
6005 auto NegSqrtS2 = B.buildFNeg(F64, SqrtS2);
6006 auto SqrtD1 = B.buildFMA(F64, NegSqrtS2, SqrtS2, SqrtX);
6007 auto SqrtD2 = B.buildFMA(F64, SqrtD1, SqrtH1, SqrtS2);
6008
6009 // Scale down the result.
6010 auto ScaleDownFactor = B.buildConstant(S32, -128);
6011 auto ScaleDown = B.buildSelect(S32, Scaling, ScaleDownFactor, ZeroInt);
6012 SqrtRet = B.buildFLdexp(F64, SqrtD2, ScaleDown, Flags).getReg(0);
6013 }
6014
6015 Register IsZeroOrInf;
6016 if (MI.getFlag(MachineInstr::FmNoInfs)) {
6017 auto ZeroFP = B.buildFConstant(F64, 0.0);
6018 IsZeroOrInf = B.buildFCmp(FCmpInst::FCMP_OEQ, S1, SqrtX, ZeroFP).getReg(0);
6019 } else {
6020 IsZeroOrInf = B.buildIsFPClass(S1, SqrtX, fcZero | fcPosInf).getReg(0);
6021 }
6022
6023 // TODO: Check for DAZ and expand to subnormals
6024
6025 // If x is +INF, +0, or -0, use its original value
6026 B.buildSelect(Dst, IsZeroOrInf, SqrtX, SqrtRet, Flags);
6027
6028 MI.eraseFromParent();
6029 return true;
6030}
6031
6034 MachineIRBuilder &B) const {
6035 LLT Ty = MRI.getType(MI.getOperand(0).getReg());
6036 if (Ty == LLT::scalar(32))
6037 return legalizeFSQRTF32(MI, MRI, B);
6038 if (Ty == LLT::scalar(64))
6039 return legalizeFSQRTF64(MI, MRI, B);
6040 if (Ty == LLT::scalar(16))
6041 return legalizeFSQRTF16(MI, MRI, B);
6042 return false;
6043}
6044
6045// Expand llvm.amdgcn.rsq.clamp on targets that don't support the instruction.
6046// FIXME: Why do we handle this one but not other removed instructions?
6047//
6048// Reciprocal square root. The clamp prevents infinite results, clamping
6049// infinities to max_float. D.f = 1.0 / sqrt(S0.f), result clamped to
6050// +-max_float.
6053 MachineIRBuilder &B) const {
6054 if (ST.getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
6055 return true;
6056
6057 Register Dst = MI.getOperand(0).getReg();
6058 Register Src = MI.getOperand(2).getReg();
6059 auto Flags = MI.getFlags();
6060
6061 LLT Ty = MRI.getType(Dst);
6062
6063 const fltSemantics *FltSemantics;
6064 if (Ty == LLT::scalar(32))
6065 FltSemantics = &APFloat::IEEEsingle();
6066 else if (Ty == LLT::scalar(64))
6067 FltSemantics = &APFloat::IEEEdouble();
6068 else
6069 return false;
6070
6071 auto Rsq = B.buildIntrinsic(Intrinsic::amdgcn_rsq, {Ty})
6072 .addUse(Src)
6073 .setMIFlags(Flags);
6074
6075 // We don't need to concern ourselves with the snan handling difference, since
6076 // the rsq quieted (or not) so use the one which will directly select.
6077 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6078 const bool UseIEEE = MFI->getMode().IEEE;
6079
6080 auto MaxFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics));
6081 auto ClampMax = UseIEEE ? B.buildFMinNumIEEE(Ty, Rsq, MaxFlt, Flags) :
6082 B.buildFMinNum(Ty, Rsq, MaxFlt, Flags);
6083
6084 auto MinFlt = B.buildFConstant(Ty, APFloat::getLargest(*FltSemantics, true));
6085
6086 if (UseIEEE)
6087 B.buildFMaxNumIEEE(Dst, ClampMax, MinFlt, Flags);
6088 else
6089 B.buildFMaxNum(Dst, ClampMax, MinFlt, Flags);
6090 MI.eraseFromParent();
6091 return true;
6092}
6093
6094// TODO: Fix pointer type handling
6097 Intrinsic::ID IID) const {
6098
6099 MachineIRBuilder &B = Helper.MIRBuilder;
6100 MachineRegisterInfo &MRI = *B.getMRI();
6101
6102 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
6103 IID == Intrinsic::amdgcn_permlanex16;
6104 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
6105 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
6106
6107 auto createLaneOp = [&IID, &B, &MI](Register Src0, Register Src1,
6108 Register Src2, LLT VT) -> Register {
6109 auto LaneOp = B.buildIntrinsic(IID, {VT}).addUse(Src0);
6110 switch (IID) {
6111 case Intrinsic::amdgcn_readfirstlane:
6112 case Intrinsic::amdgcn_permlane64:
6113 return LaneOp.getReg(0);
6114 case Intrinsic::amdgcn_readlane:
6115 case Intrinsic::amdgcn_set_inactive:
6116 case Intrinsic::amdgcn_set_inactive_chain_arg:
6117 return LaneOp.addUse(Src1).getReg(0);
6118 case Intrinsic::amdgcn_writelane:
6119 return LaneOp.addUse(Src1).addUse(Src2).getReg(0);
6120 case Intrinsic::amdgcn_permlane16:
6121 case Intrinsic::amdgcn_permlanex16: {
6122 Register Src3 = MI.getOperand(5).getReg();
6123 int64_t Src4 = MI.getOperand(6).getImm();
6124 int64_t Src5 = MI.getOperand(7).getImm();
6125 return LaneOp.addUse(Src1)
6126 .addUse(Src2)
6127 .addUse(Src3)
6128 .addImm(Src4)
6129 .addImm(Src5)
6130 .getReg(0);
6131 }
6132 case Intrinsic::amdgcn_mov_dpp8:
6133 return LaneOp.addImm(MI.getOperand(3).getImm()).getReg(0);
6134 case Intrinsic::amdgcn_update_dpp:
6135 return LaneOp.addUse(Src1)
6136 .addImm(MI.getOperand(4).getImm())
6137 .addImm(MI.getOperand(5).getImm())
6138 .addImm(MI.getOperand(6).getImm())
6139 .addImm(MI.getOperand(7).getImm())
6140 .getReg(0);
6141 default:
6142 llvm_unreachable("unhandled lane op");
6143 }
6144 };
6145
6146 Register DstReg = MI.getOperand(0).getReg();
6147 Register Src0 = MI.getOperand(2).getReg();
6148 Register Src1, Src2;
6149 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
6150 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
6151 Src1 = MI.getOperand(3).getReg();
6152 if (IID == Intrinsic::amdgcn_writelane || IsPermLane16) {
6153 Src2 = MI.getOperand(4).getReg();
6154 }
6155 }
6156
6157 LLT Ty = MRI.getType(DstReg);
6158 unsigned Size = Ty.getSizeInBits();
6159
6160 unsigned SplitSize = 32;
6161 if (IID == Intrinsic::amdgcn_update_dpp && (Size % 64 == 0) &&
6162 ST.hasDPALU_DPP() &&
6163 AMDGPU::isLegalDPALU_DPPControl(ST, MI.getOperand(4).getImm()))
6164 SplitSize = 64;
6165
6166 if (Size == SplitSize) {
6167 // Already legal
6168 return true;
6169 }
6170
6171 if (Size < 32) {
6172 Src0 = B.buildAnyExt(S32, Src0).getReg(0);
6173
6174 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6175 Src1 = B.buildAnyExt(LLT::scalar(32), Src1).getReg(0);
6176
6177 if (IID == Intrinsic::amdgcn_writelane)
6178 Src2 = B.buildAnyExt(LLT::scalar(32), Src2).getReg(0);
6179
6180 Register LaneOpDst = createLaneOp(Src0, Src1, Src2, S32);
6181 B.buildTrunc(DstReg, LaneOpDst);
6182 MI.eraseFromParent();
6183 return true;
6184 }
6185
6186 if (Size % SplitSize != 0)
6187 return false;
6188
6189 LLT PartialResTy = LLT::scalar(SplitSize);
6190 bool NeedsBitcast = false;
6191 if (Ty.isVector()) {
6192 LLT EltTy = Ty.getElementType();
6193 unsigned EltSize = EltTy.getSizeInBits();
6194 if (EltSize == SplitSize) {
6195 PartialResTy = EltTy;
6196 } else if (EltSize == 16 || EltSize == 32) {
6197 unsigned NElem = SplitSize / EltSize;
6198 PartialResTy = Ty.changeElementCount(ElementCount::getFixed(NElem));
6199 } else {
6200 // Handle all other cases via S32/S64 pieces
6201 NeedsBitcast = true;
6202 }
6203 }
6204
6205 SmallVector<Register, 4> PartialRes;
6206 unsigned NumParts = Size / SplitSize;
6207 MachineInstrBuilder Src0Parts = B.buildUnmerge(PartialResTy, Src0);
6208 MachineInstrBuilder Src1Parts, Src2Parts;
6209
6210 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6211 Src1Parts = B.buildUnmerge(PartialResTy, Src1);
6212
6213 if (IID == Intrinsic::amdgcn_writelane)
6214 Src2Parts = B.buildUnmerge(PartialResTy, Src2);
6215
6216 for (unsigned i = 0; i < NumParts; ++i) {
6217 Src0 = Src0Parts.getReg(i);
6218
6219 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
6220 Src1 = Src1Parts.getReg(i);
6221
6222 if (IID == Intrinsic::amdgcn_writelane)
6223 Src2 = Src2Parts.getReg(i);
6224
6225 PartialRes.push_back(createLaneOp(Src0, Src1, Src2, PartialResTy));
6226 }
6227
6228 if (NeedsBitcast)
6229 B.buildBitcast(DstReg, B.buildMergeLikeInstr(
6230 LLT::scalar(Ty.getSizeInBits()), PartialRes));
6231 else
6232 B.buildMergeLikeInstr(DstReg, PartialRes);
6233
6234 MI.eraseFromParent();
6235 return true;
6236}
6237
6240 MachineIRBuilder &B) const {
6242 ST.getTargetLowering()->getImplicitParameterOffset(
6244 LLT DstTy = MRI.getType(DstReg);
6245 LLT IdxTy = LLT::scalar(DstTy.getSizeInBits());
6246
6247 Register KernargPtrReg = MRI.createGenericVirtualRegister(DstTy);
6248 if (!loadInputValue(KernargPtrReg, B,
6250 return false;
6251
6252 B.buildObjectPtrOffset(DstReg, KernargPtrReg,
6253 B.buildConstant(IdxTy, Offset).getReg(0));
6254 return true;
6255}
6256
6257/// To create a buffer resource from a 64-bit pointer, mask off the upper 32
6258/// bits of the pointer and replace them with the stride argument, then
6259/// merge_values everything together. In the common case of a raw buffer (the
6260/// stride component is 0), we can just AND off the upper half.
6263 Register Result = MI.getOperand(0).getReg();
6264 Register Pointer = MI.getOperand(2).getReg();
6265 Register Stride = MI.getOperand(3).getReg();
6266 Register NumRecords = MI.getOperand(4).getReg();
6267 Register Flags = MI.getOperand(5).getReg();
6268
6269 LLT S32 = LLT::scalar(32);
6270 LLT S64 = LLT::scalar(64);
6271
6272 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6273
6274 auto ExtStride = B.buildAnyExt(S32, Stride);
6275
6276 if (ST.has45BitNumRecordsBufferResource()) {
6277 Register Zero = B.buildConstant(S32, 0).getReg(0);
6278 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
6279 // num_records.
6280 LLT PtrIntTy = LLT::scalar(MRI.getType(Pointer).getSizeInBits());
6281 auto PointerInt = B.buildPtrToInt(PtrIntTy, Pointer);
6282 auto ExtPointer = B.buildAnyExtOrTrunc(S64, PointerInt);
6283 auto NumRecordsLHS = B.buildShl(S64, NumRecords, B.buildConstant(S32, 57));
6284 Register LowHalf = B.buildOr(S64, ExtPointer, NumRecordsLHS).getReg(0);
6285
6286 // Build the higher 64-bit value, which has the higher 38-bit num_records,
6287 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
6288 auto NumRecordsRHS = B.buildLShr(S64, NumRecords, B.buildConstant(S32, 7));
6289 auto ShiftedStride = B.buildShl(S32, ExtStride, B.buildConstant(S32, 12));
6290 auto ExtShiftedStride =
6291 B.buildMergeValues(S64, {Zero, ShiftedStride.getReg(0)});
6292 auto ShiftedFlags = B.buildShl(S32, Flags, B.buildConstant(S32, 28));
6293 auto ExtShiftedFlags =
6294 B.buildMergeValues(S64, {Zero, ShiftedFlags.getReg(0)});
6295 auto CombinedFields = B.buildOr(S64, NumRecordsRHS, ExtShiftedStride);
6296 Register HighHalf =
6297 B.buildOr(S64, CombinedFields, ExtShiftedFlags).getReg(0);
6298 B.buildMergeValues(Result, {LowHalf, HighHalf});
6299 } else {
6300 NumRecords = B.buildTrunc(S32, NumRecords).getReg(0);
6301 auto Unmerge = B.buildUnmerge(S32, Pointer);
6302 auto LowHalf = Unmerge.getReg(0);
6303 auto HighHalf = Unmerge.getReg(1);
6304
6305 auto AndMask = B.buildConstant(S32, 0x0000ffff);
6306 auto Masked = B.buildAnd(S32, HighHalf, AndMask);
6307 auto ShiftConst = B.buildConstant(S32, 16);
6308 auto ShiftedStride = B.buildShl(S32, ExtStride, ShiftConst);
6309 auto NewHighHalf = B.buildOr(S32, Masked, ShiftedStride);
6310 Register NewHighHalfReg = NewHighHalf.getReg(0);
6311 B.buildMergeValues(Result, {LowHalf, NewHighHalfReg, NumRecords, Flags});
6312 }
6313
6314 MI.eraseFromParent();
6315 return true;
6316}
6317
6320 MachineIRBuilder &B) const {
6321 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6322 if (!MFI->isEntryFunction()) {
6323 return legalizePreloadedArgIntrin(MI, MRI, B,
6325 }
6326
6327 Register DstReg = MI.getOperand(0).getReg();
6328 if (!getImplicitArgPtr(DstReg, MRI, B))
6329 return false;
6330
6331 MI.eraseFromParent();
6332 return true;
6333}
6334
6337 MachineIRBuilder &B) const {
6338 Function &F = B.getMF().getFunction();
6339 std::optional<uint32_t> KnownSize =
6341 if (KnownSize.has_value())
6342 B.buildConstant(DstReg, *KnownSize);
6343 return false;
6344}
6345
6348 MachineIRBuilder &B) const {
6349
6350 const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
6351 if (!MFI->isEntryFunction()) {
6352 return legalizePreloadedArgIntrin(MI, MRI, B,
6354 }
6355
6356 Register DstReg = MI.getOperand(0).getReg();
6357 if (!getLDSKernelId(DstReg, MRI, B))
6358 return false;
6359
6360 MI.eraseFromParent();
6361 return true;
6362}
6363
6367 unsigned AddrSpace) const {
6368 const LLT S32 = LLT::scalar(32);
6369 auto Unmerge = B.buildUnmerge(S32, MI.getOperand(2).getReg());
6370 Register Hi32 = Unmerge.getReg(1);
6371
6372 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS &&
6373 ST.hasGloballyAddressableScratch()) {
6374 Register FlatScratchBaseHi =
6375 B.buildInstr(AMDGPU::S_MOV_B32, {S32},
6376 {Register(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI)})
6377 .getReg(0);
6378 MRI.setRegClass(FlatScratchBaseHi, &AMDGPU::SReg_32RegClass);
6379 // Test bits 63..58 against the aperture address.
6380 Register XOR = B.buildXor(S32, Hi32, FlatScratchBaseHi).getReg(0);
6381 B.buildICmp(ICmpInst::ICMP_ULT, MI.getOperand(0), XOR,
6382 B.buildConstant(S32, 1u << 26));
6383 } else {
6384 Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
6385 B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
6386 }
6387 MI.eraseFromParent();
6388 return true;
6389}
6390
6391// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
6392// offset (the offset that is included in bounds checking and swizzling, to be
6393// split between the instruction's voffset and immoffset fields) and soffset
6394// (the offset that is excluded from bounds checking and swizzling, to go in
6395// the instruction's soffset field). This function takes the first kind of
6396// offset and figures out how to split it between voffset and immoffset.
6397std::pair<Register, unsigned>
6399 Register OrigOffset) const {
6400 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(ST);
6401 Register BaseReg;
6402 unsigned ImmOffset;
6403 const LLT S32 = LLT::scalar(32);
6404 MachineRegisterInfo &MRI = *B.getMRI();
6405
6406 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
6407 // being added, so we can only safely match a 32-bit addition with no unsigned
6408 // overflow.
6409 bool CheckNUW = ST.hasGFX1250Insts();
6410 std::tie(BaseReg, ImmOffset) = AMDGPU::getBaseWithConstantOffset(
6411 MRI, OrigOffset, /*KnownBits=*/nullptr, CheckNUW);
6412
6413 // If BaseReg is a pointer, convert it to int.
6414 if (MRI.getType(BaseReg).isPointer())
6415 BaseReg = B.buildPtrToInt(MRI.getType(OrigOffset), BaseReg).getReg(0);
6416
6417 // If the immediate value is too big for the immoffset field, put only bits
6418 // that would normally fit in the immoffset field. The remaining value that
6419 // is copied/added for the voffset field is a large power of 2, and it
6420 // stands more chance of being CSEd with the copy/add for another similar
6421 // load/store.
6422 // However, do not do that rounding down if that is a negative
6423 // number, as it appears to be illegal to have a negative offset in the
6424 // vgpr, even if adding the immediate offset makes it positive.
6425 unsigned Overflow = ImmOffset & ~MaxImm;
6426 ImmOffset -= Overflow;
6427 if ((int32_t)Overflow < 0) {
6428 Overflow += ImmOffset;
6429 ImmOffset = 0;
6430 }
6431
6432 if (Overflow != 0) {
6433 if (!BaseReg) {
6434 BaseReg = B.buildConstant(S32, Overflow).getReg(0);
6435 } else {
6436 auto OverflowVal = B.buildConstant(S32, Overflow);
6437 BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
6438 }
6439 }
6440
6441 if (!BaseReg)
6442 BaseReg = B.buildConstant(S32, 0).getReg(0);
6443
6444 return std::pair(BaseReg, ImmOffset);
6445}
6446
6447/// Handle register layout difference for f16 images for some subtargets.
6450 Register Reg,
6451 bool ImageStore) const {
6452 const LLT S16 = LLT::scalar(16);
6453 const LLT S32 = LLT::scalar(32);
6454 LLT StoreVT = MRI.getType(Reg);
6455 assert(StoreVT.isVector() && StoreVT.getElementType() == S16);
6456
6457 if (ST.hasUnpackedD16VMem()) {
6458 auto Unmerge = B.buildUnmerge(S16, Reg);
6459
6460 SmallVector<Register, 4> WideRegs;
6461 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6462 WideRegs.push_back(B.buildAnyExt(S32, Unmerge.getReg(I)).getReg(0));
6463
6464 int NumElts = StoreVT.getNumElements();
6465
6466 return B.buildBuildVector(LLT::fixed_vector(NumElts, S32), WideRegs)
6467 .getReg(0);
6468 }
6469
6470 if (ImageStore && ST.hasImageStoreD16Bug()) {
6471 if (StoreVT.getNumElements() == 2) {
6472 SmallVector<Register, 4> PackedRegs;
6473 Reg = B.buildBitcast(S32, Reg).getReg(0);
6474 PackedRegs.push_back(Reg);
6475 PackedRegs.resize(2, B.buildUndef(S32).getReg(0));
6476 return B.buildBuildVector(LLT::fixed_vector(2, S32), PackedRegs)
6477 .getReg(0);
6478 }
6479
6480 if (StoreVT.getNumElements() == 3) {
6481 SmallVector<Register, 4> PackedRegs;
6482 auto Unmerge = B.buildUnmerge(S16, Reg);
6483 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6484 PackedRegs.push_back(Unmerge.getReg(I));
6485 PackedRegs.resize(6, B.buildUndef(S16).getReg(0));
6486 Reg = B.buildBuildVector(LLT::fixed_vector(6, S16), PackedRegs).getReg(0);
6487 return B.buildBitcast(LLT::fixed_vector(3, S32), Reg).getReg(0);
6488 }
6489
6490 if (StoreVT.getNumElements() == 4) {
6491 SmallVector<Register, 4> PackedRegs;
6492 Reg = B.buildBitcast(LLT::fixed_vector(2, S32), Reg).getReg(0);
6493 auto Unmerge = B.buildUnmerge(S32, Reg);
6494 for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
6495 PackedRegs.push_back(Unmerge.getReg(I));
6496 PackedRegs.resize(4, B.buildUndef(S32).getReg(0));
6497 return B.buildBuildVector(LLT::fixed_vector(4, S32), PackedRegs)
6498 .getReg(0);
6499 }
6500
6501 llvm_unreachable("invalid data type");
6502 }
6503
6504 if (StoreVT == LLT::fixed_vector(3, S16)) {
6505 Reg = B.buildPadVectorWithUndefElements(LLT::fixed_vector(4, S16), Reg)
6506 .getReg(0);
6507 }
6508 return Reg;
6509}
6510
6512 Register VData, LLT MemTy,
6513 bool IsFormat) const {
6514 MachineRegisterInfo *MRI = B.getMRI();
6515 LLT Ty = MRI->getType(VData);
6516
6517 const LLT S16 = LLT::scalar(16);
6518
6519 // Fixup buffer resources themselves needing to be v4i128.
6521 return castBufferRsrcToV4I32(VData, B);
6522
6523 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6524 Ty = getBitcastRegisterType(Ty);
6525 VData = B.buildBitcast(Ty, VData).getReg(0);
6526 }
6527 // Fixup illegal register types for i8 stores.
6528 if (Ty == LLT::scalar(8) || Ty == S16) {
6529 Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
6530 return AnyExt;
6531 }
6532
6533 if (Ty.isVector()) {
6534 if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
6535 if (IsFormat)
6536 return handleD16VData(B, *MRI, VData);
6537 }
6538 }
6539
6540 return VData;
6541}
6542
6544 LegalizerHelper &Helper,
6545 bool IsTyped,
6546 bool IsFormat) const {
6547 MachineIRBuilder &B = Helper.MIRBuilder;
6548 MachineRegisterInfo &MRI = *B.getMRI();
6549
6550 Register VData = MI.getOperand(1).getReg();
6551 LLT Ty = MRI.getType(VData);
6552 LLT EltTy = Ty.getScalarType();
6553 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6554 const LLT S32 = LLT::scalar(32);
6555
6556 MachineMemOperand *MMO = *MI.memoperands_begin();
6557 const int MemSize = MMO->getSize().getValue();
6558 LLT MemTy = MMO->getMemoryType();
6559
6560 VData = fixStoreSourceType(B, VData, MemTy, IsFormat);
6561
6563 Register RSrc = MI.getOperand(2).getReg();
6564
6565 unsigned ImmOffset;
6566
6567 // The typed intrinsics add an immediate after the registers.
6568 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6569
6570 // The struct intrinsic variants add one additional operand over raw.
6571 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6572 Register VIndex;
6573 int OpOffset = 0;
6574 if (HasVIndex) {
6575 VIndex = MI.getOperand(3).getReg();
6576 OpOffset = 1;
6577 } else {
6578 VIndex = B.buildConstant(S32, 0).getReg(0);
6579 }
6580
6581 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6582 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6583
6584 unsigned Format = 0;
6585 if (IsTyped) {
6586 Format = MI.getOperand(5 + OpOffset).getImm();
6587 ++OpOffset;
6588 }
6589
6590 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6591
6592 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6593
6594 unsigned Opc;
6595 if (IsTyped) {
6596 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
6597 AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
6598 } else if (IsFormat) {
6599 Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
6600 AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
6601 } else {
6602 switch (MemSize) {
6603 case 1:
6604 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
6605 break;
6606 case 2:
6607 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
6608 break;
6609 default:
6610 Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
6611 break;
6612 }
6613 }
6614
6615 auto MIB = B.buildInstr(Opc)
6616 .addUse(VData) // vdata
6617 .addUse(RSrc) // rsrc
6618 .addUse(VIndex) // vindex
6619 .addUse(VOffset) // voffset
6620 .addUse(SOffset) // soffset
6621 .addImm(ImmOffset); // offset(imm)
6622
6623 if (IsTyped)
6624 MIB.addImm(Format);
6625
6626 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6627 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6628 .addMemOperand(MMO);
6629
6630 MI.eraseFromParent();
6631 return true;
6632}
6633
6634static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc,
6635 Register VIndex, Register VOffset, Register SOffset,
6636 unsigned ImmOffset, unsigned Format,
6637 unsigned AuxiliaryData, MachineMemOperand *MMO,
6638 bool IsTyped, bool HasVIndex, MachineIRBuilder &B) {
6639 auto MIB = B.buildInstr(Opc)
6640 .addDef(LoadDstReg) // vdata
6641 .addUse(RSrc) // rsrc
6642 .addUse(VIndex) // vindex
6643 .addUse(VOffset) // voffset
6644 .addUse(SOffset) // soffset
6645 .addImm(ImmOffset); // offset(imm)
6646
6647 if (IsTyped)
6648 MIB.addImm(Format);
6649
6650 MIB.addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6651 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6652 .addMemOperand(MMO);
6653}
6654
6656 LegalizerHelper &Helper,
6657 bool IsFormat,
6658 bool IsTyped) const {
6659 MachineIRBuilder &B = Helper.MIRBuilder;
6660 MachineRegisterInfo &MRI = *B.getMRI();
6661 GISelChangeObserver &Observer = Helper.Observer;
6662
6663 // FIXME: Verifier should enforce 1 MMO for these intrinsics.
6664 MachineMemOperand *MMO = *MI.memoperands_begin();
6665 const LLT MemTy = MMO->getMemoryType();
6666 const LLT S32 = LLT::scalar(32);
6667
6668 Register Dst = MI.getOperand(0).getReg();
6669
6670 Register StatusDst;
6671 int OpOffset = 0;
6672 assert(MI.getNumExplicitDefs() == 1 || MI.getNumExplicitDefs() == 2);
6673 bool IsTFE = MI.getNumExplicitDefs() == 2;
6674 if (IsTFE) {
6675 StatusDst = MI.getOperand(1).getReg();
6676 ++OpOffset;
6677 }
6678
6679 castBufferRsrcArgToV4I32(MI, B, 2 + OpOffset);
6680 Register RSrc = MI.getOperand(2 + OpOffset).getReg();
6681
6682 // The typed intrinsics add an immediate after the registers.
6683 const unsigned NumVIndexOps = IsTyped ? 8 : 7;
6684
6685 // The struct intrinsic variants add one additional operand over raw.
6686 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps + OpOffset;
6687 Register VIndex;
6688 if (HasVIndex) {
6689 VIndex = MI.getOperand(3 + OpOffset).getReg();
6690 ++OpOffset;
6691 } else {
6692 VIndex = B.buildConstant(S32, 0).getReg(0);
6693 }
6694
6695 Register VOffset = MI.getOperand(3 + OpOffset).getReg();
6696 Register SOffset = MI.getOperand(4 + OpOffset).getReg();
6697
6698 unsigned Format = 0;
6699 if (IsTyped) {
6700 Format = MI.getOperand(5 + OpOffset).getImm();
6701 ++OpOffset;
6702 }
6703
6704 unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
6705 unsigned ImmOffset;
6706
6707 LLT Ty = MRI.getType(Dst);
6708 // Make addrspace 8 pointers loads into 4xs32 loads here, so the rest of the
6709 // logic doesn't have to handle that case.
6710 if (hasBufferRsrcWorkaround(Ty)) {
6711 Observer.changingInstr(MI);
6712 Ty = castBufferRsrcFromV4I32(MI, B, MRI, 0);
6713 Observer.changedInstr(MI);
6714 Dst = MI.getOperand(0).getReg();
6715 B.setInsertPt(B.getMBB(), MI);
6716 }
6717 if (shouldBitcastLoadStoreType(ST, Ty, MemTy)) {
6718 Ty = getBitcastRegisterType(Ty);
6719 Observer.changingInstr(MI);
6720 Helper.bitcastDst(MI, Ty, 0);
6721 Observer.changedInstr(MI);
6722 Dst = MI.getOperand(0).getReg();
6723 B.setInsertPt(B.getMBB(), MI);
6724 }
6725
6726 LLT EltTy = Ty.getScalarType();
6727 const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
6728 const bool Unpacked = ST.hasUnpackedD16VMem();
6729
6730 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6731
6732 unsigned Opc;
6733
6734 // TODO: Support TFE for typed and narrow loads.
6735 if (IsTyped) {
6736 if (IsTFE)
6737 return false;
6738 Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
6739 AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
6740 } else if (IsFormat) {
6741 if (IsD16) {
6742 if (IsTFE)
6743 return false;
6744 Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16;
6745 } else {
6746 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_TFE
6747 : AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
6748 }
6749 } else {
6750 switch (MemTy.getSizeInBits()) {
6751 case 8:
6752 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE_TFE
6753 : AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
6754 break;
6755 case 16:
6756 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT_TFE
6757 : AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
6758 break;
6759 default:
6760 Opc = IsTFE ? AMDGPU::G_AMDGPU_BUFFER_LOAD_TFE
6761 : AMDGPU::G_AMDGPU_BUFFER_LOAD;
6762 break;
6763 }
6764 }
6765
6766 if (IsTFE) {
6767 unsigned NumValueDWords = divideCeil(Ty.getSizeInBits(), 32);
6768 unsigned NumLoadDWords = NumValueDWords + 1;
6769 LLT LoadTy = LLT::fixed_vector(NumLoadDWords, S32);
6770 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(LoadTy);
6771 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6772 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6773 if (MemTy.getSizeInBits() < 32) {
6774 Register ExtDst = B.getMRI()->createGenericVirtualRegister(S32);
6775 B.buildUnmerge({ExtDst, StatusDst}, LoadDstReg);
6776 B.buildTrunc(Dst, ExtDst);
6777 } else if (NumValueDWords == 1) {
6778 B.buildUnmerge({Dst, StatusDst}, LoadDstReg);
6779 } else {
6780 SmallVector<Register, 5> LoadElts;
6781 for (unsigned I = 0; I != NumValueDWords; ++I)
6782 LoadElts.push_back(B.getMRI()->createGenericVirtualRegister(S32));
6783 LoadElts.push_back(StatusDst);
6784 B.buildUnmerge(LoadElts, LoadDstReg);
6785 LoadElts.truncate(NumValueDWords);
6786 B.buildMergeLikeInstr(Dst, LoadElts);
6787 }
6788 } else if ((!IsD16 && MemTy.getSizeInBits() < 32) ||
6789 (IsD16 && !Ty.isVector())) {
6790 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
6791 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6792 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6793 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6794 B.buildTrunc(Dst, LoadDstReg);
6795 } else if (Unpacked && IsD16 && Ty.isVector()) {
6796 LLT UnpackedTy = Ty.changeElementSize(32);
6797 Register LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
6798 buildBufferLoad(Opc, LoadDstReg, RSrc, VIndex, VOffset, SOffset, ImmOffset,
6799 Format, AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6800 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
6801 // FIXME: G_TRUNC should work, but legalization currently fails
6802 auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
6804 for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
6805 Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
6806 B.buildMergeLikeInstr(Dst, Repack);
6807 } else {
6808 buildBufferLoad(Opc, Dst, RSrc, VIndex, VOffset, SOffset, ImmOffset, Format,
6809 AuxiliaryData, MMO, IsTyped, HasVIndex, B);
6810 }
6811
6812 MI.eraseFromParent();
6813 return true;
6814}
6815
6816static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
6817 switch (IntrID) {
6818 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
6819 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
6820 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
6821 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
6822 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
6823 case Intrinsic::amdgcn_raw_buffer_atomic_add:
6824 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
6825 case Intrinsic::amdgcn_struct_buffer_atomic_add:
6826 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
6827 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
6828 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
6829 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
6830 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
6831 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
6832 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
6833 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
6834 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
6835 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
6836 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
6837 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
6838 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
6839 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
6840 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
6841 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
6842 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
6843 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
6844 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
6845 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
6846 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
6847 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
6848 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
6849 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
6850 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
6851 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
6852 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
6853 case Intrinsic::amdgcn_raw_buffer_atomic_and:
6854 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
6855 case Intrinsic::amdgcn_struct_buffer_atomic_and:
6856 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
6857 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
6858 case Intrinsic::amdgcn_raw_buffer_atomic_or:
6859 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
6860 case Intrinsic::amdgcn_struct_buffer_atomic_or:
6861 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
6862 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
6863 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
6864 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
6865 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
6866 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
6867 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
6868 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
6869 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
6870 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
6871 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
6872 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
6873 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
6874 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
6875 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
6876 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
6877 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
6878 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
6879 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
6880 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
6881 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
6882 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
6883 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
6884 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
6885 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
6886 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
6887 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
6888 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
6889 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
6890 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
6891 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
6892 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN;
6893 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
6894 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
6895 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
6896 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
6897 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
6898 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
6899 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
6900 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
6901 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
6902 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
6903 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
6904 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
6905 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
6906 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
6907 return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
6908 default:
6909 llvm_unreachable("unhandled atomic opcode");
6910 }
6911}
6912
6915 Intrinsic::ID IID) const {
6916 const bool IsCmpSwap =
6917 IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
6918 IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap ||
6919 IID == Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap ||
6920 IID == Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap;
6921
6922 Register Dst = MI.getOperand(0).getReg();
6923 // Since we don't have 128-bit atomics, we don't need to handle the case of
6924 // p8 argmunents to the atomic itself
6925 Register VData = MI.getOperand(2).getReg();
6926
6927 Register CmpVal;
6928 int OpOffset = 0;
6929
6930 if (IsCmpSwap) {
6931 CmpVal = MI.getOperand(3).getReg();
6932 ++OpOffset;
6933 }
6934
6935 castBufferRsrcArgToV4I32(MI, B, 3 + OpOffset);
6936 Register RSrc = MI.getOperand(3 + OpOffset).getReg();
6937 const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
6938
6939 // The struct intrinsic variants add one additional operand over raw.
6940 const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
6941 Register VIndex;
6942 if (HasVIndex) {
6943 VIndex = MI.getOperand(4 + OpOffset).getReg();
6944 ++OpOffset;
6945 } else {
6946 VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
6947 }
6948
6949 Register VOffset = MI.getOperand(4 + OpOffset).getReg();
6950 Register SOffset = MI.getOperand(5 + OpOffset).getReg();
6951 unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
6952
6953 MachineMemOperand *MMO = *MI.memoperands_begin();
6954
6955 unsigned ImmOffset;
6956 std::tie(VOffset, ImmOffset) = splitBufferOffsets(B, VOffset);
6957
6958 auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
6959 .addDef(Dst)
6960 .addUse(VData); // vdata
6961
6962 if (IsCmpSwap)
6963 MIB.addReg(CmpVal);
6964
6965 MIB.addUse(RSrc) // rsrc
6966 .addUse(VIndex) // vindex
6967 .addUse(VOffset) // voffset
6968 .addUse(SOffset) // soffset
6969 .addImm(ImmOffset) // offset(imm)
6970 .addImm(AuxiliaryData) // cachepolicy, swizzled buffer(imm)
6971 .addImm(HasVIndex ? -1 : 0) // idxen(imm)
6972 .addMemOperand(MMO);
6973
6974 MI.eraseFromParent();
6975 return true;
6976}
6977
6978/// Turn a set of s16 typed registers in \p AddrRegs into a dword sized
6979/// vector with s16 typed elements.
6981 SmallVectorImpl<Register> &PackedAddrs,
6982 unsigned ArgOffset,
6984 bool IsA16, bool IsG16) {
6985 const LLT S16 = LLT::scalar(16);
6986 const LLT V2S16 = LLT::fixed_vector(2, 16);
6987 auto EndIdx = Intr->VAddrEnd;
6988
6989 for (unsigned I = Intr->VAddrStart; I < EndIdx; I++) {
6990 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
6991 if (!SrcOp.isReg())
6992 continue; // _L to _LZ may have eliminated this.
6993
6994 Register AddrReg = SrcOp.getReg();
6995
6996 if ((I < Intr->GradientStart) ||
6997 (I >= Intr->GradientStart && I < Intr->CoordStart && !IsG16) ||
6998 (I >= Intr->CoordStart && !IsA16)) {
6999 if ((I < Intr->GradientStart) && IsA16 &&
7000 (B.getMRI()->getType(AddrReg) == S16)) {
7001 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
7002 // Special handling of bias when A16 is on. Bias is of type half but
7003 // occupies full 32-bit.
7004 PackedAddrs.push_back(
7005 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7006 .getReg(0));
7007 } else {
7008 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
7009 "Bias needs to be converted to 16 bit in A16 mode");
7010 // Handle any gradient or coordinate operands that should not be packed
7011 AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
7012 PackedAddrs.push_back(AddrReg);
7013 }
7014 } else {
7015 // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
7016 // derivatives dx/dh and dx/dv are packed with undef.
7017 if (((I + 1) >= EndIdx) ||
7018 ((Intr->NumGradients / 2) % 2 == 1 &&
7019 (I == static_cast<unsigned>(Intr->GradientStart +
7020 (Intr->NumGradients / 2) - 1) ||
7021 I == static_cast<unsigned>(Intr->GradientStart +
7022 Intr->NumGradients - 1))) ||
7023 // Check for _L to _LZ optimization
7024 !MI.getOperand(ArgOffset + I + 1).isReg()) {
7025 PackedAddrs.push_back(
7026 B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
7027 .getReg(0));
7028 } else {
7029 PackedAddrs.push_back(
7030 B.buildBuildVector(
7031 V2S16, {AddrReg, MI.getOperand(ArgOffset + I + 1).getReg()})
7032 .getReg(0));
7033 ++I;
7034 }
7035 }
7036 }
7037}
7038
7039/// Convert from separate vaddr components to a single vector address register,
7040/// and replace the remaining operands with $noreg.
7042 int DimIdx, int NumVAddrs) {
7043 const LLT S32 = LLT::scalar(32);
7044 (void)S32;
7045 SmallVector<Register, 8> AddrRegs;
7046 for (int I = 0; I != NumVAddrs; ++I) {
7047 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7048 if (SrcOp.isReg()) {
7049 AddrRegs.push_back(SrcOp.getReg());
7050 assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
7051 }
7052 }
7053
7054 int NumAddrRegs = AddrRegs.size();
7055 if (NumAddrRegs != 1) {
7056 auto VAddr =
7057 B.buildBuildVector(LLT::fixed_vector(NumAddrRegs, 32), AddrRegs);
7058 MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
7059 }
7060
7061 for (int I = 1; I != NumVAddrs; ++I) {
7062 MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
7063 if (SrcOp.isReg())
7064 MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
7065 }
7066}
7067
7068/// Rewrite image intrinsics to use register layouts expected by the subtarget.
7069///
7070/// Depending on the subtarget, load/store with 16-bit element data need to be
7071/// rewritten to use the low half of 32-bit registers, or directly use a packed
7072/// layout. 16-bit addresses should also sometimes be packed into 32-bit
7073/// registers.
7074///
7075/// We don't want to directly select image instructions just yet, but also want
7076/// to exposes all register repacking to the legalizer/combiners. We also don't
7077/// want a selected instruction entering RegBankSelect. In order to avoid
7078/// defining a multitude of intermediate image instructions, directly hack on
7079/// the intrinsic's arguments. In cases like a16 addresses, this requires
7080/// padding now unnecessary arguments with $noreg.
7083 const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
7084
7085 const MachineFunction &MF = *MI.getMF();
7086 const unsigned NumDefs = MI.getNumExplicitDefs();
7087 const unsigned ArgOffset = NumDefs + 1;
7088 bool IsTFE = NumDefs == 2;
7089 // We are only processing the operands of d16 image operations on subtargets
7090 // that use the unpacked register layout, or need to repack the TFE result.
7091
7092 // TODO: Do we need to guard against already legalized intrinsics?
7093 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
7095
7096 MachineRegisterInfo *MRI = B.getMRI();
7097 const LLT S32 = LLT::scalar(32);
7098 const LLT S16 = LLT::scalar(16);
7099 const LLT V2S16 = LLT::fixed_vector(2, 16);
7100
7101 unsigned DMask = 0;
7102 Register VData;
7103 LLT Ty;
7104
7105 if (!BaseOpcode->NoReturn || BaseOpcode->Store) {
7106 VData = MI.getOperand(NumDefs == 0 ? 1 : 0).getReg();
7107 Ty = MRI->getType(VData);
7108 }
7109
7110 const bool IsAtomicPacked16Bit =
7111 (BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
7112 BaseOpcode->BaseOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16);
7113
7114 // Check for 16 bit addresses and pack if true.
7115 LLT GradTy =
7116 MRI->getType(MI.getOperand(ArgOffset + Intr->GradientStart).getReg());
7117 LLT AddrTy =
7118 MRI->getType(MI.getOperand(ArgOffset + Intr->CoordStart).getReg());
7119 const bool IsG16 =
7120 ST.hasG16() ? (BaseOpcode->Gradients && GradTy == S16) : GradTy == S16;
7121 const bool IsA16 = AddrTy == S16;
7122 const bool IsD16 = !IsAtomicPacked16Bit && Ty.getScalarType() == S16;
7123
7124 int DMaskLanes = 0;
7125 if (!BaseOpcode->Atomic) {
7126 DMask = MI.getOperand(ArgOffset + Intr->DMaskIndex).getImm();
7127 if (BaseOpcode->Gather4) {
7128 DMaskLanes = 4;
7129 } else if (DMask != 0) {
7130 DMaskLanes = llvm::popcount(DMask);
7131 } else if (!IsTFE && !BaseOpcode->Store) {
7132 // If dmask is 0, this is a no-op load. This can be eliminated.
7133 B.buildUndef(MI.getOperand(0));
7134 MI.eraseFromParent();
7135 return true;
7136 }
7137 }
7138
7139 Observer.changingInstr(MI);
7140 scope_exit ChangedInstr([&] { Observer.changedInstr(MI); });
7141
7142 const unsigned StoreOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE_D16
7143 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE;
7144 const unsigned LoadOpcode = IsD16 ? AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_D16
7145 : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
7146 unsigned NewOpcode = LoadOpcode;
7147 if (BaseOpcode->Store)
7148 NewOpcode = StoreOpcode;
7149 else if (BaseOpcode->NoReturn)
7150 NewOpcode = AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD_NORET;
7151
7152 // Track that we legalized this
7153 MI.setDesc(B.getTII().get(NewOpcode));
7154
7155 // Expecting to get an error flag since TFC is on - and dmask is 0 Force
7156 // dmask to be at least 1 otherwise the instruction will fail
7157 if (IsTFE && DMask == 0) {
7158 DMask = 0x1;
7159 DMaskLanes = 1;
7160 MI.getOperand(ArgOffset + Intr->DMaskIndex).setImm(DMask);
7161 }
7162
7163 if (BaseOpcode->Atomic) {
7164 Register VData0 = MI.getOperand(2).getReg();
7165 LLT Ty = MRI->getType(VData0);
7166
7167 // TODO: Allow atomic swap and bit ops for v2s16/v4s16
7168 if (Ty.isVector() && !IsAtomicPacked16Bit)
7169 return false;
7170
7171 if (BaseOpcode->AtomicX2) {
7172 Register VData1 = MI.getOperand(3).getReg();
7173 // The two values are packed in one register.
7174 LLT PackedTy = LLT::fixed_vector(2, Ty);
7175 auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
7176 MI.getOperand(2).setReg(Concat.getReg(0));
7177 MI.getOperand(3).setReg(AMDGPU::NoRegister);
7178 }
7179 }
7180
7181 unsigned CorrectedNumVAddrs = Intr->NumVAddrs;
7182
7183 // Rewrite the addressing register layout before doing anything else.
7184 if (BaseOpcode->Gradients && !ST.hasG16() && (IsA16 != IsG16)) {
7185 // 16 bit gradients are supported, but are tied to the A16 control
7186 // so both gradients and addresses must be 16 bit
7187 return false;
7188 }
7189
7190 if (IsA16 && !ST.hasA16()) {
7191 // A16 not supported
7192 return false;
7193 }
7194
7195 const unsigned NSAMaxSize = ST.getNSAMaxSize(BaseOpcode->Sampler);
7196 const unsigned HasPartialNSA = ST.hasPartialNSAEncoding();
7197
7198 if (IsA16 || IsG16) {
7199 // Even if NumVAddrs == 1 we should pack it into a 32-bit value, because the
7200 // instructions expect VGPR_32
7201 SmallVector<Register, 4> PackedRegs;
7202
7203 packImage16bitOpsToDwords(B, MI, PackedRegs, ArgOffset, Intr, IsA16, IsG16);
7204
7205 // See also below in the non-a16 branch
7206 const bool UseNSA = ST.hasNSAEncoding() &&
7207 PackedRegs.size() >= ST.getNSAThreshold(MF) &&
7208 (PackedRegs.size() <= NSAMaxSize || HasPartialNSA);
7209 const bool UsePartialNSA =
7210 UseNSA && HasPartialNSA && PackedRegs.size() > NSAMaxSize;
7211
7212 if (UsePartialNSA) {
7213 // Pack registers that would go over NSAMaxSize into last VAddr register
7214 LLT PackedAddrTy =
7215 LLT::fixed_vector(2 * (PackedRegs.size() - NSAMaxSize + 1), 16);
7216 auto Concat = B.buildConcatVectors(
7217 PackedAddrTy, ArrayRef(PackedRegs).slice(NSAMaxSize - 1));
7218 PackedRegs[NSAMaxSize - 1] = Concat.getReg(0);
7219 PackedRegs.resize(NSAMaxSize);
7220 } else if (!UseNSA && PackedRegs.size() > 1) {
7221 LLT PackedAddrTy = LLT::fixed_vector(2 * PackedRegs.size(), 16);
7222 auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
7223 PackedRegs[0] = Concat.getReg(0);
7224 PackedRegs.resize(1);
7225 }
7226
7227 const unsigned NumPacked = PackedRegs.size();
7228 for (unsigned I = Intr->VAddrStart; I < Intr->VAddrEnd; I++) {
7229 MachineOperand &SrcOp = MI.getOperand(ArgOffset + I);
7230 if (!SrcOp.isReg()) {
7231 assert(SrcOp.isImm() && SrcOp.getImm() == 0);
7232 continue;
7233 }
7234
7235 assert(SrcOp.getReg() != AMDGPU::NoRegister);
7236
7237 if (I - Intr->VAddrStart < NumPacked)
7238 SrcOp.setReg(PackedRegs[I - Intr->VAddrStart]);
7239 else
7240 SrcOp.setReg(AMDGPU::NoRegister);
7241 }
7242 } else {
7243 // If the register allocator cannot place the address registers contiguously
7244 // without introducing moves, then using the non-sequential address encoding
7245 // is always preferable, since it saves VALU instructions and is usually a
7246 // wash in terms of code size or even better.
7247 //
7248 // However, we currently have no way of hinting to the register allocator
7249 // that MIMG addresses should be placed contiguously when it is possible to
7250 // do so, so force non-NSA for the common 2-address case as a heuristic.
7251 //
7252 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
7253 // allocation when possible.
7254 //
7255 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
7256 // set of the remaining addresses.
7257 const bool UseNSA = ST.hasNSAEncoding() &&
7258 CorrectedNumVAddrs >= ST.getNSAThreshold(MF) &&
7259 (CorrectedNumVAddrs <= NSAMaxSize || HasPartialNSA);
7260 const bool UsePartialNSA =
7261 UseNSA && HasPartialNSA && CorrectedNumVAddrs > NSAMaxSize;
7262
7263 if (UsePartialNSA) {
7265 ArgOffset + Intr->VAddrStart + NSAMaxSize - 1,
7266 Intr->NumVAddrs - NSAMaxSize + 1);
7267 } else if (!UseNSA && Intr->NumVAddrs > 1) {
7268 convertImageAddrToPacked(B, MI, ArgOffset + Intr->VAddrStart,
7269 Intr->NumVAddrs);
7270 }
7271 }
7272
7273 int Flags = 0;
7274 if (IsA16)
7275 Flags |= 1;
7276 if (IsG16)
7277 Flags |= 2;
7278 MI.addOperand(MachineOperand::CreateImm(Flags));
7279
7280 if (BaseOpcode->NoReturn) { // No TFE for stores?
7281 // TODO: Handle dmask trim
7282 if (!Ty.isVector() || !IsD16)
7283 return true;
7284
7285 Register RepackedReg = handleD16VData(B, *MRI, VData, true);
7286 if (RepackedReg != VData) {
7287 MI.getOperand(1).setReg(RepackedReg);
7288 }
7289
7290 return true;
7291 }
7292
7293 Register DstReg = MI.getOperand(0).getReg();
7294 const LLT EltTy = Ty.getScalarType();
7295 const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
7296
7297 // Confirm that the return type is large enough for the dmask specified
7298 if (NumElts < DMaskLanes)
7299 return false;
7300
7301 if (NumElts > 4 || DMaskLanes > 4)
7302 return false;
7303
7304 // Image atomic instructions are using DMask to specify how many bits
7305 // input/output data will have. 32-bits (s32, v2s16) or 64-bits (s64, v4s16).
7306 // DMaskLanes for image atomic has default value '0'.
7307 // We must be sure that atomic variants (especially packed) will not be
7308 // truncated from v2s16 or v4s16 to s16 type.
7309 //
7310 // ChangeElementCount will be needed for image load where Ty is always scalar.
7311 const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
7312 const LLT AdjustedTy =
7313 DMaskLanes == 0
7314 ? Ty
7315 : Ty.changeElementCount(ElementCount::getFixed(AdjustedNumElts));
7316
7317 // The raw dword aligned data component of the load. The only legal cases
7318 // where this matters should be when using the packed D16 format, for
7319 // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
7320 LLT RoundedTy;
7321
7322 // S32 vector to cover all data, plus TFE result element.
7323 LLT TFETy;
7324
7325 // Register type to use for each loaded component. Will be S32 or V2S16.
7326 LLT RegTy;
7327
7328 if (IsD16 && ST.hasUnpackedD16VMem()) {
7329 RoundedTy =
7330 LLT::scalarOrVector(ElementCount::getFixed(AdjustedNumElts), 32);
7331 TFETy = LLT::fixed_vector(AdjustedNumElts + 1, 32);
7332 RegTy = S32;
7333 } else {
7334 unsigned EltSize = EltTy.getSizeInBits();
7335 unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
7336 unsigned RoundedSize = 32 * RoundedElts;
7337 RoundedTy = LLT::scalarOrVector(
7338 ElementCount::getFixed(RoundedSize / EltSize), EltSize);
7339 TFETy = LLT::fixed_vector(RoundedSize / 32 + 1, S32);
7340 RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
7341 }
7342
7343 // The return type does not need adjustment.
7344 // TODO: Should we change s16 case to s32 or <2 x s16>?
7345 if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
7346 return true;
7347
7348 Register Dst1Reg;
7349
7350 // Insert after the instruction.
7351 B.setInsertPt(*MI.getParent(), ++MI.getIterator());
7352
7353 // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
7354 // s16> instead of s32, we would only need 1 bitcast instead of multiple.
7355 const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
7356 const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
7357
7358 Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
7359
7360 MI.getOperand(0).setReg(NewResultReg);
7361
7362 // In the IR, TFE is supposed to be used with a 2 element struct return
7363 // type. The instruction really returns these two values in one contiguous
7364 // register, with one additional dword beyond the loaded data. Rewrite the
7365 // return type to use a single register result.
7366
7367 if (IsTFE) {
7368 Dst1Reg = MI.getOperand(1).getReg();
7369 if (MRI->getType(Dst1Reg) != S32)
7370 return false;
7371
7372 // TODO: Make sure the TFE operand bit is set.
7373 MI.removeOperand(1);
7374
7375 // Handle the easy case that requires no repack instructions.
7376 if (Ty == S32) {
7377 B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
7378 return true;
7379 }
7380 }
7381
7382 // Now figure out how to copy the new result register back into the old
7383 // result.
7384 SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
7385
7386 const int NumDataRegs = IsTFE ? ResultNumRegs - 1 : ResultNumRegs;
7387
7388 if (ResultNumRegs == 1) {
7389 assert(!IsTFE);
7390 ResultRegs[0] = NewResultReg;
7391 } else {
7392 // We have to repack into a new vector of some kind.
7393 for (int I = 0; I != NumDataRegs; ++I)
7394 ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
7395 B.buildUnmerge(ResultRegs, NewResultReg);
7396
7397 // Drop the final TFE element to get the data part. The TFE result is
7398 // directly written to the right place already.
7399 if (IsTFE)
7400 ResultRegs.resize(NumDataRegs);
7401 }
7402
7403 // For an s16 scalar result, we form an s32 result with a truncate regardless
7404 // of packed vs. unpacked.
7405 if (IsD16 && !Ty.isVector()) {
7406 B.buildTrunc(DstReg, ResultRegs[0]);
7407 return true;
7408 }
7409
7410 // Avoid a build/concat_vector of 1 entry.
7411 if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
7412 B.buildBitcast(DstReg, ResultRegs[0]);
7413 return true;
7414 }
7415
7416 assert(Ty.isVector());
7417
7418 if (IsD16) {
7419 // For packed D16 results with TFE enabled, all the data components are
7420 // S32. Cast back to the expected type.
7421 //
7422 // TODO: We don't really need to use load s32 elements. We would only need one
7423 // cast for the TFE result if a multiple of v2s16 was used.
7424 if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
7425 for (Register &Reg : ResultRegs)
7426 Reg = B.buildBitcast(V2S16, Reg).getReg(0);
7427 } else if (ST.hasUnpackedD16VMem()) {
7428 for (Register &Reg : ResultRegs)
7429 Reg = B.buildTrunc(S16, Reg).getReg(0);
7430 }
7431 }
7432
7433 auto padWithUndef = [&](LLT Ty, int NumElts) {
7434 if (NumElts == 0)
7435 return;
7436 Register Undef = B.buildUndef(Ty).getReg(0);
7437 for (int I = 0; I != NumElts; ++I)
7438 ResultRegs.push_back(Undef);
7439 };
7440
7441 // Pad out any elements eliminated due to the dmask.
7442 LLT ResTy = MRI->getType(ResultRegs[0]);
7443 if (!ResTy.isVector()) {
7444 padWithUndef(ResTy, NumElts - ResultRegs.size());
7445 B.buildBuildVector(DstReg, ResultRegs);
7446 return true;
7447 }
7448
7449 assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
7450 const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
7451
7452 // Deal with the one annoying legal case.
7453 const LLT V3S16 = LLT::fixed_vector(3, 16);
7454 if (Ty == V3S16) {
7455 if (IsTFE) {
7456 if (ResultRegs.size() == 1) {
7457 NewResultReg = ResultRegs[0];
7458 } else if (ResultRegs.size() == 2) {
7459 LLT V4S16 = LLT::fixed_vector(4, 16);
7460 NewResultReg = B.buildConcatVectors(V4S16, ResultRegs).getReg(0);
7461 } else {
7462 return false;
7463 }
7464 }
7465
7466 if (MRI->getType(DstReg).getNumElements() <
7467 MRI->getType(NewResultReg).getNumElements()) {
7468 B.buildDeleteTrailingVectorElements(DstReg, NewResultReg);
7469 } else {
7470 B.buildPadVectorWithUndefElements(DstReg, NewResultReg);
7471 }
7472 return true;
7473 }
7474
7475 padWithUndef(ResTy, RegsToCover - ResultRegs.size());
7476 B.buildConcatVectors(DstReg, ResultRegs);
7477 return true;
7478}
7479
7481 MachineInstr &MI) const {
7482 MachineIRBuilder &B = Helper.MIRBuilder;
7483 GISelChangeObserver &Observer = Helper.Observer;
7484
7485 Register OrigDst = MI.getOperand(0).getReg();
7486 Register Dst;
7487 LLT Ty = B.getMRI()->getType(OrigDst);
7488 unsigned Size = Ty.getSizeInBits();
7489 MachineFunction &MF = B.getMF();
7490 unsigned Opc = 0;
7491 if (Size < 32 && ST.hasScalarSubwordLoads()) {
7492 assert(Size == 8 || Size == 16);
7493 Opc = Size == 8 ? AMDGPU::G_AMDGPU_S_BUFFER_LOAD_UBYTE
7494 : AMDGPU::G_AMDGPU_S_BUFFER_LOAD_USHORT;
7495 // The 8-bit and 16-bit scalar buffer load instructions have 32-bit
7496 // destination register.
7497 Dst = B.getMRI()->createGenericVirtualRegister(LLT::scalar(32));
7498 } else {
7499 Opc = AMDGPU::G_AMDGPU_S_BUFFER_LOAD;
7500 Dst = OrigDst;
7501 }
7502
7503 Observer.changingInstr(MI);
7504
7505 // Handle needing to s.buffer.load() a p8 value.
7506 if (hasBufferRsrcWorkaround(Ty)) {
7507 Ty = castBufferRsrcFromV4I32(MI, B, *B.getMRI(), 0);
7508 B.setInsertPt(B.getMBB(), MI);
7509 }
7511 Ty = getBitcastRegisterType(Ty);
7512 Helper.bitcastDst(MI, Ty, 0);
7513 B.setInsertPt(B.getMBB(), MI);
7514 }
7515
7516 // FIXME: We don't really need this intermediate instruction. The intrinsic
7517 // should be fixed to have a memory operand. Since it's readnone, we're not
7518 // allowed to add one.
7519 MI.setDesc(B.getTII().get(Opc));
7520 MI.removeOperand(1); // Remove intrinsic ID
7521
7522 // FIXME: When intrinsic definition is fixed, this should have an MMO already.
7523 const unsigned MemSize = (Size + 7) / 8;
7524 const Align MemAlign = B.getDataLayout().getABITypeAlign(
7530 MemSize, MemAlign);
7531 MI.addMemOperand(MF, MMO);
7532 if (Dst != OrigDst) {
7533 MI.getOperand(0).setReg(Dst);
7534 B.setInsertPt(B.getMBB(), ++B.getInsertPt());
7535 B.buildTrunc(OrigDst, Dst);
7536 }
7537
7538 // If we don't have 96-bit result scalar loads, widening to 128-bit should
7539 // always be legal. We may need to restore this to a 96-bit result if it turns
7540 // out this needs to be converted to a vector load during RegBankSelect.
7541 if (!isPowerOf2_32(Size) && (Size != 96 || !ST.hasScalarDwordx3Loads())) {
7542 if (Ty.isVector())
7544 else
7545 Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
7546 }
7547
7548 Observer.changedInstr(MI);
7549 return true;
7550}
7551
7553 MachineInstr &MI) const {
7554 MachineIRBuilder &B = Helper.MIRBuilder;
7555 GISelChangeObserver &Observer = Helper.Observer;
7556 Observer.changingInstr(MI);
7557 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_PREFETCH));
7558 MI.removeOperand(0); // Remove intrinsic ID
7560 Observer.changedInstr(MI);
7561 return true;
7562}
7563
7564// TODO: Move to selection
7567 MachineIRBuilder &B) const {
7568 if (!ST.hasTrapHandler() ||
7569 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
7570 return legalizeTrapEndpgm(MI, MRI, B);
7571
7572 return ST.supportsGetDoorbellID() ?
7574}
7575
7578 const DebugLoc &DL = MI.getDebugLoc();
7579 MachineBasicBlock &BB = B.getMBB();
7580 MachineFunction *MF = BB.getParent();
7581
7582 if (BB.succ_empty() && std::next(MI.getIterator()) == BB.end()) {
7583 BuildMI(BB, BB.end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7584 .addImm(0);
7585 MI.eraseFromParent();
7586 return true;
7587 }
7588
7589 // We need a block split to make the real endpgm a terminator. We also don't
7590 // want to break phis in successor blocks, so we can't just delete to the
7591 // end of the block.
7592 BB.splitAt(MI, false /*UpdateLiveIns*/);
7594 MF->push_back(TrapBB);
7595 BuildMI(*TrapBB, TrapBB->end(), DL, B.getTII().get(AMDGPU::S_ENDPGM))
7596 .addImm(0);
7597 BuildMI(BB, &MI, DL, B.getTII().get(AMDGPU::S_CBRANCH_EXECNZ))
7598 .addMBB(TrapBB);
7599
7600 BB.addSuccessor(TrapBB);
7601 MI.eraseFromParent();
7602 return true;
7603}
7604
7607 MachineFunction &MF = B.getMF();
7608 const LLT S64 = LLT::scalar(64);
7609
7610 Register SGPR01(AMDGPU::SGPR0_SGPR1);
7611 // For code object version 5, queue_ptr is passed through implicit kernarg.
7617 ST.getTargetLowering()->getImplicitParameterOffset(B.getMF(), Param);
7618
7619 Register KernargPtrReg = MRI.createGenericVirtualRegister(
7621
7622 if (!loadInputValue(KernargPtrReg, B,
7624 return false;
7625
7626 // TODO: can we be smarter about machine pointer info?
7629 PtrInfo.getWithOffset(Offset),
7633
7634 // Pointer address
7637 B.buildObjectPtrOffset(LoadAddr, KernargPtrReg,
7638 B.buildConstant(LLT::scalar(64), Offset).getReg(0));
7639 // Load address
7640 Register Temp = B.buildLoad(S64, LoadAddr, *MMO).getReg(0);
7641 B.buildCopy(SGPR01, Temp);
7642 B.buildInstr(AMDGPU::S_TRAP)
7643 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7644 .addReg(SGPR01, RegState::Implicit);
7645 MI.eraseFromParent();
7646 return true;
7647 }
7648
7649 // Pass queue pointer to trap handler as input, and insert trap instruction
7650 // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
7651 Register LiveIn =
7654 return false;
7655
7656 B.buildCopy(SGPR01, LiveIn);
7657 B.buildInstr(AMDGPU::S_TRAP)
7658 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap))
7659 .addReg(SGPR01, RegState::Implicit);
7660
7661 MI.eraseFromParent();
7662 return true;
7663}
7664
7667 MachineIRBuilder &B) const {
7668 // We need to simulate the 's_trap 2' instruction on targets that run in
7669 // PRIV=1 (where it is treated as a nop).
7670 if (ST.hasPrivEnabledTrap2NopBug()) {
7671 ST.getInstrInfo()->insertSimulatedTrap(MRI, B.getMBB(), MI,
7672 MI.getDebugLoc());
7673 MI.eraseFromParent();
7674 return true;
7675 }
7676
7677 B.buildInstr(AMDGPU::S_TRAP)
7678 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSATrap));
7679 MI.eraseFromParent();
7680 return true;
7681}
7682
7685 MachineIRBuilder &B) const {
7686 // Is non-HSA path or trap-handler disabled? Then, report a warning
7687 // accordingly
7688 if (!ST.hasTrapHandler() ||
7689 ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
7690 Function &Fn = B.getMF().getFunction();
7692 Fn, "debugtrap handler not supported", MI.getDebugLoc(), DS_Warning));
7693 } else {
7694 // Insert debug-trap instruction
7695 B.buildInstr(AMDGPU::S_TRAP)
7696 .addImm(static_cast<unsigned>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap));
7697 }
7698
7699 MI.eraseFromParent();
7700 return true;
7701}
7702
7704 MachineInstr &MI, MachineIRBuilder &B) const {
7705 MachineRegisterInfo &MRI = *B.getMRI();
7706 const LLT S16 = LLT::scalar(16);
7707 const LLT S32 = LLT::scalar(32);
7708 const LLT V2S16 = LLT::fixed_vector(2, 16);
7709 const LLT V3S32 = LLT::fixed_vector(3, 32);
7710
7711 Register DstReg = MI.getOperand(0).getReg();
7712 Register NodePtr = MI.getOperand(2).getReg();
7713 Register RayExtent = MI.getOperand(3).getReg();
7714 Register RayOrigin = MI.getOperand(4).getReg();
7715 Register RayDir = MI.getOperand(5).getReg();
7716 Register RayInvDir = MI.getOperand(6).getReg();
7717 Register TDescr = MI.getOperand(7).getReg();
7718
7719 if (!ST.hasGFX10_AEncoding()) {
7720 Function &Fn = B.getMF().getFunction();
7722 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7723 return false;
7724 }
7725
7726 const bool IsGFX11 = AMDGPU::isGFX11(ST);
7727 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(ST);
7728 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(ST);
7729 const bool IsA16 = MRI.getType(RayDir).getElementType().getSizeInBits() == 16;
7730 const bool Is64 = MRI.getType(NodePtr).getSizeInBits() == 64;
7731 const unsigned NumVDataDwords = 4;
7732 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
7733 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
7734 const bool UseNSA =
7735 IsGFX12Plus || (ST.hasNSAEncoding() && NumVAddrs <= ST.getNSAMaxSize());
7736
7737 const unsigned BaseOpcodes[2][2] = {
7738 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
7739 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
7740 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
7741 int Opcode;
7742 if (UseNSA) {
7743 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7744 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
7745 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
7746 : AMDGPU::MIMGEncGfx10NSA,
7747 NumVDataDwords, NumVAddrDwords);
7748 } else {
7749 assert(!IsGFX12Plus);
7750 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
7751 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
7752 : AMDGPU::MIMGEncGfx10Default,
7753 NumVDataDwords, NumVAddrDwords);
7754 }
7755 assert(Opcode != -1);
7756
7758 if (UseNSA && IsGFX11Plus) {
7759 auto packLanes = [&Ops, &S32, &V3S32, &B](Register Src) {
7760 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7761 auto Merged = B.buildMergeLikeInstr(
7762 V3S32, {Unmerge.getReg(0), Unmerge.getReg(1), Unmerge.getReg(2)});
7763 Ops.push_back(Merged.getReg(0));
7764 };
7765
7766 Ops.push_back(NodePtr);
7767 Ops.push_back(RayExtent);
7768 packLanes(RayOrigin);
7769
7770 if (IsA16) {
7771 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7772 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7773 auto MergedDir = B.buildMergeLikeInstr(
7774 V3S32,
7775 {B.buildBitcast(
7776 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(0),
7777 UnmergeRayDir.getReg(0)}))
7778 .getReg(0),
7779 B.buildBitcast(
7780 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(1),
7781 UnmergeRayDir.getReg(1)}))
7782 .getReg(0),
7783 B.buildBitcast(
7784 S32, B.buildMergeLikeInstr(V2S16, {UnmergeRayInvDir.getReg(2),
7785 UnmergeRayDir.getReg(2)}))
7786 .getReg(0)});
7787 Ops.push_back(MergedDir.getReg(0));
7788 } else {
7789 packLanes(RayDir);
7790 packLanes(RayInvDir);
7791 }
7792 } else {
7793 if (Is64) {
7794 auto Unmerge = B.buildUnmerge({S32, S32}, NodePtr);
7795 Ops.push_back(Unmerge.getReg(0));
7796 Ops.push_back(Unmerge.getReg(1));
7797 } else {
7798 Ops.push_back(NodePtr);
7799 }
7800 Ops.push_back(RayExtent);
7801
7802 auto packLanes = [&Ops, &S32, &B](Register Src) {
7803 auto Unmerge = B.buildUnmerge({S32, S32, S32}, Src);
7804 Ops.push_back(Unmerge.getReg(0));
7805 Ops.push_back(Unmerge.getReg(1));
7806 Ops.push_back(Unmerge.getReg(2));
7807 };
7808
7809 packLanes(RayOrigin);
7810 if (IsA16) {
7811 auto UnmergeRayDir = B.buildUnmerge({S16, S16, S16}, RayDir);
7812 auto UnmergeRayInvDir = B.buildUnmerge({S16, S16, S16}, RayInvDir);
7816 B.buildMergeLikeInstr(R1,
7817 {UnmergeRayDir.getReg(0), UnmergeRayDir.getReg(1)});
7818 B.buildMergeLikeInstr(
7819 R2, {UnmergeRayDir.getReg(2), UnmergeRayInvDir.getReg(0)});
7820 B.buildMergeLikeInstr(
7821 R3, {UnmergeRayInvDir.getReg(1), UnmergeRayInvDir.getReg(2)});
7822 Ops.push_back(R1);
7823 Ops.push_back(R2);
7824 Ops.push_back(R3);
7825 } else {
7826 packLanes(RayDir);
7827 packLanes(RayInvDir);
7828 }
7829 }
7830
7831 if (!UseNSA) {
7832 // Build a single vector containing all the operands so far prepared.
7833 LLT OpTy = LLT::fixed_vector(Ops.size(), 32);
7834 Register MergedOps = B.buildMergeLikeInstr(OpTy, Ops).getReg(0);
7835 Ops.clear();
7836 Ops.push_back(MergedOps);
7837 }
7838
7839 auto MIB = B.buildInstr(AMDGPU::G_AMDGPU_BVH_INTERSECT_RAY)
7840 .addDef(DstReg)
7841 .addImm(Opcode);
7842
7843 for (Register R : Ops) {
7844 MIB.addUse(R);
7845 }
7846
7847 MIB.addUse(TDescr)
7848 .addImm(IsA16 ? 1 : 0)
7849 .cloneMemRefs(MI);
7850
7851 MI.eraseFromParent();
7852 return true;
7853}
7854
7856 MachineInstr &MI, MachineIRBuilder &B) const {
7857 const LLT S32 = LLT::scalar(32);
7858 const LLT V2S32 = LLT::fixed_vector(2, 32);
7859
7860 Register DstReg = MI.getOperand(0).getReg();
7861 Register DstOrigin = MI.getOperand(1).getReg();
7862 Register DstDir = MI.getOperand(2).getReg();
7863 Register NodePtr = MI.getOperand(4).getReg();
7864 Register RayExtent = MI.getOperand(5).getReg();
7865 Register InstanceMask = MI.getOperand(6).getReg();
7866 Register RayOrigin = MI.getOperand(7).getReg();
7867 Register RayDir = MI.getOperand(8).getReg();
7868 Register Offsets = MI.getOperand(9).getReg();
7869 Register TDescr = MI.getOperand(10).getReg();
7870
7871 if (!ST.hasBVHDualAndBVH8Insts()) {
7872 Function &Fn = B.getMF().getFunction();
7874 Fn, "intrinsic not supported on subtarget", MI.getDebugLoc()));
7875 return false;
7876 }
7877
7878 bool IsBVH8 = cast<GIntrinsic>(MI).getIntrinsicID() ==
7879 Intrinsic::amdgcn_image_bvh8_intersect_ray;
7880 const unsigned NumVDataDwords = 10;
7881 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
7882 int Opcode = AMDGPU::getMIMGOpcode(
7883 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
7884 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
7885 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
7886 assert(Opcode != -1);
7887
7888 auto RayExtentInstanceMaskVec = B.buildMergeLikeInstr(
7889 V2S32, {RayExtent, B.buildAnyExt(S32, InstanceMask)});
7890
7891 B.buildInstr(IsBVH8 ? AMDGPU::G_AMDGPU_BVH8_INTERSECT_RAY
7892 : AMDGPU::G_AMDGPU_BVH_DUAL_INTERSECT_RAY)
7893 .addDef(DstReg)
7894 .addDef(DstOrigin)
7895 .addDef(DstDir)
7896 .addImm(Opcode)
7897 .addUse(NodePtr)
7898 .addUse(RayExtentInstanceMaskVec.getReg(0))
7899 .addUse(RayOrigin)
7900 .addUse(RayDir)
7901 .addUse(Offsets)
7902 .addUse(TDescr)
7903 .cloneMemRefs(MI);
7904
7905 MI.eraseFromParent();
7906 return true;
7907}
7908
7910 MachineIRBuilder &B) const {
7911 const SITargetLowering *TLI = ST.getTargetLowering();
7913 Register DstReg = MI.getOperand(0).getReg();
7914 B.buildInstr(AMDGPU::G_AMDGPU_WAVE_ADDRESS, {DstReg}, {StackPtr});
7915 MI.eraseFromParent();
7916 return true;
7917}
7918
7920 MachineIRBuilder &B) const {
7921 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
7922 if (!ST.hasArchitectedSGPRs())
7923 return false;
7924 LLT S32 = LLT::scalar(32);
7925 Register DstReg = MI.getOperand(0).getReg();
7926 auto TTMP8 = B.buildCopy(S32, Register(AMDGPU::TTMP8));
7927 auto LSB = B.buildConstant(S32, 25);
7928 auto Width = B.buildConstant(S32, 5);
7929 B.buildUbfx(DstReg, TTMP8, LSB, Width);
7930 MI.eraseFromParent();
7931 return true;
7932}
7933
7936 AMDGPU::Hwreg::Id HwReg,
7937 unsigned LowBit,
7938 unsigned Width) const {
7939 MachineRegisterInfo &MRI = *B.getMRI();
7940 Register DstReg = MI.getOperand(0).getReg();
7941 if (!MRI.getRegClassOrNull(DstReg))
7942 MRI.setRegClass(DstReg, &AMDGPU::SReg_32RegClass);
7943 B.buildInstr(AMDGPU::S_GETREG_B32_const)
7944 .addDef(DstReg)
7945 .addImm(AMDGPU::Hwreg::HwregEncoding::encode(HwReg, LowBit, Width));
7946 MI.eraseFromParent();
7947 return true;
7948}
7949
7950static constexpr unsigned FPEnvModeBitField =
7952
7953static constexpr unsigned FPEnvTrapBitField =
7955
7958 MachineIRBuilder &B) const {
7959 Register Src = MI.getOperand(0).getReg();
7960 if (MRI.getType(Src) != S64)
7961 return false;
7962
7963 auto ModeReg =
7964 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7965 /*HasSideEffects=*/true, /*isConvergent=*/false)
7966 .addImm(FPEnvModeBitField);
7967 auto TrapReg =
7968 B.buildIntrinsic(Intrinsic::amdgcn_s_getreg, {S32},
7969 /*HasSideEffects=*/true, /*isConvergent=*/false)
7970 .addImm(FPEnvTrapBitField);
7971 B.buildMergeLikeInstr(Src, {ModeReg, TrapReg});
7972 MI.eraseFromParent();
7973 return true;
7974}
7975
7978 MachineIRBuilder &B) const {
7979 Register Src = MI.getOperand(0).getReg();
7980 if (MRI.getType(Src) != S64)
7981 return false;
7982
7983 auto Unmerge = B.buildUnmerge({S32, S32}, MI.getOperand(0));
7984 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7985 /*HasSideEffects=*/true, /*isConvergent=*/false)
7986 .addImm(static_cast<int16_t>(FPEnvModeBitField))
7987 .addReg(Unmerge.getReg(0));
7988 B.buildIntrinsic(Intrinsic::amdgcn_s_setreg, ArrayRef<DstOp>(),
7989 /*HasSideEffects=*/true, /*isConvergent=*/false)
7990 .addImm(static_cast<int16_t>(FPEnvTrapBitField))
7991 .addReg(Unmerge.getReg(1));
7992 MI.eraseFromParent();
7993 return true;
7994}
7995
7997 MachineInstr &MI) const {
7998 MachineIRBuilder &B = Helper.MIRBuilder;
7999 MachineRegisterInfo &MRI = *B.getMRI();
8000
8001 // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
8002 auto IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
8003 switch (IntrID) {
8004 case Intrinsic::sponentry:
8005 if (B.getMF().getInfo<SIMachineFunctionInfo>()->isBottomOfStack()) {
8006 // FIXME: The imported pattern checks for i32 instead of p5; if we fix
8007 // that we can remove this cast.
8008 const LLT S32 = LLT::scalar(32);
8010 B.buildInstr(AMDGPU::G_AMDGPU_SPONENTRY).addDef(TmpReg);
8011
8012 Register DstReg = MI.getOperand(0).getReg();
8013 B.buildIntToPtr(DstReg, TmpReg);
8014 MI.eraseFromParent();
8015 } else {
8016 int FI = B.getMF().getFrameInfo().CreateFixedObject(
8017 1, 0, /*IsImmutable=*/false);
8018 B.buildFrameIndex(MI.getOperand(0), FI);
8019 MI.eraseFromParent();
8020 }
8021 return true;
8022 case Intrinsic::amdgcn_if:
8023 case Intrinsic::amdgcn_else: {
8024 MachineInstr *Br = nullptr;
8025 MachineBasicBlock *UncondBrTarget = nullptr;
8026 bool Negated = false;
8027 if (MachineInstr *BrCond =
8028 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8029 const SIRegisterInfo *TRI
8030 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8031
8032 Register Def = MI.getOperand(1).getReg();
8033 Register Use = MI.getOperand(3).getReg();
8034
8035 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8036
8037 if (Negated)
8038 std::swap(CondBrTarget, UncondBrTarget);
8039
8040 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8041 if (IntrID == Intrinsic::amdgcn_if) {
8042 B.buildInstr(AMDGPU::SI_IF)
8043 .addDef(Def)
8044 .addUse(Use)
8045 .addMBB(UncondBrTarget);
8046 } else {
8047 B.buildInstr(AMDGPU::SI_ELSE)
8048 .addDef(Def)
8049 .addUse(Use)
8050 .addMBB(UncondBrTarget);
8051 }
8052
8053 if (Br) {
8054 Br->getOperand(0).setMBB(CondBrTarget);
8055 } else {
8056 // The IRTranslator skips inserting the G_BR for fallthrough cases, but
8057 // since we're swapping branch targets it needs to be reinserted.
8058 // FIXME: IRTranslator should probably not do this
8059 B.buildBr(*CondBrTarget);
8060 }
8061
8062 MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
8063 MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
8064 MI.eraseFromParent();
8065 BrCond->eraseFromParent();
8066 return true;
8067 }
8068
8069 return false;
8070 }
8071 case Intrinsic::amdgcn_loop: {
8072 MachineInstr *Br = nullptr;
8073 MachineBasicBlock *UncondBrTarget = nullptr;
8074 bool Negated = false;
8075 if (MachineInstr *BrCond =
8076 verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget, Negated)) {
8077 const SIRegisterInfo *TRI
8078 = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
8079
8080 MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
8081 Register Reg = MI.getOperand(2).getReg();
8082
8083 if (Negated)
8084 std::swap(CondBrTarget, UncondBrTarget);
8085
8086 B.setInsertPt(B.getMBB(), BrCond->getIterator());
8087 B.buildInstr(AMDGPU::SI_LOOP)
8088 .addUse(Reg)
8089 .addMBB(UncondBrTarget);
8090
8091 if (Br)
8092 Br->getOperand(0).setMBB(CondBrTarget);
8093 else
8094 B.buildBr(*CondBrTarget);
8095
8096 MI.eraseFromParent();
8097 BrCond->eraseFromParent();
8098 MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
8099 return true;
8100 }
8101
8102 return false;
8103 }
8104 case Intrinsic::amdgcn_addrspacecast_nonnull:
8105 return legalizeAddrSpaceCast(MI, MRI, B);
8106 case Intrinsic::amdgcn_make_buffer_rsrc:
8107 return legalizePointerAsRsrcIntrin(MI, MRI, B);
8108 case Intrinsic::amdgcn_kernarg_segment_ptr:
8109 if (!AMDGPU::isKernel(B.getMF().getFunction())) {
8110 // This only makes sense to call in a kernel, so just lower to null.
8111 B.buildConstant(MI.getOperand(0).getReg(), 0);
8112 MI.eraseFromParent();
8113 return true;
8114 }
8115
8118 case Intrinsic::amdgcn_implicitarg_ptr:
8119 return legalizeImplicitArgPtr(MI, MRI, B);
8120 case Intrinsic::amdgcn_workitem_id_x:
8121 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 0,
8123 case Intrinsic::amdgcn_workitem_id_y:
8124 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 1,
8126 case Intrinsic::amdgcn_workitem_id_z:
8127 return legalizeWorkitemIDIntrinsic(MI, MRI, B, 2,
8129 case Intrinsic::amdgcn_workgroup_id_x:
8130 return legalizeWorkGroupId(
8134 case Intrinsic::amdgcn_workgroup_id_y:
8135 return legalizeWorkGroupId(
8139 case Intrinsic::amdgcn_workgroup_id_z:
8140 return legalizeWorkGroupId(
8144 case Intrinsic::amdgcn_cluster_id_x:
8145 return ST.hasClusters() &&
8148 case Intrinsic::amdgcn_cluster_id_y:
8149 return ST.hasClusters() &&
8152 case Intrinsic::amdgcn_cluster_id_z:
8153 return ST.hasClusters() &&
8156 case Intrinsic::amdgcn_cluster_workgroup_id_x:
8157 return ST.hasClusters() &&
8160 case Intrinsic::amdgcn_cluster_workgroup_id_y:
8161 return ST.hasClusters() &&
8164 case Intrinsic::amdgcn_cluster_workgroup_id_z:
8165 return ST.hasClusters() &&
8168 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
8169 return ST.hasClusters() &&
8171 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
8172 return ST.hasClusters() &&
8175 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
8176 return ST.hasClusters() &&
8179 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
8180 return ST.hasClusters() &&
8183 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
8184 return ST.hasClusters() &&
8186 MI, MRI, B,
8188 case Intrinsic::amdgcn_wave_id:
8189 return legalizeWaveID(MI, B);
8190 case Intrinsic::amdgcn_lds_kernel_id:
8191 return legalizePreloadedArgIntrin(MI, MRI, B,
8193 case Intrinsic::amdgcn_dispatch_ptr:
8194 return legalizePreloadedArgIntrin(MI, MRI, B,
8196 case Intrinsic::amdgcn_queue_ptr:
8197 return legalizePreloadedArgIntrin(MI, MRI, B,
8199 case Intrinsic::amdgcn_implicit_buffer_ptr:
8202 case Intrinsic::amdgcn_dispatch_id:
8203 return legalizePreloadedArgIntrin(MI, MRI, B,
8205 case Intrinsic::r600_read_ngroups_x:
8206 // TODO: Emit error for hsa
8209 case Intrinsic::r600_read_ngroups_y:
8212 case Intrinsic::r600_read_ngroups_z:
8215 case Intrinsic::r600_read_local_size_x:
8216 // TODO: Could insert G_ASSERT_ZEXT from s16
8218 case Intrinsic::r600_read_local_size_y:
8219 // TODO: Could insert G_ASSERT_ZEXT from s16
8221 // TODO: Could insert G_ASSERT_ZEXT from s16
8222 case Intrinsic::r600_read_local_size_z:
8225 case Intrinsic::amdgcn_fdiv_fast:
8226 return legalizeFDIVFastIntrin(MI, MRI, B);
8227 case Intrinsic::amdgcn_is_shared:
8229 case Intrinsic::amdgcn_is_private:
8231 case Intrinsic::amdgcn_wavefrontsize: {
8232 B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
8233 MI.eraseFromParent();
8234 return true;
8235 }
8236 case Intrinsic::amdgcn_s_buffer_load:
8237 return legalizeSBufferLoad(Helper, MI);
8238 case Intrinsic::amdgcn_raw_buffer_store:
8239 case Intrinsic::amdgcn_raw_ptr_buffer_store:
8240 case Intrinsic::amdgcn_struct_buffer_store:
8241 case Intrinsic::amdgcn_struct_ptr_buffer_store:
8242 return legalizeBufferStore(MI, Helper, false, false);
8243 case Intrinsic::amdgcn_raw_buffer_store_format:
8244 case Intrinsic::amdgcn_raw_ptr_buffer_store_format:
8245 case Intrinsic::amdgcn_struct_buffer_store_format:
8246 case Intrinsic::amdgcn_struct_ptr_buffer_store_format:
8247 return legalizeBufferStore(MI, Helper, false, true);
8248 case Intrinsic::amdgcn_raw_tbuffer_store:
8249 case Intrinsic::amdgcn_raw_ptr_tbuffer_store:
8250 case Intrinsic::amdgcn_struct_tbuffer_store:
8251 case Intrinsic::amdgcn_struct_ptr_tbuffer_store:
8252 return legalizeBufferStore(MI, Helper, true, true);
8253 case Intrinsic::amdgcn_raw_buffer_load:
8254 case Intrinsic::amdgcn_raw_ptr_buffer_load:
8255 case Intrinsic::amdgcn_raw_atomic_buffer_load:
8256 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
8257 case Intrinsic::amdgcn_struct_buffer_load:
8258 case Intrinsic::amdgcn_struct_ptr_buffer_load:
8259 case Intrinsic::amdgcn_struct_atomic_buffer_load:
8260 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load:
8261 return legalizeBufferLoad(MI, Helper, false, false);
8262 case Intrinsic::amdgcn_raw_buffer_load_format:
8263 case Intrinsic::amdgcn_raw_ptr_buffer_load_format:
8264 case Intrinsic::amdgcn_struct_buffer_load_format:
8265 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
8266 return legalizeBufferLoad(MI, Helper, true, false);
8267 case Intrinsic::amdgcn_raw_tbuffer_load:
8268 case Intrinsic::amdgcn_raw_ptr_tbuffer_load:
8269 case Intrinsic::amdgcn_struct_tbuffer_load:
8270 case Intrinsic::amdgcn_struct_ptr_tbuffer_load:
8271 return legalizeBufferLoad(MI, Helper, true, true);
8272 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
8273 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
8274 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
8275 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
8276 case Intrinsic::amdgcn_raw_buffer_atomic_add:
8277 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
8278 case Intrinsic::amdgcn_struct_buffer_atomic_add:
8279 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
8280 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
8281 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
8282 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
8283 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
8284 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
8285 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
8286 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
8287 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
8288 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
8289 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
8290 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
8291 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
8292 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
8293 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
8294 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
8295 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
8296 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
8297 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
8298 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
8299 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
8300 case Intrinsic::amdgcn_raw_buffer_atomic_and:
8301 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
8302 case Intrinsic::amdgcn_struct_buffer_atomic_and:
8303 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
8304 case Intrinsic::amdgcn_raw_buffer_atomic_or:
8305 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
8306 case Intrinsic::amdgcn_struct_buffer_atomic_or:
8307 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
8308 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
8309 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
8310 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
8311 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
8312 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
8313 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
8314 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
8315 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
8316 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
8317 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
8318 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
8319 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
8320 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
8321 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap:
8322 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
8323 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap:
8324 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
8325 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
8326 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
8327 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
8328 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
8329 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
8330 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
8331 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
8332 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
8333 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
8334 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
8335 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
8336 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
8337 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
8338 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
8339 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
8340 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
8341 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
8342 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
8343 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
8344 return legalizeBufferAtomic(MI, B, IntrID);
8345 case Intrinsic::amdgcn_rsq_clamp:
8346 return legalizeRsqClampIntrinsic(MI, MRI, B);
8347 case Intrinsic::amdgcn_image_bvh_intersect_ray:
8349 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
8350 case Intrinsic::amdgcn_image_bvh8_intersect_ray:
8352 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
8353 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
8354 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
8355 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
8356 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
8357 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
8358 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
8359 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
8360 Register Index = MI.getOperand(5).getReg();
8361 LLT S64 = LLT::scalar(64);
8362 LLT IndexArgTy = MRI.getType(Index);
8363 if (IndexArgTy != S64) {
8364 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(S64, Index)
8365 : B.buildAnyExt(S64, Index);
8366 MI.getOperand(5).setReg(NewIndex.getReg(0));
8367 }
8368 return true;
8369 }
8370 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
8371 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
8372 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
8373 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
8374 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
8375 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
8376 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
8377 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
8378 Register Index = MI.getOperand(5).getReg();
8379 LLT S32 = LLT::scalar(32);
8380 if (MRI.getType(Index) != S32)
8381 MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
8382 return true;
8383 }
8384 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
8385 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
8386 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
8387 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
8388 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
8389 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8:
8390 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
8391 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
8392 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
8393 Register Index = MI.getOperand(7).getReg();
8394 LLT IdxTy = IntrID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
8395 ? LLT::scalar(64)
8396 : LLT::scalar(32);
8397 LLT IndexArgTy = MRI.getType(Index);
8398 if (IndexArgTy != IdxTy) {
8399 auto NewIndex = IndexArgTy.isVector() ? B.buildBitcast(IdxTy, Index)
8400 : B.buildAnyExt(IdxTy, Index);
8401 MI.getOperand(7).setReg(NewIndex.getReg(0));
8402 }
8403 return true;
8404 }
8405
8406 case Intrinsic::amdgcn_fmed3: {
8407 GISelChangeObserver &Observer = Helper.Observer;
8408
8409 // FIXME: This is to workaround the inability of tablegen match combiners to
8410 // match intrinsics in patterns.
8411 Observer.changingInstr(MI);
8412 MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_FMED3));
8413 MI.removeOperand(1);
8414 Observer.changedInstr(MI);
8415 return true;
8416 }
8417 case Intrinsic::amdgcn_readlane:
8418 case Intrinsic::amdgcn_writelane:
8419 case Intrinsic::amdgcn_readfirstlane:
8420 case Intrinsic::amdgcn_permlane16:
8421 case Intrinsic::amdgcn_permlanex16:
8422 case Intrinsic::amdgcn_permlane64:
8423 case Intrinsic::amdgcn_set_inactive:
8424 case Intrinsic::amdgcn_set_inactive_chain_arg:
8425 case Intrinsic::amdgcn_mov_dpp8:
8426 case Intrinsic::amdgcn_update_dpp:
8427 return legalizeLaneOp(Helper, MI, IntrID);
8428 case Intrinsic::amdgcn_s_buffer_prefetch_data:
8429 return legalizeSBufferPrefetch(Helper, MI);
8430 case Intrinsic::amdgcn_dead: {
8431 // TODO: Use poison instead of undef
8432 for (const MachineOperand &Def : MI.defs())
8433 B.buildUndef(Def);
8434 MI.eraseFromParent();
8435 return true;
8436 }
8437 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
8438 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
8439 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
8440 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8441 B.buildLoad(MI.getOperand(0), MI.getOperand(2), **MI.memoperands_begin());
8442 MI.eraseFromParent();
8443 return true;
8444 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
8445 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
8446 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
8447 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8448 B.buildStore(MI.getOperand(2), MI.getOperand(1), **MI.memoperands_begin());
8449 MI.eraseFromParent();
8450 return true;
8451 case Intrinsic::amdgcn_flat_load_monitor_b32:
8452 case Intrinsic::amdgcn_flat_load_monitor_b64:
8453 case Intrinsic::amdgcn_flat_load_monitor_b128:
8454 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8455 B.buildInstr(AMDGPU::G_AMDGPU_FLAT_LOAD_MONITOR)
8456 .add(MI.getOperand(0))
8457 .add(MI.getOperand(2))
8458 .addMemOperand(*MI.memoperands_begin());
8459 MI.eraseFromParent();
8460 return true;
8461 case Intrinsic::amdgcn_global_load_monitor_b32:
8462 case Intrinsic::amdgcn_global_load_monitor_b64:
8463 case Intrinsic::amdgcn_global_load_monitor_b128:
8464 assert(MI.hasOneMemOperand() && "Expected IRTranslator to set MemOp!");
8465 B.buildInstr(AMDGPU::G_AMDGPU_GLOBAL_LOAD_MONITOR)
8466 .add(MI.getOperand(0))
8467 .add(MI.getOperand(2))
8468 .addMemOperand(*MI.memoperands_begin());
8469 MI.eraseFromParent();
8470 return true;
8471 default: {
8472 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
8474 return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
8475 return true;
8476 }
8477 }
8478
8479 return true;
8480}
MachineInstrBuilder & UseMI
MachineInstrBuilder MachineInstrBuilder & DefMI
static unsigned getIntrinsicID(const SDNode *N)
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID)
static LLT getBufferRsrcScalarType(const LLT Ty)
static LegalityPredicate isIllegalRegisterType(const GCNSubtarget &ST, unsigned TypeIdx)
static cl::opt< bool > EnableNewLegality("amdgpu-global-isel-new-legality", cl::desc("Use GlobalISel desired legality, rather than try to use" "rules compatible with selection patterns"), cl::init(false), cl::ReallyHidden)
static MachineInstrBuilder buildExp(MachineIRBuilder &B, const DstOp &Dst, const SrcOp &Src, unsigned Flags)
static bool needsDenormHandlingF32(const MachineFunction &MF, Register Src, unsigned Flags)
constexpr std::initializer_list< LLT > AllVectors
static LegalizeMutation bitcastToVectorElement32(unsigned TypeIdx)
static LegalityPredicate isSmallOddVector(unsigned TypeIdx)
static LegalizeMutation oneMoreElement(unsigned TypeIdx)
constexpr LLT F64
static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size)
constexpr LLT V2S8
static bool allowApproxFunc(const MachineFunction &MF, unsigned Flags)
constexpr LLT V4S128
constexpr LLT S16
constexpr LLT S1
static bool shouldBitcastLoadStoreType(const GCNSubtarget &ST, const LLT Ty, const LLT MemTy)
Return true if a load or store of the type should be lowered with a bitcast to a different type.
constexpr LLT S1024
static constexpr unsigned FPEnvModeBitField
constexpr LLT V7S64
static LegalizeMutation getScalarTypeFromMemDesc(unsigned TypeIdx)
static LegalityPredicate vectorWiderThan(unsigned TypeIdx, unsigned Size)
static bool shouldWidenLoad(const GCNSubtarget &ST, LLT MemoryTy, uint64_t AlignInBits, unsigned AddrSpace, unsigned Opcode)
Return true if we should legalize a load by widening an odd sized memory access up to the alignment.
static bool isRegisterVectorElementType(LLT EltTy)
static LegalizeMutation fewerEltsToSize64Vector(unsigned TypeIdx)
static LegalityPredicate isWideVec16(unsigned TypeIdx)
constexpr std::initializer_list< LLT > AllScalarTypes
static LegalityPredicate isTruncStoreToSizePowerOf2(unsigned TypeIdx)
constexpr LLT V2S16
constexpr LLT V8S16
constexpr LLT V9S32
constexpr std::initializer_list< LLT > AllS32Vectors
constexpr LLT S224
static LegalizeMutation moreElementsToNextExistingRegClass(unsigned TypeIdx)
constexpr LLT S512
constexpr LLT MaxScalar
static Register castBufferRsrcToV4I32(Register Pointer, MachineIRBuilder &B)
Cast a buffer resource (an address space 8 pointer) into a 4xi32, which is the form in which the valu...
constexpr LLT V11S32
static bool isRegisterClassType(const GCNSubtarget &ST, LLT Ty)
constexpr LLT V6S64
constexpr LLT V2S64
static std::pair< Register, Register > emitReciprocalU64(MachineIRBuilder &B, Register Val)
static LLT getBitcastRegisterType(const LLT Ty)
static LLT getBufferRsrcRegisterType(const LLT Ty)
constexpr LLT S32
constexpr LLT V2F16
static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx)
static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI)
constexpr LLT V8S32
constexpr LLT V2BF16
constexpr LLT S192
static LLT castBufferRsrcFromV4I32(MachineInstr &MI, MachineIRBuilder &B, MachineRegisterInfo &MRI, unsigned Idx)
Mutates IR (typicaly a load instruction) to use a <4 x s32> as the initial type of the operand idx an...
static bool replaceWithConstant(MachineIRBuilder &B, MachineInstr &MI, int64_t C)
static constexpr unsigned SPDenormModeBitField
constexpr LLT F32
static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS, bool IsLoad, bool IsAtomic)
constexpr LLT V6S32
static bool isLoadStoreSizeLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
constexpr LLT S160
static MachineInstr * verifyCFIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineInstr *&Br, MachineBasicBlock *&UncondBrTarget, bool &Negated)
constexpr LLT V4S16
constexpr LLT V2S128
constexpr LLT V10S16
static LegalityPredicate numElementsNotEven(unsigned TypeIdx)
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT V6S16
constexpr std::initializer_list< LLT > AllS64Vectors
constexpr LLT S256
static void castBufferRsrcArgToV4I32(MachineInstr &MI, MachineIRBuilder &B, unsigned Idx)
constexpr LLT V4S64
static constexpr unsigned FPEnvTrapBitField
constexpr LLT V10S32
constexpr LLT V16S32
static constexpr unsigned MaxRegisterSize
constexpr LLT V7S32
constexpr LLT S96
constexpr LLT V12S16
constexpr LLT V16S64
static bool isRegisterSize(const GCNSubtarget &ST, unsigned Size)
static LegalityPredicate isWideScalarExtLoadTruncStore(unsigned TypeIdx)
static bool hasBufferRsrcWorkaround(const LLT Ty)
constexpr LLT V32S32
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B, const GCNSubtarget &ST, SIModeRegisterDefaults Mode)
constexpr LLT S64
constexpr std::initializer_list< LLT > AllS16Vectors
static bool loadStoreBitcastWorkaround(const LLT Ty)
static LLT widenToNextPowerOf2(LLT Ty)
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI)
constexpr LLT V16S16
static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI, int DimIdx, int NumVAddrs)
Convert from separate vaddr components to a single vector address register, and replace the remaining...
static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query)
static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx)
constexpr LLT V5S32
constexpr LLT V5S64
constexpr LLT V3S64
static LLT getPow2VectorType(LLT Ty)
static void buildBufferLoad(unsigned Opc, Register LoadDstReg, Register RSrc, Register VIndex, Register VOffset, Register SOffset, unsigned ImmOffset, unsigned Format, unsigned AuxiliaryData, MachineMemOperand *MMO, bool IsTyped, bool HasVIndex, MachineIRBuilder &B)
constexpr LLT V8S64
static LLT getPow2ScalarType(LLT Ty)
static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx)
constexpr LLT V2S32
static bool isRegisterVectorType(LLT Ty)
constexpr LLT V12S32
constexpr LLT S128
static LegalityPredicate sizeIsMultipleOf32(unsigned TypeIdx)
constexpr LLT S8
static bool isRegisterType(const GCNSubtarget &ST, LLT Ty)
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
This file declares the targeting of the Machinelegalizer class for AMDGPU.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static Error unsupported(const char *Str, const Triple &T)
Definition MachO.cpp:71
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
@ Enable
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
Interface for Targets to specify which operations they can successfully select and how the others sho...
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
This file declares the MachineIRBuilder class.
Register const TargetRegisterInfo * TRI
#define R2(n)
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
static MCRegister getReg(const MCDisassembler *D, unsigned RC, unsigned RegNo)
#define P(N)
ppc ctr loops verify
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
#define CH(x, y, z)
Definition SHA256.cpp:34
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1269
Interface definition for SIInstrInfo.
Interface definition for SIRegisterInfo.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static constexpr int Concat[]
bool legalizeConstHwRegRead(MachineInstr &MI, MachineIRBuilder &B, AMDGPU::Hwreg::Id HwReg, unsigned LowBit, unsigned Width) const
void buildMultiply(LegalizerHelper &Helper, MutableArrayRef< Register > Accum, ArrayRef< Register > Src0, ArrayRef< Register > Src1, bool UsePartialMad64_32, bool SeparateOddAlignedProducts) const
bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeIntrinsicTrunc(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeInsert(LegalizerHelper &Helper, MachineInstr &MI) const
std::pair< Register, unsigned > splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const
bool legalizeBVHIntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned AddrSpace) const
bool legalizeUnsignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeTrapHsa(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferStore(MachineInstr &MI, LegalizerHelper &Helper, bool IsTyped, bool IsFormat) const
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFFREXP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getSegmentAperture(unsigned AddrSpace, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePointerAsRsrcIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
To create a buffer resource from a 64-bit pointer, mask off the upper 32 bits of the pointer and repl...
bool legalizeFlogCommon(MachineInstr &MI, MachineIRBuilder &B) const
bool getLDSKernelId(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExp2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B, Intrinsic::ID IID) const
void legalizeUnsignedDIV_REM32Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI, Register Reg, bool ImageStore=false) const
Handle register layout difference for f16 images for some subtargets.
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
AMDGPULegalizerInfo(const GCNSubtarget &ST, const GCNTargetMachine &TM)
bool legalizeFDIV32(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFMad(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSBufferPrefetch(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFExp10Unsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeIntrinsic(LegalizerHelper &Helper, MachineInstr &MI) const override
bool legalizeFrem(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizePreloadedArgIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeStore(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI, LostDebugLocObserver &LocObserver) const override
Called for instructions with the Custom LegalizationAction.
bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, int64_t Offset, unsigned GAFlags=SIInstrInfo::MO_NONE) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeRsqClampIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafeImpl(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags, bool IsExp10) const
std::pair< Register, Register > getScaledLogInput(MachineIRBuilder &B, Register Src, unsigned Flags) const
bool legalizeFDIVFastIntrin(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool loadInputValue(Register DstReg, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
bool legalizeBVHDualOrBVH8IntersectRayIntrinsic(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFExpUnsafe(MachineIRBuilder &B, Register Dst, Register Src, unsigned Flags) const
bool legalizeFEXPF64(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeAddrSpaceCast(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtract(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeBufferLoad(MachineInstr &MI, LegalizerHelper &Helper, bool IsFormat, bool IsTyped) const
bool legalizeImplicitArgPtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const
void legalizeUnsignedDIV_REM64Impl(MachineIRBuilder &B, Register DstDivReg, Register DstRemReg, Register Num, Register Den) const
bool legalizeDebugTrap(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeCTLS(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWaveID(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFroundeven(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLDSKernelId(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkGroupId(MachineInstr &MI, MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ClusterIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterMaxIdPV, AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const
bool legalizeSignedDIV_REM(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFastUnsafeFDIV64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, bool Signed) const
bool legalizeStackSave(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeFlogUnsafe(MachineIRBuilder &B, Register Dst, Register Src, bool IsLog10, unsigned Flags) const
bool legalizeKernargMemParameter(MachineInstr &MI, MachineIRBuilder &B, uint64_t Offset, Align Alignment=Align(4)) const
Legalize a value that's loaded from kernel arguments.
bool legalizeImageIntrinsic(MachineInstr &MI, MachineIRBuilder &B, GISelChangeObserver &Observer, const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const
Rewrite image intrinsics to use register layouts expected by the subtarget.
void buildAbsGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV, MachineRegisterInfo &MRI) const
bool legalizeGetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool getImplicitArgPtr(Register DstReg, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRT(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register getKernargParameterPtr(MachineIRBuilder &B, int64_t Offset) const
bool legalizeSBufferLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeFceil(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFSQRTF64(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeLoad(LegalizerHelper &Helper, MachineInstr &MI) const
bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
Register fixStoreSourceType(MachineIRBuilder &B, Register VData, LLT MemTy, bool IsFormat) const
bool legalizeLaneOp(LegalizerHelper &Helper, MachineInstr &MI, Intrinsic::ID IID) const
bool legalizeSetFPEnv(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeWorkitemIDIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B, unsigned Dim, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const
void buildLoadInputValue(Register DstReg, MachineIRBuilder &B, const ArgDescriptor *Arg, const TargetRegisterClass *ArgRC, LLT ArgTy) const
bool legalizeTrapHsaQueuePtr(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
bool legalizeFlog2(MachineInstr &MI, MachineIRBuilder &B) const
bool legalizeTrapEndpgm(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1213
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1193
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1153
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ ICMP_UGE
unsigned greater or equal
Definition InstrTypes.h:700
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:703
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
This is the shared class of boolean and integer constants.
Definition Constants.h:87
int64_t getSExtValue() const
Return the constant as a 64-bit integer value after it has been sign extended as appropriate for the ...
Definition Constants.h:174
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
Abstract class that contains various methods for clients to notify about changes.
virtual void changingInstr(MachineInstr &MI)=0
This instruction is about to be mutated in some way.
virtual void changedInstr(MachineInstr &MI)=0
This instruction was mutated in some way.
Simple wrapper observer that takes several observers, and calls each one for each event.
KnownBits getKnownBits(Register R)
bool hasExternalLinkage() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:135
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:563
static constexpr LLT float64()
Get a 64-bit IEEE double value.
constexpr unsigned getScalarSizeInBits() const
constexpr bool isScalar() const
constexpr LLT changeElementType(LLT NewEltTy) const
If this type is a vector, return a vector with the same number of elements but the new element type.
static constexpr LLT vector(ElementCount EC, unsigned ScalarSizeInBits)
Get a low-level vector of some number of elements and element width.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
constexpr uint16_t getNumElements() const
Returns the number of elements in a vector LLT.
constexpr bool isVector() const
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
constexpr bool isPointer() const
constexpr LLT getElementType() const
Returns the vector's element type. Only valid for vector types.
constexpr LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
static constexpr LLT float16()
Get a 16-bit IEEE half value.
constexpr unsigned getAddressSpace() const
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
constexpr LLT changeElementCount(ElementCount EC) const
Return a vector or scalar with the same element type and the new element count.
constexpr LLT getScalarType() const
static constexpr LLT scalarOrVector(ElementCount EC, LLT ScalarTy)
static constexpr LLT float32()
Get a 32-bit IEEE float value.
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI void computeTables()
Compute any ancillary tables needed to quickly decide how an operation should be handled.
LegalizeRuleSet & minScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at least as wide as Ty.
LegalizeRuleSet & legalFor(std::initializer_list< LLT > Types)
The instruction is legal when type index 0 is any type in the given list.
LegalizeRuleSet & unsupported()
The instruction is unsupported.
LegalizeRuleSet & scalarSameSizeAs(unsigned TypeIdx, unsigned SameSizeIdx)
Change the type TypeIdx to have the same scalar size as type SameSizeIdx.
LegalizeRuleSet & fewerElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Remove elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & clampScalarOrElt(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & bitcastIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
The specified type index is coerced if predicate is true.
LegalizeRuleSet & maxScalar(unsigned TypeIdx, const LLT Ty)
Ensure the scalar is at most as wide as Ty.
LegalizeRuleSet & minScalarOrElt(unsigned TypeIdx, const LLT Ty)
Ensure the scalar or element is at least as wide as Ty.
LegalizeRuleSet & clampMaxNumElements(unsigned TypeIdx, const LLT EltTy, unsigned MaxElements)
Limit the number of elements in EltTy vectors to at most MaxElements.
LegalizeRuleSet & unsupportedFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & lower()
The instruction is lowered.
LegalizeRuleSet & moreElementsIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Add more elements to reach the type selected by the mutation if the predicate is true.
LegalizeRuleSet & lowerFor(std::initializer_list< LLT > Types)
The instruction is lowered when type index 0 is any type in the given list.
LegalizeRuleSet & clampScalar(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the range of scalar sizes to MinTy and MaxTy.
LegalizeRuleSet & custom()
Unconditionally custom lower.
LegalizeRuleSet & clampMaxNumElementsStrict(unsigned TypeIdx, const LLT EltTy, unsigned NumElts)
Express EltTy vectors strictly using vectors with NumElts elements (or scalars when NumElts equals 1)...
LegalizeRuleSet & unsupportedIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarIf(LegalityPredicate Predicate, LegalizeMutation Mutation)
Widen the scalar to the one selected by the mutation if the predicate is true.
LegalizeRuleSet & alwaysLegal()
LegalizeRuleSet & clampNumElements(unsigned TypeIdx, const LLT MinTy, const LLT MaxTy)
Limit the number of elements for the given vectors to at least MinTy's number of elements and at most...
LegalizeRuleSet & maxScalarIf(LegalityPredicate Predicate, unsigned TypeIdx, const LLT Ty)
Conditionally limit the maximum size of the scalar.
LegalizeRuleSet & customIf(LegalityPredicate Predicate)
LegalizeRuleSet & widenScalarToNextPow2(unsigned TypeIdx, unsigned MinSize=0)
Widen the scalar to the next power of two that is at least MinSize.
LegalizeRuleSet & scalarize(unsigned TypeIdx)
LegalizeRuleSet & legalForCartesianProduct(std::initializer_list< LLT > Types)
The instruction is legal when type indexes 0 and 1 are both in the given list.
LegalizeRuleSet & legalIf(LegalityPredicate Predicate)
The instruction is legal if predicate is true.
LegalizeRuleSet & customFor(std::initializer_list< LLT > Types)
LegalizeRuleSet & widenScalarToNextMultipleOf(unsigned TypeIdx, unsigned Size)
Widen the scalar to the next multiple of Size.
LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI)
LLVM_ABI void moreElementsVectorDst(MachineInstr &MI, LLT MoreTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a Def by performing it with addition...
LLVM_ABI LegalizeResult lowerInsert(MachineInstr &MI)
LLVM_ABI LegalizeResult lowerExtract(MachineInstr &MI)
GISelValueTracking * getValueTracking() const
@ Legalized
Instruction has been legalized and the MachineFunction changed.
GISelChangeObserver & Observer
To keep track of changes made by the LegalizerHelper.
LLVM_ABI void bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx)
Legalize a single operand OpIdx of the machine instruction MI as a def by inserting a G_BITCAST from ...
LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI)
MachineIRBuilder & MIRBuilder
Expose MIRBuilder so clients can set their own RecordInsertInstruction functions.
LLVM_ABI void widenScalarDst(MachineInstr &MI, LLT WideTy, unsigned OpIdx=0, unsigned TruncOpcode=TargetOpcode::G_TRUNC)
Legalize a single operand OpIdx of the machine instruction MI as a Def by extending the operand's typ...
LegalizeRuleSet & getActionDefinitionsBuilder(unsigned Opcode)
Get the action definition builder for the given opcode.
const LegacyLegalizerInfo & getLegacyLegalizerInfo() const
TypeSize getValue() const
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition MCRegister.h:72
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
MachineInstrBundleIterator< MachineInstr > iterator
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
Helper class to build MachineInstr.
MachineFunction & getMF()
Getter for the function we currently build.
Register getReg(unsigned Idx) const
Get the register for the operand index.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & addGlobalAddress(const GlobalValue *GV, int64_t Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
LLT getMemoryType() const
Return the memory type of the memory reference.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
MachineBasicBlock * getMBB() const
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
void setMBB(MachineBasicBlock *MBB)
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
use_instr_nodbg_iterator use_instr_nodbg_begin(Register RegNo) const
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register createGenericVirtualRegister(LLT Ty, StringRef Name="")
Create and return a new generic virtual register with low-level type Ty.
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
const TargetRegisterInfo * getTargetRegisterInfo() const
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
MutableArrayRef - Represent a mutable reference to an array (0 or more elements consecutively in memo...
Definition ArrayRef.h:298
MutableArrayRef< T > drop_front(size_t N=1) const
Drop the first N elements of the array.
Definition ArrayRef.h:387
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
constexpr bool isValid() const
Definition Register.h:112
constexpr bool isVirtual() const
Return true if the specified register number is in the virtual register namespace.
Definition Register.h:79
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
bool shouldEmitFixup(const GlobalValue *GV) const
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool shouldEmitPCReloc(const GlobalValue *GV) const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void truncate(size_type N)
Like resize, but requires that N is less than size().
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
int64_t getImm() const
Register getReg() const
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
unsigned getPointerSizeInBits(unsigned AS) const
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
self_iterator getIterator()
Definition ilist_node.h:123
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
bool isFlatGlobalAddrSpace(unsigned AS)
bool isGFX12Plus(const MCSubtargetInfo &STI)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isGFX11Plus(const MCSubtargetInfo &STI)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
std::pair< Register, unsigned > getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg, GISelValueTracking *ValueTracking=nullptr, bool CheckNUW=false)
Returns base register and constant offset.
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI LegalityPredicate scalarOrEltWiderThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or a vector with an element type that's wider than the ...
LLVM_ABI LegalityPredicate isScalar(unsigned TypeIdx)
True iff the specified type index is a scalar.
LLVM_ABI LegalityPredicate isPointer(unsigned TypeIdx)
True iff the specified type index is a pointer (with any address space).
LLVM_ABI LegalityPredicate typeInSet(unsigned TypeIdx, std::initializer_list< LLT > TypesInit)
True iff the given type index is one of the specified types.
LLVM_ABI LegalityPredicate smallerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a smaller total bit size than second type index.
LLVM_ABI LegalityPredicate largerThan(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the first type index has a larger total bit size than second type index.
LLVM_ABI LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT EltTy)
True if the type index is a vector with element type EltTy.
LLVM_ABI LegalityPredicate sameSize(unsigned TypeIdx0, unsigned TypeIdx1)
True iff the specified type indices are both the same bit size.
LLVM_ABI LegalityPredicate scalarOrEltNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar or vector with an element type that's narrower than the...
LegalityPredicate typeIsNot(unsigned TypeIdx, LLT Type)
True iff the given type index is not the specified type.
Predicate all(Predicate P0, Predicate P1)
True iff P0 and P1 are true.
LLVM_ABI LegalityPredicate typeIs(unsigned TypeIdx, LLT TypesInit)
True iff the given type index is the specified type.
LLVM_ABI LegalityPredicate scalarNarrowerThan(unsigned TypeIdx, unsigned Size)
True iff the specified type index is a scalar that's narrower than the given size.
LLVM_ABI LegalizeMutation scalarize(unsigned TypeIdx)
Break up the vector type for the given type index into the element type.
LLVM_ABI LegalizeMutation widenScalarOrEltToNextPow2(unsigned TypeIdx, unsigned Min=0)
Widen the scalar type or vector element type for the given type index to the next power of 2.
LLVM_ABI LegalizeMutation changeTo(unsigned TypeIdx, LLT Ty)
Select this specific type for the given type index.
Invariant opcodes: All instruction sets have these as their low opcodes.
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
LLVM_ABI Register getFunctionLiveInPhysReg(MachineFunction &MF, const TargetInstrInfo &TII, MCRegister PhysReg, const TargetRegisterClass &RC, const DebugLoc &DL, LLT RegTy=LLT())
Return a virtual register corresponding to the incoming argument register PhysReg.
Definition Utils.cpp:916
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:532
LLVM_ABI Type * getTypeForLLT(LLT Ty, LLVMContext &C)
Get the type back from LLT.
Definition Utils.cpp:2036
LLVM_ABI MachineInstr * getOpcodeDef(unsigned Opcode, Register Reg, const MachineRegisterInfo &MRI)
See if Reg is defined by an single def instruction that is Opcode.
Definition Utils.cpp:652
LLVM_ABI const ConstantFP * getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI)
Definition Utils.cpp:460
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Undef
Value of the register doesn't matter.
LLVM_ABI const llvm::fltSemantics & getFltSemanticForLLT(LLT Ty)
Get the appropriate floating point arithmetic semantic based on the bit size of the given scalar LLT.
std::function< std::pair< unsigned, LLT >(const LegalityQuery &)> LegalizeMutation
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:323
void * PointerTy
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:154
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
LLVM_ABI std::optional< int64_t > getIConstantVRegSExtVal(Register VReg, const MachineRegisterInfo &MRI)
If VReg is defined by a G_CONSTANT fits in int64_t returns it.
Definition Utils.cpp:313
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:147
std::function< bool(const LegalityQuery &)> LegalityPredicate
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ FMul
Product of floats.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
LLVM_ABI void eraseInstr(MachineInstr &MI, MachineRegisterInfo &MRI, LostDebugLocObserver *LocObserver=nullptr)
Definition Utils.cpp:1720
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:432
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:345
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
static constexpr uint64_t encode(Fields... Values)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
MCRegister getRegister() const
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
bool isZero() const
Returns true if value is all zero.
Definition KnownBits.h:80
The LegalityQuery object bundles together all the information that's needed to decide whether a given...
ArrayRef< MemDesc > MMODescrs
Operations which require memory can use this to place requirements on the memory type for each MMO.
ArrayRef< LLT > Types
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
bool IEEE
Floating point opcodes that support exception flag gathering quiet and propagate signaling NaN inputs...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.