LLVM 23.0.0git
NVPTXISelLowering.cpp
Go to the documentation of this file.
1//===-- NVPTXISelLowering.cpp - NVPTX DAG Lowering Implementation ---------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that NVPTX uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "NVPTXISelLowering.h"
16#include "NVPTX.h"
17#include "NVPTXISelDAGToDAG.h"
20#include "NVPTXSubtarget.h"
21#include "NVPTXTargetMachine.h"
23#include "NVPTXUtilities.h"
24#include "NVVMProperties.h"
25#include "llvm/ADT/APFloat.h"
26#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/STLExtras.h"
29#include "llvm/ADT/StringRef.h"
42#include "llvm/IR/Argument.h"
43#include "llvm/IR/Attributes.h"
44#include "llvm/IR/Constants.h"
45#include "llvm/IR/DataLayout.h"
48#include "llvm/IR/FPEnv.h"
49#include "llvm/IR/Function.h"
50#include "llvm/IR/GlobalValue.h"
51#include "llvm/IR/IRBuilder.h"
52#include "llvm/IR/Instruction.h"
54#include "llvm/IR/IntrinsicsNVPTX.h"
55#include "llvm/IR/Module.h"
56#include "llvm/IR/Type.h"
57#include "llvm/IR/Value.h"
69#include <algorithm>
70#include <cassert>
71#include <cmath>
72#include <cstdint>
73#include <iterator>
74#include <optional>
75#include <string>
76#include <tuple>
77#include <utility>
78#include <vector>
79
80#define DEBUG_TYPE "nvptx-lower"
81
82using namespace llvm;
83
85 "nvptx-sched4reg",
86 cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false));
87
89 "nvptx-fma-level", cl::Hidden,
90 cl::desc("NVPTX Specific: FMA contraction (0: don't do it"
91 " 1: do it 2: do it aggressively"),
92 cl::init(2));
93
95 "nvptx-prec-divf32", cl::Hidden,
97 "NVPTX Specific: Override the precision of the lowering for f32 fdiv"),
99 clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"),
100 clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"),
102 "Use IEEE Compliant F32 div.rnd if available (default)"),
104 "Use IEEE Compliant F32 div.rnd if available, no FTZ")),
106
108 "nvptx-prec-sqrtf32", cl::Hidden,
109 cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
110 cl::init(true));
111
112// PTX atom.add.f32 has fixed FTZ behavior that may not match the function's
113// (see shouldExpandAtomicRMWInIR), so by default we fall back to a CAS loop
114// when they disagree. This flag is an escape hatch to use atom.add anyway,
115// trading correct denormal handling for the speed of the native instruction.
117 "nvptx-allow-ftz-atomics", cl::Hidden,
118 cl::desc("NVPTX Specific: Lower atomicrmw fadd to atom.add even when its "
119 "FTZ behavior does not match the function's denormal mode."),
120 cl::init(false));
121
122/// Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it
123/// does NOT use lg2.approx for log2, so this is disabled by default.
125 "nvptx-approx-log2f32",
126 cl::desc("NVPTX Specific: whether to use lg2.approx for log2"),
127 cl::init(false));
128
131 const SDNode &N) const {
132 // If nvptx-prec-div32=N is used on the command-line, always honor it
133 if (UsePrecDivF32.getNumOccurrences() > 0)
134 return UsePrecDivF32;
135
136 const SDNodeFlags Flags = N.getFlags();
137 if (Flags.hasApproximateFuncs())
139
141}
142
144 // If nvptx-prec-sqrtf32 is used on the command-line, always honor it
145 if (UsePrecSqrtF32.getNumOccurrences() > 0)
146 return UsePrecSqrtF32;
147
148 if (N) {
149 const SDNodeFlags Flags = N->getFlags();
150 if (Flags.hasApproximateFuncs())
151 return false;
152 }
153
154 return true;
155}
156
161
162static bool IsPTXVectorType(MVT VT) {
163 switch (VT.SimpleTy) {
164 default:
165 return false;
166 case MVT::v2i1:
167 case MVT::v4i1:
168 case MVT::v2i8:
169 case MVT::v4i8:
170 case MVT::v8i8: // <2 x i8x4>
171 case MVT::v16i8: // <4 x i8x4>
172 case MVT::v2i16:
173 case MVT::v4i16:
174 case MVT::v8i16: // <4 x i16x2>
175 case MVT::v2i32:
176 case MVT::v4i32:
177 case MVT::v2i64:
178 case MVT::v2f16:
179 case MVT::v4f16:
180 case MVT::v8f16: // <4 x f16x2>
181 case MVT::v2bf16:
182 case MVT::v4bf16:
183 case MVT::v8bf16: // <4 x bf16x2>
184 case MVT::v2f32:
185 case MVT::v4f32:
186 case MVT::v2f64:
187 case MVT::v4i64:
188 case MVT::v4f64:
189 case MVT::v8i32:
190 case MVT::v8f32:
191 case MVT::v16f16: // <8 x f16x2>
192 case MVT::v16bf16: // <8 x bf16x2>
193 case MVT::v16i16: // <8 x i16x2>
194 case MVT::v32i8: // <8 x i8x4>
195 return true;
196 }
197}
198
199// When legalizing vector loads/stores, this function is called, which does two
200// things:
201// 1. Determines Whether the vector is something we want to custom lower,
202// std::nullopt is returned if we do not want to custom lower it.
203// 2. If we do want to handle it, returns two parameters:
204// - unsigned int NumElts - The number of elements in the final vector
205// - EVT EltVT - The type of the elements in the final vector
206static std::optional<std::pair<unsigned int, MVT>>
208 unsigned AddressSpace) {
209 const bool CanLowerTo256Bit = STI.has256BitVectorLoadStore(AddressSpace);
210
211 if (CanLowerTo256Bit && VectorEVT.isScalarInteger() &&
212 VectorEVT.getSizeInBits() == 256)
213 return {{4, MVT::i64}};
214
215 if (!VectorEVT.isSimple())
216 return std::nullopt;
217 const MVT VectorVT = VectorEVT.getSimpleVT();
218
219 if (!VectorVT.isVector()) {
220 if (VectorVT == MVT::i128 || VectorVT == MVT::f128)
221 return {{2, MVT::i64}};
222 return std::nullopt;
223 }
224
225 const MVT EltVT = VectorVT.getVectorElementType();
226 const unsigned NumElts = VectorVT.getVectorNumElements();
227
228 // The size of the PTX virtual register that holds a packed type.
229 unsigned PackRegSize;
230
231 // We only handle "native" vector sizes for now, e.g. <4 x double> is not
232 // legal. We can (and should) split that into 2 stores of <2 x double> here
233 // but I'm leaving that as a TODO for now.
234 switch (VectorVT.SimpleTy) {
235 default:
236 return std::nullopt;
237
238 case MVT::v4i64:
239 case MVT::v4f64:
240 // This is a "native" vector type iff the address space is global and the
241 // target supports 256-bit loads/stores
242 if (!CanLowerTo256Bit)
243 return std::nullopt;
244 [[fallthrough]];
245 case MVT::v2i8:
246 case MVT::v2i64:
247 case MVT::v2f64:
248 // This is a "native" vector type
249 return std::pair(NumElts, EltVT);
250
251 case MVT::v16f16: // <8 x f16x2>
252 case MVT::v16bf16: // <8 x bf16x2>
253 case MVT::v16i16: // <8 x i16x2>
254 case MVT::v32i8: // <8 x i8x4>
255 // This can be upsized into a "native" vector type iff the address space is
256 // global and the target supports 256-bit loads/stores.
257 if (!CanLowerTo256Bit)
258 return std::nullopt;
259 [[fallthrough]];
260 case MVT::v2i16: // <1 x i16x2>
261 case MVT::v2f16: // <1 x f16x2>
262 case MVT::v2bf16: // <1 x bf16x2>
263 case MVT::v4i8: // <1 x i8x4>
264 case MVT::v4i16: // <2 x i16x2>
265 case MVT::v4f16: // <2 x f16x2>
266 case MVT::v4bf16: // <2 x bf16x2>
267 case MVT::v8i8: // <2 x i8x4>
268 case MVT::v8f16: // <4 x f16x2>
269 case MVT::v8bf16: // <4 x bf16x2>
270 case MVT::v8i16: // <4 x i16x2>
271 case MVT::v16i8: // <4 x i8x4>
272 PackRegSize = 32;
273 break;
274
275 case MVT::v8f32: // <4 x f32x2>
276 case MVT::v8i32: // <4 x i32x2>
277 // This is a "native" vector type iff the address space is global and the
278 // target supports 256-bit loads/stores
279 if (!CanLowerTo256Bit)
280 return std::nullopt;
281 [[fallthrough]];
282 case MVT::v2f32: // <1 x f32x2>
283 case MVT::v4f32: // <2 x f32x2>
284 case MVT::v2i32: // <1 x i32x2>
285 case MVT::v4i32: // <2 x i32x2>
286 if (!STI.hasF32x2Instructions())
287 return std::pair(NumElts, EltVT);
288 PackRegSize = 64;
289 break;
290 }
291
292 // If we reach here, then we can pack 2 or more elements into a single 32-bit
293 // or 64-bit PTX register and treat the vector as a new vector containing
294 // packed elements.
295
296 // Number of elements to pack in one word.
297 const unsigned NPerReg = PackRegSize / EltVT.getSizeInBits();
298
299 return std::pair(NumElts / NPerReg, MVT::getVectorVT(EltVT, NPerReg));
300}
301
302/// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive
303/// legal-ish MVTs that compose it. Unlike ComputeValueVTs, this will legalize
304/// the types as required by the calling convention (with special handling for
305/// i8s).
306/// NOTE: This is a band-aid for code that expects ComputeValueVTs to return the
307/// same number of types as the Ins/Outs arrays in LowerFormalArguments,
308/// LowerCall, and LowerReturn.
309static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
310 LLVMContext &Ctx, CallingConv::ID CallConv,
311 Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
313 uint64_t StartingOffset = 0) {
314 SmallVector<EVT, 16> TempVTs;
315 SmallVector<uint64_t, 16> TempOffsets;
316 ComputeValueVTs(TLI, DL, Ty, TempVTs, /*MemVTs=*/nullptr, &TempOffsets,
317 StartingOffset);
318
319 for (const auto [VT, Off] : zip(TempVTs, TempOffsets)) {
320 MVT RegisterVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
321 unsigned NumRegs = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
322
323 // Since we actually can load/store b8, we need to ensure that we'll use
324 // the original sized type for any i8s or i8 vectors.
325 if (VT.getScalarType() == MVT::i8) {
326 if (RegisterVT == MVT::i16)
327 RegisterVT = MVT::i8;
328 else if (RegisterVT == MVT::v2i16)
329 RegisterVT = MVT::v2i8;
330 else
331 assert(RegisterVT == MVT::v4i8 &&
332 "Expected v4i8, v2i16, or i16 for i8 RegisterVT");
333 }
334
335 // TODO: This is horribly incorrect for cases where the vector elements are
336 // not a multiple of bytes (ex i1) and legal or i8. However, this problem
337 // has existed for as long as NVPTX has and no one has complained, so we'll
338 // leave it for now.
339 for (unsigned I : seq(NumRegs)) {
340 ValueVTs.push_back(RegisterVT);
341 Offsets.push_back(Off + I * RegisterVT.getStoreSize());
342 }
343 }
344}
345
346// We return an EVT that can hold N VTs
347// If the VT is a vector, the resulting EVT is a flat vector with the same
348// element type as VT's element type.
349static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C) {
350 if (N == 1)
351 return VT;
352
353 return VT.isVector() ? EVT::getVectorVT(C, VT.getScalarType(),
354 VT.getVectorNumElements() * N)
355 : EVT::getVectorVT(C, VT, N);
356}
357
359 const SDLoc &dl, SelectionDAG &DAG) {
360 if (V.getValueType() == VT) {
361 assert(I == 0 && "Index must be 0 for scalar value");
362 return V;
363 }
364
365 if (!VT.isVector())
366 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, V,
367 DAG.getVectorIdxConstant(I, dl));
368
369 return DAG.getNode(
370 ISD::EXTRACT_SUBVECTOR, dl, VT, V,
372}
373
374template <typename T>
375static inline SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl,
376 SelectionDAG &DAG, T GetElement) {
377 if (N == 1)
378 return GetElement(0);
379
381 for (const unsigned I : llvm::seq(N)) {
382 SDValue Val = GetElement(I);
383 if (Val.getValueType().isVector())
385 else
386 Values.push_back(Val);
387 }
388
389 EVT VT = EVT::getVectorVT(*DAG.getContext(), Values[0].getValueType(),
390 Values.size());
391 return DAG.getBuildVector(VT, dl, Values);
392}
393
394/// PromoteScalarIntegerPTX
395/// Used to make sure the arguments/returns are suitable for passing
396/// and promote them to a larger size if they're not.
397///
398/// The promoted type is placed in \p PromoteVT if the function returns true.
400 if (VT.isScalarInteger()) {
401 switch (PowerOf2Ceil(VT.getFixedSizeInBits())) {
402 default:
404 "Promotion is not suitable for scalars of size larger than 64-bits");
405 case 1:
406 return MVT::i1;
407 case 2:
408 case 4:
409 case 8:
410 return MVT::i8;
411 case 16:
412 return MVT::i16;
413 case 32:
414 return MVT::i32;
415 case 64:
416 return MVT::i64;
417 }
418 }
419 return VT;
420}
421
422// Check whether we can merge loads/stores of some of the pieces of a
423// flattened function parameter or return value into a single vector
424// load/store.
425//
426// The flattened parameter is represented as a list of EVTs and
427// offsets, and the whole structure is aligned to ParamAlignment. This
428// function determines whether we can load/store pieces of the
429// parameter starting at index Idx using a single vectorized op of
430// size AccessSize. If so, it returns the number of param pieces
431// covered by the vector op. Otherwise, it returns 1.
432template <typename T>
434 unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
435 const SmallVectorImpl<T> &Offsets, Align ParamAlignment) {
436
437 // Can't vectorize if param alignment is not sufficient.
438 if (ParamAlignment < AccessSize)
439 return 1;
440 // Can't vectorize if offset is not aligned.
441 if (Offsets[Idx] & (AccessSize - 1))
442 return 1;
443
444 EVT EltVT = ValueVTs[Idx];
445 unsigned EltSize = EltVT.getStoreSize();
446
447 // Element is too large to vectorize.
448 if (EltSize >= AccessSize)
449 return 1;
450
451 unsigned NumElts = AccessSize / EltSize;
452 // Can't vectorize if AccessBytes if not a multiple of EltSize.
453 if (AccessSize != EltSize * NumElts)
454 return 1;
455
456 // We don't have enough elements to vectorize.
457 if (Idx + NumElts > ValueVTs.size())
458 return 1;
459
460 // PTX ISA can only deal with 2- and 4-element vector ops.
461 if (NumElts != 4 && NumElts != 2)
462 return 1;
463
464 for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
465 // Types do not match.
466 if (ValueVTs[j] != EltVT)
467 return 1;
468
469 // Elements are not contiguous.
470 if (Offsets[j] - Offsets[j - 1] != EltSize)
471 return 1;
472 }
473 // OK. We can vectorize ValueVTs[i..i+NumElts)
474 return NumElts;
475}
476
477// Computes whether and how we can vectorize the loads/stores of a
478// flattened function parameter or return value.
479//
480// The flattened parameter is represented as the list of ValueVTs and
481// Offsets, and is aligned to ParamAlignment bytes. We return a vector
482// of the same size as ValueVTs indicating how each piece should be
483// loaded/stored (i.e. as a scalar, or as part of a vector
484// load/store).
485template <typename T>
488 const SmallVectorImpl<T> &Offsets, Align ParamAlignment,
489 bool IsVAArg = false) {
490 // Set vector size to match ValueVTs and mark all elements as
491 // scalars by default.
492
493 if (IsVAArg)
494 return SmallVector<unsigned>(ValueVTs.size(), 1);
495
496 SmallVector<unsigned, 16> VectorInfo;
497
498 const auto GetNumElts = [&](unsigned I) -> unsigned {
499 for (const unsigned AccessSize : {16, 8, 4, 2}) {
500 const unsigned NumElts = canMergeParamLoadStoresStartingAt(
501 I, AccessSize, ValueVTs, Offsets, ParamAlignment);
502 assert((NumElts == 1 || NumElts == 2 || NumElts == 4) &&
503 "Unexpected vectorization size");
504 if (NumElts != 1)
505 return NumElts;
506 }
507 return 1;
508 };
509
510 // Check what we can vectorize using 128/64/32-bit accesses.
511 for (unsigned I = 0, E = ValueVTs.size(); I != E;) {
512 const unsigned NumElts = GetNumElts(I);
513 VectorInfo.push_back(NumElts);
514 I += NumElts;
515 }
516 assert(std::accumulate(VectorInfo.begin(), VectorInfo.end(), 0u) ==
517 ValueVTs.size());
518 return VectorInfo;
519}
520
521// NVPTXTargetLowering Constructor.
523 const NVPTXSubtarget &STI)
524 : TargetLowering(TM, STI), nvTM(&TM), STI(STI), GlobalUniqueCallSite(0) {
525 // always lower memset, memcpy, and memmove intrinsics to load/store
526 // instructions, rather
527 // then generating calls to memset, mempcy or memmove.
531
534
535 // Jump is Expensive. Don't create extra control flow for 'and', 'or'
536 // condition branches.
537 setJumpIsExpensive(true);
538
539 // Wide divides are _very_ slow. Try to reduce the width of the divide if
540 // possible.
541 addBypassSlowDiv(64, 32);
542
543 // By default, use the Source scheduling
544 if (sched4reg)
546 else
548
549 auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
550 LegalizeAction NoF16Action) {
551 bool IsOpSupported = STI.allowFP16Math();
552 switch (Op) {
553 // Several FP16 instructions are available on sm_80 only.
554 case ISD::FMINNUM:
555 case ISD::FMAXNUM:
558 case ISD::FMAXIMUM:
559 case ISD::FMINIMUM:
560 case ISD::FMAXIMUMNUM:
561 case ISD::FMINIMUMNUM:
562 IsOpSupported &= STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
563 break;
564 case ISD::FEXP2:
565 IsOpSupported &= STI.getSmVersion() >= 75 && STI.getPTXVersion() >= 70;
566 break;
567 }
568 setOperationAction(Op, VT, IsOpSupported ? Action : NoF16Action);
569 };
570
571 auto setBF16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
572 LegalizeAction NoBF16Action) {
573 bool IsOpSupported = STI.hasNativeBF16Support(Op);
575 Op, VT, IsOpSupported ? Action : NoBF16Action);
576 };
577
578 auto setI16x2OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
579 LegalizeAction NoI16x2Action) {
580 bool IsOpSupported = false;
581 // instructions are available on sm_90 only
582 switch (Op) {
583 case ISD::ADD:
584 case ISD::SMAX:
585 case ISD::SMIN:
586 case ISD::UMIN:
587 case ISD::UMAX:
588 IsOpSupported = STI.getSmVersion() >= 90 && STI.getPTXVersion() >= 80;
589 break;
590 }
591 setOperationAction(Op, VT, IsOpSupported ? Action : NoI16x2Action);
592 };
593
594 addRegisterClass(MVT::i1, &NVPTX::B1RegClass);
595 addRegisterClass(MVT::i16, &NVPTX::B16RegClass);
596 addRegisterClass(MVT::v2i16, &NVPTX::B32RegClass);
597 addRegisterClass(MVT::v4i8, &NVPTX::B32RegClass);
598 addRegisterClass(MVT::i32, &NVPTX::B32RegClass);
599 addRegisterClass(MVT::i64, &NVPTX::B64RegClass);
600 addRegisterClass(MVT::f32, &NVPTX::B32RegClass);
601 addRegisterClass(MVT::f64, &NVPTX::B64RegClass);
602 addRegisterClass(MVT::f16, &NVPTX::B16RegClass);
603 addRegisterClass(MVT::v2f16, &NVPTX::B32RegClass);
604 addRegisterClass(MVT::bf16, &NVPTX::B16RegClass);
605 addRegisterClass(MVT::v2bf16, &NVPTX::B32RegClass);
606
607 if (STI.hasF32x2Instructions()) {
608 addRegisterClass(MVT::v2f32, &NVPTX::B64RegClass);
609 addRegisterClass(MVT::v2i32, &NVPTX::B64RegClass);
610 }
611
612 // Conversion to/from FP16/FP16x2 is always legal.
617
619 if (STI.getSmVersion() >= 30 && STI.getPTXVersion() > 31)
621
622 setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
623 setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
624
625 // Conversion to/from BFP16/BFP16x2 is always legal.
630
631 setBF16OperationAction(ISD::SETCC, MVT::v2bf16, Legal, Expand);
632 setBF16OperationAction(ISD::SETCC, MVT::bf16, Legal, Promote);
633 if (getOperationAction(ISD::SETCC, MVT::bf16) == Promote)
634 AddPromotedToType(ISD::SETCC, MVT::bf16, MVT::f32);
635
636 // Conversion to/from i16/i16x2 is always legal.
641
646
647 // No support for these operations with v2f32/v2i32
648 setOperationAction(ISD::INSERT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32}, Expand);
649 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2f32, MVT::v2i32}, Expand);
650
653 MVT::v2i32, Expand);
654
655 // Need custom lowering in case the index is dynamic.
656 if (STI.hasF32x2Instructions())
657 setOperationAction(ISD::EXTRACT_VECTOR_ELT, {MVT::v2f32, MVT::v2i32},
658 Custom);
659
660 // Custom conversions to/from v2i8.
662
663 // Only logical ops can be done on v4i8/v2i32 directly, others must be done
664 // elementwise.
681 {MVT::v4i8, MVT::v2i32}, Expand);
682
683 // Operations not directly supported by NVPTX.
684 for (MVT VT : {MVT::bf16, MVT::f16, MVT::v2bf16, MVT::v2f16, MVT::f32,
685 MVT::v2f32, MVT::f64, MVT::i1, MVT::i8, MVT::i16, MVT::v2i16,
686 MVT::v4i8, MVT::i32, MVT::v2i32, MVT::i64}) {
689 }
690
691 // We don't want ops like FMINIMUM or UMAX to be lowered to SETCC+VSELECT.
692 setOperationAction(ISD::VSELECT, {MVT::v2f32, MVT::v2i32}, Expand);
693
694 // Some SIGN_EXTEND_INREG can be done using cvt instruction.
695 // For others we will expand to a SHL/SRA pair.
701 setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::v2i16, MVT::v2i32}, Expand);
702
709
712
714 {MVT::i8, MVT::i16, MVT::v2i16, MVT::i32, MVT::i64},
715 Expand);
716
717 if (STI.hasHWROT32()) {
720 Custom);
721 }
722
723 setOperationAction(ISD::BR_JT, MVT::Other, STI.hasBrx() ? Legal : Expand);
725
726 // We want to legalize constant related memmove and memcopy
727 // intrinsics.
729
730 // FP extload/truncstore is not legal in PTX. We need to expand all these.
731 for (auto FloatVTs :
733 for (MVT ValVT : FloatVTs) {
734 for (MVT MemVT : FloatVTs) {
735 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Expand);
736 setTruncStoreAction(ValVT, MemVT, Expand);
737 }
738 }
739 }
740
741 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
742 // how they'll be lowered in ISel anyway, and by doing this a little earlier
743 // we allow for more DAG combine opportunities.
744 for (auto IntVTs :
746 for (MVT ValVT : IntVTs)
747 for (MVT MemVT : IntVTs)
748 if (isTypeLegal(ValVT))
749 setLoadExtAction(ISD::EXTLOAD, ValVT, MemVT, Custom);
750
751 // PTX does not support load / store predicate registers
753 for (MVT VT : MVT::integer_valuetypes()) {
755 Promote);
756 setTruncStoreAction(VT, MVT::i1, Expand);
757 }
758
759 // Disable generations of extload/truncstore for v2i32/v2i16/v2i8. The generic
760 // expansion for these nodes when they are unaligned is incorrect if the
761 // type is a vector.
762 //
763 // TODO: Fix the generic expansion for these nodes found in
764 // TargetLowering::expandUnalignedLoad/Store.
766 MVT::v2i8, Expand);
768 {MVT::v2i8, MVT::v2i16}, Expand);
769 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
770 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
771 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
772
773 // Register custom handling for illegal type loads/stores. We'll try to custom
774 // lower almost all illegal types and logic in the lowering will discard cases
775 // we can't handle.
776 setOperationAction({ISD::LOAD, ISD::STORE}, {MVT::i128, MVT::i256, MVT::f128},
777 Custom);
779 if (!isTypeLegal(VT) && VT.getStoreSizeInBits() <= 256)
781 Custom);
782
783 // Custom legalization for LDU intrinsics.
784 // TODO: The logic to lower these is not very robust and we should rewrite it.
785 // Perhaps LDU should not be represented as an intrinsic at all.
788 if (IsPTXVectorType(VT))
790
794 MVT::i1, Expand);
795
796 // This is legal in NVPTX
801
802 setOperationAction(ISD::DYNAMIC_STACKALLOC, {MVT::i32, MVT::i64}, Custom);
804
805 // TRAP can be lowered to PTX trap
806 setOperationAction(ISD::TRAP, MVT::Other, Legal);
807 // DEBUGTRAP can be lowered to PTX brkpt
809
810 // Support varargs.
815
817 {MVT::i16, MVT::i32, MVT::i64}, Legal);
818 // PTX abs.s is undefined for INT_MIN, so ISD::ABS (which requires
819 // abs(INT_MIN) == INT_MIN) must be expanded. ABS_MIN_POISON matches
820 // PTX abs semantics since INT_MIN input is poison/undefined.
821 setOperationAction(ISD::ABS, {MVT::i16, MVT::i32, MVT::i64}, Expand);
822 setOperationAction(ISD::ABS_MIN_POISON, {MVT::i16, MVT::i32, MVT::i64},
823 Legal);
824
826 Promote);
829
830 setI16x2OperationAction(ISD::ABS_MIN_POISON, MVT::v2i16, Legal, Custom);
831 setI16x2OperationAction(ISD::SMIN, MVT::v2i16, Legal, Custom);
832 setI16x2OperationAction(ISD::SMAX, MVT::v2i16, Legal, Custom);
833 setI16x2OperationAction(ISD::UMIN, MVT::v2i16, Legal, Custom);
834 setI16x2OperationAction(ISD::UMAX, MVT::v2i16, Legal, Custom);
835 setI16x2OperationAction(ISD::CTPOP, MVT::v2i16, Legal, Expand);
836 setI16x2OperationAction(ISD::CTLZ, MVT::v2i16, Legal, Expand);
837
838 setI16x2OperationAction(ISD::ADD, MVT::v2i16, Legal, Custom);
839 setI16x2OperationAction(ISD::SUB, MVT::v2i16, Legal, Custom);
840 setI16x2OperationAction(ISD::MUL, MVT::v2i16, Legal, Custom);
841 setI16x2OperationAction(ISD::SHL, MVT::v2i16, Legal, Custom);
842 setI16x2OperationAction(ISD::SREM, MVT::v2i16, Legal, Custom);
843 setI16x2OperationAction(ISD::UREM, MVT::v2i16, Legal, Custom);
844
845 // Other arithmetic and logic ops are unsupported.
849 {MVT::v2i16, MVT::v2i32}, Expand);
850
851 // v2i32 is not supported for any arithmetic operations
856 MVT::v2i32, Expand);
857
862 if (STI.getPTXVersion() >= 43) {
867 }
868
870 setOperationAction(ISD::CTTZ, {MVT::v2i16, MVT::v2i32}, Expand);
873
874 // PTX does not directly support SELP of i1, so promote to i32 first
876
877 // PTX cannot multiply two i64s in a single instruction.
880
881 // We have some custom DAG combine patterns for these nodes
883 ISD::AND,
885 ISD::FADD,
892 ISD::MUL,
894 ISD::SHL,
895 ISD::SREM,
896 ISD::UREM,
900 ISD::LOAD,
905
906 // If the vector operands require register coalescing, scalarize instead
907 if (STI.hasF32x2Instructions())
909
910 // setcc for f16x2 and bf16x2 needs special handling to prevent
911 // legalizer's attempt to scalarize it due to v2i1 not being legal.
912 if (STI.allowFP16Math() || STI.hasBF16Math())
914
915 // Vector reduction operations. These may be turned into shuffle or tree
916 // reductions depending on what instructions are available for each type.
918 MVT EltVT = VT.getVectorElementType();
919 if (EltVT == MVT::f32 || EltVT == MVT::f64) {
922 VT, Custom);
923 }
924 }
925
926 // Promote fp16 arithmetic if fp16 hardware isn't available or the
927 // user passed --nvptx-no-fp16-math. The flag is useful because,
928 // although sm_53+ GPUs have some sort of FP16 support in
929 // hardware, only sm_53 and sm_60 have full implementation. Others
930 // only have token amount of hardware and are likely to run faster
931 // by using fp32 units instead.
932 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
933 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
934 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
935 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
936 // bf16 must be promoted to f32.
937 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
938 if (getOperationAction(Op, MVT::bf16) == Promote)
939 AddPromotedToType(Op, MVT::bf16, MVT::f32);
940 setOperationAction(Op, MVT::v2f32,
941 STI.hasF32x2Instructions() ? Legal : Expand);
942 }
943
944 // On SM80, we select add/mul/sub as fma to avoid promotion to float
945 for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB}) {
946 for (const auto &VT : {MVT::bf16, MVT::v2bf16}) {
947 if (!STI.hasNativeBF16Support(Op) && STI.hasNativeBF16Support(ISD::FMA)) {
949 }
950 }
951 }
952
953 // f16/f16x2 neg was introduced in PTX 60, SM_53.
954 const bool IsFP16FP16x2NegAvailable = STI.getSmVersion() >= 53 &&
955 STI.getPTXVersion() >= 60 &&
956 STI.allowFP16Math();
957 for (const auto &VT : {MVT::f16, MVT::v2f16})
959 IsFP16FP16x2NegAvailable ? Legal : Expand);
960
961 setBF16OperationAction(ISD::FNEG, MVT::bf16, Legal, Expand);
962 setBF16OperationAction(ISD::FNEG, MVT::v2bf16, Legal, Expand);
963 setOperationAction(ISD::FNEG, MVT::v2f32, Expand);
964 // (would be) Library functions.
965
966 // These map to conversion instructions for scalar FP types.
967 for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
969 setOperationAction(Op, MVT::f16, Legal);
970 setOperationAction(Op, MVT::f32, Legal);
971 setOperationAction(Op, MVT::f64, Legal);
972 setOperationAction(Op, MVT::v2f16, Expand);
973 setOperationAction(Op, MVT::v2bf16, Expand);
974 setOperationAction(Op, MVT::v2f32, Expand);
975 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
976 if (getOperationAction(Op, MVT::bf16) == Promote)
977 AddPromotedToType(Op, MVT::bf16, MVT::f32);
978 }
979
980 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71) {
982 }
983 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
984 for (MVT VT : {MVT::bf16, MVT::f32, MVT::f64}) {
987 }
988 }
989
990 // Expand v2f32 = fp_extend
992 // Expand v2[b]f16 = fp_round v2f32
993 setOperationAction(ISD::FP_ROUND, {MVT::v2bf16, MVT::v2f16}, Expand);
994
995 // sm_80 only has conversions between f32 and bf16. Custom lower all other
996 // bf16 conversions.
997 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
998 for (MVT VT : {MVT::i1, MVT::i16, MVT::i32, MVT::i64}) {
1001 VT, Custom);
1002 }
1005 MVT::bf16, Custom);
1006 }
1007
1011 setOperationAction(ISD::FROUND, MVT::v2bf16, Expand);
1015 AddPromotedToType(ISD::FROUND, MVT::bf16, MVT::f32);
1016
1017 setOperationAction({ISD::LROUND, ISD::LLROUND}, {MVT::f32, MVT::f64}, Expand);
1018
1019 // 'Expand' implements FCOPYSIGN without calling an external library.
1026
1027 // These map to corresponding instructions for f32/f64. f16 must be
1028 // promoted to f32. v2f16 is expanded to f16, which is then promoted
1029 // to f32.
1030 for (const auto &Op :
1032 setOperationAction(Op, MVT::f16, Promote);
1033 setOperationAction(Op, MVT::f32, Legal);
1034 // only div/rem/sqrt are legal for f64
1035 if (Op == ISD::FDIV || Op == ISD::FREM || Op == ISD::FSQRT) {
1036 setOperationAction(Op, MVT::f64, Legal);
1037 }
1038 setOperationAction(Op, {MVT::v2f16, MVT::v2bf16, MVT::v2f32}, Expand);
1039 setOperationAction(Op, MVT::bf16, Promote);
1040 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1041 }
1042 setOperationAction(ISD::FREM, {MVT::f32, MVT::f64}, Custom);
1043
1044 setOperationAction(ISD::FABS, {MVT::f32, MVT::f64}, Legal);
1045 setOperationAction(ISD::FABS, MVT::v2f32, Expand);
1046 if (STI.getPTXVersion() >= 65) {
1047 setFP16OperationAction(ISD::FABS, MVT::f16, Legal, Promote);
1048 setFP16OperationAction(ISD::FABS, MVT::v2f16, Legal, Expand);
1049 } else {
1051 setOperationAction(ISD::FABS, MVT::v2f16, Expand);
1052 }
1053 setBF16OperationAction(ISD::FABS, MVT::v2bf16, Legal, Expand);
1054 setBF16OperationAction(ISD::FABS, MVT::bf16, Legal, Promote);
1055 if (getOperationAction(ISD::FABS, MVT::bf16) == Promote)
1056 AddPromotedToType(ISD::FABS, MVT::bf16, MVT::f32);
1057
1058 for (const auto &Op :
1060 setOperationAction(Op, MVT::f32, Legal);
1061 setOperationAction(Op, MVT::f64, Legal);
1062 setFP16OperationAction(Op, MVT::f16, Legal, Promote);
1063 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1064 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1065 setBF16OperationAction(Op, MVT::bf16, Legal, Promote);
1066 if (getOperationAction(Op, MVT::bf16) == Promote)
1067 AddPromotedToType(Op, MVT::bf16, MVT::f32);
1068 setOperationAction(Op, MVT::v2f32, Expand);
1069 }
1070 bool SupportsF32MinMaxNaN =
1071 STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70;
1072 for (const auto &Op : {ISD::FMINIMUM, ISD::FMAXIMUM}) {
1073 setOperationAction(Op, MVT::f32, SupportsF32MinMaxNaN ? Legal : Expand);
1074 setFP16OperationAction(Op, MVT::f16, Legal, Expand);
1075 setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
1076 setBF16OperationAction(Op, MVT::bf16, Legal, Expand);
1077 setBF16OperationAction(Op, MVT::v2bf16, Legal, Expand);
1078 setOperationAction(Op, MVT::v2f32, Expand);
1079 }
1080
1081 // Custom lowering for inline asm with 128-bit operands
1084
1085 // FEXP2 support:
1086 // - f32
1087 // - f16/f16x2 (sm_70+, PTX 7.0+)
1088 // - bf16/bf16x2 (sm_90+, PTX 7.8+)
1089 // When f16/bf16 types aren't supported, they are promoted/expanded to f32.
1091 setOperationAction(ISD::FEXP2, MVT::v2f32, Expand);
1092 setFP16OperationAction(ISD::FEXP2, MVT::f16, Legal, Promote);
1093 setFP16OperationAction(ISD::FEXP2, MVT::v2f16, Legal, Expand);
1094 setBF16OperationAction(ISD::FEXP2, MVT::bf16, Legal, Promote);
1095 setBF16OperationAction(ISD::FEXP2, MVT::v2bf16, Legal, Expand);
1096
1097 // FLOG2 supports f32 only
1098 // f16/bf16 types aren't supported, but they are promoted/expanded to f32.
1099 if (UseApproxLog2F32) {
1101 setOperationPromotedToType(ISD::FLOG2, MVT::f16, MVT::f32);
1102 setOperationPromotedToType(ISD::FLOG2, MVT::bf16, MVT::f32);
1103 setOperationAction(ISD::FLOG2, {MVT::v2f16, MVT::v2bf16, MVT::v2f32},
1104 Expand);
1105 }
1106
1107 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
1108
1109 setOperationAction(ISD::ATOMIC_LOAD_SUB, {MVT::i32, MVT::i64}, Expand);
1110
1111 // atom.b128 is legal in PTX but since we don't represent i128 as a legal
1112 // type, we need to custom lower it.
1114 Custom);
1115
1116 // Now deduce the information based on the above mentioned
1117 // actions
1118 computeRegisterProperties(STI.getRegisterInfo());
1119
1120 // PTX support for 16-bit CAS is emulated. Only use 32+
1121 setMinCmpXchgSizeInBits(STI.getMinCmpXchgSizeInBits());
1122 setMaxAtomicSizeInBitsSupported(STI.hasAtomSwap128() ? 128 : 64);
1124
1125 // Custom lowering for tcgen05.ld vector operands
1127 {MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1128 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::v2f32,
1129 MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32,
1130 MVT::v64f32, MVT::v128f32},
1131 Custom);
1132
1133 // Custom lowering for tcgen05.st vector operands and the st.async
1134 // i128 (.b128) operand. MVT::i8 is needed for the st.async.{sys,gpu} b8
1135 // variant.
1137 {MVT::i8, MVT::v2i32, MVT::v4i32, MVT::v8i32, MVT::v16i32,
1138 MVT::v32i32, MVT::v64i32, MVT::v128i32, MVT::i128,
1139 MVT::Other},
1140 Custom);
1141
1142 // Enable custom lowering for the following:
1143 // * MVT::i128 - clusterlaunchcontrol
1144 // * MVT::i32 - prmt
1145 // * MVT::v4f32 - cvt_rs fp{4/6/8}x4 intrinsics
1146 // * MVT::Other - internal.addrspace.wrap
1148 {MVT::i32, MVT::i128, MVT::v4f32, MVT::Other}, Custom);
1149
1150 // Custom lowering for bswap
1151 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::i32, MVT::i64, MVT::v2i16},
1152 Custom);
1153}
1154
1157 if (!VT.isScalableVector() && VT.getVectorNumElements() != 1 &&
1158 VT.getScalarType() == MVT::i1)
1159 return TypeSplitVector;
1161}
1162
1164 int Enabled, int &ExtraSteps,
1165 bool &UseOneConst,
1166 bool Reciprocal) const {
1169 return SDValue();
1170
1171 if (ExtraSteps == ReciprocalEstimate::Unspecified)
1172 ExtraSteps = 0;
1173
1174 SDLoc DL(Operand);
1175 EVT VT = Operand.getValueType();
1176 bool Ftz = useF32FTZ(DAG.getMachineFunction());
1177
1178 auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
1179 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
1180 DAG.getConstant(IID, DL, MVT::i32), Operand);
1181 };
1182
1183 // The sqrt and rsqrt refinement processes assume we always start out with an
1184 // approximation of the rsqrt. Therefore, if we're going to do any refinement
1185 // (i.e. ExtraSteps > 0), we must return an rsqrt. But if we're *not* doing
1186 // any refinement, we must return a regular sqrt.
1187 if (Reciprocal || ExtraSteps > 0) {
1188 if (VT == MVT::f32)
1189 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
1190 : Intrinsic::nvvm_rsqrt_approx_f);
1191 else if (VT == MVT::f64)
1192 return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
1193 else
1194 return SDValue();
1195 } else {
1196 if (VT == MVT::f32)
1197 return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
1198 : Intrinsic::nvvm_sqrt_approx_f);
1199 else {
1200 // There's no sqrt.approx.f64 instruction, so we emit
1201 // reciprocal(rsqrt(x)). This is faster than
1202 // select(x == 0, 0, x * rsqrt(x)). (In fact, it's faster than plain
1203 // x * rsqrt(x).)
1204 return DAG.getNode(
1206 DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
1207 MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
1208 }
1209 }
1210}
1211
1213 const DataLayout &DL,
1214 const TargetLowering &TL) {
1215 if (Ptr->getOpcode() == ISD::FrameIndex) {
1216 auto Ty = TL.getPointerTy(DL, ADDRESS_SPACE_LOCAL);
1217 Ptr = DAG.getAddrSpaceCast(SDLoc(), Ty, Ptr, ADDRESS_SPACE_GENERIC,
1219
1221 }
1222
1223 // Peel of an addrspacecast to generic and load directly from the specific
1224 // address space.
1225 if (Ptr->getOpcode() == ISD::ADDRSPACECAST) {
1226 const auto *ASC = cast<AddrSpaceCastSDNode>(Ptr);
1227 if (ASC->getDestAddressSpace() == ADDRESS_SPACE_GENERIC) {
1228 Ptr = ASC->getOperand(0);
1229 return MachinePointerInfo(ASC->getSrcAddressSpace());
1230 }
1231 }
1232
1233 return MachinePointerInfo();
1234}
1235
1237 if (Flags.isSExt())
1238 return ISD::SIGN_EXTEND;
1239 if (Flags.isZExt())
1240 return ISD::ZERO_EXTEND;
1241 return ISD::ANY_EXTEND;
1242}
1243
1245 ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
1246 SDLoc dl) {
1247 const EVT ActualVT = V.getValueType();
1248 assert((ActualVT == ExpectedVT ||
1249 (ExpectedVT.isInteger() && ActualVT.isInteger())) &&
1250 "Non-integer argument type size mismatch");
1251 if (ExpectedVT.bitsGT(ActualVT))
1252 return DAG.getNode(getExtOpcode(Flags), dl, ExpectedVT, V);
1253 if (ExpectedVT.bitsLT(ActualVT))
1254 return DAG.getNode(ISD::TRUNCATE, dl, ExpectedVT, V);
1255
1256 return V;
1257}
1258
1260 SmallVectorImpl<SDValue> &InVals) const {
1261
1262 if (CLI.IsVarArg && (STI.getPTXVersion() < 60 || STI.getSmVersion() < 30))
1264 "Support for variadic functions (unsized array parameter) introduced "
1265 "in PTX ISA version 6.0 and requires target sm_30.");
1266
1267 SelectionDAG &DAG = CLI.DAG;
1268 SDLoc dl = CLI.DL;
1269 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
1270 SDValue Callee = CLI.Callee;
1271 ArgListTy &Args = CLI.getArgs();
1272 Type *RetTy = CLI.RetTy;
1273 const CallBase *CB = CLI.CB;
1274 const DataLayout &DL = DAG.getDataLayout();
1275 LLVMContext &Ctx = *DAG.getContext();
1276
1277 const auto GetI32 = [&](const unsigned I) {
1278 return DAG.getConstant(I, dl, MVT::i32);
1279 };
1280
1281 const unsigned UniqueCallSite = GlobalUniqueCallSite++;
1282 const SDValue CallChain = CLI.Chain;
1283 const SDValue StartChain =
1284 DAG.getCALLSEQ_START(CallChain, UniqueCallSite, 0, dl);
1285 SDValue DeclareGlue = StartChain.getValue(1);
1286
1287 SmallVector<SDValue, 16> CallPrereqs{StartChain};
1288
1289 const auto MakeDeclareScalarParam = [&](SDValue Symbol, unsigned Size) {
1290 // PTX ABI requires integral types to be at least 32 bits in size. FP16 is
1291 // loaded/stored using i16, so it's handled here as well.
1292 const unsigned SizeBits = promoteScalarArgumentSize(Size * 8);
1293 SDValue Declare =
1294 DAG.getNode(NVPTXISD::DeclareScalarParam, dl, {MVT::Other, MVT::Glue},
1295 {StartChain, Symbol, GetI32(SizeBits), DeclareGlue});
1296 CallPrereqs.push_back(Declare);
1297 DeclareGlue = Declare.getValue(1);
1298 return Declare;
1299 };
1300
1301 const auto MakeDeclareArrayParam = [&](SDValue Symbol, Align Align,
1302 unsigned Size) {
1303 SDValue Declare = DAG.getNode(
1304 NVPTXISD::DeclareArrayParam, dl, {MVT::Other, MVT::Glue},
1305 {StartChain, Symbol, GetI32(Align.value()), GetI32(Size), DeclareGlue});
1306 CallPrereqs.push_back(Declare);
1307 DeclareGlue = Declare.getValue(1);
1308 return Declare;
1309 };
1310
1311 // Variadic arguments.
1312 //
1313 // Normally, for each argument, we declare a param scalar or a param
1314 // byte array in the .param space, and store the argument value to that
1315 // param scalar or array starting at offset 0.
1316 //
1317 // In the case of the first variadic argument, we declare a vararg byte array
1318 // with size 0. The exact size of this array isn't known at this point, so
1319 // it'll be patched later. All the variadic arguments will be stored to this
1320 // array at a certain offset (which gets tracked by 'VAOffset'). The offset is
1321 // initially set to 0, so it can be used for non-variadic arguments (which use
1322 // 0 offset) to simplify the code.
1323 //
1324 // After all vararg is processed, 'VAOffset' holds the size of the
1325 // vararg byte array.
1326 assert((CLI.IsVarArg || CLI.Args.size() <= CLI.NumFixedArgs) &&
1327 "Non-VarArg function with extra arguments");
1328
1329 const unsigned FirstVAArg = CLI.NumFixedArgs; // position of first variadic
1330 unsigned VAOffset = 0; // current offset in the param array
1331
1332 const SDValue VADeclareParam =
1333 CLI.Args.size() > FirstVAArg
1334 ? MakeDeclareArrayParam(getCallParamSymbol(DAG, FirstVAArg, MVT::i32),
1335 Align(STI.getMaxRequiredAlignment()), 0)
1336 : SDValue();
1337
1338 // Args.size() and Outs.size() need not match.
1339 // Outs.size() will be larger
1340 // * if there is an aggregate argument with multiple fields (each field
1341 // showing up separately in Outs)
1342 // * if there is a vector argument with more than typical vector-length
1343 // elements (generally if more than 4) where each vector element is
1344 // individually present in Outs.
1345 // So a different index should be used for indexing into Outs/OutVals.
1346 // See similar issue in LowerFormalArguments.
1347 auto AllOuts = ArrayRef(CLI.Outs);
1348 auto AllOutVals = ArrayRef(CLI.OutVals);
1349 assert(AllOuts.size() == AllOutVals.size() &&
1350 "Outs and OutVals must be the same size");
1351 // Declare the .params or .reg need to pass values
1352 // to the function
1353 for (const auto E : llvm::enumerate(Args)) {
1354 const auto ArgI = E.index();
1355 const auto Arg = E.value();
1356 const auto ArgOuts =
1357 AllOuts.take_while([&](auto O) { return O.OrigArgIndex == ArgI; });
1358 const auto ArgOutVals = AllOutVals.take_front(ArgOuts.size());
1359 AllOuts = AllOuts.drop_front(ArgOuts.size());
1360 AllOutVals = AllOutVals.drop_front(ArgOuts.size());
1361
1362 const bool IsVAArg = (ArgI >= FirstVAArg);
1363 const bool IsByVal = Arg.IsByVal;
1364
1365 const SDValue ParamSymbol =
1366 getCallParamSymbol(DAG, IsVAArg ? FirstVAArg : ArgI, MVT::i32);
1367
1368 assert((!IsByVal || Arg.IndirectType) &&
1369 "byval arg must have indirect type");
1370 Type *ETy = (IsByVal ? Arg.IndirectType : Arg.Ty);
1371
1372 const Align ArgAlign = [&]() {
1373 if (IsByVal) {
1374 // The ByValAlign in the Outs[OIdx].Flags is always set at this point,
1375 // so we don't need to worry whether it's naturally aligned or not.
1376 // See TargetLowering::LowerCallTo().
1377 const Align InitialAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1379 InitialAlign, DL);
1380 }
1381 return getPTXParamAlign(CB, Arg.Ty, ArgI + AttributeList::FirstArgIndex,
1382 DL);
1383 }();
1384
1385 const unsigned TySize = DL.getTypeAllocSize(ETy);
1386 assert((!IsByVal || TySize == ArgOuts[0].Flags.getByValSize()) &&
1387 "type size mismatch");
1388
1389 const SDValue ArgDeclare = [&]() {
1390 if (IsVAArg)
1391 return VADeclareParam;
1392
1393 if (IsByVal || shouldPassAsArray(Arg.Ty))
1394 return MakeDeclareArrayParam(ParamSymbol, ArgAlign, TySize);
1395
1396 assert(ArgOuts.size() == 1 && "We must pass only one value as non-array");
1397 assert((ArgOuts[0].VT.isInteger() || ArgOuts[0].VT.isFloatingPoint()) &&
1398 "Only int and float types are supported as non-array arguments");
1399
1400 return MakeDeclareScalarParam(ParamSymbol, TySize);
1401 }();
1402
1403 if (IsByVal) {
1404 assert(ArgOutVals.size() == 1 && "We must pass only one value as byval");
1405 SDValue SrcPtr = ArgOutVals[0];
1406 const auto PointerInfo = refinePtrAS(SrcPtr, DAG, DL, *this);
1407 const Align BaseSrcAlign = ArgOuts[0].Flags.getNonZeroByValAlign();
1408
1409 if (IsVAArg)
1410 VAOffset = alignTo(VAOffset, ArgAlign);
1411
1412 SmallVector<EVT, 4> ValueVTs, MemVTs;
1414 ComputeValueVTs(*this, DL, ETy, ValueVTs, &MemVTs, &Offsets);
1415
1416 unsigned J = 0;
1417 const auto VI = VectorizePTXValueVTs(MemVTs, Offsets, ArgAlign, IsVAArg);
1418 for (const unsigned NumElts : VI) {
1419 EVT LoadVT = getVectorizedVT(MemVTs[J], NumElts, Ctx);
1420 Align SrcAlign = commonAlignment(BaseSrcAlign, Offsets[J]);
1421 SDValue SrcAddr = DAG.getObjectPtrOffset(dl, SrcPtr, Offsets[J]);
1422 SDValue SrcLoad =
1423 DAG.getLoad(LoadVT, dl, CallChain, SrcAddr, PointerInfo, SrcAlign);
1424
1425 TypeSize ParamOffset = Offsets[J].getWithIncrement(VAOffset);
1426 Align ParamAlign = commonAlignment(ArgAlign, ParamOffset);
1427 SDValue ParamAddr =
1428 DAG.getObjectPtrOffset(dl, ParamSymbol, ParamOffset);
1429 SDValue StoreParam = DAG.getStore(
1430 ArgDeclare, dl, SrcLoad, ParamAddr,
1432 CallPrereqs.push_back(StoreParam);
1433
1434 J += NumElts;
1435 }
1436 if (IsVAArg)
1437 VAOffset += TySize;
1438 } else {
1441 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, Arg.Ty, VTs, Offsets,
1442 VAOffset);
1443 assert(VTs.size() == Offsets.size() && "Size mismatch");
1444 assert(VTs.size() == ArgOuts.size() && "Size mismatch");
1445
1446 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
1447 // than 32-bits are sign extended or zero extended, depending on
1448 // whether they are signed or unsigned types. This case applies
1449 // only to scalar parameters and not to aggregate values.
1450 const bool ExtendIntegerParam =
1451 Arg.Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Arg.Ty) < 32;
1452
1453 const auto GetStoredValue = [&](const unsigned I) {
1454 SDValue StVal = ArgOutVals[I];
1456 StVal.getValueType() &&
1457 "OutVal type should always be legal");
1458
1459 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1460 const EVT StoreVT =
1461 ExtendIntegerParam ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1462
1463 return correctParamType(StVal, StoreVT, ArgOuts[I].Flags, DAG, dl);
1464 };
1465
1466 unsigned J = 0;
1467 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign, IsVAArg);
1468 for (const unsigned NumElts : VI) {
1469 const EVT EltVT = promoteScalarIntegerPTX(VTs[J]);
1470
1471 unsigned Offset;
1472 if (IsVAArg) {
1473 // TODO: We may need to support vector types that can be passed
1474 // as scalars in variadic arguments.
1475 assert(NumElts == 1 &&
1476 "Vectorization should be disabled for vaargs.");
1477
1478 // Align each part of the variadic argument to their type.
1479 VAOffset = alignTo(VAOffset, DAG.getEVTAlign(EltVT));
1480 Offset = VAOffset;
1481
1482 const EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : EltVT;
1483 VAOffset += DL.getTypeAllocSize(TheStoreType.getTypeForEVT(Ctx));
1484 } else {
1485 assert(VAOffset == 0 && "VAOffset must be 0 for non-VA args");
1486 Offset = Offsets[J];
1487 }
1488
1489 SDValue Ptr =
1490 DAG.getObjectPtrOffset(dl, ParamSymbol, TypeSize::getFixed(Offset));
1491
1492 const MaybeAlign CurrentAlign = ExtendIntegerParam
1493 ? MaybeAlign(std::nullopt)
1494 : commonAlignment(ArgAlign, Offset);
1495
1496 SDValue Val =
1497 getBuildVectorizedValue(NumElts, dl, DAG, [&](unsigned K) {
1498 return GetStoredValue(J + K);
1499 });
1500
1501 SDValue StoreParam = DAG.getStore(
1502 ArgDeclare, dl, Val, Ptr,
1504 CallPrereqs.push_back(StoreParam);
1505
1506 J += NumElts;
1507 }
1508 }
1509 }
1510
1511 // Handle Result
1512 if (!Ins.empty()) {
1513 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1514 const unsigned ResultSize = DL.getTypeAllocSize(RetTy);
1515 if (shouldPassAsArray(RetTy)) {
1516 const Align RetAlign =
1517 getPTXParamAlign(CB, RetTy, AttributeList::ReturnIndex, DL);
1518 MakeDeclareArrayParam(RetSymbol, RetAlign, ResultSize);
1519 } else {
1520 MakeDeclareScalarParam(RetSymbol, ResultSize);
1521 }
1522 }
1523
1524 // Set the size of the vararg param byte array if the callee is a variadic
1525 // function and the variadic part is not empty.
1526 if (VADeclareParam) {
1527 SDValue DeclareParamOps[] = {VADeclareParam.getOperand(0),
1528 VADeclareParam.getOperand(1),
1529 VADeclareParam.getOperand(2), GetI32(VAOffset),
1530 VADeclareParam.getOperand(4)};
1531 DAG.MorphNodeTo(VADeclareParam.getNode(), VADeclareParam.getOpcode(),
1532 VADeclareParam->getVTList(), DeclareParamOps);
1533 }
1534
1535 const auto *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
1536 const auto *CalleeF = Func ? dyn_cast<Function>(Func->getGlobal()) : nullptr;
1537
1538 // If the type of the callsite does not match that of the function, convert
1539 // the callsite to an indirect call.
1540 const bool ConvertToIndirectCall =
1541 CalleeF && CB->getFunctionType() != CalleeF->getFunctionType();
1542
1543 // Both indirect calls and libcalls have nullptr Func. In order to distinguish
1544 // between them we must rely on the call site value which is valid for
1545 // indirect calls but is always null for libcalls.
1546 const bool IsIndirectCall = (!Func && CB) || ConvertToIndirectCall;
1547
1548 if (isa<ExternalSymbolSDNode>(Callee)) {
1549 Function* CalleeFunc = nullptr;
1550
1551 // Try to find the callee in the current module.
1552 Callee = DAG.getSymbolFunctionGlobalAddress(Callee, &CalleeFunc);
1553 assert(CalleeFunc != nullptr && "Libcall callee must be set.");
1554
1555 // Set the "libcall callee" attribute to indicate that the function
1556 // must always have a declaration.
1557 CalleeFunc->addFnAttr("nvptx-libcall-callee", "true");
1558 }
1559
1560 // In the indirect function call case, PTX requires a prototype of the form:
1561 // proto_0 : .callprototype(.param .b32 _) _ (.param .b32 _);
1562 // Where the label is to be used as the last arg of the call instruction.
1563 // We record the call site here and emit all prototypes at the
1564 // start of the function in the AsmPrinter.
1565 if (IsIndirectCall)
1566 DAG.getMachineFunction()
1568 ->addCallPrototype(UniqueCallSite, CB);
1569
1570 const bool IsUnknownIntrinsic =
1571 CalleeF && CalleeF->isIntrinsic() &&
1572 CalleeF->getIntrinsicID() == Intrinsic::not_intrinsic;
1573 if (IsUnknownIntrinsic) {
1576 "call to unknown intrinsic '" + CalleeF->getName() +
1577 "' cannot be lowered by the NVPTX backend",
1578 dl.getDebugLoc()));
1579 }
1580
1581 const unsigned Proto = IsIndirectCall ? UniqueCallSite : 0;
1582 const unsigned NumArgs =
1583 std::min<unsigned>(CLI.NumFixedArgs + 1, Args.size());
1584 /// CALL(Chain, IsConvergent, IsIndirectCall/IsUniform, NumReturns,
1585 /// NumParams, Callee, Proto)
1586 const SDValue CallToken = DAG.getTokenFactor(dl, CallPrereqs);
1587 const SDValue Call = DAG.getNode(
1588 NVPTXISD::CALL, dl, MVT::Other,
1589 {CallToken, GetI32(CLI.IsConvergent), GetI32(IsIndirectCall),
1590 GetI32(Ins.empty() ? 0 : 1), GetI32(NumArgs), Callee, GetI32(Proto)});
1591
1592 SmallVector<SDValue, 16> LoadChains{Call};
1593 SmallVector<SDValue, 16> ProxyRegOps;
1594 if (!Ins.empty()) {
1597 ComputePTXValueVTs(*this, DL, Ctx, CLI.CallConv, RetTy, VTs, Offsets);
1598 assert(VTs.size() == Ins.size() && "Bad value decomposition");
1599
1600 const Align RetAlign =
1601 getPTXParamAlign(CB, RetTy, AttributeList::ReturnIndex, DL);
1602 const SDValue RetSymbol = DAG.getExternalSymbol("retval0", MVT::i32);
1603
1604 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
1605 // 32-bits are sign extended or zero extended, depending on whether
1606 // they are signed or unsigned types.
1607 const bool ExtendIntegerRetVal =
1608 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
1609
1610 unsigned I = 0;
1611 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
1612 for (const unsigned NumElts : VI) {
1613 const MaybeAlign CurrentAlign =
1614 ExtendIntegerRetVal ? MaybeAlign(std::nullopt)
1615 : commonAlignment(RetAlign, Offsets[I]);
1616
1617 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
1618 const EVT LoadVT =
1619 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
1620 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
1621 SDValue Ptr =
1622 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
1623
1624 SDValue R = DAG.getLoad(
1625 VecVT, dl, Call, Ptr,
1627
1628 LoadChains.push_back(R.getValue(1));
1629 for (const unsigned J : llvm::seq(NumElts))
1630 ProxyRegOps.push_back(getExtractVectorizedValue(R, J, LoadVT, dl, DAG));
1631 I += NumElts;
1632 }
1633 }
1634
1635 const SDValue EndToken = DAG.getTokenFactor(dl, LoadChains);
1636 const SDValue CallEnd = DAG.getCALLSEQ_END(EndToken, UniqueCallSite,
1637 UniqueCallSite + 1, SDValue(), dl);
1638
1639 // Append ProxyReg instructions to the chain to make sure that `callseq_end`
1640 // will not get lost. Otherwise, during libcalls expansion, the nodes can become
1641 // dangling.
1642 for (const auto [I, Reg] : llvm::enumerate(ProxyRegOps)) {
1643 SDValue Proxy =
1644 DAG.getNode(NVPTXISD::ProxyReg, dl, Reg.getValueType(), {CallEnd, Reg});
1645 SDValue Ret = correctParamType(Proxy, Ins[I].VT, Ins[I].Flags, DAG, dl);
1646 InVals.push_back(Ret);
1647 }
1648
1649 // set IsTailCall to false for now, until we figure out how to express
1650 // tail call optimization in PTX
1651 CLI.IsTailCall = false;
1652 return CallEnd;
1653}
1654
1656 SelectionDAG &DAG) const {
1657
1658 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1659 const Function &Fn = DAG.getMachineFunction().getFunction();
1660
1662 Fn,
1663 "Support for dynamic alloca introduced in PTX ISA version 7.3 and "
1664 "requires target sm_52.",
1665 SDLoc(Op).getDebugLoc()));
1666 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()),
1667 Op.getOperand(0)};
1668 return DAG.getMergeValues(Ops, SDLoc());
1669 }
1670
1671 SDLoc DL(Op.getNode());
1672 SDValue Chain = Op.getOperand(0);
1673 SDValue Size = Op.getOperand(1);
1674 uint64_t Align = Op.getConstantOperandVal(2);
1675
1676 // The alignment on a ISD::DYNAMIC_STACKALLOC node may be 0 to indicate that
1677 // the default stack alignment should be used.
1678 if (Align == 0)
1680
1681 // The size for ptx alloca instruction is 64-bit for m64 and 32-bit for m32.
1682 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1683
1684 SDValue Alloc =
1685 DAG.getNode(NVPTXISD::DYNAMIC_STACKALLOC, DL, {LocalVT, MVT::Other},
1686 {Chain, DAG.getZExtOrTrunc(Size, DL, LocalVT),
1687 DAG.getTargetConstant(Align, DL, MVT::i32)});
1688
1689 SDValue ASC = DAG.getAddrSpaceCast(
1691
1692 return DAG.getMergeValues({ASC, SDValue(Alloc.getNode(), 1)}, DL);
1693}
1694
1696 SelectionDAG &DAG) const {
1697 SDLoc DL(Op.getNode());
1698 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1699 const Function &Fn = DAG.getMachineFunction().getFunction();
1700
1702 Fn,
1703 "Support for stackrestore requires PTX ISA version >= 7.3 and target "
1704 ">= sm_52.",
1705 DL.getDebugLoc()));
1706 return Op.getOperand(0);
1707 }
1708
1709 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1710 SDValue Chain = Op.getOperand(0);
1711 SDValue Ptr = Op.getOperand(1);
1712 SDValue ASC = DAG.getAddrSpaceCast(DL, LocalVT, Ptr, ADDRESS_SPACE_GENERIC,
1714 return DAG.getNode(NVPTXISD::STACKRESTORE, DL, MVT::Other, {Chain, ASC});
1715}
1716
1718 SelectionDAG &DAG) const {
1719 SDLoc DL(Op.getNode());
1720 if (STI.getPTXVersion() < 73 || STI.getSmVersion() < 52) {
1721 const Function &Fn = DAG.getMachineFunction().getFunction();
1722
1724 Fn,
1725 "Support for stacksave requires PTX ISA version >= 7.3 and target >= "
1726 "sm_52.",
1727 DL.getDebugLoc()));
1728 auto Ops = {DAG.getConstant(0, DL, Op.getValueType()), Op.getOperand(0)};
1729 return DAG.getMergeValues(Ops, DL);
1730 }
1731
1732 const MVT LocalVT = getPointerTy(DAG.getDataLayout(), ADDRESS_SPACE_LOCAL);
1733 SDValue Chain = Op.getOperand(0);
1734 SDValue SS =
1735 DAG.getNode(NVPTXISD::STACKSAVE, DL, {LocalVT, MVT::Other}, Chain);
1736 SDValue ASC = DAG.getAddrSpaceCast(
1737 DL, Op.getValueType(), SS, ADDRESS_SPACE_LOCAL, ADDRESS_SPACE_GENERIC);
1738 return DAG.getMergeValues({ASC, SDValue(SS.getNode(), 1)}, DL);
1739}
1740
1741// By default CONCAT_VECTORS is lowered by ExpandVectorBuildThroughStack()
1742// (see LegalizeDAG.cpp). This is slow and uses local memory.
1743// We use extract/insert/build vector just as what LegalizeOp() does in llvm 2.5
1744SDValue
1745NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
1746 SDNode *Node = Op.getNode();
1747 SDLoc dl(Node);
1749 unsigned NumOperands = Node->getNumOperands();
1750 for (unsigned i = 0; i < NumOperands; ++i) {
1751 SDValue SubOp = Node->getOperand(i);
1752 EVT VVT = SubOp.getNode()->getValueType(0);
1753 EVT EltVT = VVT.getVectorElementType();
1754 unsigned NumSubElem = VVT.getVectorNumElements();
1755 for (unsigned j = 0; j < NumSubElem; ++j) {
1756 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, SubOp,
1757 DAG.getIntPtrConstant(j, dl)));
1758 }
1759 }
1760 return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
1761}
1762
1764 SelectionDAG &DAG,
1765 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1766 assert(A.getValueType() == MVT::i32 && B.getValueType() == MVT::i32 &&
1767 Selector.getValueType() == MVT::i32 && "PRMT must have i32 operands");
1768 return DAG.getNode(NVPTXISD::PRMT, DL, MVT::i32,
1769 {A, B, Selector, DAG.getConstant(Mode, DL, MVT::i32)});
1770}
1771
1773 SelectionDAG &DAG,
1774 unsigned Mode = NVPTX::PTXPrmtMode::NONE) {
1775 return getPRMT(A, B, DAG.getConstant(Selector, DL, MVT::i32), DL, DAG, Mode);
1776}
1777
1778/// Reduces the elements using the scalar operations provided. The operations
1779/// are sorted descending in number of inputs they take. The flags on the
1780/// original reduction operation will be propagated to each scalar operation.
1781/// Nearby elements are grouped in tree reduction, unlike the shuffle reduction
1782/// used in ExpandReductions and SelectionDAG.
1784 const SmallVector<SDValue> &Elements, EVT EltTy,
1785 ArrayRef<std::pair<unsigned /*NodeType*/, unsigned /*NumInputs*/>> Ops,
1786 const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG) {
1787 // Build the reduction tree at each level, starting with all the elements.
1788 SmallVector<SDValue> Level = Elements;
1789
1790 unsigned OpIdx = 0;
1791 while (Level.size() > 1) {
1792 // Try to reduce this level using the current operator.
1793 const auto [Op, NumInputs] = Ops[OpIdx];
1794
1795 // Build the next level by partially reducing all elements.
1796 SmallVector<SDValue> ReducedLevel;
1797 unsigned I = 0, E = Level.size();
1798 for (; I + NumInputs <= E; I += NumInputs) {
1799 // Reduce elements in groups of [NumInputs], as much as possible.
1800 ReducedLevel.push_back(DAG.getNode(
1801 Op, DL, EltTy, ArrayRef<SDValue>(Level).slice(I, NumInputs), Flags));
1802 }
1803
1804 if (I < E) {
1805 // Handle leftover elements.
1806
1807 if (ReducedLevel.empty()) {
1808 // We didn't reduce anything at this level. We need to pick a smaller
1809 // operator.
1810 ++OpIdx;
1811 assert(OpIdx < Ops.size() && "no smaller operators for reduction");
1812 continue;
1813 }
1814
1815 // We reduced some things but there's still more left, meaning the
1816 // operator's number of inputs doesn't evenly divide this level size. Move
1817 // these elements to the next level.
1818 for (; I < E; ++I)
1819 ReducedLevel.push_back(Level[I]);
1820 }
1821
1822 // Process the next level.
1823 Level = ReducedLevel;
1824 }
1825
1826 return *Level.begin();
1827}
1828
1829// Get scalar reduction opcode
1830static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode) {
1831 switch (ReductionOpcode) {
1833 return ISD::FMAXNUM;
1835 return ISD::FMINNUM;
1837 return ISD::FMAXIMUM;
1839 return ISD::FMINIMUM;
1840 default:
1841 llvm_unreachable("unhandled reduction opcode");
1842 }
1843}
1844
1845/// Get 3-input scalar reduction opcode
1846static std::optional<unsigned>
1847getScalar3OpcodeForReduction(unsigned ReductionOpcode) {
1848 switch (ReductionOpcode) {
1850 return NVPTXISD::FMAXNUM3;
1852 return NVPTXISD::FMINNUM3;
1854 return NVPTXISD::FMAXIMUM3;
1856 return NVPTXISD::FMINIMUM3;
1857 default:
1858 return std::nullopt;
1859 }
1860}
1861
1862/// Lower reductions to either a sequence of operations or a tree if
1863/// reassociations are allowed. This method will use larger operations like
1864/// max3/min3 when the target supports them.
1865SDValue NVPTXTargetLowering::LowerVECREDUCE(SDValue Op,
1866 SelectionDAG &DAG) const {
1867 SDLoc DL(Op);
1868 const SDNodeFlags Flags = Op->getFlags();
1869 SDValue Vector = Op.getOperand(0);
1870
1871 const unsigned Opcode = Op->getOpcode();
1872 const EVT EltTy = Vector.getValueType().getVectorElementType();
1873
1874 // Whether we can use 3-input min/max when expanding the reduction.
1875 const bool CanUseMinMax3 =
1876 EltTy == MVT::f32 && STI.getSmVersion() >= 100 &&
1877 STI.getPTXVersion() >= 88 &&
1878 (Opcode == ISD::VECREDUCE_FMAX || Opcode == ISD::VECREDUCE_FMIN ||
1879 Opcode == ISD::VECREDUCE_FMAXIMUM || Opcode == ISD::VECREDUCE_FMINIMUM);
1880
1881 // A list of SDNode opcodes with equivalent semantics, sorted descending by
1882 // number of inputs they take.
1883 SmallVector<std::pair<unsigned /*Op*/, unsigned /*NumIn*/>, 2> ScalarOps;
1884
1885 if (auto Opcode3Elem = getScalar3OpcodeForReduction(Opcode);
1886 CanUseMinMax3 && Opcode3Elem)
1887 ScalarOps.push_back({*Opcode3Elem, 3});
1888 ScalarOps.push_back({getScalarOpcodeForReduction(Opcode), 2});
1889
1891 DAG.ExtractVectorElements(Vector, Elements);
1892
1893 return buildTreeReduction(Elements, EltTy, ScalarOps, DL, Flags, DAG);
1894}
1895
1896SDValue NVPTXTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
1897 // Handle bitcasting from v2i8 without hitting the default promotion
1898 // strategy which goes through stack memory.
1899 EVT FromVT = Op->getOperand(0)->getValueType(0);
1900 if (FromVT != MVT::v2i8) {
1901 return Op;
1902 }
1903
1904 // Pack vector elements into i16 and bitcast to final type
1905 SDLoc DL(Op);
1906 SDValue Vec0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
1907 Op->getOperand(0), DAG.getIntPtrConstant(0, DL));
1908 SDValue Vec1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8,
1909 Op->getOperand(0), DAG.getIntPtrConstant(1, DL));
1910 SDValue Extend0 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec0);
1911 SDValue Extend1 = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Vec1);
1912 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
1913 SDValue AsInt = DAG.getNode(
1914 ISD::OR, DL, MVT::i16,
1915 {Extend0, DAG.getNode(ISD::SHL, DL, MVT::i16, {Extend1, Const8})});
1916 EVT ToVT = Op->getValueType(0);
1917 return DAG.getBitcast(ToVT, AsInt);
1918}
1919
1920// We can init constant f16x2/v2i16/v4i8 with a single .b32 move. Normally it
1921// would get lowered as two constant loads and vector-packing move.
1922// Instead we want just a constant move:
1923// mov.b32 %r2, 0x40003C00
1924SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
1925 SelectionDAG &DAG) const {
1926 EVT VT = Op->getValueType(0);
1927 if (!(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector()))
1928 return Op;
1929 SDLoc DL(Op);
1930
1931 if (!llvm::all_of(Op->ops(), [](SDValue Operand) {
1932 return Operand->isUndef() || isa<ConstantSDNode>(Operand) ||
1933 isa<ConstantFPSDNode>(Operand);
1934 })) {
1935 if (VT != MVT::v4i8)
1936 return Op;
1937 // Lower non-const v4i8 vector as byte-wise constructed i32, which allows us
1938 // to optimize calculation of constant parts.
1939 auto GetPRMT = [&](const SDValue Left, const SDValue Right, bool Cast,
1940 uint64_t SelectionValue) -> SDValue {
1941 SDValue L = Left;
1942 SDValue R = Right;
1943 if (Cast) {
1944 L = DAG.getAnyExtOrTrunc(L, DL, MVT::i32);
1945 R = DAG.getAnyExtOrTrunc(R, DL, MVT::i32);
1946 }
1947 return getPRMT(L, R, SelectionValue, DL, DAG);
1948 };
1949 auto PRMT__10 = GetPRMT(Op->getOperand(0), Op->getOperand(1), true, 0x3340);
1950 auto PRMT__32 = GetPRMT(Op->getOperand(2), Op->getOperand(3), true, 0x3340);
1951 auto PRMT3210 = GetPRMT(PRMT__10, PRMT__32, false, 0x5410);
1952 return DAG.getBitcast(VT, PRMT3210);
1953 }
1954
1955 // Get value or the Nth operand as an APInt(32). Undef values treated as 0.
1956 auto GetOperand = [](SDValue Op, int N) -> APInt {
1957 const SDValue &Operand = Op->getOperand(N);
1958 EVT VT = Op->getValueType(0);
1959 if (Operand->isUndef())
1960 return APInt(32, 0);
1961 APInt Value;
1962 if (VT == MVT::v2f16 || VT == MVT::v2bf16)
1963 Value = cast<ConstantFPSDNode>(Operand)->getValueAPF().bitcastToAPInt();
1964 else if (VT == MVT::v2i16 || VT == MVT::v4i8)
1965 Value = Operand->getAsAPIntVal();
1966 else
1967 llvm_unreachable("Unsupported type");
1968 // i8 values are carried around as i16, so we need to zero out upper bits,
1969 // so they do not get in the way of combining individual byte values
1970 if (VT == MVT::v4i8)
1971 Value = Value.trunc(8);
1972 return Value.zext(32);
1973 };
1974
1975 // Construct a 32-bit constant by shifting into place smaller values
1976 // (elements of the vector type VT).
1977 // For example, if VT has 2 elements, then N == 2:
1978 // ShiftAmount = 32 / N = 16
1979 // Value |= Op0 (b16) << 0
1980 // Value |= Op1 (b16) << 16
1981 // If N == 4:
1982 // ShiftAmount = 32 / N = 8
1983 // Value |= Op0 (b8) << 0
1984 // Value |= Op1 (b8) << 8
1985 // Value |= Op2 (b8) << 16
1986 // Value |= Op3 (b8) << 24
1987 // ...etc
1988 APInt Value(32, 0);
1989 const unsigned NumElements = VT.getVectorNumElements();
1990 assert(32 % NumElements == 0 && "must evenly divide bit length");
1991 const unsigned ShiftAmount = 32 / NumElements;
1992 for (unsigned ElementNo : seq(NumElements))
1993 Value |= GetOperand(Op, ElementNo).shl(ElementNo * ShiftAmount);
1994 SDValue Const = DAG.getConstant(Value, DL, MVT::i32);
1995 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), Const);
1996}
1997
1998SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
1999 SelectionDAG &DAG) const {
2000 SDValue Index = Op->getOperand(1);
2001 SDValue Vector = Op->getOperand(0);
2002 SDLoc DL(Op);
2003 EVT VectorVT = Vector.getValueType();
2004
2005 if (VectorVT == MVT::v4i8) {
2006 SDValue Selector = DAG.getNode(ISD::OR, DL, MVT::i32,
2007 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2008 DAG.getConstant(0x7770, DL, MVT::i32));
2009 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, Vector),
2010 DAG.getConstant(0, DL, MVT::i32), Selector, DL, DAG);
2011 SDValue Ext = DAG.getAnyExtOrTrunc(PRMT, DL, Op->getValueType(0));
2012 SDNodeFlags Flags;
2013 Flags.setNoSignedWrap(Ext.getScalarValueSizeInBits() > 8);
2014 Flags.setNoUnsignedWrap(Ext.getScalarValueSizeInBits() >= 8);
2015 Ext->setFlags(Flags);
2016 return Ext;
2017 }
2018
2019 // Constant index will be matched by tablegen.
2020 if (isa<ConstantSDNode>(Index.getNode()))
2021 return Op;
2022
2023 // Extract individual elements and select one of them.
2024 assert(NVPTX::isPackedVectorTy(VectorVT) &&
2025 VectorVT.getVectorNumElements() == 2 && "Unexpected vector type.");
2026 EVT EltVT = VectorVT.getVectorElementType();
2027
2028 SDLoc dl(Op.getNode());
2029 SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2030 DAG.getIntPtrConstant(0, dl));
2031 SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
2032 DAG.getIntPtrConstant(1, dl));
2033 return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
2035}
2036
2037SDValue NVPTXTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
2038 SelectionDAG &DAG) const {
2039 SDValue Vector = Op->getOperand(0);
2040 EVT VectorVT = Vector.getValueType();
2041
2042 if (VectorVT != MVT::v4i8)
2043 return Op;
2044 SDLoc DL(Op);
2045 SDValue Value = Op->getOperand(1);
2046 if (Value->isUndef())
2047 return Vector;
2048
2049 SDValue Index = Op->getOperand(2);
2050
2051 SDValue BFI =
2052 DAG.getNode(NVPTXISD::BFI, DL, MVT::i32,
2053 {DAG.getZExtOrTrunc(Value, DL, MVT::i32), Vector,
2054 DAG.getNode(ISD::MUL, DL, MVT::i32,
2055 DAG.getZExtOrTrunc(Index, DL, MVT::i32),
2056 DAG.getConstant(8, DL, MVT::i32)),
2057 DAG.getConstant(8, DL, MVT::i32)});
2058 return DAG.getNode(ISD::BITCAST, DL, Op->getValueType(0), BFI);
2059}
2060
2061SDValue NVPTXTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
2062 SelectionDAG &DAG) const {
2063 SDValue V1 = Op.getOperand(0);
2064 EVT VectorVT = V1.getValueType();
2065 if (VectorVT != MVT::v4i8 || Op.getValueType() != MVT::v4i8)
2066 return Op;
2067
2068 // Lower shuffle to PRMT instruction.
2069 const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op.getNode());
2070 SDValue V2 = Op.getOperand(1);
2071 uint32_t Selector = 0;
2072 for (auto I : llvm::enumerate(SVN->getMask())) {
2073 if (I.value() != -1) // -1 is a placeholder for undef.
2074 Selector |= (I.value() << (I.index() * 4));
2075 }
2076
2077 SDLoc DL(Op);
2078 SDValue PRMT = getPRMT(DAG.getBitcast(MVT::i32, V1),
2079 DAG.getBitcast(MVT::i32, V2), Selector, DL, DAG);
2080 return DAG.getBitcast(Op.getValueType(), PRMT);
2081}
2082/// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
2083/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2084/// amount, or
2085/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2086/// amount.
2087SDValue NVPTXTargetLowering::LowerShiftRightParts(SDValue Op,
2088 SelectionDAG &DAG) const {
2089 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2090 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
2091
2092 EVT VT = Op.getValueType();
2093 unsigned VTBits = VT.getSizeInBits();
2094 SDLoc dl(Op);
2095 SDValue ShOpLo = Op.getOperand(0);
2096 SDValue ShOpHi = Op.getOperand(1);
2097 SDValue ShAmt = Op.getOperand(2);
2098 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
2099
2100 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2101 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2102 // {dHi, dLo} = {aHi, aLo} >> Amt
2103 // dHi = aHi >> Amt
2104 // dLo = shf.r.clamp aLo, aHi, Amt
2105
2106 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2107 SDValue Lo =
2108 DAG.getNode(NVPTXISD::FSHR_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2109
2110 SDValue Ops[2] = { Lo, Hi };
2111 return DAG.getMergeValues(Ops, dl);
2112 }
2113 else {
2114 // {dHi, dLo} = {aHi, aLo} >> Amt
2115 // - if (Amt>=size) then
2116 // dLo = aHi >> (Amt-size)
2117 // dHi = aHi >> Amt (this is either all 0 or all 1)
2118 // else
2119 // dLo = (aLo >>logic Amt) | (aHi << (size-Amt))
2120 // dHi = aHi >> Amt
2121
2122 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2123 DAG.getConstant(VTBits, dl, MVT::i32),
2124 ShAmt);
2125 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
2126 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2127 DAG.getConstant(VTBits, dl, MVT::i32));
2128 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
2129 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2130 SDValue TrueVal = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
2131
2132 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2133 DAG.getConstant(VTBits, dl, MVT::i32),
2134 ISD::SETGE);
2135 SDValue Hi = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
2136 SDValue Lo = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2137
2138 SDValue Ops[2] = { Lo, Hi };
2139 return DAG.getMergeValues(Ops, dl);
2140 }
2141}
2142
2143/// LowerShiftLeftParts - Lower SHL_PARTS, which
2144/// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
2145/// amount, or
2146/// 2) returns two i64 values and take a 2 x i64 value to shift plus a shift
2147/// amount.
2148SDValue NVPTXTargetLowering::LowerShiftLeftParts(SDValue Op,
2149 SelectionDAG &DAG) const {
2150 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
2151 assert(Op.getOpcode() == ISD::SHL_PARTS);
2152
2153 EVT VT = Op.getValueType();
2154 unsigned VTBits = VT.getSizeInBits();
2155 SDLoc dl(Op);
2156 SDValue ShOpLo = Op.getOperand(0);
2157 SDValue ShOpHi = Op.getOperand(1);
2158 SDValue ShAmt = Op.getOperand(2);
2159
2160 if (VTBits == 32 && STI.getSmVersion() >= 35) {
2161 // For 32bit and sm35, we can use the funnel shift 'shf' instruction.
2162 // {dHi, dLo} = {aHi, aLo} << Amt
2163 // dHi = shf.l.clamp aLo, aHi, Amt
2164 // dLo = aLo << Amt
2165
2166 SDValue Hi =
2167 DAG.getNode(NVPTXISD::FSHL_CLAMP, dl, VT, ShOpHi, ShOpLo, ShAmt);
2168 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2169
2170 SDValue Ops[2] = { Lo, Hi };
2171 return DAG.getMergeValues(Ops, dl);
2172 }
2173 else {
2174 // {dHi, dLo} = {aHi, aLo} << Amt
2175 // - if (Amt>=size) then
2176 // dLo = aLo << Amt (all 0)
2177 // dLo = aLo << (Amt-size)
2178 // else
2179 // dLo = aLo << Amt
2180 // dHi = (aHi << Amt) | (aLo >> (size-Amt))
2181
2182 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
2183 DAG.getConstant(VTBits, dl, MVT::i32),
2184 ShAmt);
2185 SDValue Tmp1 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
2186 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
2187 DAG.getConstant(VTBits, dl, MVT::i32));
2188 SDValue Tmp2 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
2189 SDValue FalseVal = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
2190 SDValue TrueVal = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
2191
2192 SDValue Cmp = DAG.getSetCC(dl, MVT::i1, ShAmt,
2193 DAG.getConstant(VTBits, dl, MVT::i32),
2194 ISD::SETGE);
2195 SDValue Lo = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
2196 SDValue Hi = DAG.getNode(ISD::SELECT, dl, VT, Cmp, TrueVal, FalseVal);
2197
2198 SDValue Ops[2] = { Lo, Hi };
2199 return DAG.getMergeValues(Ops, dl);
2200 }
2201}
2202
2203/// If the types match, convert the generic copysign to the NVPTXISD version,
2204/// otherwise bail ensuring that mismatched cases are properly expaned.
2205SDValue NVPTXTargetLowering::LowerFCOPYSIGN(SDValue Op,
2206 SelectionDAG &DAG) const {
2207 EVT VT = Op.getValueType();
2208 SDLoc DL(Op);
2209
2210 SDValue In1 = Op.getOperand(0);
2211 SDValue In2 = Op.getOperand(1);
2212 EVT SrcVT = In2.getValueType();
2213
2214 if (!SrcVT.bitsEq(VT))
2215 return SDValue();
2216
2217 return DAG.getNode(NVPTXISD::FCOPYSIGN, DL, VT, In1, In2);
2218}
2219
2220SDValue NVPTXTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
2221 EVT VT = Op.getValueType();
2222
2223 if (VT == MVT::f32)
2224 return LowerFROUND32(Op, DAG);
2225
2226 if (VT == MVT::f64)
2227 return LowerFROUND64(Op, DAG);
2228
2229 llvm_unreachable("unhandled type");
2230}
2231
2232// This is the the rounding method used in CUDA libdevice in C like code:
2233// float roundf(float A)
2234// {
2235// float RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f));
2236// RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2237// return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2238// }
2239SDValue NVPTXTargetLowering::LowerFROUND32(SDValue Op,
2240 SelectionDAG &DAG) const {
2241 SDLoc SL(Op);
2242 SDValue A = Op.getOperand(0);
2243 EVT VT = Op.getValueType();
2244
2245 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2246
2247 // RoundedA = (float) (int) ( A > 0 ? (A + 0.5f) : (A - 0.5f))
2248 SDValue Bitcast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, A);
2249 const unsigned SignBitMask = 0x80000000;
2250 SDValue Sign = DAG.getNode(ISD::AND, SL, MVT::i32, Bitcast,
2251 DAG.getConstant(SignBitMask, SL, MVT::i32));
2252 const unsigned PointFiveInBits = 0x3F000000;
2253 SDValue PointFiveWithSignRaw =
2254 DAG.getNode(ISD::OR, SL, MVT::i32, Sign,
2255 DAG.getConstant(PointFiveInBits, SL, MVT::i32));
2256 SDValue PointFiveWithSign =
2257 DAG.getNode(ISD::BITCAST, SL, VT, PointFiveWithSignRaw);
2258 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, A, PointFiveWithSign);
2259 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2260
2261 // RoundedA = abs(A) > 0x1.0p23 ? A : RoundedA;
2262 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2263 SDValue IsLarge =
2264 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 23.0), SL, VT),
2265 ISD::SETOGT);
2266 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2267
2268 // return abs(A) < 0.5 ? (float)(int)A : RoundedA;
2269 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2270 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2271 SDValue RoundedAForSmallA = DAG.getNode(ISD::FTRUNC, SL, VT, A);
2272 return DAG.getNode(ISD::SELECT, SL, VT, IsSmall, RoundedAForSmallA, RoundedA);
2273}
2274
2275// The implementation of round(double) is similar to that of round(float) in
2276// that they both separate the value range into three regions and use a method
2277// specific to the region to round the values. However, round(double) first
2278// calculates the round of the absolute value and then adds the sign back while
2279// round(float) directly rounds the value with sign.
2280SDValue NVPTXTargetLowering::LowerFROUND64(SDValue Op,
2281 SelectionDAG &DAG) const {
2282 SDLoc SL(Op);
2283 SDValue A = Op.getOperand(0);
2284 EVT VT = Op.getValueType();
2285
2286 SDValue AbsA = DAG.getNode(ISD::FABS, SL, VT, A);
2287
2288 // double RoundedA = (double) (int) (abs(A) + 0.5f);
2289 SDValue AdjustedA = DAG.getNode(ISD::FADD, SL, VT, AbsA,
2290 DAG.getConstantFP(0.5, SL, VT));
2291 SDValue RoundedA = DAG.getNode(ISD::FTRUNC, SL, VT, AdjustedA);
2292
2293 // RoundedA = abs(A) < 0.5 ? (double)0 : RoundedA;
2294 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2295 SDValue IsSmall =DAG.getSetCC(SL, SetCCVT, AbsA,
2296 DAG.getConstantFP(0.5, SL, VT), ISD::SETOLT);
2297 RoundedA = DAG.getNode(ISD::SELECT, SL, VT, IsSmall,
2298 DAG.getConstantFP(0, SL, VT),
2299 RoundedA);
2300
2301 // Add sign to rounded_A
2302 RoundedA = DAG.getNode(ISD::FCOPYSIGN, SL, VT, RoundedA, A);
2303 DAG.getNode(ISD::FTRUNC, SL, VT, A);
2304
2305 // RoundedA = abs(A) > 0x1.0p52 ? A : RoundedA;
2306 SDValue IsLarge =
2307 DAG.getSetCC(SL, SetCCVT, AbsA, DAG.getConstantFP(pow(2.0, 52.0), SL, VT),
2308 ISD::SETOGT);
2309 return DAG.getNode(ISD::SELECT, SL, VT, IsLarge, A, RoundedA);
2310}
2311
2313 EVT VT = N->getValueType(0);
2314 EVT NVT = MVT::f32;
2315 if (VT.isVector()) {
2316 NVT = EVT::getVectorVT(*DAG.getContext(), NVT, VT.getVectorElementCount());
2317 }
2318 SDLoc DL(N);
2319 SDValue Tmp0 = DAG.getFPExtendOrRound(N->getOperand(0), DL, NVT);
2320 SDValue Tmp1 = DAG.getFPExtendOrRound(N->getOperand(1), DL, NVT);
2321 SDValue Res = DAG.getNode(N->getOpcode(), DL, NVT, Tmp0, Tmp1, N->getFlags());
2322 return DAG.getFPExtendOrRound(Res, DL, VT);
2323}
2324
2325SDValue NVPTXTargetLowering::PromoteBinOpIfF32FTZ(SDValue Op,
2326 SelectionDAG &DAG) const {
2327 if (useF32FTZ(DAG.getMachineFunction())) {
2328 return PromoteBinOpToF32(Op.getNode(), DAG);
2329 }
2330 return Op;
2331}
2332
2333SDValue NVPTXTargetLowering::LowerINT_TO_FP(SDValue Op,
2334 SelectionDAG &DAG) const {
2335 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2336
2337 if (Op.getValueType() == MVT::bf16) {
2338 SDLoc Loc(Op);
2339 return DAG.getNode(
2340 ISD::FP_ROUND, Loc, MVT::bf16,
2341 DAG.getNode(Op.getOpcode(), Loc, MVT::f32, Op.getOperand(0)),
2342 DAG.getIntPtrConstant(0, Loc, /*isTarget=*/true));
2343 }
2344
2345 // Everything else is considered legal.
2346 return Op;
2347}
2348
2349SDValue NVPTXTargetLowering::LowerFP_TO_INT(SDValue Op,
2350 SelectionDAG &DAG) const {
2351 assert(STI.getSmVersion() < 90 || STI.getPTXVersion() < 78);
2352
2353 if (Op.getOperand(0).getValueType() == MVT::bf16) {
2354 SDLoc Loc(Op);
2355 return DAG.getNode(
2356 Op.getOpcode(), Loc, Op.getValueType(),
2357 DAG.getNode(ISD::FP_EXTEND, Loc, MVT::f32, Op.getOperand(0)));
2358 }
2359
2360 // Everything else is considered legal.
2361 return Op;
2362}
2363
2364SDValue NVPTXTargetLowering::LowerFP_ROUND(SDValue Op,
2365 SelectionDAG &DAG) const {
2366 EVT NarrowVT = Op.getValueType();
2367 SDValue Wide = Op.getOperand(0);
2368 EVT WideVT = Wide.getValueType();
2369 if (NarrowVT.getScalarType() == MVT::bf16) {
2370 const TargetLowering *TLI = STI.getTargetLowering();
2371 if (STI.getSmVersion() < 80 || STI.getPTXVersion() < 70) {
2372 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2373 }
2374 if (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78) {
2375 // This combination was the first to support f32 -> bf16.
2376 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 70) {
2377 if (WideVT.getScalarType() == MVT::f32) {
2378 return Op;
2379 }
2380 if (WideVT.getScalarType() == MVT::f64) {
2381 SDLoc Loc(Op);
2382 // Round-inexact-to-odd f64 to f32, then do the final rounding using
2383 // the hardware f32 -> bf16 instruction.
2385 WideVT.changeElementType(*DAG.getContext(), MVT::f32), Wide, Loc,
2386 DAG);
2387 return DAG.getFPExtendOrRound(rod, Loc, NarrowVT);
2388 }
2389 }
2390 return TLI->expandFP_ROUND(Op.getNode(), DAG);
2391 }
2392 }
2393
2394 // Everything else is considered legal.
2395 return Op;
2396}
2397
2398SDValue NVPTXTargetLowering::LowerFP_EXTEND(SDValue Op,
2399 SelectionDAG &DAG) const {
2400 SDValue Narrow = Op.getOperand(0);
2401 EVT NarrowVT = Narrow.getValueType();
2402 EVT WideVT = Op.getValueType();
2403 if (NarrowVT.getScalarType() == MVT::bf16) {
2404 if (WideVT.getScalarType() == MVT::f32 &&
2405 (STI.getSmVersion() < 80 || STI.getPTXVersion() < 71)) {
2406 SDLoc Loc(Op);
2407 return DAG.getNode(ISD::BF16_TO_FP, Loc, WideVT, Narrow);
2408 }
2409 if (WideVT.getScalarType() == MVT::f64 &&
2410 (STI.getSmVersion() < 90 || STI.getPTXVersion() < 78)) {
2411 EVT F32 = NarrowVT.changeElementType(*DAG.getContext(), MVT::f32);
2412 SDLoc Loc(Op);
2413 if (STI.getSmVersion() >= 80 && STI.getPTXVersion() >= 71) {
2414 Op = DAG.getNode(ISD::FP_EXTEND, Loc, F32, Narrow);
2415 } else {
2416 Op = DAG.getNode(ISD::BF16_TO_FP, Loc, F32, Narrow);
2417 }
2418 return DAG.getNode(ISD::FP_EXTEND, Loc, WideVT, Op);
2419 }
2420 }
2421
2422 // Everything else is considered legal.
2423 return Op;
2424}
2425
2427 SDLoc DL(Op);
2428 if (Op.getValueType() != MVT::v2i16)
2429 return Op;
2430 EVT EltVT = Op.getValueType().getVectorElementType();
2431 SmallVector<SDValue> VecElements;
2432 for (int I = 0, E = Op.getValueType().getVectorNumElements(); I < E; I++) {
2433 SmallVector<SDValue> ScalarArgs;
2434 llvm::transform(Op->ops(), std::back_inserter(ScalarArgs),
2435 [&](const SDUse &O) {
2436 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
2437 O.get(), DAG.getIntPtrConstant(I, DL));
2438 });
2439 VecElements.push_back(DAG.getNode(Op.getOpcode(), DL, EltVT, ScalarArgs));
2440 }
2441 SDValue V =
2442 DAG.getNode(ISD::BUILD_VECTOR, DL, Op.getValueType(), VecElements);
2443 return V;
2444}
2445
2447 bool hasOffset = false) {
2448 // skip lowering if the vector operand is already legalized
2449 if (!Op->getOperand(hasOffset ? 4 : 3).getValueType().isVector())
2450 return Op;
2451
2452 SDNode *N = Op.getNode();
2453 SDLoc DL(N);
2455
2456 // split the vector argument
2457 for (size_t I = 0; I < N->getNumOperands(); I++) {
2458 SDValue Val = N->getOperand(I);
2459 EVT ValVT = Val.getValueType();
2460 if (ValVT.isVector()) {
2461 EVT EltVT = ValVT.getVectorElementType();
2462 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2463 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2464 DAG.getIntPtrConstant(J, DL)));
2465 } else
2466 Ops.push_back(Val);
2467 }
2468
2470 SDValue Tcgen05StNode =
2471 DAG.getMemIntrinsicNode(ISD::INTRINSIC_VOID, DL, N->getVTList(), Ops,
2472 MemSD->getMemoryVT(), MemSD->getMemOperand());
2473
2474 return Tcgen05StNode;
2475}
2476
2478 SDLoc DL(Op);
2479 SDValue Src = Op.getOperand(0);
2480 EVT VT = Op.getValueType();
2481
2482 switch (VT.getSimpleVT().SimpleTy) {
2483 case MVT::i16: {
2484 SDValue Extended = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Src);
2485 SDValue Swapped =
2486 getPRMT(Extended, DAG.getConstant(0, DL, MVT::i32), 0x7701, DL, DAG);
2487 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Swapped);
2488 }
2489 case MVT::i32: {
2490 return getPRMT(Src, DAG.getConstant(0, DL, MVT::i32), 0x0123, DL, DAG);
2491 }
2492 case MVT::v2i16: {
2493 SDValue Converted = DAG.getBitcast(MVT::i32, Src);
2494 SDValue Swapped =
2495 getPRMT(Converted, DAG.getConstant(0, DL, MVT::i32), 0x2301, DL, DAG);
2496 return DAG.getNode(ISD::BITCAST, DL, MVT::v2i16, Swapped);
2497 }
2498 case MVT::i64: {
2499 SDValue UnpackSrc =
2500 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, Src);
2501 SDValue SwappedLow =
2502 getPRMT(UnpackSrc.getValue(0), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2503 DL, DAG);
2504 SDValue SwappedHigh =
2505 getPRMT(UnpackSrc.getValue(1), DAG.getConstant(0, DL, MVT::i32), 0x0123,
2506 DL, DAG);
2507 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64,
2508 {SwappedHigh, SwappedLow});
2509 }
2510 default:
2511 llvm_unreachable("unsupported type for bswap");
2512 }
2513}
2514
2516 const Function &Fn = DAG.getMachineFunction().getFunction();
2517 SDNode *N = Op.getNode();
2518 SDLoc DL(N);
2519 Intrinsic::ID IntrinsicID = N->getConstantOperandVal(1);
2520 SDValue DestAddr = N->getOperand(2);
2521 SDValue Value = N->getOperand(3);
2522 SDValue MbarAddr = N->getOperand(4);
2523
2524 MVT ValueVT = Value.getSimpleValueType();
2525
2526 if (ValueVT == MVT::i32 || ValueVT == MVT::i64)
2527 return Op;
2528
2529 if (ValueVT == MVT::i128) {
2530 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Value);
2531 SDValue ValueLo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2532 DAG.getIntPtrConstant(0, DL));
2533 SDValue ValueHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2534 DAG.getIntPtrConstant(1, DL));
2535 SDValue Ops[] = {N->getOperand(0), DestAddr, ValueLo, ValueHi, MbarAddr};
2536 return DAG.getNode(NVPTXISD::ST_ASYNC_MBARRIER_B128, DL, MVT::Other, Ops);
2537 }
2538
2540 Fn,
2541 Twine("unsupported argument type ") + llvm::EVT(ValueVT).getEVTString() +
2542 " for " + llvm::Intrinsic::getName(IntrinsicID) + " intrinsic",
2543 DiagnosticLocation(DL.getDebugLoc())));
2544 return Op.getOperand(0); // Return only the chain
2545}
2546
2548 const Function &Fn = DAG.getMachineFunction().getFunction();
2549 SDNode *N = Op.getNode();
2550 SDLoc DL(N);
2551 Intrinsic::ID IntrinsicID = N->getConstantOperandVal(1);
2552 SDValue DestAddr = N->getOperand(2);
2553 SDValue Value = N->getOperand(3);
2554
2555 MVT ValueVT = Value.getSimpleValueType();
2556
2557 if (ValueVT == MVT::i16 || ValueVT == MVT::i32 || ValueVT == MVT::i64)
2558 return Op;
2559
2560 if (ValueVT == MVT::i8) {
2561 unsigned OpCode;
2562 switch (IntrinsicID) {
2563 case Intrinsic::nvvm_st_async_sys:
2564 OpCode = NVPTXISD::ST_ASYNC_SYS_B8;
2565 break;
2566 case Intrinsic::nvvm_st_async_gpu:
2567 OpCode = NVPTXISD::ST_ASYNC_GPU_B8;
2568 break;
2569 case Intrinsic::nvvm_st_async_mmio_sys:
2570 OpCode = NVPTXISD::ST_ASYNC_MMIO_SYS_B8;
2571 break;
2572 default:
2573 llvm_unreachable("unexpected intrinsic ID for st.async.release");
2574 }
2575
2576 Value = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i16, Value);
2577
2578 // The `.mmio` variant has no multimem form and therefore no `isMultimem`
2579 // operand.
2580 if (IntrinsicID == Intrinsic::nvvm_st_async_mmio_sys) {
2581 SDValue Ops[] = {N->getOperand(0), DestAddr, Value};
2582 return DAG.getNode(OpCode, DL, MVT::Other, Ops);
2583 }
2584
2585 SDValue IsMultimem =
2586 DAG.getTargetConstant(N->getConstantOperandVal(4), DL, MVT::i1);
2587 SDValue Ops[] = {N->getOperand(0), DestAddr, Value, IsMultimem};
2588 return DAG.getNode(OpCode, DL, MVT::Other, Ops);
2589 }
2590
2592 Fn,
2593 Twine("unsupported argument type ") + llvm::EVT(ValueVT).getEVTString() +
2594 " for " + llvm::Intrinsic::getName(IntrinsicID) + " intrinsic",
2595 DiagnosticLocation(DL.getDebugLoc())));
2596 return Op.getOperand(0); // Return only the chain
2597}
2598
2599static unsigned getTcgen05MMADisableOutputLane(unsigned IID) {
2600 switch (IID) {
2601 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2602 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG1;
2603 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2604 return NVPTXISD::TCGEN05_MMA_SHARED_DISABLE_OUTPUT_LANE_CG2;
2605 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2606 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2607 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2608 return NVPTXISD::TCGEN05_MMA_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2609 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2610 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2611 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2612 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2613 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2614 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2615 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2616 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2617 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2618 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2619 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2620 return NVPTXISD::TCGEN05_MMA_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2621 case Intrinsic::
2622 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2623 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2624 case Intrinsic::
2625 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2626 return NVPTXISD::TCGEN05_MMA_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2627 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2628 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG1;
2629 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2630 return NVPTXISD::TCGEN05_MMA_SP_SHARED_DISABLE_OUTPUT_LANE_CG2;
2631 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2632 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2633 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2634 return NVPTXISD::TCGEN05_MMA_SP_SHARED_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2635 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2636 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1;
2637 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2638 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2;
2639 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2640 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2641 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2642 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2643 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2644 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1;
2645 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2646 return NVPTXISD::TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2;
2647 case Intrinsic::
2648 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2649 return NVPTXISD::
2650 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG1_ASHIFT;
2651 case Intrinsic::
2652 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2653 return NVPTXISD::
2654 TCGEN05_MMA_SP_TENSOR_SCALE_D_DISABLE_OUTPUT_LANE_CG2_ASHIFT;
2655 };
2656 llvm_unreachable("unhandled tcgen05.mma.disable_output_lane intrinsic");
2657}
2658
2660 SDNode *N = Op.getNode();
2661 SDLoc DL(N);
2662 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
2663
2665 // split the vector argument
2666 for (size_t I = 0; I < N->getNumOperands(); I++) {
2667 if (I == 1)
2668 continue; // skip IID
2669 SDValue Val = N->getOperand(I);
2670 EVT ValVT = Val.getValueType();
2671 if (ValVT.isVector()) {
2672 EVT EltVT = ValVT.getVectorElementType();
2673 for (unsigned J = 0, NElts = ValVT.getVectorNumElements(); J < NElts; J++)
2674 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
2675 DAG.getIntPtrConstant(J, DL)));
2676 } else
2677 Ops.push_back(Val);
2678 }
2679
2681 SDValue Tcgen05MMANode = DAG.getMemIntrinsicNode(
2682 getTcgen05MMADisableOutputLane(IID), DL, N->getVTList(), Ops,
2683 MemSD->getMemoryVT(), MemSD->getMemOperand());
2684
2685 return Tcgen05MMANode;
2686}
2687
2688// Lower vector return type of tcgen05.ld intrinsics
2689static std::optional<std::pair<SDValue, SDValue>>
2690lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset = false) {
2691 SDLoc DL(N);
2692 EVT ResVT = N->getValueType(0);
2693 if (!ResVT.isVector())
2694 return {}; // already legalized.
2695
2696 const unsigned NumElts = ResVT.getVectorNumElements();
2697
2698 // Create the return type of the instructions
2699 SmallVector<EVT, 5> ListVTs;
2700 for (unsigned i = 0; i < NumElts; ++i)
2701 ListVTs.push_back(MVT::i32);
2702
2703 ListVTs.push_back(N->getValueType(1)); // Chain
2704
2705 SDVTList ResVTs = DAG.getVTList(ListVTs);
2706
2707 SmallVector<SDValue, 8> Ops{N->getOperand(0), N->getOperand(1),
2708 N->getOperand(2)};
2709
2710 if (HasOffset) {
2711 Ops.push_back(N->getOperand(3)); // offset
2712 Ops.push_back(N->getOperand(4)); // Pack flag
2713 } else
2714 Ops.push_back(N->getOperand(3)); // Pack flag
2715
2717 SDValue NewNode =
2719 MemSD->getMemoryVT(), MemSD->getMemOperand());
2720
2721 // split the vector result
2722 SmallVector<SDValue, 4> ScalarRes;
2723 for (unsigned i = 0; i < NumElts; ++i) {
2724 SDValue Res = NewNode.getValue(i);
2725 ScalarRes.push_back(Res);
2726 }
2727
2728 SDValue Chain = NewNode.getValue(NumElts);
2729 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
2730 return {{BuildVector, Chain}};
2731}
2732
2734 unsigned Val) {
2735 SDNode *N = Op.getNode();
2736 SDLoc DL(N);
2737
2738 const Function &Fn = DAG.getMachineFunction().getFunction();
2739
2740 unsigned AS = 0;
2741 if (auto *MemN = dyn_cast<MemIntrinsicSDNode>(N))
2742 AS = MemN->getAddressSpace();
2743 Type *PtrTy = PointerType::get(*DAG.getContext(), AS);
2745
2747 Fn,
2748 "Intrinsic " +
2749 Intrinsic::getName(N->getConstantOperandVal(1), {PtrTy}, M) +
2750 " with value " + Twine(Val) +
2751 " is not supported on the given target.",
2752 DL.getDebugLoc()));
2753 return Op.getOperand(0);
2754}
2755
2757 SDNode *N = Op.getNode();
2758 SDLoc DL(N);
2759
2760 // immediate argument representing elemtype
2761 unsigned Val = N->getConstantOperandVal(3);
2762
2764 Val))
2765 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2766
2767 return Op;
2768}
2769
2771 SDNode *N = Op.getNode();
2772 SDLoc DL(N);
2773
2774 // immediate argument representing swizzle mode
2775 unsigned Val = N->getConstantOperandVal(3);
2776
2778 Val))
2779 return reportInvalidTensormapReplaceUsage(Op, DAG, Val);
2780
2781 return Op;
2782}
2783
2785 SDNode *N = Op.getNode();
2786 SDValue Intrin = N->getOperand(1);
2787
2788 // Get the intrinsic ID
2789 unsigned IntrinNo = cast<ConstantSDNode>(Intrin.getNode())->getZExtValue();
2790 switch (IntrinNo) {
2791 default:
2792 break;
2793 case Intrinsic::nvvm_st_async:
2794 return lowerStAsyncWithMbarrier(Op, DAG);
2795 case Intrinsic::nvvm_st_async_sys:
2796 case Intrinsic::nvvm_st_async_gpu:
2797 case Intrinsic::nvvm_st_async_mmio_sys:
2798 return lowerStAsyncRelease(Op, DAG);
2799 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
2800 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
2801 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
2802 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
2803 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
2804 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
2805 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
2806 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
2807 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
2808 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
2809 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
2810 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
2811 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
2812 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
2813 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
2814 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
2815 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
2816 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
2817 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
2818 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
2819 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
2820 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
2821 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
2822 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
2823 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
2824 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
2825 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
2826 return lowerTcgen05St(Op, DAG);
2827 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2:
2828 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4:
2829 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8:
2830 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16:
2831 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32:
2832 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64:
2833 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128:
2834 return lowerTcgen05St(Op, DAG, /* hasOffset */ true);
2835 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
2836 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
2837 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
2838 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
2839 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
2840 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
2841 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
2842 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
2843 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
2844 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
2845 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
2846 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
2847 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
2848 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
2849 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
2850 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
2851 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
2852 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
2853 case Intrinsic::
2854 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
2855 case Intrinsic::
2856 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
2857 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
2858 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
2859 case Intrinsic::
2860 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift:
2861 case Intrinsic::
2862 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift:
2864 case Intrinsic::nvvm_tensormap_replace_elemtype:
2865 return lowerTensormapReplaceElemtype(Op, DAG);
2866 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
2868 }
2869 return Op;
2870}
2871
2873 SelectionDAG &DAG) {
2874
2875 SDNode *N = Op.getNode();
2876 if (N->getOperand(1).getValueType() != MVT::i128) {
2877 // return, if the operand is already lowered
2878 return SDValue();
2879 }
2880
2881 unsigned IID =
2882 cast<ConstantSDNode>(N->getOperand(0).getNode())->getZExtValue();
2883 auto Opcode = [&]() {
2884 switch (IID) {
2885 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
2886 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_IS_CANCELED;
2887 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
2888 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_X;
2889 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
2890 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Y;
2891 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
2892 return NVPTXISD::CLUSTERLAUNCHCONTROL_QUERY_CANCEL_GET_FIRST_CTAID_Z;
2893 default:
2894 llvm_unreachable("unsupported/unhandled intrinsic");
2895 }
2896 }();
2897
2898 SDLoc DL(N);
2899 SDValue TryCancelResponse = N->getOperand(1);
2900 SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, TryCancelResponse);
2901 SDValue TryCancelResponse0 =
2902 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2903 DAG.getIntPtrConstant(0, DL));
2904 SDValue TryCancelResponse1 =
2905 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
2906 DAG.getIntPtrConstant(1, DL));
2907
2908 return DAG.getNode(Opcode, DL, N->getVTList(),
2909 {TryCancelResponse0, TryCancelResponse1});
2910}
2911
2913 SDNode *N = Op.getNode();
2914 SDLoc DL(N);
2915 SDValue F32Vec = N->getOperand(1);
2916 SDValue RBits = N->getOperand(2);
2917
2918 unsigned IntrinsicID = N->getConstantOperandVal(0);
2919
2920 // Extract the 4 float elements from the vector
2922 for (unsigned i = 0; i < 4; ++i)
2923 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, F32Vec,
2924 DAG.getIntPtrConstant(i, DL)));
2925
2927
2928 auto [OpCode, RetTy, CvtModeFlag] =
2929 [&]() -> std::tuple<unsigned, MVT::SimpleValueType, uint32_t> {
2930 switch (IntrinsicID) {
2931 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
2932 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8,
2933 CvtMode::RS | CvtMode::RELU_FLAG};
2934 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
2935 return {NVPTXISD::CVT_E4M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2936 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
2937 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8,
2938 CvtMode::RS | CvtMode::RELU_FLAG};
2939 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
2940 return {NVPTXISD::CVT_E5M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2941 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
2942 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8,
2943 CvtMode::RS | CvtMode::RELU_FLAG};
2944 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
2945 return {NVPTXISD::CVT_E2M3X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2946 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
2947 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8,
2948 CvtMode::RS | CvtMode::RELU_FLAG};
2949 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
2950 return {NVPTXISD::CVT_E3M2X4_F32X4_RS_SF, MVT::v4i8, CvtMode::RS};
2951 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
2952 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16,
2953 CvtMode::RS | CvtMode::RELU_FLAG};
2954 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
2955 return {NVPTXISD::CVT_E2M1X4_F32X4_RS_SF, MVT::i16, CvtMode::RS};
2956 default:
2957 llvm_unreachable("unsupported/unhandled intrinsic");
2958 }
2959 }();
2960
2961 Ops.push_back(RBits);
2962 Ops.push_back(DAG.getConstant(CvtModeFlag, DL, MVT::i32));
2963
2964 return DAG.getNode(OpCode, DL, RetTy, Ops);
2965}
2966
2968 const unsigned Mode = [&]() {
2969 switch (Op->getConstantOperandVal(0)) {
2970 case Intrinsic::nvvm_prmt:
2972 case Intrinsic::nvvm_prmt_b4e:
2974 case Intrinsic::nvvm_prmt_ecl:
2976 case Intrinsic::nvvm_prmt_ecr:
2978 case Intrinsic::nvvm_prmt_f4e:
2980 case Intrinsic::nvvm_prmt_rc16:
2982 case Intrinsic::nvvm_prmt_rc8:
2984 default:
2985 llvm_unreachable("unsupported/unhandled intrinsic");
2986 }
2987 }();
2988 SDLoc DL(Op);
2989 SDValue A = Op->getOperand(1);
2990 SDValue B = Op.getNumOperands() == 4 ? Op.getOperand(2)
2991 : DAG.getConstant(0, DL, MVT::i32);
2992 SDValue Selector = (Op->op_end() - 1)->get();
2993 return getPRMT(A, B, Selector, DL, DAG, Mode);
2994}
2995
2996#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE) \
2997 Intrinsic::nvvm_tcgen05_ld_red_##SHAPE##_x##NUM##_##TYPE
2998
2999#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE) \
3000 NVPTXISD::TCGEN05_LD_RED_##SHAPE##_X##NUM##_##TYPE
3001
3002static unsigned getTcgen05LdRedID(Intrinsic::ID IID) {
3003 switch (IID) {
3004 case TCGEN05_LD_RED_INTR(32x32b, 2, f32):
3005 return TCGEN05_LD_RED_INST(32x32b, 2, F32);
3006 case TCGEN05_LD_RED_INTR(32x32b, 4, f32):
3007 return TCGEN05_LD_RED_INST(32x32b, 4, F32);
3008 case TCGEN05_LD_RED_INTR(32x32b, 8, f32):
3009 return TCGEN05_LD_RED_INST(32x32b, 8, F32);
3010 case TCGEN05_LD_RED_INTR(32x32b, 16, f32):
3011 return TCGEN05_LD_RED_INST(32x32b, 16, F32);
3012 case TCGEN05_LD_RED_INTR(32x32b, 32, f32):
3013 return TCGEN05_LD_RED_INST(32x32b, 32, F32);
3014 case TCGEN05_LD_RED_INTR(32x32b, 64, f32):
3015 return TCGEN05_LD_RED_INST(32x32b, 64, F32);
3016 case TCGEN05_LD_RED_INTR(32x32b, 128, f32):
3017 return TCGEN05_LD_RED_INST(32x32b, 128, F32);
3018 case TCGEN05_LD_RED_INTR(16x32bx2, 2, f32):
3019 return TCGEN05_LD_RED_INST(16x32bx2, 2, F32);
3020 case TCGEN05_LD_RED_INTR(16x32bx2, 4, f32):
3021 return TCGEN05_LD_RED_INST(16x32bx2, 4, F32);
3022 case TCGEN05_LD_RED_INTR(16x32bx2, 8, f32):
3023 return TCGEN05_LD_RED_INST(16x32bx2, 8, F32);
3024 case TCGEN05_LD_RED_INTR(16x32bx2, 16, f32):
3025 return TCGEN05_LD_RED_INST(16x32bx2, 16, F32);
3026 case TCGEN05_LD_RED_INTR(16x32bx2, 32, f32):
3027 return TCGEN05_LD_RED_INST(16x32bx2, 32, F32);
3028 case TCGEN05_LD_RED_INTR(16x32bx2, 64, f32):
3029 return TCGEN05_LD_RED_INST(16x32bx2, 64, F32);
3030 case TCGEN05_LD_RED_INTR(16x32bx2, 128, f32):
3031 return TCGEN05_LD_RED_INST(16x32bx2, 128, F32);
3032 case TCGEN05_LD_RED_INTR(32x32b, 2, i32):
3033 return TCGEN05_LD_RED_INST(32x32b, 2, I32);
3034 case TCGEN05_LD_RED_INTR(32x32b, 4, i32):
3035 return TCGEN05_LD_RED_INST(32x32b, 4, I32);
3036 case TCGEN05_LD_RED_INTR(32x32b, 8, i32):
3037 return TCGEN05_LD_RED_INST(32x32b, 8, I32);
3038 case TCGEN05_LD_RED_INTR(32x32b, 16, i32):
3039 return TCGEN05_LD_RED_INST(32x32b, 16, I32);
3040 case TCGEN05_LD_RED_INTR(32x32b, 32, i32):
3041 return TCGEN05_LD_RED_INST(32x32b, 32, I32);
3042 case TCGEN05_LD_RED_INTR(32x32b, 64, i32):
3043 return TCGEN05_LD_RED_INST(32x32b, 64, I32);
3044 case TCGEN05_LD_RED_INTR(32x32b, 128, i32):
3045 return TCGEN05_LD_RED_INST(32x32b, 128, I32);
3046 case TCGEN05_LD_RED_INTR(16x32bx2, 2, i32):
3047 return TCGEN05_LD_RED_INST(16x32bx2, 2, I32);
3048 case TCGEN05_LD_RED_INTR(16x32bx2, 4, i32):
3049 return TCGEN05_LD_RED_INST(16x32bx2, 4, I32);
3050 case TCGEN05_LD_RED_INTR(16x32bx2, 8, i32):
3051 return TCGEN05_LD_RED_INST(16x32bx2, 8, I32);
3052 case TCGEN05_LD_RED_INTR(16x32bx2, 16, i32):
3053 return TCGEN05_LD_RED_INST(16x32bx2, 16, I32);
3054 case TCGEN05_LD_RED_INTR(16x32bx2, 32, i32):
3055 return TCGEN05_LD_RED_INST(16x32bx2, 32, I32);
3056 case TCGEN05_LD_RED_INTR(16x32bx2, 64, i32):
3057 return TCGEN05_LD_RED_INST(16x32bx2, 64, I32);
3058 case TCGEN05_LD_RED_INTR(16x32bx2, 128, i32):
3059 return TCGEN05_LD_RED_INST(16x32bx2, 128, I32);
3060 default:
3061 llvm_unreachable("Invalid tcgen05.ld.red intrinsic ID");
3062 }
3063}
3064
3065// Lower vector return type of tcgen05.ld intrinsics
3066static std::optional<std::tuple<SDValue, SDValue, SDValue>>
3068 SDLoc DL(N);
3069 EVT ResVT = N->getValueType(0);
3070 if (!ResVT.isVector())
3071 return {}; // already legalized.
3072
3073 const unsigned NumElts = ResVT.getVectorNumElements();
3074
3075 // Create the return type of the instructions
3076 // +1 represents the reduction value
3077 SmallVector<EVT, 132> ListVTs{
3078 NumElts + 1,
3079 ResVT.getVectorElementType().isFloatingPoint() ? MVT::f32 : MVT::i32};
3080
3081 ListVTs.push_back(MVT::Other); // Chain
3082
3083 SDVTList ResVTs = DAG.getVTList(ListVTs);
3084
3085 // Prepare the Operands
3086 SmallVector<SDValue, 8> Ops{N->getOperand(0)}; // Chain
3087
3088 // skip IID at index 1
3089 for (unsigned i = 2; i < N->getNumOperands(); i++)
3090 Ops.push_back(N->getOperand(i));
3091
3092 unsigned IID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
3094 SDValue NewNode =
3095 DAG.getMemIntrinsicNode(getTcgen05LdRedID(IID), DL, ResVTs, Ops,
3096 MemSD->getMemoryVT(), MemSD->getMemOperand());
3097
3098 // Split vector result
3099 SmallVector<SDValue, 132> ScalarRes;
3100 for (unsigned i = 0; i < NumElts; ++i) {
3101 SDValue Res = NewNode.getValue(i);
3102 ScalarRes.push_back(Res);
3103 }
3104
3105 SDValue BuildVector = DAG.getNode(ISD::BUILD_VECTOR, DL, ResVT, ScalarRes);
3106 SDValue RedResult = NewNode.getValue(NumElts);
3107 SDValue Chain = NewNode.getValue(NumElts + 1);
3108 return {{BuildVector, RedResult, Chain}};
3109}
3110
3112 switch (Op->getConstantOperandVal(1)) {
3113 default:
3114 return Op;
3115
3116 // These tcgen05 intrinsics return a v2i32, which is legal, so we have to
3117 // lower them through LowerOperation() instead of ReplaceNodeResults().
3118 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
3119 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
3120 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
3121 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG))
3122 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3123 return SDValue();
3124
3125 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
3126 if (auto Res = lowerTcgen05Ld(Op.getNode(), DAG, /*HasOffset=*/true))
3127 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(Op));
3128 return SDValue();
3129
3130 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
3131 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
3132 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32:
3133 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32:
3134 if (auto Res = lowerTcgen05LdRed(Op.getNode(), DAG))
3135 return DAG.getMergeValues(
3136 {std::get<0>(*Res), std::get<1>(*Res), std::get<2>(*Res)}, SDLoc(Op));
3137 return SDValue();
3138 }
3139}
3140
3142 switch (Op->getConstantOperandVal(0)) {
3143 default:
3144 return Op;
3145 case Intrinsic::nvvm_prmt:
3146 case Intrinsic::nvvm_prmt_b4e:
3147 case Intrinsic::nvvm_prmt_ecl:
3148 case Intrinsic::nvvm_prmt_ecr:
3149 case Intrinsic::nvvm_prmt_f4e:
3150 case Intrinsic::nvvm_prmt_rc16:
3151 case Intrinsic::nvvm_prmt_rc8:
3152 return lowerPrmtIntrinsic(Op, DAG);
3153 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_is_canceled:
3154 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_x:
3155 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_y:
3156 case Intrinsic::nvvm_clusterlaunchcontrol_query_cancel_get_first_ctaid_z:
3158 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_satfinite:
3159 case Intrinsic::nvvm_f32x4_to_e4m3x4_rs_relu_satfinite:
3160 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_satfinite:
3161 case Intrinsic::nvvm_f32x4_to_e5m2x4_rs_relu_satfinite:
3162 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_satfinite:
3163 case Intrinsic::nvvm_f32x4_to_e2m3x4_rs_relu_satfinite:
3164 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_satfinite:
3165 case Intrinsic::nvvm_f32x4_to_e3m2x4_rs_relu_satfinite:
3166 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_satfinite:
3167 case Intrinsic::nvvm_f32x4_to_e2m1x4_rs_relu_satfinite:
3168 return lowerCvtRSIntrinsics(Op, DAG);
3169 }
3170}
3171
3172// In PTX 64-bit CTLZ and CTPOP are supported, but they return a 32-bit value.
3173// Lower these into a node returning the correct type which is zero-extended
3174// back to the correct size.
3176 SDValue V = Op->getOperand(0);
3177 assert(V.getValueType() == MVT::i64 &&
3178 "Unexpected CTLZ/CTPOP type to legalize");
3179
3180 SDLoc DL(Op);
3181 SDValue CT = DAG.getNode(Op->getOpcode(), DL, MVT::i32, V);
3182 return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, CT, SDNodeFlags::NonNeg);
3183}
3184
3186 unsigned Opcode, SelectionDAG &DAG) {
3187 assert(A.getValueType() == MVT::i64 && B.getValueType() == MVT::i64);
3188
3189 const auto *AmtConst = dyn_cast<ConstantSDNode>(ShiftAmount);
3190 if (!AmtConst)
3191 return SDValue();
3192 const auto Amt = AmtConst->getZExtValue() & 63;
3193
3194 SDValue UnpackA =
3195 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, A);
3196 SDValue UnpackB =
3197 DAG.getNode(NVPTXISD::UNPACK_VECTOR, DL, {MVT::i32, MVT::i32}, B);
3198
3199 // Arch is Little endiain: 0 = low bits, 1 = high bits
3200 SDValue ALo = UnpackA.getValue(0);
3201 SDValue AHi = UnpackA.getValue(1);
3202 SDValue BLo = UnpackB.getValue(0);
3203 SDValue BHi = UnpackB.getValue(1);
3204
3205 // The bitfeild consists of { AHi : ALo : BHi : BLo }
3206 //
3207 // * FSHL, Amt < 32 - The window will contain { AHi : ALo : BHi }
3208 // * FSHL, Amt >= 32 - The window will contain { ALo : BHi : BLo }
3209 // * FSHR, Amt < 32 - The window will contain { ALo : BHi : BLo }
3210 // * FSHR, Amt >= 32 - The window will contain { AHi : ALo : BHi }
3211 //
3212 // Note that Amt = 0 and Amt = 32 are special cases where 32-bit funnel shifts
3213 // are not needed at all. Amt = 0 is a no-op producing either A or B depending
3214 // on the direction. Amt = 32 can be implemented by a packing and unpacking
3215 // move to select and arrange the 32bit values. For simplicity, these cases
3216 // are not handled here explicitly and instead we rely on DAGCombiner to
3217 // remove the no-op funnel shifts we insert.
3218 auto [High, Mid, Low] = ((Opcode == ISD::FSHL) == (Amt < 32))
3219 ? std::make_tuple(AHi, ALo, BHi)
3220 : std::make_tuple(ALo, BHi, BLo);
3221
3222 SDValue NewAmt = DAG.getConstant(Amt & 31, DL, MVT::i32);
3223 SDValue RHi = DAG.getNode(Opcode, DL, MVT::i32, {High, Mid, NewAmt});
3224 SDValue RLo = DAG.getNode(Opcode, DL, MVT::i32, {Mid, Low, NewAmt});
3225
3226 return DAG.getNode(NVPTXISD::BUILD_VECTOR, DL, MVT::i64, {RLo, RHi});
3227}
3228
3230 return expandFSH64(Op->getOperand(0), Op->getOperand(1), Op->getOperand(2),
3231 SDLoc(Op), Op->getOpcode(), DAG);
3232}
3233
3235 unsigned Opcode = Op->getOpcode() == ISD::ROTL ? ISD::FSHL : ISD::FSHR;
3236 return expandFSH64(Op->getOperand(0), Op->getOperand(0), Op->getOperand(1),
3237 SDLoc(Op), Opcode, DAG);
3238}
3239
3241 // Lower (frem x, y) into (sub x, (mul (ftrunc (div x, y)) y)),
3242 // i.e. "poor man's fmod()". When y is infinite, x is returned. This matches
3243 // the semantics of LLVM's frem.
3244 SDLoc DL(Op);
3245 SDValue X = Op->getOperand(0);
3246 SDValue Y = Op->getOperand(1);
3247 EVT Ty = Op.getValueType();
3248 SDNodeFlags Flags = Op->getFlags();
3249
3250 SDValue Div = DAG.getNode(ISD::FDIV, DL, Ty, X, Y, Flags);
3251 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, Ty, Div, Flags);
3252 SDValue Mul = DAG.getNode(ISD::FMUL, DL, Ty, Trunc, Y,
3254 SDValue Sub = DAG.getNode(ISD::FSUB, DL, Ty, X, Mul,
3256
3257 if (Flags.hasNoInfs())
3258 return Sub;
3259
3260 // If Y is infinite, return X
3261 SDValue AbsY = DAG.getNode(ISD::FABS, DL, Ty, Y);
3262 SDValue Inf =
3263 DAG.getConstantFP(APFloat::getInf(Ty.getFltSemantics()), DL, Ty);
3264 SDValue IsInf = DAG.getSetCC(DL, MVT::i1, AbsY, Inf, ISD::SETEQ);
3265 return DAG.getSelect(DL, Ty, IsInf, X, Sub);
3266}
3267
3269 assert(Op.getValueType() == MVT::i1 && "Custom lowering enabled only for i1");
3270
3271 SDValue Cond = Op->getOperand(0);
3272 SDValue TrueVal = Op->getOperand(1);
3273 SDValue FalseVal = Op->getOperand(2);
3274 SDLoc DL(Op);
3275
3276 // If both operands are truncated, we push the select through the truncates.
3277 if (TrueVal.getOpcode() == ISD::TRUNCATE &&
3278 FalseVal.getOpcode() == ISD::TRUNCATE) {
3279 TrueVal = TrueVal.getOperand(0);
3280 FalseVal = FalseVal.getOperand(0);
3281
3282 EVT VT = TrueVal.getSimpleValueType().bitsLE(FalseVal.getSimpleValueType())
3283 ? TrueVal.getValueType()
3284 : FalseVal.getValueType();
3285 TrueVal = DAG.getAnyExtOrTrunc(TrueVal, DL, VT);
3286 FalseVal = DAG.getAnyExtOrTrunc(FalseVal, DL, VT);
3287 SDValue Select = DAG.getSelect(DL, VT, Cond, TrueVal, FalseVal);
3288 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Select);
3289 }
3290
3291 // Otherwise, expand the select into a series of logical operations. These
3292 // often can be folded into other operations either by us or ptxas.
3293 TrueVal = DAG.getFreeze(TrueVal);
3294 FalseVal = DAG.getFreeze(FalseVal);
3295 SDValue And1 = DAG.getNode(ISD::AND, DL, MVT::i1, Cond, TrueVal);
3296 SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
3297 SDValue And2 = DAG.getNode(ISD::AND, DL, MVT::i1, NotCond, FalseVal);
3298 SDValue Or = DAG.getNode(ISD::OR, DL, MVT::i1, And1, And2);
3299 return Or;
3300}
3301
3303 SDNode *N = Op.getNode();
3304
3305 SDValue Chain = N->getOperand(0);
3306 SDValue Val = N->getOperand(1);
3307 SDValue BasePtr = N->getOperand(2);
3308 SDValue Offset = N->getOperand(3);
3309 SDValue Mask = N->getOperand(4);
3310
3311 SDLoc DL(N);
3312 EVT ValVT = Val.getValueType();
3313 MemSDNode *MemSD = cast<MemSDNode>(N);
3314 assert(ValVT.isVector() && "Masked vector store must have vector type");
3315 assert(MemSD->getAlign() >= DAG.getEVTAlign(ValVT) &&
3316 "Unexpected alignment for masked store");
3317
3318 unsigned Opcode = 0;
3319 switch (ValVT.getSimpleVT().SimpleTy) {
3320 default:
3321 llvm_unreachable("Unexpected masked vector store type");
3322 case MVT::v4i64:
3323 case MVT::v4f64: {
3324 Opcode = NVPTXISD::StoreV4;
3325 break;
3326 }
3327 case MVT::v8i32:
3328 case MVT::v8f32: {
3329 Opcode = NVPTXISD::StoreV8;
3330 break;
3331 }
3332 }
3333
3335
3336 // Construct the new SDNode. First operand is the chain.
3337 Ops.push_back(Chain);
3338
3339 // The next N operands are the values to store. Encode the mask into the
3340 // values using the sentinel register 0 to represent a masked-off element.
3341 assert(Mask.getValueType().isVector() &&
3342 Mask.getValueType().getVectorElementType() == MVT::i1 &&
3343 "Mask must be a vector of i1");
3344 assert(Mask.getOpcode() == ISD::BUILD_VECTOR &&
3345 "Mask expected to be a BUILD_VECTOR");
3346 assert(Mask.getValueType().getVectorNumElements() ==
3347 ValVT.getVectorNumElements() &&
3348 "Mask size must be the same as the vector size");
3349 for (auto [I, Op] : enumerate(Mask->ops())) {
3350 // Mask elements must be constants.
3351 if (Op.getNode()->getAsZExtVal() == 0) {
3352 // Append a sentinel register 0 to the Ops vector to represent a masked
3353 // off element, this will be handled in tablegen
3355 ValVT.getVectorElementType()));
3356 } else {
3357 // Extract the element from the vector to store
3358 SDValue ExtVal =
3360 Val, DAG.getIntPtrConstant(I, DL));
3361 Ops.push_back(ExtVal);
3362 }
3363 }
3364
3365 // Next, the pointer operand.
3366 Ops.push_back(BasePtr);
3367
3368 // Finally, the offset operand. We expect this to always be undef, and it will
3369 // be ignored in lowering, but to mirror the handling of the other vector
3370 // store instructions we include it in the new SDNode.
3371 assert(Offset.getOpcode() == ISD::UNDEF &&
3372 "Offset operand expected to be undef");
3373 Ops.push_back(Offset);
3374
3375 SDValue NewSt =
3376 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3377 MemSD->getMemoryVT(), MemSD->getMemOperand());
3378
3379 return NewSt;
3380}
3381
3382SDValue
3384 switch (Op.getOpcode()) {
3385 case ISD::RETURNADDR:
3386 return SDValue();
3387 case ISD::FRAMEADDR:
3388 return SDValue();
3389 case ISD::ADDRSPACECAST:
3390 return LowerADDRSPACECAST(Op, DAG);
3392 return lowerIntrinsicWChain(Op, DAG);
3394 return lowerIntrinsicWOChain(Op, DAG);
3396 return lowerIntrinsicVoid(Op, DAG);
3397 case ISD::BUILD_VECTOR:
3398 return LowerBUILD_VECTOR(Op, DAG);
3399 case ISD::BITCAST:
3400 return LowerBITCAST(Op, DAG);
3402 return Op;
3404 return LowerEXTRACT_VECTOR_ELT(Op, DAG);
3406 return LowerINSERT_VECTOR_ELT(Op, DAG);
3408 return LowerVECTOR_SHUFFLE(Op, DAG);
3410 return LowerCONCAT_VECTORS(Op, DAG);
3415 return LowerVECREDUCE(Op, DAG);
3416 case ISD::STORE:
3417 return LowerSTORE(Op, DAG);
3418 case ISD::MSTORE: {
3419 assert(STI.has256BitVectorLoadStore(
3420 cast<MemSDNode>(Op.getNode())->getAddressSpace()) &&
3421 "Masked store vector not supported on subtarget.");
3422 return lowerMSTORE(Op, DAG);
3423 }
3424 case ISD::LOAD:
3425 return LowerLOAD(Op, DAG);
3426 case ISD::MLOAD:
3427 return LowerMLOAD(Op, DAG);
3428 case ISD::SHL_PARTS:
3429 return LowerShiftLeftParts(Op, DAG);
3430 case ISD::SRA_PARTS:
3431 case ISD::SRL_PARTS:
3432 return LowerShiftRightParts(Op, DAG);
3433 case ISD::SELECT:
3434 return lowerSELECT(Op, DAG);
3435 case ISD::FROUND:
3436 return LowerFROUND(Op, DAG);
3437 case ISD::FCOPYSIGN:
3438 return LowerFCOPYSIGN(Op, DAG);
3439 case ISD::SINT_TO_FP:
3440 case ISD::UINT_TO_FP:
3441 return LowerINT_TO_FP(Op, DAG);
3442 case ISD::FP_TO_SINT:
3443 case ISD::FP_TO_UINT:
3444 // fptosi/fptoui to i1 truncate toward zero, so the only defined results
3445 // are {0,-1} (signed) and {0,1} (unsigned); every other input results in
3446 // poison. Thus we can simply lower to `x <= -1.0` or `x >= 1.0`.
3447 if (Op.getValueType() == MVT::i1) {
3448 SDLoc DL(Op);
3449 SDValue X = Op.getOperand(0);
3450 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
3451 return DAG.getSetCC(
3452 DL, MVT::i1, X,
3453 DAG.getConstantFP(IsSigned ? -1.0 : 1.0, DL, X.getValueType()),
3454 IsSigned ? ISD::SETOLE : ISD::SETOGE);
3455 }
3456 return LowerFP_TO_INT(Op, DAG);
3457 case ISD::FP_ROUND:
3458 return LowerFP_ROUND(Op, DAG);
3459 case ISD::FP_EXTEND:
3460 return LowerFP_EXTEND(Op, DAG);
3461 case ISD::VAARG:
3462 return LowerVAARG(Op, DAG);
3463 case ISD::VASTART:
3464 return LowerVASTART(Op, DAG);
3465 case ISD::FSHL:
3466 case ISD::FSHR:
3467 return lowerFSH(Op, DAG);
3468 case ISD::ROTL:
3469 case ISD::ROTR:
3470 return lowerROT(Op, DAG);
3471 case ISD::ABS:
3473 case ISD::SMIN:
3474 case ISD::SMAX:
3475 case ISD::UMIN:
3476 case ISD::UMAX:
3477 case ISD::ADD:
3478 case ISD::SUB:
3479 case ISD::MUL:
3480 case ISD::SHL:
3481 case ISD::SREM:
3482 case ISD::UREM:
3483 return LowerVectorArith(Op, DAG);
3485 return LowerDYNAMIC_STACKALLOC(Op, DAG);
3486 case ISD::STACKRESTORE:
3487 return LowerSTACKRESTORE(Op, DAG);
3488 case ISD::STACKSAVE:
3489 return LowerSTACKSAVE(Op, DAG);
3490 case ISD::CopyToReg:
3491 return LowerCopyToReg_128(Op, DAG);
3492 case ISD::FADD:
3493 case ISD::FSUB:
3494 case ISD::FMUL:
3495 // Used only for bf16 on SM80, where we select fma for non-ftz operation
3496 return PromoteBinOpIfF32FTZ(Op, DAG);
3497 case ISD::CTPOP:
3498 case ISD::CTLZ:
3499 return lowerCTLZCTPOP(Op, DAG);
3500 case ISD::FREM:
3501 return lowerFREM(Op, DAG);
3502 case ISD::BSWAP:
3503 return lowerBSWAP(Op, DAG);
3504 default:
3505 llvm_unreachable("Custom lowering not defined for operation");
3506 }
3507}
3508
3509// This will prevent AsmPrinter from trying to print the jump tables itself.
3513
3514SDValue NVPTXTargetLowering::LowerADDRSPACECAST(SDValue Op,
3515 SelectionDAG &DAG) const {
3517 unsigned SrcAS = N->getSrcAddressSpace();
3518 unsigned DestAS = N->getDestAddressSpace();
3519 if (SrcAS != llvm::ADDRESS_SPACE_GENERIC &&
3520 DestAS != llvm::ADDRESS_SPACE_GENERIC) {
3521 // Shared and SharedCluster can be converted to each other through generic
3522 // space
3523 if ((SrcAS == llvm::ADDRESS_SPACE_SHARED &&
3526 DestAS == llvm::ADDRESS_SPACE_SHARED)) {
3527 SDLoc DL(Op.getNode());
3528 const MVT GenerictVT =
3530 SDValue GenericConversion = DAG.getAddrSpaceCast(
3531 DL, GenerictVT, Op.getOperand(0), SrcAS, ADDRESS_SPACE_GENERIC);
3532 SDValue SharedClusterConversion =
3533 DAG.getAddrSpaceCast(DL, Op.getValueType(), GenericConversion,
3534 ADDRESS_SPACE_GENERIC, DestAS);
3535 return SharedClusterConversion;
3536 }
3537
3538 return DAG.getUNDEF(Op.getValueType());
3539 }
3540
3541 return Op;
3542}
3543
3544// This function is almost a copy of SelectionDAG::expandVAArg().
3545// The only diff is that this one produces loads from local address space.
3546SDValue NVPTXTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
3547 const TargetLowering *TLI = STI.getTargetLowering();
3548 SDLoc DL(Op);
3549
3550 SDNode *Node = Op.getNode();
3551 const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
3552 EVT VT = Node->getValueType(0);
3553 auto *Ty = VT.getTypeForEVT(*DAG.getContext());
3554 SDValue Tmp1 = Node->getOperand(0);
3555 SDValue Tmp2 = Node->getOperand(1);
3556 const MaybeAlign MA(Node->getConstantOperandVal(3));
3557
3558 SDValue VAListLoad = DAG.getLoad(TLI->getPointerTy(DAG.getDataLayout()), DL,
3559 Tmp1, Tmp2, MachinePointerInfo(V));
3560 SDValue VAList = VAListLoad;
3561
3562 if (MA && *MA > TLI->getMinStackArgumentAlignment()) {
3563 VAList = DAG.getNode(
3564 ISD::ADD, DL, VAList.getValueType(), VAList,
3565 DAG.getConstant(MA->value() - 1, DL, VAList.getValueType()));
3566
3567 VAList = DAG.getNode(ISD::AND, DL, VAList.getValueType(), VAList,
3568 DAG.getSignedConstant(-(int64_t)MA->value(), DL,
3569 VAList.getValueType()));
3570 }
3571
3572 // Increment the pointer, VAList, to the next vaarg
3573 Tmp1 = DAG.getNode(ISD::ADD, DL, VAList.getValueType(), VAList,
3575 DL, VAList.getValueType()));
3576
3577 // Store the incremented VAList to the legalized pointer
3578 Tmp1 = DAG.getStore(VAListLoad.getValue(1), DL, Tmp1, Tmp2,
3579 MachinePointerInfo(V));
3580
3581 const Value *SrcV = Constant::getNullValue(
3583
3584 // Load the actual argument out of the pointer VAList
3585 return DAG.getLoad(VT, DL, Tmp1, VAList, MachinePointerInfo(SrcV));
3586}
3587
3588SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
3589 const TargetLowering *TLI = STI.getTargetLowering();
3590 SDLoc DL(Op);
3591 EVT PtrVT = TLI->getPointerTy(DAG.getDataLayout());
3592
3593 // Store the address of unsized array <function>_vararg[] in the ap object.
3594 SDValue VAReg = getParamSymbol(DAG, /* vararg */ -1, PtrVT);
3595
3596 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
3597 return DAG.getStore(Op.getOperand(0), DL, VAReg, Op.getOperand(1),
3598 MachinePointerInfo(SV));
3599}
3600
3601static std::pair<MemSDNode *, uint32_t>
3603 const NVPTXSubtarget &STI) {
3604 SDValue Chain = N->getOperand(0);
3605 SDValue BasePtr = N->getOperand(1);
3606 SDValue Mask = N->getOperand(3);
3607 [[maybe_unused]] SDValue Passthru = N->getOperand(4);
3608
3609 SDLoc DL(N);
3610 EVT ResVT = N->getValueType(0);
3611 assert(ResVT.isVector() && "Masked vector load must have vector type");
3612 // While we only expect poison passthru vectors as an input to the backend,
3613 // when the legalization framework splits a poison vector in half, it creates
3614 // two undef vectors, so we can technically expect those too.
3615 assert((Passthru.getOpcode() == ISD::POISON ||
3616 Passthru.getOpcode() == ISD::UNDEF) &&
3617 "Passthru operand expected to be poison or undef");
3618
3619 // Extract the mask and convert it to a uint32_t representing the used bytes
3620 // of the entire vector load
3621 uint32_t UsedBytesMask = 0;
3622 uint32_t ElementSizeInBits = ResVT.getVectorElementType().getSizeInBits();
3623 assert(ElementSizeInBits % 8 == 0 && "Unexpected element size");
3624 uint32_t ElementSizeInBytes = ElementSizeInBits / 8;
3625 uint32_t ElementMask = (1u << ElementSizeInBytes) - 1u;
3626
3627 for (SDValue Op : reverse(Mask->ops())) {
3628 // We technically only want to do this shift for every
3629 // iteration *but* the first, but in the first iteration UsedBytesMask is 0,
3630 // so this shift is a no-op.
3631 UsedBytesMask <<= ElementSizeInBytes;
3632
3633 // Mask elements must be constants.
3634 if (Op->getAsZExtVal() != 0)
3635 UsedBytesMask |= ElementMask;
3636 }
3637
3638 assert(UsedBytesMask != 0 && UsedBytesMask != UINT32_MAX &&
3639 "Unexpected masked load with elements masked all on or all off");
3640
3641 // Create a new load sd node to be handled normally by ReplaceLoadVector.
3642 MemSDNode *NewLD = cast<MemSDNode>(
3643 DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());
3644
3645 // If our subtarget does not support the used bytes mask pragma, "drop" the
3646 // mask by setting it to UINT32_MAX
3647 if (!STI.hasUsedBytesMaskPragma())
3648 UsedBytesMask = UINT32_MAX;
3649
3650 return {NewLD, UsedBytesMask};
3651}
3652
3653/// replaceLoadVector - Convert vector loads into multi-output scalar loads.
3654static std::optional<std::pair<SDValue, SDValue>>
3657 const EVT ResVT = LD->getValueType(0);
3658 const EVT MemVT = LD->getMemoryVT();
3659
3660 // If we're doing sign/zero extension as part of the load, avoid lowering to
3661 // a LoadV node. TODO: consider relaxing this restriction.
3662 if (ResVT != MemVT)
3663 return std::nullopt;
3664
3665 const auto NumEltsAndEltVT =
3666 getVectorLoweringShape(ResVT, STI, LD->getAddressSpace());
3667 if (!NumEltsAndEltVT)
3668 return std::nullopt;
3669 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3670
3671 Align Alignment = LD->getAlign();
3672 const auto &TD = DAG.getDataLayout();
3673 Align PrefAlign = TD.getPrefTypeAlign(MemVT.getTypeForEVT(*DAG.getContext()));
3674 if (Alignment < PrefAlign) {
3675 // This load is not sufficiently aligned, so bail out and let this vector
3676 // load be scalarized. Note that we may still be able to emit smaller
3677 // vector loads. For example, if we are loading a <4 x float> with an
3678 // alignment of 8, this check will fail but the legalizer will try again
3679 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3680 return std::nullopt;
3681 }
3682
3683 // If we have a masked load, convert it to a normal load now
3684 std::optional<uint32_t> UsedBytesMask = std::nullopt;
3685 if (LD->getOpcode() == ISD::MLOAD)
3686 std::tie(LD, UsedBytesMask) =
3688
3689 // Since LoadV2 is a target node, we cannot rely on DAG type legalization.
3690 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
3691 // loaded type to i16 and propagate the "real" type as the memory type.
3692 const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT;
3693
3694 unsigned Opcode;
3695 switch (NumElts) {
3696 default:
3697 return std::nullopt;
3698 case 2:
3699 Opcode = NVPTXISD::LoadV2;
3700 break;
3701 case 4:
3702 Opcode = NVPTXISD::LoadV4;
3703 break;
3704 case 8:
3705 Opcode = NVPTXISD::LoadV8;
3706 break;
3707 }
3708 auto ListVTs = SmallVector<EVT, 9>(NumElts, LoadEltVT);
3709 ListVTs.push_back(MVT::Other);
3710 SDVTList LdResVTs = DAG.getVTList(ListVTs);
3711
3712 SDLoc DL(LD);
3713
3714 // Copy regular operands
3715 SmallVector<SDValue, 8> OtherOps(LD->ops());
3716
3717 OtherOps.push_back(
3718 DAG.getConstant(UsedBytesMask.value_or(UINT32_MAX), DL, MVT::i32));
3719
3720 // The select routine does not have access to the LoadSDNode instance, so
3721 // pass along the extension information
3722 OtherOps.push_back(
3723 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3724
3725 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps, MemVT,
3726 LD->getMemOperand());
3727
3728 SmallVector<SDValue> ScalarRes;
3729 if (EltVT.isVector()) {
3731 assert(NumElts * EltVT.getVectorNumElements() ==
3732 ResVT.getVectorNumElements());
3733 // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back
3734 // into individual elements.
3735 for (const unsigned I : llvm::seq(NumElts)) {
3736 SDValue SubVector = NewLD.getValue(I);
3737 DAG.ExtractVectorElements(SubVector, ScalarRes);
3738 }
3739 } else {
3740 for (const unsigned I : llvm::seq(NumElts)) {
3741 SDValue Res = NewLD.getValue(I);
3742 if (LoadEltVT != EltVT)
3743 Res = DAG.getNode(ISD::TRUNCATE, DL, EltVT, Res);
3744 ScalarRes.push_back(Res);
3745 }
3746 }
3747
3748 SDValue LoadChain = NewLD.getValue(NumElts);
3749
3750 const MVT BuildVecVT =
3751 MVT::getVectorVT(EltVT.getScalarType(), ScalarRes.size());
3752 SDValue BuildVec = DAG.getBuildVector(BuildVecVT, DL, ScalarRes);
3753 SDValue LoadValue = DAG.getBitcast(ResVT, BuildVec);
3754
3755 return {{LoadValue, LoadChain}};
3756}
3757
3760 const NVPTXSubtarget &STI) {
3761 if (auto Res = replaceLoadVector(N, DAG, STI))
3762 Results.append({Res->first, Res->second});
3763}
3764
3766 const NVPTXSubtarget &STI) {
3767 if (auto Res = replaceLoadVector(N, DAG, STI))
3768 return DAG.getMergeValues({Res->first, Res->second}, SDLoc(N));
3769 return SDValue();
3770}
3771
3772// v = ld i1* addr
3773// =>
3774// v1 = ld i8* addr (-> i16)
3775// v = trunc i16 to i1
3777 SDLoc dl(LD);
3778 assert(LD->getExtensionType() == ISD::NON_EXTLOAD);
3779 assert(LD->getValueType(0) == MVT::i1 && "Custom lowering for i1 load only");
3780 SDValue newLD = DAG.getExtLoad(ISD::ZEXTLOAD, dl, MVT::i16, LD->getChain(),
3781 LD->getBasePtr(), LD->getPointerInfo(),
3782 MVT::i8, LD->getAlign(),
3783 LD->getMemOperand()->getFlags());
3784 SDValue result = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, newLD);
3785 // The legalizer (the caller) is expecting two values from the legalized
3786 // load, so we build a MergeValues node for it. See ExpandUnalignedLoad()
3787 // in LegalizeDAG.cpp which also uses MergeValues.
3788 return DAG.getMergeValues({result, LD->getChain()}, dl);
3789}
3790
3791SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
3792 LoadSDNode *LD = cast<LoadSDNode>(Op);
3793
3794 if (Op.getValueType() == MVT::i1)
3795 return lowerLOADi1(LD, DAG);
3796
3797 // To improve CodeGen we'll legalize any-extend loads to zext loads. This is
3798 // how they'll be lowered in ISel anyway, and by doing this a little earlier
3799 // we allow for more DAG combine opportunities.
3800 if (LD->getExtensionType() == ISD::EXTLOAD) {
3801 assert(LD->getValueType(0).isInteger() && LD->getMemoryVT().isInteger() &&
3802 "Unexpected fpext-load");
3803 return DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(Op), Op.getValueType(),
3804 LD->getChain(), LD->getBasePtr(), LD->getMemoryVT(),
3805 LD->getMemOperand());
3806 }
3807
3808 llvm_unreachable("Unexpected custom lowering for load");
3809}
3810
3811SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
3812 // v2f16/v2bf16/v2i16/v4i8 are legal, so we can't rely on legalizer to handle
3813 // masked loads of these types and have to handle them here.
3814 // v2f32 also needs to be handled here if the subtarget has f32x2
3815 // instructions, making it legal.
3816 //
3817 // Note: misaligned masked loads should never reach this point
3818 // because the override of isLegalMaskedLoad in NVPTXTargetTransformInfo.cpp
3819 // will validate alignment. Therefore, we do not need to special case handle
3820 // them here.
3821 EVT VT = Op.getValueType();
3822 if (NVPTX::isPackedVectorTy(VT)) {
3824 cast<MemSDNode>(Op.getNode()), DAG, STI);
3825 MemSDNode *LD = std::get<0>(Result);
3826 uint32_t UsedBytesMask = std::get<1>(Result);
3827
3828 SDLoc DL(LD);
3829
3830 // Copy regular operands
3831 SmallVector<SDValue, 8> OtherOps(LD->ops());
3832
3833 OtherOps.push_back(DAG.getConstant(UsedBytesMask, DL, MVT::i32));
3834
3835 // We currently are not lowering extending loads, but pass the extension
3836 // type anyway as later handling expects it.
3837 OtherOps.push_back(
3838 DAG.getIntPtrConstant(cast<LoadSDNode>(LD)->getExtensionType(), DL));
3839 SDValue NewLD =
3840 DAG.getMemIntrinsicNode(NVPTXISD::MLoad, DL, LD->getVTList(), OtherOps,
3841 LD->getMemoryVT(), LD->getMemOperand());
3842 return NewLD;
3843 }
3844 return SDValue();
3845}
3846
3848 const NVPTXSubtarget &STI) {
3849 MemSDNode *N = cast<MemSDNode>(Op.getNode());
3850 SDValue Val = N->getOperand(1);
3851 SDLoc DL(N);
3852 const EVT ValVT = Val.getValueType();
3853 const EVT MemVT = N->getMemoryVT();
3854
3855 // If we're truncating as part of the store, avoid lowering to a StoreV node.
3856 // TODO: consider relaxing this restriction.
3857 if (ValVT != MemVT)
3858 return SDValue();
3859
3860 const auto NumEltsAndEltVT =
3861 getVectorLoweringShape(ValVT, STI, N->getAddressSpace());
3862 if (!NumEltsAndEltVT)
3863 return SDValue();
3864 const auto [NumElts, EltVT] = NumEltsAndEltVT.value();
3865
3866 const DataLayout &TD = DAG.getDataLayout();
3867
3868 Align Alignment = N->getAlign();
3869 Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
3870 if (Alignment < PrefAlign) {
3871 // This store is not sufficiently aligned, so bail out and let this vector
3872 // store be scalarized. Note that we may still be able to emit smaller
3873 // vector stores. For example, if we are storing a <4 x float> with an
3874 // alignment of 8, this check will fail but the legalizer will try again
3875 // with 2 x <2 x float>, which will succeed with an alignment of 8.
3876 return SDValue();
3877 }
3878
3879 unsigned Opcode;
3880 switch (NumElts) {
3881 default:
3882 return SDValue();
3883 case 2:
3884 Opcode = NVPTXISD::StoreV2;
3885 break;
3886 case 4:
3887 Opcode = NVPTXISD::StoreV4;
3888 break;
3889 case 8:
3890 Opcode = NVPTXISD::StoreV8;
3891 break;
3892 }
3893
3895
3896 // First is the chain
3897 Ops.push_back(N->getOperand(0));
3898
3899 // Then the split values
3900 if (EltVT.isVector()) {
3902 assert(NumElts * EltVT.getVectorNumElements() ==
3903 ValVT.getVectorNumElements());
3904 // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be
3905 // stored as b32s
3906 const unsigned NumEltsPerSubVector = EltVT.getVectorNumElements();
3907 for (const unsigned I : llvm::seq(NumElts)) {
3908 SmallVector<SDValue, 4> SubVectorElts;
3909 DAG.ExtractVectorElements(Val, SubVectorElts, I * NumEltsPerSubVector,
3910 NumEltsPerSubVector);
3911 Ops.push_back(DAG.getBuildVector(EltVT, DL, SubVectorElts));
3912 }
3913 } else {
3914 SDValue V = DAG.getBitcast(MVT::getVectorVT(EltVT, NumElts), Val);
3915 for (const unsigned I : llvm::seq(NumElts)) {
3916 SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, V,
3917 DAG.getIntPtrConstant(I, DL));
3918
3919 // Since StoreV2 is a target node, we cannot rely on DAG type
3920 // legalization. Therefore, we must ensure the type is legal. For i1 and
3921 // i8, we set the stored type to i16 and propagate the "real" type as the
3922 // memory type.
3923 if (EltVT.getSizeInBits() < 16)
3924 ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
3925 Ops.push_back(ExtVal);
3926 }
3927 }
3928
3929 // Then any remaining arguments
3930 Ops.append(N->op_begin() + 2, N->op_end());
3931
3932 SDValue NewSt =
3933 DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
3934 N->getMemoryVT(), N->getMemOperand());
3935
3936 // return DCI.CombineTo(N, NewSt, true);
3937 return NewSt;
3938}
3939
3940SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
3941 StoreSDNode *Store = cast<StoreSDNode>(Op);
3942 EVT VT = Store->getMemoryVT();
3943
3944 if (VT == MVT::i1)
3945 return LowerSTOREi1(Op, DAG);
3946
3947 // Lower store of any other vector type, including v2f32 as we want to break
3948 // it apart since this is not a widely-supported type.
3949 return lowerSTOREVector(Op, DAG, STI);
3950}
3951
3952// st i1 v, addr
3953// =>
3954// v1 = zxt v to i16
3955// st.u8 i16, addr
3956SDValue NVPTXTargetLowering::LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const {
3957 SDNode *Node = Op.getNode();
3958 SDLoc dl(Node);
3959 StoreSDNode *ST = cast<StoreSDNode>(Node);
3960 SDValue Tmp1 = ST->getChain();
3961 SDValue Tmp2 = ST->getBasePtr();
3962 SDValue Tmp3 = ST->getValue();
3963 assert(Tmp3.getValueType() == MVT::i1 && "Custom lowering for i1 store only");
3964 Tmp3 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Tmp3);
3965 SDValue Result =
3966 DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(), MVT::i8,
3967 ST->getAlign(), ST->getMemOperand()->getFlags());
3968 return Result;
3969}
3970
3971SDValue NVPTXTargetLowering::LowerCopyToReg_128(SDValue Op,
3972 SelectionDAG &DAG) const {
3973 // Change the CopyToReg to take in two 64-bit operands instead of a 128-bit
3974 // operand so that it can pass the legalization.
3975
3976 assert(Op.getOperand(1).getValueType() == MVT::i128 &&
3977 "Custom lowering for 128-bit CopyToReg only");
3978
3979 SDNode *Node = Op.getNode();
3980 SDLoc DL(Node);
3981
3982 SDValue Cast = DAG.getBitcast(MVT::v2i64, Op->getOperand(2));
3983 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3984 DAG.getIntPtrConstant(0, DL));
3985 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, Cast,
3986 DAG.getIntPtrConstant(1, DL));
3987
3989 SmallVector<EVT, 3> ResultsType(Node->values());
3990
3991 NewOps[0] = Op->getOperand(0); // Chain
3992 NewOps[1] = Op->getOperand(1); // Dst Reg
3993 NewOps[2] = Lo; // Lower 64-bit
3994 NewOps[3] = Hi; // Higher 64-bit
3995 if (Op.getNumOperands() == 4)
3996 NewOps[4] = Op->getOperand(3); // Glue if exists
3997
3998 return DAG.getNode(ISD::CopyToReg, DL, ResultsType, NewOps);
3999}
4000
4001unsigned NVPTXTargetLowering::getNumRegisters(
4002 LLVMContext &Context, EVT VT,
4003 std::optional<MVT> RegisterVT = std::nullopt) const {
4004 if (VT == MVT::i128 && RegisterVT == MVT::i128)
4005 return 1;
4006 return TargetLoweringBase::getNumRegisters(Context, VT, RegisterVT);
4007}
4008
4009bool NVPTXTargetLowering::splitValueIntoRegisterParts(
4010 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4011 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4012 if (Val.getValueType() == MVT::i128 && NumParts == 1) {
4013 Parts[0] = Val;
4014 return true;
4015 }
4016 return false;
4017}
4018
4019// This creates target external symbol for a function parameter.
4020// Name of the symbol is composed from its index and the function name.
4021// Negative index corresponds to special parameter (unsized array) used for
4022// passing variable arguments.
4023SDValue NVPTXTargetLowering::getParamSymbol(SelectionDAG &DAG, int I,
4024 EVT T) const {
4025 StringRef SavedStr = nvTM->getStrPool().save(
4027 return DAG.getExternalSymbol(SavedStr.data(), T);
4028}
4029
4030SDValue NVPTXTargetLowering::getCallParamSymbol(SelectionDAG &DAG, int I,
4031 EVT T) const {
4032 const StringRef SavedStr = nvTM->getStrPool().save("param" + Twine(I));
4033 return DAG.getExternalSymbol(SavedStr.data(), T);
4034}
4035
4037 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4038 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4039 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4040 const DataLayout &DL = DAG.getDataLayout();
4041 LLVMContext &Ctx = *DAG.getContext();
4042 auto PtrVT = getPointerTy(DAG.getDataLayout());
4043
4044 const Function &F = DAG.getMachineFunction().getFunction();
4045 const bool IsKernel = isKernelFunction(F);
4046
4047 SDValue Root = DAG.getRoot();
4048 SmallVector<SDValue, 16> OutChains;
4049
4050 // argTypes.size() (or theArgs.size()) and Ins.size() need not match.
4051 // Ins.size() will be larger
4052 // * if there is an aggregate argument with multiple fields (each field
4053 // showing up separately in Ins)
4054 // * if there is a vector argument with more than typical vector-length
4055 // elements (generally if more than 4) where each vector element is
4056 // individually present in Ins.
4057 // So a different index should be used for indexing into Ins.
4058 // See similar issue in LowerCall.
4059
4060 auto AllIns = ArrayRef(Ins);
4061 const auto NonEmptyArgs = make_filter_range(
4062 F.args(), [](const Argument &A) { return !A.getType()->isEmptyTy(); });
4063 for (const auto &[ParamI, Arg] : enumerate(NonEmptyArgs)) {
4064 const unsigned ArgNo = Arg.getArgNo();
4065 const auto ArgIns =
4066 AllIns.take_while([&](auto I) { return I.OrigArgIndex == ArgNo; });
4067 AllIns = AllIns.drop_front(ArgIns.size());
4068
4069 Type *Ty = Arg.getType();
4070 assert(!ArgIns.empty() &&
4071 "Non-empty argument produced no parameter values");
4072
4073 if (Arg.use_empty()) {
4074 // argument is dead
4075 for (const auto &In : ArgIns) {
4076 assert(!In.Used && "Arg.use_empty() is true but Arg is used?");
4077 InVals.push_back(DAG.getUNDEF(In.VT));
4078 }
4079 continue;
4080 }
4081
4082 SDValue ArgSymbol = getParamSymbol(DAG, ParamI, PtrVT);
4083
4084 // In the following cases, assign a node order of "i+1"
4085 // to newly created nodes. The SDNodes for params have to
4086 // appear in the same order as their order of appearance
4087 // in the original function. "i+1" holds that order.
4088 if (Arg.hasByValAttr()) {
4089 // Param has ByVal attribute
4090 // Return MoveParam(param symbol).
4091 // Ideally, the param symbol can be returned directly,
4092 // but when SDNode builder decides to use it in a CopyToReg(),
4093 // machine instruction fails because TargetExternalSymbol
4094 // (not lowered) is target dependent, and CopyToReg assumes
4095 // the source is lowered.
4096 assert(ArgIns.size() == 1 && "ByVal argument must be a pointer");
4097 const auto &ByvalIn = ArgIns[0];
4098 assert(getValueType(DL, Ty) == ByvalIn.VT &&
4099 "Ins type did not match function type");
4100 assert(ByvalIn.VT == PtrVT && "ByVal argument must be a pointer");
4101
4102 SDValue P;
4103 if (IsKernel) {
4104 assert(Arg.getType()->getPointerAddressSpace() ==
4106 "Kernel ByVal argument must be lowered to the param address "
4107 "space by NVPTXLowerArgs");
4108 P = ArgSymbol;
4109 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4110 } else {
4111 P = DAG.getNode(NVPTXISD::MoveParam, dl, ByvalIn.VT, ArgSymbol);
4112 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4113 P = DAG.getAddrSpaceCast(dl, ByvalIn.VT, P, ADDRESS_SPACE_LOCAL,
4115 }
4116 InVals.push_back(P);
4117 } else {
4120 ComputePTXValueVTs(*this, DL, Ctx, CallConv, Ty, VTs, Offsets);
4121 assert(VTs.size() == ArgIns.size() && "Size mismatch");
4122 assert(VTs.size() == Offsets.size() && "Size mismatch");
4123
4124 const Align ArgAlign = getPTXParamAlign(
4125 &F, Ty, Arg.getArgNo() + AttributeList::FirstArgIndex, DL);
4126
4127 unsigned I = 0;
4128 const auto VI = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
4129 for (const unsigned NumElts : VI) {
4130 // i1 is loaded/stored as i8
4131 const EVT LoadVT = VTs[I] == MVT::i1 ? MVT::i8 : VTs[I];
4132 const EVT VecVT = getVectorizedVT(LoadVT, NumElts, Ctx);
4133
4134 SDValue VecAddr = DAG.getObjectPtrOffset(
4135 dl, ArgSymbol, TypeSize::getFixed(Offsets[I]));
4136
4137 const Align PartAlign = commonAlignment(ArgAlign, Offsets[I]);
4138 const unsigned AS = IsKernel ? NVPTX::AddressSpace::EntryParam
4140 SDValue P = DAG.getLoad(VecVT, dl, Root, VecAddr,
4141 MachinePointerInfo(AS), PartAlign,
4144 P.getNode()->setIROrder(Arg.getArgNo() + 1);
4145 for (const unsigned J : llvm::seq(NumElts)) {
4146 SDValue Elt = getExtractVectorizedValue(P, J, LoadVT, dl, DAG);
4147
4148 Elt = correctParamType(Elt, ArgIns[I + J].VT, ArgIns[I + J].Flags,
4149 DAG, dl);
4150 InVals.push_back(Elt);
4151 }
4152 I += NumElts;
4153 }
4154 }
4155 }
4156
4157 if (!OutChains.empty())
4158 DAG.setRoot(DAG.getTokenFactor(dl, OutChains));
4159
4160 return Chain;
4161}
4162
4163SDValue
4165 bool isVarArg,
4167 const SmallVectorImpl<SDValue> &OutVals,
4168 const SDLoc &dl, SelectionDAG &DAG) const {
4169 const Function &F = DAG.getMachineFunction().getFunction();
4170 Type *RetTy = F.getReturnType();
4171
4172 if (RetTy->isVoidTy()) {
4173 assert(OutVals.empty() && Outs.empty() && "Return value expected for void");
4174 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4175 }
4176
4177 const DataLayout &DL = DAG.getDataLayout();
4178 LLVMContext &Ctx = *DAG.getContext();
4179
4180 const SDValue RetSymbol = DAG.getExternalSymbol("func_retval0", MVT::i32);
4181 const auto RetAlign =
4182 getPTXParamAlign(&F, RetTy, AttributeList::ReturnIndex, DL);
4183
4184 // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
4185 // 32-bits are sign extended or zero extended, depending on whether
4186 // they are signed or unsigned types.
4187 const bool ExtendIntegerRetVal =
4188 RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
4189
4192 ComputePTXValueVTs(*this, DL, Ctx, CallConv, RetTy, VTs, Offsets);
4193 assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
4194
4195 const auto GetRetVal = [&](unsigned I) -> SDValue {
4196 SDValue RetVal = OutVals[I];
4198 RetVal.getValueType() &&
4199 "OutVal type should always be legal");
4200
4201 const EVT VTI = promoteScalarIntegerPTX(VTs[I]);
4202 const EVT StoreVT =
4203 ExtendIntegerRetVal ? MVT::i32 : (VTI == MVT::i1 ? MVT::i8 : VTI);
4204 return correctParamType(RetVal, StoreVT, Outs[I].Flags, DAG, dl);
4205 };
4206
4207 unsigned I = 0;
4208 const auto VI = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
4209 for (const unsigned NumElts : VI) {
4210 const MaybeAlign CurrentAlign = ExtendIntegerRetVal
4211 ? MaybeAlign(std::nullopt)
4212 : commonAlignment(RetAlign, Offsets[I]);
4213
4215 NumElts, dl, DAG, [&](unsigned K) { return GetRetVal(I + K); });
4216
4217 SDValue Ptr =
4218 DAG.getObjectPtrOffset(dl, RetSymbol, TypeSize::getFixed(Offsets[I]));
4219
4220 Chain = DAG.getStore(Chain, dl, Val, Ptr,
4222 CurrentAlign);
4223
4224 I += NumElts;
4225 }
4226
4227 return DAG.getNode(NVPTXISD::RET_GLUE, dl, MVT::Other, Chain);
4228}
4229
4231 SDValue Op, StringRef Constraint, std::vector<SDValue> &Ops,
4232 SelectionDAG &DAG) const {
4233 if (Constraint.size() > 1)
4234 return;
4236}
4237
4238// llvm.ptx.memcpy.const and llvm.ptx.memmove.const need to be modeled as
4239// TgtMemIntrinsic
4240// because we need the information that is only available in the "Value" type
4241// of destination
4242// pointer. In particular, the address space information.
4245 MachineFunction &MF, unsigned Intrinsic) const {
4246 IntrinsicInfo Info;
4247 switch (Intrinsic) {
4248 default:
4249 return;
4250 case Intrinsic::nvvm_match_all_sync_i32p:
4251 case Intrinsic::nvvm_match_all_sync_i64p:
4252 Info.opc = ISD::INTRINSIC_W_CHAIN;
4253 // memVT is bogus. These intrinsics have IntrInaccessibleMemOnly attribute
4254 // in order to model data exchange with other threads, but perform no real
4255 // memory accesses.
4256 Info.memVT = MVT::i1;
4257
4258 // Our result depends on both our and other thread's arguments.
4260 Infos.push_back(Info);
4261 return;
4262 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col:
4263 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row:
4264 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_col_stride:
4265 case Intrinsic::nvvm_wmma_m16n16k16_load_a_f16_row_stride:
4266 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col:
4267 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row:
4268 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_col_stride:
4269 case Intrinsic::nvvm_wmma_m16n16k16_load_b_f16_row_stride:
4270 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col:
4271 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row:
4272 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_col_stride:
4273 case Intrinsic::nvvm_wmma_m32n8k16_load_a_f16_row_stride:
4274 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col:
4275 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row:
4276 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_col_stride:
4277 case Intrinsic::nvvm_wmma_m32n8k16_load_b_f16_row_stride:
4278 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col:
4279 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row:
4280 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_col_stride:
4281 case Intrinsic::nvvm_wmma_m8n32k16_load_a_f16_row_stride:
4282 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col:
4283 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row:
4284 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_col_stride:
4285 case Intrinsic::nvvm_wmma_m8n32k16_load_b_f16_row_stride: {
4286 Info.opc = ISD::INTRINSIC_W_CHAIN;
4287 Info.memVT = MVT::v8f16;
4288 Info.ptrVal = I.getArgOperand(0);
4289 Info.offset = 0;
4290 Info.flags = MachineMemOperand::MOLoad;
4291 Info.align = Align(16);
4292 Infos.push_back(Info);
4293 return;
4294 }
4295 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col:
4296 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_col_stride:
4297 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col_stride:
4298 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_col:
4299 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row:
4300 case Intrinsic::nvvm_wmma_m16n16k16_load_a_s8_row_stride:
4301 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row_stride:
4302 case Intrinsic::nvvm_wmma_m16n16k16_load_a_u8_row:
4303 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col:
4304 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_col_stride:
4305 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row:
4306 case Intrinsic::nvvm_wmma_m8n32k16_load_a_bf16_row_stride:
4307 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col:
4308 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_col_stride:
4309 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col_stride:
4310 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_col:
4311 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row:
4312 case Intrinsic::nvvm_wmma_m16n16k16_load_b_s8_row_stride:
4313 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row_stride:
4314 case Intrinsic::nvvm_wmma_m16n16k16_load_b_u8_row:
4315 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col:
4316 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_col_stride:
4317 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row:
4318 case Intrinsic::nvvm_wmma_m32n8k16_load_b_bf16_row_stride: {
4319 Info.opc = ISD::INTRINSIC_W_CHAIN;
4320 Info.memVT = MVT::v2i32;
4321 Info.ptrVal = I.getArgOperand(0);
4322 Info.offset = 0;
4323 Info.flags = MachineMemOperand::MOLoad;
4324 Info.align = Align(8);
4325 Infos.push_back(Info);
4326 return;
4327 }
4328
4329 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col:
4330 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_col_stride:
4331 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col_stride:
4332 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_col:
4333 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row:
4334 case Intrinsic::nvvm_wmma_m32n8k16_load_a_s8_row_stride:
4335 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row_stride:
4336 case Intrinsic::nvvm_wmma_m32n8k16_load_a_u8_row:
4337 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col:
4338 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_col_stride:
4339 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row:
4340 case Intrinsic::nvvm_wmma_m16n16k16_load_a_bf16_row_stride:
4341 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col:
4342 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_col_stride:
4343 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row:
4344 case Intrinsic::nvvm_wmma_m16n16k8_load_a_tf32_row_stride:
4345
4346 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col:
4347 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_col_stride:
4348 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col_stride:
4349 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_col:
4350 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row:
4351 case Intrinsic::nvvm_wmma_m8n32k16_load_b_s8_row_stride:
4352 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row_stride:
4353 case Intrinsic::nvvm_wmma_m8n32k16_load_b_u8_row:
4354 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col:
4355 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_col_stride:
4356 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row:
4357 case Intrinsic::nvvm_wmma_m16n16k16_load_b_bf16_row_stride:
4358 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col:
4359 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_col_stride:
4360 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row:
4361 case Intrinsic::nvvm_wmma_m16n16k8_load_b_tf32_row_stride:
4362 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_b16:
4363 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x4_trans_b16:
4364 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8:
4365 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b4x16_p64:
4366 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x2_trans_b8x16_b6x16_p32:
4367 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b4x16_p64:
4368 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x4_b8x16_b6x16_p32: {
4369 Info.opc = ISD::INTRINSIC_W_CHAIN;
4370 Info.memVT = MVT::v4i32;
4371 Info.ptrVal = I.getArgOperand(0);
4372 Info.offset = 0;
4373 Info.flags = MachineMemOperand::MOLoad;
4374 Info.align = Align(16);
4375 Infos.push_back(Info);
4376 return;
4377 }
4378
4379 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col:
4380 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_col_stride:
4381 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col_stride:
4382 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_col:
4383 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row:
4384 case Intrinsic::nvvm_wmma_m32n8k16_load_b_s8_row_stride:
4385 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row_stride:
4386 case Intrinsic::nvvm_wmma_m32n8k16_load_b_u8_row:
4387
4388 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col:
4389 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_col_stride:
4390 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col_stride:
4391 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_col:
4392 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row:
4393 case Intrinsic::nvvm_wmma_m8n32k16_load_a_s8_row_stride:
4394 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row_stride:
4395 case Intrinsic::nvvm_wmma_m8n32k16_load_a_u8_row:
4396 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row:
4397 case Intrinsic::nvvm_wmma_m8n8k128_load_a_b1_row_stride:
4398 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col:
4399 case Intrinsic::nvvm_wmma_m8n8k128_load_b_b1_col_stride:
4400 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row:
4401 case Intrinsic::nvvm_wmma_m8n8k32_load_a_s4_row_stride:
4402 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row_stride:
4403 case Intrinsic::nvvm_wmma_m8n8k32_load_a_u4_row:
4404 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col:
4405 case Intrinsic::nvvm_wmma_m8n8k32_load_b_s4_col_stride:
4406 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col_stride:
4407 case Intrinsic::nvvm_wmma_m8n8k32_load_b_u4_col:
4408 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_b16:
4409 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x1_trans_b16:
4410 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b4x16_p64:
4411 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x1_b8x16_b6x16_p32: {
4412 Info.opc = ISD::INTRINSIC_W_CHAIN;
4413 Info.memVT = MVT::i32;
4414 Info.ptrVal = I.getArgOperand(0);
4415 Info.offset = 0;
4416 Info.flags = MachineMemOperand::MOLoad;
4417 Info.align = Align(4);
4418 Infos.push_back(Info);
4419 return;
4420 }
4421
4422 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col:
4423 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row:
4424 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_col_stride:
4425 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f16_row_stride:
4426 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col:
4427 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row:
4428 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_col_stride:
4429 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f16_row_stride:
4430 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col:
4431 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row:
4432 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_col_stride:
4433 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f16_row_stride: {
4434 Info.opc = ISD::INTRINSIC_W_CHAIN;
4435 Info.memVT = MVT::v4f16;
4436 Info.ptrVal = I.getArgOperand(0);
4437 Info.offset = 0;
4438 Info.flags = MachineMemOperand::MOLoad;
4439 Info.align = Align(16);
4440 Infos.push_back(Info);
4441 return;
4442 }
4443
4444 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col:
4445 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row:
4446 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_col_stride:
4447 case Intrinsic::nvvm_wmma_m16n16k16_load_c_f32_row_stride:
4448 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col:
4449 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row:
4450 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_col_stride:
4451 case Intrinsic::nvvm_wmma_m32n8k16_load_c_f32_row_stride:
4452 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col:
4453 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row:
4454 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_col_stride:
4455 case Intrinsic::nvvm_wmma_m8n32k16_load_c_f32_row_stride:
4456 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col:
4457 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row:
4458 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_col_stride:
4459 case Intrinsic::nvvm_wmma_m16n16k8_load_c_f32_row_stride: {
4460 Info.opc = ISD::INTRINSIC_W_CHAIN;
4461 Info.memVT = MVT::v8f32;
4462 Info.ptrVal = I.getArgOperand(0);
4463 Info.offset = 0;
4464 Info.flags = MachineMemOperand::MOLoad;
4465 Info.align = Align(16);
4466 Infos.push_back(Info);
4467 return;
4468 }
4469
4470 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col:
4471 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_col_stride:
4472 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row:
4473 case Intrinsic::nvvm_wmma_m32n8k16_load_a_bf16_row_stride:
4474
4475 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col:
4476 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_col_stride:
4477 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row:
4478 case Intrinsic::nvvm_wmma_m8n32k16_load_b_bf16_row_stride:
4479
4480 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col:
4481 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_col_stride:
4482 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row:
4483 case Intrinsic::nvvm_wmma_m16n16k16_load_c_s32_row_stride:
4484 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col:
4485 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_col_stride:
4486 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row:
4487 case Intrinsic::nvvm_wmma_m32n8k16_load_c_s32_row_stride:
4488 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col:
4489 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_col_stride:
4490 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row:
4491 case Intrinsic::nvvm_wmma_m8n32k16_load_c_s32_row_stride: {
4492 Info.opc = ISD::INTRINSIC_W_CHAIN;
4493 Info.memVT = MVT::v8i32;
4494 Info.ptrVal = I.getArgOperand(0);
4495 Info.offset = 0;
4496 Info.flags = MachineMemOperand::MOLoad;
4497 Info.align = Align(16);
4498 Infos.push_back(Info);
4499 return;
4500 }
4501
4502 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col:
4503 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_col_stride:
4504 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row:
4505 case Intrinsic::nvvm_wmma_m8n8k128_load_c_s32_row_stride:
4506 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col:
4507 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_col_stride:
4508 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row:
4509 case Intrinsic::nvvm_wmma_m8n8k32_load_c_s32_row_stride:
4510 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_b16:
4511 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n8_x2_trans_b16:
4512 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8:
4513 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b4x16_p64:
4514 case Intrinsic::nvvm_ldmatrix_sync_aligned_m16n16_x1_trans_b8x16_b6x16_p32:
4515 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b4x16_p64:
4516 case Intrinsic::nvvm_ldmatrix_sync_aligned_m8n16_x2_b8x16_b6x16_p32: {
4517 Info.opc = ISD::INTRINSIC_W_CHAIN;
4518 Info.memVT = MVT::v2i32;
4519 Info.ptrVal = I.getArgOperand(0);
4520 Info.offset = 0;
4521 Info.flags = MachineMemOperand::MOLoad;
4522 Info.align = Align(8);
4523 Infos.push_back(Info);
4524 return;
4525 }
4526
4527 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col:
4528 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_col_stride:
4529 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row:
4530 case Intrinsic::nvvm_wmma_m8n8k4_load_a_f64_row_stride:
4531
4532 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col:
4533 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_col_stride:
4534 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row:
4535 case Intrinsic::nvvm_wmma_m8n8k4_load_b_f64_row_stride: {
4536 Info.opc = ISD::INTRINSIC_W_CHAIN;
4537 Info.memVT = MVT::f64;
4538 Info.ptrVal = I.getArgOperand(0);
4539 Info.offset = 0;
4540 Info.flags = MachineMemOperand::MOLoad;
4541 Info.align = Align(8);
4542 Infos.push_back(Info);
4543 return;
4544 }
4545
4546 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col:
4547 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_col_stride:
4548 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row:
4549 case Intrinsic::nvvm_wmma_m8n8k4_load_c_f64_row_stride: {
4550 Info.opc = ISD::INTRINSIC_W_CHAIN;
4551 Info.memVT = MVT::v2f64;
4552 Info.ptrVal = I.getArgOperand(0);
4553 Info.offset = 0;
4554 Info.flags = MachineMemOperand::MOLoad;
4555 Info.align = Align(16);
4556 Infos.push_back(Info);
4557 return;
4558 }
4559
4560 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col:
4561 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row:
4562 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_col_stride:
4563 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f16_row_stride:
4564 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col:
4565 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row:
4566 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_col_stride:
4567 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f16_row_stride:
4568 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col:
4569 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row:
4570 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_col_stride:
4571 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f16_row_stride: {
4572 Info.opc = ISD::INTRINSIC_VOID;
4573 Info.memVT = MVT::v4f16;
4574 Info.ptrVal = I.getArgOperand(0);
4575 Info.offset = 0;
4576 Info.flags = MachineMemOperand::MOStore;
4577 Info.align = Align(16);
4578 Infos.push_back(Info);
4579 return;
4580 }
4581
4582 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col:
4583 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row:
4584 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_col_stride:
4585 case Intrinsic::nvvm_wmma_m16n16k16_store_d_f32_row_stride:
4586 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col:
4587 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row:
4588 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_col_stride:
4589 case Intrinsic::nvvm_wmma_m32n8k16_store_d_f32_row_stride:
4590 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col:
4591 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row:
4592 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_col_stride:
4593 case Intrinsic::nvvm_wmma_m8n32k16_store_d_f32_row_stride:
4594 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col:
4595 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row:
4596 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_col_stride:
4597 case Intrinsic::nvvm_wmma_m16n16k8_store_d_f32_row_stride: {
4598 Info.opc = ISD::INTRINSIC_VOID;
4599 Info.memVT = MVT::v8f32;
4600 Info.ptrVal = I.getArgOperand(0);
4601 Info.offset = 0;
4602 Info.flags = MachineMemOperand::MOStore;
4603 Info.align = Align(16);
4604 Infos.push_back(Info);
4605 return;
4606 }
4607
4608 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col:
4609 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_col_stride:
4610 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row:
4611 case Intrinsic::nvvm_wmma_m16n16k16_store_d_s32_row_stride:
4612 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col:
4613 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_col_stride:
4614 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row:
4615 case Intrinsic::nvvm_wmma_m32n8k16_store_d_s32_row_stride:
4616 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col:
4617 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_col_stride:
4618 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row:
4619 case Intrinsic::nvvm_wmma_m8n32k16_store_d_s32_row_stride: {
4620 Info.opc = ISD::INTRINSIC_VOID;
4621 Info.memVT = MVT::v8i32;
4622 Info.ptrVal = I.getArgOperand(0);
4623 Info.offset = 0;
4624 Info.flags = MachineMemOperand::MOStore;
4625 Info.align = Align(16);
4626 Infos.push_back(Info);
4627 return;
4628 }
4629
4630 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col:
4631 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_col_stride:
4632 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row:
4633 case Intrinsic::nvvm_wmma_m8n8k128_store_d_s32_row_stride:
4634 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col:
4635 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_col_stride:
4636 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row:
4637 case Intrinsic::nvvm_wmma_m8n8k32_store_d_s32_row_stride:
4638 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_b16:
4639 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x2_trans_b16:
4640 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x2_trans_b8: {
4641 Info.opc = ISD::INTRINSIC_VOID;
4642 Info.memVT = MVT::v2i32;
4643 Info.ptrVal = I.getArgOperand(0);
4644 Info.offset = 0;
4645 Info.flags = MachineMemOperand::MOStore;
4646 Info.align = Align(8);
4647 Infos.push_back(Info);
4648 return;
4649 }
4650
4651 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col:
4652 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_col_stride:
4653 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row:
4654 case Intrinsic::nvvm_wmma_m8n8k4_store_d_f64_row_stride: {
4655 Info.opc = ISD::INTRINSIC_VOID;
4656 Info.memVT = MVT::v2f64;
4657 Info.ptrVal = I.getArgOperand(0);
4658 Info.offset = 0;
4659 Info.flags = MachineMemOperand::MOStore;
4660 Info.align = Align(16);
4661 Infos.push_back(Info);
4662 return;
4663 }
4664
4665 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_b16:
4666 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x1_trans_b16:
4667 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x1_trans_b8: {
4668 Info.opc = ISD::INTRINSIC_VOID;
4669 Info.memVT = MVT::i32;
4670 Info.ptrVal = I.getArgOperand(0);
4671 Info.offset = 0;
4672 Info.flags = MachineMemOperand::MOStore;
4673 Info.align = Align(4);
4674 Infos.push_back(Info);
4675 return;
4676 }
4677
4678 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_b16:
4679 case Intrinsic::nvvm_stmatrix_sync_aligned_m8n8_x4_trans_b16:
4680 case Intrinsic::nvvm_stmatrix_sync_aligned_m16n8_x4_trans_b8: {
4681 Info.opc = ISD::INTRINSIC_VOID;
4682 Info.memVT = MVT::v4i32;
4683 Info.ptrVal = I.getArgOperand(0);
4684 Info.offset = 0;
4685 Info.flags = MachineMemOperand::MOStore;
4686 Info.align = Align(16);
4687 Infos.push_back(Info);
4688 return;
4689 }
4690
4691 case Intrinsic::nvvm_prefetch_tensormap: {
4692 auto &DL = I.getDataLayout();
4693 Info.opc = ISD::INTRINSIC_VOID;
4694 Info.memVT = getPointerTy(DL);
4695 Info.ptrVal = I.getArgOperand(0);
4696 Info.offset = 0;
4697 Info.flags =
4699 Info.align.reset();
4700 Infos.push_back(Info);
4701 return;
4702 }
4703
4704 case Intrinsic::nvvm_tensormap_replace_global_address:
4705 case Intrinsic::nvvm_tensormap_replace_global_stride: {
4706 Info.opc = ISD::INTRINSIC_VOID;
4707 Info.memVT = MVT::i64;
4708 Info.ptrVal = I.getArgOperand(0);
4709 Info.offset = 0;
4710 Info.flags = MachineMemOperand::MOStore;
4711 Info.align.reset();
4712 Infos.push_back(Info);
4713 return;
4714 }
4715
4716 case Intrinsic::nvvm_tensormap_replace_rank:
4717 case Intrinsic::nvvm_tensormap_replace_box_dim:
4718 case Intrinsic::nvvm_tensormap_replace_global_dim:
4719 case Intrinsic::nvvm_tensormap_replace_element_stride:
4720 case Intrinsic::nvvm_tensormap_replace_elemtype:
4721 case Intrinsic::nvvm_tensormap_replace_interleave_layout:
4722 case Intrinsic::nvvm_tensormap_replace_swizzle_mode:
4723 case Intrinsic::nvvm_tensormap_replace_swizzle_atomicity:
4724 case Intrinsic::nvvm_tensormap_replace_fill_mode: {
4725 Info.opc = ISD::INTRINSIC_VOID;
4726 Info.memVT = MVT::i32;
4727 Info.ptrVal = I.getArgOperand(0);
4728 Info.offset = 0;
4729 Info.flags = MachineMemOperand::MOStore;
4730 Info.align.reset();
4731 Infos.push_back(Info);
4732 return;
4733 }
4734
4735 case Intrinsic::nvvm_ldu_global_i:
4736 case Intrinsic::nvvm_ldu_global_f:
4737 case Intrinsic::nvvm_ldu_global_p: {
4738 Info.opc = ISD::INTRINSIC_W_CHAIN;
4739 Info.memVT = getValueType(I.getDataLayout(), I.getType());
4740 Info.ptrVal = I.getArgOperand(0);
4741 Info.offset = 0;
4742 Info.flags = MachineMemOperand::MOLoad;
4743 Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
4744
4745 Infos.push_back(Info);
4746 return;
4747 }
4748 case Intrinsic::nvvm_tex_1d_v4f32_s32:
4749 case Intrinsic::nvvm_tex_1d_v4f32_f32:
4750 case Intrinsic::nvvm_tex_1d_level_v4f32_f32:
4751 case Intrinsic::nvvm_tex_1d_grad_v4f32_f32:
4752 case Intrinsic::nvvm_tex_1d_array_v4f32_s32:
4753 case Intrinsic::nvvm_tex_1d_array_v4f32_f32:
4754 case Intrinsic::nvvm_tex_1d_array_level_v4f32_f32:
4755 case Intrinsic::nvvm_tex_1d_array_grad_v4f32_f32:
4756 case Intrinsic::nvvm_tex_2d_v4f32_s32:
4757 case Intrinsic::nvvm_tex_2d_v4f32_f32:
4758 case Intrinsic::nvvm_tex_2d_level_v4f32_f32:
4759 case Intrinsic::nvvm_tex_2d_grad_v4f32_f32:
4760 case Intrinsic::nvvm_tex_2d_array_v4f32_s32:
4761 case Intrinsic::nvvm_tex_2d_array_v4f32_f32:
4762 case Intrinsic::nvvm_tex_2d_array_level_v4f32_f32:
4763 case Intrinsic::nvvm_tex_2d_array_grad_v4f32_f32:
4764 case Intrinsic::nvvm_tex_3d_v4f32_s32:
4765 case Intrinsic::nvvm_tex_3d_v4f32_f32:
4766 case Intrinsic::nvvm_tex_3d_level_v4f32_f32:
4767 case Intrinsic::nvvm_tex_3d_grad_v4f32_f32:
4768 case Intrinsic::nvvm_tex_cube_v4f32_f32:
4769 case Intrinsic::nvvm_tex_cube_level_v4f32_f32:
4770 case Intrinsic::nvvm_tex_cube_array_v4f32_f32:
4771 case Intrinsic::nvvm_tex_cube_array_level_v4f32_f32:
4772 case Intrinsic::nvvm_tld4_r_2d_v4f32_f32:
4773 case Intrinsic::nvvm_tld4_g_2d_v4f32_f32:
4774 case Intrinsic::nvvm_tld4_b_2d_v4f32_f32:
4775 case Intrinsic::nvvm_tld4_a_2d_v4f32_f32:
4776 case Intrinsic::nvvm_tex_unified_1d_v4f32_s32:
4777 case Intrinsic::nvvm_tex_unified_1d_v4f32_f32:
4778 case Intrinsic::nvvm_tex_unified_1d_level_v4f32_f32:
4779 case Intrinsic::nvvm_tex_unified_1d_grad_v4f32_f32:
4780 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_s32:
4781 case Intrinsic::nvvm_tex_unified_1d_array_v4f32_f32:
4782 case Intrinsic::nvvm_tex_unified_1d_array_level_v4f32_f32:
4783 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4f32_f32:
4784 case Intrinsic::nvvm_tex_unified_2d_v4f32_s32:
4785 case Intrinsic::nvvm_tex_unified_2d_v4f32_f32:
4786 case Intrinsic::nvvm_tex_unified_2d_level_v4f32_f32:
4787 case Intrinsic::nvvm_tex_unified_2d_grad_v4f32_f32:
4788 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_s32:
4789 case Intrinsic::nvvm_tex_unified_2d_array_v4f32_f32:
4790 case Intrinsic::nvvm_tex_unified_2d_array_level_v4f32_f32:
4791 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4f32_f32:
4792 case Intrinsic::nvvm_tex_unified_3d_v4f32_s32:
4793 case Intrinsic::nvvm_tex_unified_3d_v4f32_f32:
4794 case Intrinsic::nvvm_tex_unified_3d_level_v4f32_f32:
4795 case Intrinsic::nvvm_tex_unified_3d_grad_v4f32_f32:
4796 case Intrinsic::nvvm_tex_unified_cube_v4f32_f32:
4797 case Intrinsic::nvvm_tex_unified_cube_level_v4f32_f32:
4798 case Intrinsic::nvvm_tex_unified_cube_array_v4f32_f32:
4799 case Intrinsic::nvvm_tex_unified_cube_array_level_v4f32_f32:
4800 case Intrinsic::nvvm_tex_unified_cube_grad_v4f32_f32:
4801 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4f32_f32:
4802 case Intrinsic::nvvm_tld4_unified_r_2d_v4f32_f32:
4803 case Intrinsic::nvvm_tld4_unified_g_2d_v4f32_f32:
4804 case Intrinsic::nvvm_tld4_unified_b_2d_v4f32_f32:
4805 case Intrinsic::nvvm_tld4_unified_a_2d_v4f32_f32:
4806 Info.opc = ISD::INTRINSIC_W_CHAIN;
4807 Info.memVT = MVT::v4f32;
4808 Info.ptrVal = nullptr;
4809 Info.offset = 0;
4810 Info.flags = MachineMemOperand::MOLoad;
4811 Info.align = Align(16);
4812 Infos.push_back(Info);
4813 return;
4814
4815 case Intrinsic::nvvm_tex_1d_v4s32_s32:
4816 case Intrinsic::nvvm_tex_1d_v4s32_f32:
4817 case Intrinsic::nvvm_tex_1d_level_v4s32_f32:
4818 case Intrinsic::nvvm_tex_1d_grad_v4s32_f32:
4819 case Intrinsic::nvvm_tex_1d_array_v4s32_s32:
4820 case Intrinsic::nvvm_tex_1d_array_v4s32_f32:
4821 case Intrinsic::nvvm_tex_1d_array_level_v4s32_f32:
4822 case Intrinsic::nvvm_tex_1d_array_grad_v4s32_f32:
4823 case Intrinsic::nvvm_tex_2d_v4s32_s32:
4824 case Intrinsic::nvvm_tex_2d_v4s32_f32:
4825 case Intrinsic::nvvm_tex_2d_level_v4s32_f32:
4826 case Intrinsic::nvvm_tex_2d_grad_v4s32_f32:
4827 case Intrinsic::nvvm_tex_2d_array_v4s32_s32:
4828 case Intrinsic::nvvm_tex_2d_array_v4s32_f32:
4829 case Intrinsic::nvvm_tex_2d_array_level_v4s32_f32:
4830 case Intrinsic::nvvm_tex_2d_array_grad_v4s32_f32:
4831 case Intrinsic::nvvm_tex_3d_v4s32_s32:
4832 case Intrinsic::nvvm_tex_3d_v4s32_f32:
4833 case Intrinsic::nvvm_tex_3d_level_v4s32_f32:
4834 case Intrinsic::nvvm_tex_3d_grad_v4s32_f32:
4835 case Intrinsic::nvvm_tex_cube_v4s32_f32:
4836 case Intrinsic::nvvm_tex_cube_level_v4s32_f32:
4837 case Intrinsic::nvvm_tex_cube_array_v4s32_f32:
4838 case Intrinsic::nvvm_tex_cube_array_level_v4s32_f32:
4839 case Intrinsic::nvvm_tex_cube_v4u32_f32:
4840 case Intrinsic::nvvm_tex_cube_level_v4u32_f32:
4841 case Intrinsic::nvvm_tex_cube_array_v4u32_f32:
4842 case Intrinsic::nvvm_tex_cube_array_level_v4u32_f32:
4843 case Intrinsic::nvvm_tex_1d_v4u32_s32:
4844 case Intrinsic::nvvm_tex_1d_v4u32_f32:
4845 case Intrinsic::nvvm_tex_1d_level_v4u32_f32:
4846 case Intrinsic::nvvm_tex_1d_grad_v4u32_f32:
4847 case Intrinsic::nvvm_tex_1d_array_v4u32_s32:
4848 case Intrinsic::nvvm_tex_1d_array_v4u32_f32:
4849 case Intrinsic::nvvm_tex_1d_array_level_v4u32_f32:
4850 case Intrinsic::nvvm_tex_1d_array_grad_v4u32_f32:
4851 case Intrinsic::nvvm_tex_2d_v4u32_s32:
4852 case Intrinsic::nvvm_tex_2d_v4u32_f32:
4853 case Intrinsic::nvvm_tex_2d_level_v4u32_f32:
4854 case Intrinsic::nvvm_tex_2d_grad_v4u32_f32:
4855 case Intrinsic::nvvm_tex_2d_array_v4u32_s32:
4856 case Intrinsic::nvvm_tex_2d_array_v4u32_f32:
4857 case Intrinsic::nvvm_tex_2d_array_level_v4u32_f32:
4858 case Intrinsic::nvvm_tex_2d_array_grad_v4u32_f32:
4859 case Intrinsic::nvvm_tex_3d_v4u32_s32:
4860 case Intrinsic::nvvm_tex_3d_v4u32_f32:
4861 case Intrinsic::nvvm_tex_3d_level_v4u32_f32:
4862 case Intrinsic::nvvm_tex_3d_grad_v4u32_f32:
4863 case Intrinsic::nvvm_tld4_r_2d_v4s32_f32:
4864 case Intrinsic::nvvm_tld4_g_2d_v4s32_f32:
4865 case Intrinsic::nvvm_tld4_b_2d_v4s32_f32:
4866 case Intrinsic::nvvm_tld4_a_2d_v4s32_f32:
4867 case Intrinsic::nvvm_tld4_r_2d_v4u32_f32:
4868 case Intrinsic::nvvm_tld4_g_2d_v4u32_f32:
4869 case Intrinsic::nvvm_tld4_b_2d_v4u32_f32:
4870 case Intrinsic::nvvm_tld4_a_2d_v4u32_f32:
4871 case Intrinsic::nvvm_tex_unified_1d_v4s32_s32:
4872 case Intrinsic::nvvm_tex_unified_1d_v4s32_f32:
4873 case Intrinsic::nvvm_tex_unified_1d_level_v4s32_f32:
4874 case Intrinsic::nvvm_tex_unified_1d_grad_v4s32_f32:
4875 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_s32:
4876 case Intrinsic::nvvm_tex_unified_1d_array_v4s32_f32:
4877 case Intrinsic::nvvm_tex_unified_1d_array_level_v4s32_f32:
4878 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4s32_f32:
4879 case Intrinsic::nvvm_tex_unified_2d_v4s32_s32:
4880 case Intrinsic::nvvm_tex_unified_2d_v4s32_f32:
4881 case Intrinsic::nvvm_tex_unified_2d_level_v4s32_f32:
4882 case Intrinsic::nvvm_tex_unified_2d_grad_v4s32_f32:
4883 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_s32:
4884 case Intrinsic::nvvm_tex_unified_2d_array_v4s32_f32:
4885 case Intrinsic::nvvm_tex_unified_2d_array_level_v4s32_f32:
4886 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4s32_f32:
4887 case Intrinsic::nvvm_tex_unified_3d_v4s32_s32:
4888 case Intrinsic::nvvm_tex_unified_3d_v4s32_f32:
4889 case Intrinsic::nvvm_tex_unified_3d_level_v4s32_f32:
4890 case Intrinsic::nvvm_tex_unified_3d_grad_v4s32_f32:
4891 case Intrinsic::nvvm_tex_unified_1d_v4u32_s32:
4892 case Intrinsic::nvvm_tex_unified_1d_v4u32_f32:
4893 case Intrinsic::nvvm_tex_unified_1d_level_v4u32_f32:
4894 case Intrinsic::nvvm_tex_unified_1d_grad_v4u32_f32:
4895 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_s32:
4896 case Intrinsic::nvvm_tex_unified_1d_array_v4u32_f32:
4897 case Intrinsic::nvvm_tex_unified_1d_array_level_v4u32_f32:
4898 case Intrinsic::nvvm_tex_unified_1d_array_grad_v4u32_f32:
4899 case Intrinsic::nvvm_tex_unified_2d_v4u32_s32:
4900 case Intrinsic::nvvm_tex_unified_2d_v4u32_f32:
4901 case Intrinsic::nvvm_tex_unified_2d_level_v4u32_f32:
4902 case Intrinsic::nvvm_tex_unified_2d_grad_v4u32_f32:
4903 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_s32:
4904 case Intrinsic::nvvm_tex_unified_2d_array_v4u32_f32:
4905 case Intrinsic::nvvm_tex_unified_2d_array_level_v4u32_f32:
4906 case Intrinsic::nvvm_tex_unified_2d_array_grad_v4u32_f32:
4907 case Intrinsic::nvvm_tex_unified_3d_v4u32_s32:
4908 case Intrinsic::nvvm_tex_unified_3d_v4u32_f32:
4909 case Intrinsic::nvvm_tex_unified_3d_level_v4u32_f32:
4910 case Intrinsic::nvvm_tex_unified_3d_grad_v4u32_f32:
4911 case Intrinsic::nvvm_tex_unified_cube_v4s32_f32:
4912 case Intrinsic::nvvm_tex_unified_cube_level_v4s32_f32:
4913 case Intrinsic::nvvm_tex_unified_cube_array_v4s32_f32:
4914 case Intrinsic::nvvm_tex_unified_cube_array_level_v4s32_f32:
4915 case Intrinsic::nvvm_tex_unified_cube_v4u32_f32:
4916 case Intrinsic::nvvm_tex_unified_cube_level_v4u32_f32:
4917 case Intrinsic::nvvm_tex_unified_cube_array_v4u32_f32:
4918 case Intrinsic::nvvm_tex_unified_cube_array_level_v4u32_f32:
4919 case Intrinsic::nvvm_tex_unified_cube_grad_v4s32_f32:
4920 case Intrinsic::nvvm_tex_unified_cube_grad_v4u32_f32:
4921 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4s32_f32:
4922 case Intrinsic::nvvm_tex_unified_cube_array_grad_v4u32_f32:
4923 case Intrinsic::nvvm_tld4_unified_r_2d_v4s32_f32:
4924 case Intrinsic::nvvm_tld4_unified_g_2d_v4s32_f32:
4925 case Intrinsic::nvvm_tld4_unified_b_2d_v4s32_f32:
4926 case Intrinsic::nvvm_tld4_unified_a_2d_v4s32_f32:
4927 case Intrinsic::nvvm_tld4_unified_r_2d_v4u32_f32:
4928 case Intrinsic::nvvm_tld4_unified_g_2d_v4u32_f32:
4929 case Intrinsic::nvvm_tld4_unified_b_2d_v4u32_f32:
4930 case Intrinsic::nvvm_tld4_unified_a_2d_v4u32_f32:
4931 Info.opc = ISD::INTRINSIC_W_CHAIN;
4932 Info.memVT = MVT::v4i32;
4933 Info.ptrVal = nullptr;
4934 Info.offset = 0;
4935 Info.flags = MachineMemOperand::MOLoad;
4936 Info.align = Align(16);
4937 Infos.push_back(Info);
4938 return;
4939
4940 case Intrinsic::nvvm_suld_1d_i8_clamp:
4941 case Intrinsic::nvvm_suld_1d_v2i8_clamp:
4942 case Intrinsic::nvvm_suld_1d_v4i8_clamp:
4943 case Intrinsic::nvvm_suld_1d_array_i8_clamp:
4944 case Intrinsic::nvvm_suld_1d_array_v2i8_clamp:
4945 case Intrinsic::nvvm_suld_1d_array_v4i8_clamp:
4946 case Intrinsic::nvvm_suld_2d_i8_clamp:
4947 case Intrinsic::nvvm_suld_2d_v2i8_clamp:
4948 case Intrinsic::nvvm_suld_2d_v4i8_clamp:
4949 case Intrinsic::nvvm_suld_2d_array_i8_clamp:
4950 case Intrinsic::nvvm_suld_2d_array_v2i8_clamp:
4951 case Intrinsic::nvvm_suld_2d_array_v4i8_clamp:
4952 case Intrinsic::nvvm_suld_3d_i8_clamp:
4953 case Intrinsic::nvvm_suld_3d_v2i8_clamp:
4954 case Intrinsic::nvvm_suld_3d_v4i8_clamp:
4955 case Intrinsic::nvvm_suld_1d_i8_trap:
4956 case Intrinsic::nvvm_suld_1d_v2i8_trap:
4957 case Intrinsic::nvvm_suld_1d_v4i8_trap:
4958 case Intrinsic::nvvm_suld_1d_array_i8_trap:
4959 case Intrinsic::nvvm_suld_1d_array_v2i8_trap:
4960 case Intrinsic::nvvm_suld_1d_array_v4i8_trap:
4961 case Intrinsic::nvvm_suld_2d_i8_trap:
4962 case Intrinsic::nvvm_suld_2d_v2i8_trap:
4963 case Intrinsic::nvvm_suld_2d_v4i8_trap:
4964 case Intrinsic::nvvm_suld_2d_array_i8_trap:
4965 case Intrinsic::nvvm_suld_2d_array_v2i8_trap:
4966 case Intrinsic::nvvm_suld_2d_array_v4i8_trap:
4967 case Intrinsic::nvvm_suld_3d_i8_trap:
4968 case Intrinsic::nvvm_suld_3d_v2i8_trap:
4969 case Intrinsic::nvvm_suld_3d_v4i8_trap:
4970 case Intrinsic::nvvm_suld_1d_i8_zero:
4971 case Intrinsic::nvvm_suld_1d_v2i8_zero:
4972 case Intrinsic::nvvm_suld_1d_v4i8_zero:
4973 case Intrinsic::nvvm_suld_1d_array_i8_zero:
4974 case Intrinsic::nvvm_suld_1d_array_v2i8_zero:
4975 case Intrinsic::nvvm_suld_1d_array_v4i8_zero:
4976 case Intrinsic::nvvm_suld_2d_i8_zero:
4977 case Intrinsic::nvvm_suld_2d_v2i8_zero:
4978 case Intrinsic::nvvm_suld_2d_v4i8_zero:
4979 case Intrinsic::nvvm_suld_2d_array_i8_zero:
4980 case Intrinsic::nvvm_suld_2d_array_v2i8_zero:
4981 case Intrinsic::nvvm_suld_2d_array_v4i8_zero:
4982 case Intrinsic::nvvm_suld_3d_i8_zero:
4983 case Intrinsic::nvvm_suld_3d_v2i8_zero:
4984 case Intrinsic::nvvm_suld_3d_v4i8_zero:
4985 Info.opc = ISD::INTRINSIC_W_CHAIN;
4986 Info.memVT = MVT::i8;
4987 Info.ptrVal = nullptr;
4988 Info.offset = 0;
4989 Info.flags = MachineMemOperand::MOLoad;
4990 Info.align = Align(16);
4991 Infos.push_back(Info);
4992 return;
4993
4994 case Intrinsic::nvvm_suld_1d_i16_clamp:
4995 case Intrinsic::nvvm_suld_1d_v2i16_clamp:
4996 case Intrinsic::nvvm_suld_1d_v4i16_clamp:
4997 case Intrinsic::nvvm_suld_1d_array_i16_clamp:
4998 case Intrinsic::nvvm_suld_1d_array_v2i16_clamp:
4999 case Intrinsic::nvvm_suld_1d_array_v4i16_clamp:
5000 case Intrinsic::nvvm_suld_2d_i16_clamp:
5001 case Intrinsic::nvvm_suld_2d_v2i16_clamp:
5002 case Intrinsic::nvvm_suld_2d_v4i16_clamp:
5003 case Intrinsic::nvvm_suld_2d_array_i16_clamp:
5004 case Intrinsic::nvvm_suld_2d_array_v2i16_clamp:
5005 case Intrinsic::nvvm_suld_2d_array_v4i16_clamp:
5006 case Intrinsic::nvvm_suld_3d_i16_clamp:
5007 case Intrinsic::nvvm_suld_3d_v2i16_clamp:
5008 case Intrinsic::nvvm_suld_3d_v4i16_clamp:
5009 case Intrinsic::nvvm_suld_1d_i16_trap:
5010 case Intrinsic::nvvm_suld_1d_v2i16_trap:
5011 case Intrinsic::nvvm_suld_1d_v4i16_trap:
5012 case Intrinsic::nvvm_suld_1d_array_i16_trap:
5013 case Intrinsic::nvvm_suld_1d_array_v2i16_trap:
5014 case Intrinsic::nvvm_suld_1d_array_v4i16_trap:
5015 case Intrinsic::nvvm_suld_2d_i16_trap:
5016 case Intrinsic::nvvm_suld_2d_v2i16_trap:
5017 case Intrinsic::nvvm_suld_2d_v4i16_trap:
5018 case Intrinsic::nvvm_suld_2d_array_i16_trap:
5019 case Intrinsic::nvvm_suld_2d_array_v2i16_trap:
5020 case Intrinsic::nvvm_suld_2d_array_v4i16_trap:
5021 case Intrinsic::nvvm_suld_3d_i16_trap:
5022 case Intrinsic::nvvm_suld_3d_v2i16_trap:
5023 case Intrinsic::nvvm_suld_3d_v4i16_trap:
5024 case Intrinsic::nvvm_suld_1d_i16_zero:
5025 case Intrinsic::nvvm_suld_1d_v2i16_zero:
5026 case Intrinsic::nvvm_suld_1d_v4i16_zero:
5027 case Intrinsic::nvvm_suld_1d_array_i16_zero:
5028 case Intrinsic::nvvm_suld_1d_array_v2i16_zero:
5029 case Intrinsic::nvvm_suld_1d_array_v4i16_zero:
5030 case Intrinsic::nvvm_suld_2d_i16_zero:
5031 case Intrinsic::nvvm_suld_2d_v2i16_zero:
5032 case Intrinsic::nvvm_suld_2d_v4i16_zero:
5033 case Intrinsic::nvvm_suld_2d_array_i16_zero:
5034 case Intrinsic::nvvm_suld_2d_array_v2i16_zero:
5035 case Intrinsic::nvvm_suld_2d_array_v4i16_zero:
5036 case Intrinsic::nvvm_suld_3d_i16_zero:
5037 case Intrinsic::nvvm_suld_3d_v2i16_zero:
5038 case Intrinsic::nvvm_suld_3d_v4i16_zero:
5039 Info.opc = ISD::INTRINSIC_W_CHAIN;
5040 Info.memVT = MVT::i16;
5041 Info.ptrVal = nullptr;
5042 Info.offset = 0;
5043 Info.flags = MachineMemOperand::MOLoad;
5044 Info.align = Align(16);
5045 Infos.push_back(Info);
5046 return;
5047
5048 case Intrinsic::nvvm_suld_1d_i32_clamp:
5049 case Intrinsic::nvvm_suld_1d_v2i32_clamp:
5050 case Intrinsic::nvvm_suld_1d_v4i32_clamp:
5051 case Intrinsic::nvvm_suld_1d_array_i32_clamp:
5052 case Intrinsic::nvvm_suld_1d_array_v2i32_clamp:
5053 case Intrinsic::nvvm_suld_1d_array_v4i32_clamp:
5054 case Intrinsic::nvvm_suld_2d_i32_clamp:
5055 case Intrinsic::nvvm_suld_2d_v2i32_clamp:
5056 case Intrinsic::nvvm_suld_2d_v4i32_clamp:
5057 case Intrinsic::nvvm_suld_2d_array_i32_clamp:
5058 case Intrinsic::nvvm_suld_2d_array_v2i32_clamp:
5059 case Intrinsic::nvvm_suld_2d_array_v4i32_clamp:
5060 case Intrinsic::nvvm_suld_3d_i32_clamp:
5061 case Intrinsic::nvvm_suld_3d_v2i32_clamp:
5062 case Intrinsic::nvvm_suld_3d_v4i32_clamp:
5063 case Intrinsic::nvvm_suld_1d_i32_trap:
5064 case Intrinsic::nvvm_suld_1d_v2i32_trap:
5065 case Intrinsic::nvvm_suld_1d_v4i32_trap:
5066 case Intrinsic::nvvm_suld_1d_array_i32_trap:
5067 case Intrinsic::nvvm_suld_1d_array_v2i32_trap:
5068 case Intrinsic::nvvm_suld_1d_array_v4i32_trap:
5069 case Intrinsic::nvvm_suld_2d_i32_trap:
5070 case Intrinsic::nvvm_suld_2d_v2i32_trap:
5071 case Intrinsic::nvvm_suld_2d_v4i32_trap:
5072 case Intrinsic::nvvm_suld_2d_array_i32_trap:
5073 case Intrinsic::nvvm_suld_2d_array_v2i32_trap:
5074 case Intrinsic::nvvm_suld_2d_array_v4i32_trap:
5075 case Intrinsic::nvvm_suld_3d_i32_trap:
5076 case Intrinsic::nvvm_suld_3d_v2i32_trap:
5077 case Intrinsic::nvvm_suld_3d_v4i32_trap:
5078 case Intrinsic::nvvm_suld_1d_i32_zero:
5079 case Intrinsic::nvvm_suld_1d_v2i32_zero:
5080 case Intrinsic::nvvm_suld_1d_v4i32_zero:
5081 case Intrinsic::nvvm_suld_1d_array_i32_zero:
5082 case Intrinsic::nvvm_suld_1d_array_v2i32_zero:
5083 case Intrinsic::nvvm_suld_1d_array_v4i32_zero:
5084 case Intrinsic::nvvm_suld_2d_i32_zero:
5085 case Intrinsic::nvvm_suld_2d_v2i32_zero:
5086 case Intrinsic::nvvm_suld_2d_v4i32_zero:
5087 case Intrinsic::nvvm_suld_2d_array_i32_zero:
5088 case Intrinsic::nvvm_suld_2d_array_v2i32_zero:
5089 case Intrinsic::nvvm_suld_2d_array_v4i32_zero:
5090 case Intrinsic::nvvm_suld_3d_i32_zero:
5091 case Intrinsic::nvvm_suld_3d_v2i32_zero:
5092 case Intrinsic::nvvm_suld_3d_v4i32_zero:
5093 Info.opc = ISD::INTRINSIC_W_CHAIN;
5094 Info.memVT = MVT::i32;
5095 Info.ptrVal = nullptr;
5096 Info.offset = 0;
5097 Info.flags = MachineMemOperand::MOLoad;
5098 Info.align = Align(16);
5099 Infos.push_back(Info);
5100 return;
5101
5102 case Intrinsic::nvvm_suld_1d_i64_clamp:
5103 case Intrinsic::nvvm_suld_1d_v2i64_clamp:
5104 case Intrinsic::nvvm_suld_1d_array_i64_clamp:
5105 case Intrinsic::nvvm_suld_1d_array_v2i64_clamp:
5106 case Intrinsic::nvvm_suld_2d_i64_clamp:
5107 case Intrinsic::nvvm_suld_2d_v2i64_clamp:
5108 case Intrinsic::nvvm_suld_2d_array_i64_clamp:
5109 case Intrinsic::nvvm_suld_2d_array_v2i64_clamp:
5110 case Intrinsic::nvvm_suld_3d_i64_clamp:
5111 case Intrinsic::nvvm_suld_3d_v2i64_clamp:
5112 case Intrinsic::nvvm_suld_1d_i64_trap:
5113 case Intrinsic::nvvm_suld_1d_v2i64_trap:
5114 case Intrinsic::nvvm_suld_1d_array_i64_trap:
5115 case Intrinsic::nvvm_suld_1d_array_v2i64_trap:
5116 case Intrinsic::nvvm_suld_2d_i64_trap:
5117 case Intrinsic::nvvm_suld_2d_v2i64_trap:
5118 case Intrinsic::nvvm_suld_2d_array_i64_trap:
5119 case Intrinsic::nvvm_suld_2d_array_v2i64_trap:
5120 case Intrinsic::nvvm_suld_3d_i64_trap:
5121 case Intrinsic::nvvm_suld_3d_v2i64_trap:
5122 case Intrinsic::nvvm_suld_1d_i64_zero:
5123 case Intrinsic::nvvm_suld_1d_v2i64_zero:
5124 case Intrinsic::nvvm_suld_1d_array_i64_zero:
5125 case Intrinsic::nvvm_suld_1d_array_v2i64_zero:
5126 case Intrinsic::nvvm_suld_2d_i64_zero:
5127 case Intrinsic::nvvm_suld_2d_v2i64_zero:
5128 case Intrinsic::nvvm_suld_2d_array_i64_zero:
5129 case Intrinsic::nvvm_suld_2d_array_v2i64_zero:
5130 case Intrinsic::nvvm_suld_3d_i64_zero:
5131 case Intrinsic::nvvm_suld_3d_v2i64_zero:
5132 Info.opc = ISD::INTRINSIC_W_CHAIN;
5133 Info.memVT = MVT::i64;
5134 Info.ptrVal = nullptr;
5135 Info.offset = 0;
5136 Info.flags = MachineMemOperand::MOLoad;
5137 Info.align = Align(16);
5138 Infos.push_back(Info);
5139 return;
5140
5141 case Intrinsic::nvvm_tcgen05_ld_16x64b_x1:
5142 case Intrinsic::nvvm_tcgen05_ld_32x32b_x1:
5143 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x1: {
5144 Info.opc = ISD::INTRINSIC_W_CHAIN;
5145 Info.memVT = MVT::v1i32;
5146 Info.ptrVal = I.getArgOperand(0);
5147 Info.offset = 0;
5148 Info.flags = MachineMemOperand::MOLoad;
5149 Info.align.reset();
5150 Infos.push_back(Info);
5151 return;
5152 }
5153
5154 case Intrinsic::nvvm_tcgen05_ld_16x64b_x2:
5155 case Intrinsic::nvvm_tcgen05_ld_16x128b_x1:
5156 case Intrinsic::nvvm_tcgen05_ld_32x32b_x2:
5157 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x2:
5158 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_i32:
5159 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_i32: {
5160 Info.opc = ISD::INTRINSIC_W_CHAIN;
5161 Info.memVT = MVT::v2i32;
5162 Info.ptrVal = I.getArgOperand(0);
5163 Info.offset = 0;
5164 Info.flags = MachineMemOperand::MOLoad;
5165 Info.align.reset();
5166 Infos.push_back(Info);
5167 return;
5168 }
5169
5170 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x2_f32:
5171 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x2_f32: {
5172 Info.opc = ISD::INTRINSIC_W_CHAIN;
5173 Info.memVT = MVT::v2f32;
5174 Info.ptrVal = I.getArgOperand(0);
5175 Info.offset = 0;
5176 Info.flags = MachineMemOperand::MOLoad;
5177 Info.align.reset();
5178 Infos.push_back(Info);
5179 return;
5180 }
5181
5182 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
5183 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
5184 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
5185 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
5186 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
5187 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
5188 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32: {
5189 Info.opc = ISD::INTRINSIC_W_CHAIN;
5190 Info.memVT = MVT::v4i32;
5191 Info.ptrVal = I.getArgOperand(0);
5192 Info.offset = 0;
5193 Info.flags = MachineMemOperand::MOLoad;
5194 Info.align.reset();
5195 Infos.push_back(Info);
5196 return;
5197 }
5198
5199 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
5200 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32: {
5201 Info.opc = ISD::INTRINSIC_W_CHAIN;
5202 Info.memVT = MVT::v4f32;
5203 Info.ptrVal = I.getArgOperand(0);
5204 Info.offset = 0;
5205 Info.flags = MachineMemOperand::MOLoad;
5206 Info.align.reset();
5207 Infos.push_back(Info);
5208 return;
5209 }
5210
5211 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
5212 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
5213 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
5214 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
5215 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
5216 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
5217 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32: {
5218 Info.opc = ISD::INTRINSIC_W_CHAIN;
5219 Info.memVT = MVT::v8i32;
5220 Info.ptrVal = I.getArgOperand(0);
5221 Info.offset = 0;
5222 Info.flags = MachineMemOperand::MOLoad;
5223 Info.align.reset();
5224 Infos.push_back(Info);
5225 return;
5226 }
5227
5228 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
5229 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32: {
5230 Info.opc = ISD::INTRINSIC_W_CHAIN;
5231 Info.memVT = MVT::v8f32;
5232 Info.ptrVal = I.getArgOperand(0);
5233 Info.offset = 0;
5234 Info.flags = MachineMemOperand::MOLoad;
5235 Info.align.reset();
5236 Infos.push_back(Info);
5237 return;
5238 }
5239
5240 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
5241 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
5242 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
5243 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
5244 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
5245 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
5246 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32: {
5247 Info.opc = ISD::INTRINSIC_W_CHAIN;
5248 Info.memVT = MVT::v16i32;
5249 Info.ptrVal = I.getArgOperand(0);
5250 Info.offset = 0;
5251 Info.flags = MachineMemOperand::MOLoad;
5252 Info.align.reset();
5253 Infos.push_back(Info);
5254 return;
5255 }
5256
5257 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
5258 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32: {
5259 Info.opc = ISD::INTRINSIC_W_CHAIN;
5260 Info.memVT = MVT::v16f32;
5261 Info.ptrVal = I.getArgOperand(0);
5262 Info.offset = 0;
5263 Info.flags = MachineMemOperand::MOLoad;
5264 Info.align.reset();
5265 Infos.push_back(Info);
5266 return;
5267 }
5268
5269 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
5270 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
5271 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
5272 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
5273 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
5274 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
5275 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32: {
5276 Info.opc = ISD::INTRINSIC_W_CHAIN;
5277 Info.memVT = MVT::v32i32;
5278 Info.ptrVal = I.getArgOperand(0);
5279 Info.offset = 0;
5280 Info.flags = MachineMemOperand::MOLoad;
5281 Info.align.reset();
5282 Infos.push_back(Info);
5283 return;
5284 }
5285
5286 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
5287 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32: {
5288 Info.opc = ISD::INTRINSIC_W_CHAIN;
5289 Info.memVT = MVT::v32f32;
5290 Info.ptrVal = I.getArgOperand(0);
5291 Info.offset = 0;
5292 Info.flags = MachineMemOperand::MOLoad;
5293 Info.align.reset();
5294 Infos.push_back(Info);
5295 return;
5296 }
5297
5298 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
5299 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
5300 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
5301 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
5302 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
5303 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
5304 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32: {
5305 Info.opc = ISD::INTRINSIC_W_CHAIN;
5306 Info.memVT = MVT::v64i32;
5307 Info.ptrVal = I.getArgOperand(0);
5308 Info.offset = 0;
5309 Info.flags = MachineMemOperand::MOLoad;
5310 Info.align.reset();
5311 Infos.push_back(Info);
5312 return;
5313 }
5314
5315 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
5316 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32: {
5317 Info.opc = ISD::INTRINSIC_W_CHAIN;
5318 Info.memVT = MVT::v64f32;
5319 Info.ptrVal = I.getArgOperand(0);
5320 Info.offset = 0;
5321 Info.flags = MachineMemOperand::MOLoad;
5322 Info.align.reset();
5323 Infos.push_back(Info);
5324 return;
5325 }
5326
5327 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
5328 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
5329 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
5330 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
5331 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
5332 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
5333 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32: {
5334 Info.opc = ISD::INTRINSIC_W_CHAIN;
5335 Info.memVT = MVT::v128i32;
5336 Info.ptrVal = I.getArgOperand(0);
5337 Info.offset = 0;
5338 Info.flags = MachineMemOperand::MOLoad;
5339 Info.align.reset();
5340 Infos.push_back(Info);
5341 return;
5342 }
5343
5344 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
5345 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32: {
5346 Info.opc = ISD::INTRINSIC_W_CHAIN;
5347 Info.memVT = MVT::v128f32;
5348 Info.ptrVal = I.getArgOperand(0);
5349 Info.offset = 0;
5350 Info.flags = MachineMemOperand::MOLoad;
5351 Info.align.reset();
5352 Infos.push_back(Info);
5353 return;
5354 }
5355
5356 case Intrinsic::nvvm_tcgen05_st_16x64b_x1:
5357 case Intrinsic::nvvm_tcgen05_st_32x32b_x1:
5358 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x1: {
5359 Info.opc = ISD::INTRINSIC_VOID;
5360 Info.memVT = MVT::i32;
5361 Info.ptrVal = I.getArgOperand(0);
5362 Info.offset = 0;
5363 Info.flags = MachineMemOperand::MOStore;
5364 Info.align.reset();
5365 Infos.push_back(Info);
5366 return;
5367 }
5368
5369 case Intrinsic::nvvm_tcgen05_st_16x64b_x2:
5370 case Intrinsic::nvvm_tcgen05_st_16x128b_x1:
5371 case Intrinsic::nvvm_tcgen05_st_32x32b_x2:
5372 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x2: {
5373 Info.opc = ISD::INTRINSIC_VOID;
5374 Info.memVT = MVT::v2i32;
5375 Info.ptrVal = I.getArgOperand(0);
5376 Info.offset = 0;
5377 Info.flags = MachineMemOperand::MOStore;
5378 Info.align.reset();
5379 Infos.push_back(Info);
5380 return;
5381 }
5382
5383 case Intrinsic::nvvm_tcgen05_st_16x64b_x4:
5384 case Intrinsic::nvvm_tcgen05_st_16x128b_x2:
5385 case Intrinsic::nvvm_tcgen05_st_16x256b_x1:
5386 case Intrinsic::nvvm_tcgen05_st_32x32b_x4:
5387 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x4: {
5388 Info.opc = ISD::INTRINSIC_VOID;
5389 Info.memVT = MVT::v4i32;
5390 Info.ptrVal = I.getArgOperand(0);
5391 Info.offset = 0;
5392 Info.flags = MachineMemOperand::MOStore;
5393 Info.align.reset();
5394 Infos.push_back(Info);
5395 return;
5396 }
5397
5398 case Intrinsic::nvvm_tcgen05_st_16x64b_x8:
5399 case Intrinsic::nvvm_tcgen05_st_16x128b_x4:
5400 case Intrinsic::nvvm_tcgen05_st_16x256b_x2:
5401 case Intrinsic::nvvm_tcgen05_st_32x32b_x8:
5402 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x8: {
5403 Info.opc = ISD::INTRINSIC_VOID;
5404 Info.memVT = MVT::v8i32;
5405 Info.ptrVal = I.getArgOperand(0);
5406 Info.offset = 0;
5407 Info.flags = MachineMemOperand::MOStore;
5408 Info.align.reset();
5409 Infos.push_back(Info);
5410 return;
5411 }
5412
5413 case Intrinsic::nvvm_tcgen05_st_16x64b_x16:
5414 case Intrinsic::nvvm_tcgen05_st_16x128b_x8:
5415 case Intrinsic::nvvm_tcgen05_st_16x256b_x4:
5416 case Intrinsic::nvvm_tcgen05_st_32x32b_x16:
5417 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x16: {
5418 Info.opc = ISD::INTRINSIC_VOID;
5419 Info.memVT = MVT::v16i32;
5420 Info.ptrVal = I.getArgOperand(0);
5421 Info.offset = 0;
5422 Info.flags = MachineMemOperand::MOStore;
5423 Info.align.reset();
5424 Infos.push_back(Info);
5425 return;
5426 }
5427
5428 case Intrinsic::nvvm_tcgen05_st_16x64b_x32:
5429 case Intrinsic::nvvm_tcgen05_st_16x128b_x16:
5430 case Intrinsic::nvvm_tcgen05_st_16x256b_x8:
5431 case Intrinsic::nvvm_tcgen05_st_32x32b_x32:
5432 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x32: {
5433 Info.opc = ISD::INTRINSIC_VOID;
5434 Info.memVT = MVT::v32i32;
5435 Info.ptrVal = I.getArgOperand(0);
5436 Info.offset = 0;
5437 Info.flags = MachineMemOperand::MOStore;
5438 Info.align.reset();
5439 Infos.push_back(Info);
5440 return;
5441 }
5442
5443 case Intrinsic::nvvm_tcgen05_st_16x64b_x64:
5444 case Intrinsic::nvvm_tcgen05_st_16x128b_x32:
5445 case Intrinsic::nvvm_tcgen05_st_16x256b_x16:
5446 case Intrinsic::nvvm_tcgen05_st_32x32b_x64:
5447 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x64: {
5448 Info.opc = ISD::INTRINSIC_VOID;
5449 Info.memVT = MVT::v64i32;
5450 Info.ptrVal = I.getArgOperand(0);
5451 Info.offset = 0;
5452 Info.flags = MachineMemOperand::MOStore;
5453 Info.align.reset();
5454 Infos.push_back(Info);
5455 return;
5456 }
5457
5458 case Intrinsic::nvvm_tcgen05_st_16x64b_x128:
5459 case Intrinsic::nvvm_tcgen05_st_16x128b_x64:
5460 case Intrinsic::nvvm_tcgen05_st_16x256b_x32:
5461 case Intrinsic::nvvm_tcgen05_st_32x32b_x128:
5462 case Intrinsic::nvvm_tcgen05_st_16x32bx2_x128: {
5463 Info.opc = ISD::INTRINSIC_VOID;
5464 Info.memVT = MVT::v128i32;
5465 Info.ptrVal = I.getArgOperand(0);
5466 Info.offset = 0;
5467 Info.flags = MachineMemOperand::MOStore;
5468 Info.align.reset();
5469 Infos.push_back(Info);
5470 return;
5471 }
5472 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg1:
5473 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg1:
5474 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg1:
5475 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg1:
5476 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1:
5477 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1:
5478 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg1_ashift:
5479 case Intrinsic::
5480 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg1_ashift:
5481 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1:
5482 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1:
5483 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg1_ashift:
5484 case Intrinsic::
5485 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg1_ashift: {
5486 // We are reading and writing back to TMem
5487 Info.opc = ISD::INTRINSIC_VOID;
5488 Info.memVT = MVT::v4i32;
5489 Info.ptrVal = I.getArgOperand(0);
5490 Info.offset = 0;
5492 Info.align = Align(16);
5493 Infos.push_back(Info);
5494 return;
5495 }
5496
5497 case Intrinsic::nvvm_tcgen05_mma_shared_disable_output_lane_cg2:
5498 case Intrinsic::nvvm_tcgen05_mma_shared_scale_d_disable_output_lane_cg2:
5499 case Intrinsic::nvvm_tcgen05_mma_sp_shared_disable_output_lane_cg2:
5500 case Intrinsic::nvvm_tcgen05_mma_sp_shared_scale_d_disable_output_lane_cg2:
5501 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2:
5502 case Intrinsic::nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2:
5503 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2:
5504 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2:
5505 case Intrinsic::nvvm_tcgen05_mma_tensor_disable_output_lane_cg2_ashift:
5506 case Intrinsic::
5507 nvvm_tcgen05_mma_tensor_scale_d_disable_output_lane_cg2_ashift:
5508 case Intrinsic::nvvm_tcgen05_mma_sp_tensor_disable_output_lane_cg2_ashift:
5509 case Intrinsic::
5510 nvvm_tcgen05_mma_sp_tensor_scale_d_disable_output_lane_cg2_ashift: {
5511 // We are reading and writing back to TMem
5512 Info.opc = ISD::INTRINSIC_VOID;
5513 Info.memVT = MVT::v8i32;
5514 Info.ptrVal = I.getArgOperand(0);
5515 Info.offset = 0;
5517 Info.align = Align(16);
5518 Infos.push_back(Info);
5519 return;
5520 }
5521 }
5522}
5523
5524// Helper for getting a function parameter name. Name is composed from
5525// its index and the function name. Negative index corresponds to special
5526// parameter (unsized array) used for passing variable arguments.
5528 int Idx) const {
5529 std::string ParamName;
5530 raw_string_ostream ParamStr(ParamName);
5531
5532 ParamStr << getTargetMachine().getSymbol(F)->getName();
5533 if (Idx < 0)
5534 ParamStr << "_vararg";
5535 else
5536 ParamStr << "_param_" << Idx;
5537
5538 return ParamName;
5539}
5540
5541/// isLegalAddressingMode - Return true if the addressing mode represented
5542/// by AM is legal for this target, for a load/store of the specified type.
5543/// Used to guide target specific optimizations, like loop strength reduction
5544/// (LoopStrengthReduce.cpp) and memory optimization for address mode
5545/// (CodeGenPrepare.cpp)
5547 const AddrMode &AM, Type *Ty,
5548 unsigned AS, Instruction *I) const {
5549 // AddrMode - This represents an addressing mode of:
5550 // BaseGV + BaseOffs + BaseReg + Scale*ScaleReg
5551 //
5552 // The legal address modes are
5553 // - [avar]
5554 // - [areg]
5555 // - [areg+immoff]
5556 // - [immAddr]
5557
5558 // immoff must fit in a signed 32-bit int
5559 if (!APInt(64, AM.BaseOffs).isSignedIntN(32))
5560 return false;
5561
5562 if (AM.BaseGV)
5563 return !AM.BaseOffs && !AM.HasBaseReg && !AM.Scale;
5564
5565 switch (AM.Scale) {
5566 case 0: // "r", "r+i" or "i" is allowed
5567 break;
5568 case 1:
5569 if (AM.HasBaseReg) // "r+r+i" or "r+r" is not allowed.
5570 return false;
5571 // Otherwise we have r+i.
5572 break;
5573 default:
5574 // No scale > 1 is allowed
5575 return false;
5576 }
5577 return true;
5578}
5579
5580//===----------------------------------------------------------------------===//
5581// NVPTX Inline Assembly Support
5582//===----------------------------------------------------------------------===//
5583
5584/// getConstraintType - Given a constraint letter, return the type of
5585/// constraint it is for this target.
5588 if (Constraint.size() == 1) {
5589 switch (Constraint[0]) {
5590 default:
5591 break;
5592 case 'b':
5593 case 'r':
5594 case 'h':
5595 case 'c':
5596 case 'l':
5597 case 'f':
5598 case 'd':
5599 case 'q':
5600 case '0':
5601 case 'N':
5602 return C_RegisterClass;
5603 }
5604 }
5605 return TargetLowering::getConstraintType(Constraint);
5606}
5607
5608std::pair<unsigned, const TargetRegisterClass *>
5610 StringRef Constraint,
5611 MVT VT) const {
5612 if (Constraint.size() == 1) {
5613 switch (Constraint[0]) {
5614 case 'b':
5615 return std::make_pair(0U, &NVPTX::B1RegClass);
5616 case 'c':
5617 case 'h':
5618 return std::make_pair(0U, &NVPTX::B16RegClass);
5619 case 'r':
5620 case 'f':
5621 return std::make_pair(0U, &NVPTX::B32RegClass);
5622 case 'l':
5623 case 'N':
5624 case 'd':
5625 return std::make_pair(0U, &NVPTX::B64RegClass);
5626 case 'q': {
5627 if (STI.getSmVersion() < 70)
5628 report_fatal_error("Inline asm with 128 bit operands is only "
5629 "supported for sm_70 and higher!");
5630 return std::make_pair(0U, &NVPTX::B128RegClass);
5631 }
5632 }
5633 }
5634 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
5635}
5636
5637//===----------------------------------------------------------------------===//
5638// NVPTX DAG Combining
5639//===----------------------------------------------------------------------===//
5640
5642 CodeGenOptLevel OptLevel) const {
5643 // Always honor command-line argument
5644 if (FMAContractLevelOpt.getNumOccurrences() > 0)
5645 return FMAContractLevelOpt > 0;
5646
5647 // Do not contract if we're not optimizing the code.
5648 if (OptLevel == CodeGenOptLevel::None)
5649 return false;
5650
5651 // Honor TargetOptions flags that explicitly say fusion is okay.
5653 return true;
5654
5655 return false;
5656}
5657
5658static bool isConstZero(const SDValue &Operand) {
5659 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
5660 return Const && Const->getZExtValue() == 0;
5661}
5662
5663/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
5664/// operands N0 and N1. This is a helper for PerformADDCombine that is
5665/// called with the default operands, and if that fails, with commuted
5666/// operands.
5667static SDValue
5670 EVT VT = N0.getValueType();
5671
5672 // Since integer multiply-add costs the same as integer multiply
5673 // but is more costly than integer add, do the fusion only when
5674 // the mul is only used in the add.
5675 // TODO: this may not be true for later architectures, consider relaxing this
5676 if (!N0.getNode()->hasOneUse())
5677 return SDValue();
5678
5679 // fold (add (select cond, 0, (mul a, b)), c)
5680 // -> (select cond, c, (add (mul a, b), c))
5681 //
5682 if (N0.getOpcode() == ISD::SELECT) {
5683 unsigned ZeroOpNum;
5684 if (isConstZero(N0->getOperand(1)))
5685 ZeroOpNum = 1;
5686 else if (isConstZero(N0->getOperand(2)))
5687 ZeroOpNum = 2;
5688 else
5689 return SDValue();
5690
5691 SDValue M = N0->getOperand((ZeroOpNum == 1) ? 2 : 1);
5692 if (M->getOpcode() != ISD::MUL || !M.getNode()->hasOneUse())
5693 return SDValue();
5694
5695 SDLoc DL(N);
5696 SDValue Mul =
5697 DCI.DAG.getNode(ISD::MUL, DL, VT, M->getOperand(0), M->getOperand(1));
5698 SDValue MAD = DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, N1);
5699 return DCI.DAG.getSelect(SDLoc(N), VT, N0->getOperand(0),
5700 ((ZeroOpNum == 1) ? N1 : MAD),
5701 ((ZeroOpNum == 1) ? MAD : N1));
5702 }
5703
5704 return SDValue();
5705}
5706
5707SDValue NVPTXTargetLowering::performFADDCombineWithOperands(
5709 CodeGenOptLevel OptLevel) const {
5710 EVT VT = N0.getValueType();
5711 if (N0.getOpcode() == ISD::FMUL) {
5712 if (!(allowFMA(DCI.DAG.getMachineFunction(), OptLevel) ||
5713 (N->getFlags().hasAllowContract() &&
5714 N0->getFlags().hasAllowContract())))
5715 return SDValue();
5716
5717 // For floating point:
5718 // Do the fusion only when the mul has less than 5 uses and all
5719 // are add.
5720 // The heuristic is that if a use is not an add, then that use
5721 // cannot be fused into fma, therefore mul is still needed anyway.
5722 // If there are more than 4 uses, even if they are all add, fusing
5723 // them will increase register pressue.
5724 //
5725 int numUses = 0;
5726 int nonAddCount = 0;
5727 for (const SDNode *User : N0.getNode()->users()) {
5728 numUses++;
5729 if (User->getOpcode() != ISD::FADD)
5730 ++nonAddCount;
5731 if (numUses >= 5)
5732 return SDValue();
5733 }
5734 if (nonAddCount) {
5735 int orderNo = N->getIROrder();
5736 int orderNo2 = N0.getNode()->getIROrder();
5737 // simple heuristics here for considering potential register
5738 // pressure, the logics here is that the differnce are used
5739 // to measure the distance between def and use, the longer distance
5740 // more likely cause register pressure.
5741 if (orderNo - orderNo2 < 500)
5742 return SDValue();
5743
5744 // Now, check if at least one of the FMUL's operands is live beyond the
5745 // node N, which guarantees that the FMA will not increase register
5746 // pressure at node N.
5747 bool opIsLive = false;
5748 const SDNode *left = N0.getOperand(0).getNode();
5749 const SDNode *right = N0.getOperand(1).getNode();
5750
5751 if (isa<ConstantSDNode>(left) || isa<ConstantSDNode>(right))
5752 opIsLive = true;
5753
5754 if (!opIsLive)
5755 for (const SDNode *User : left->users()) {
5756 int orderNo3 = User->getIROrder();
5757 if (orderNo3 > orderNo) {
5758 opIsLive = true;
5759 break;
5760 }
5761 }
5762
5763 if (!opIsLive)
5764 for (const SDNode *User : right->users()) {
5765 int orderNo3 = User->getIROrder();
5766 if (orderNo3 > orderNo) {
5767 opIsLive = true;
5768 break;
5769 }
5770 }
5771
5772 if (!opIsLive)
5773 return SDValue();
5774 }
5775
5776 return DCI.DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0),
5777 N0.getOperand(1), N1);
5778 }
5779
5780 return SDValue();
5781}
5782
5783/// Fold unpacking movs into a load by increasing the number of return values.
5784///
5785/// ex:
5786/// L: v2f16,ch = load <p>
5787/// a: f16 = extractelt L:0, 0
5788/// b: f16 = extractelt L:0, 1
5789/// use(a, b)
5790///
5791/// ...is turned into...
5792///
5793/// L: f16,f16,ch = LoadV2 <p>
5794/// use(L:0, L:1)
5795static SDValue
5797 // Don't run this optimization before the legalizer
5798 if (!DCI.isAfterLegalizeDAG())
5799 return SDValue();
5800
5801 EVT ElementVT = N->getValueType(0);
5802 // Avoid non-packed types and v4i8
5803 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5804 return SDValue();
5805
5806 // Check whether all outputs are either used by an extractelt or are
5807 // glue/chain nodes
5808 if (!all_of(N->uses(), [&](SDUse &U) {
5809 // Skip glue, chain nodes
5810 if (U.getValueType() == MVT::Glue || U.getValueType() == MVT::Other)
5811 return true;
5812 if (U.getUser()->getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
5813 if (N->getOpcode() != ISD::LOAD)
5814 return true;
5815 // Since this is an ISD::LOAD, check all extractelts are used. If
5816 // any are not used, we don't want to defeat another optimization that
5817 // will narrow the load.
5818 //
5819 // For example:
5820 //
5821 // L: v2f16,ch = load <p>
5822 // e0: f16 = extractelt L:0, 0
5823 // e1: f16 = extractelt L:0, 1 <-- unused
5824 // store e0
5825 //
5826 // Can be optimized by DAGCombiner to:
5827 //
5828 // L: f16,ch = load <p>
5829 // store L:0
5830 return !U.getUser()->use_empty();
5831 }
5832
5833 // Otherwise, this use prevents us from splitting a value.
5834 return false;
5835 }))
5836 return SDValue();
5837
5838 auto *LD = cast<MemSDNode>(N);
5839 SDLoc DL(LD);
5840
5841 // the new opcode after we double the number of operands
5842 unsigned Opcode;
5843 SmallVector<SDValue> Operands(LD->ops());
5844 unsigned OldNumOutputs; // non-glue, non-chain outputs
5845 switch (LD->getOpcode()) {
5846 case ISD::LOAD:
5847 OldNumOutputs = 1;
5848 // Any packed type is legal, so the legalizer will not have lowered
5849 // ISD::LOAD -> NVPTXISD::Load (unless it's under-aligned). We have to do it
5850 // here.
5851 Opcode = NVPTXISD::LoadV2;
5852 // append a "full" used bytes mask operand right before the extension type
5853 // operand, signifying that all bytes are used.
5854 Operands.push_back(DCI.DAG.getConstant(UINT32_MAX, DL, MVT::i32));
5855 Operands.push_back(DCI.DAG.getIntPtrConstant(
5856 cast<LoadSDNode>(LD)->getExtensionType(), DL));
5857 break;
5858 case NVPTXISD::LoadV2:
5859 OldNumOutputs = 2;
5860 Opcode = NVPTXISD::LoadV4;
5861 break;
5862 case NVPTXISD::LoadV4:
5863 // V8 is only supported for f32/i32. Don't forget, we're not changing the
5864 // load size here. This is already a 256-bit load.
5865 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
5866 return SDValue();
5867 OldNumOutputs = 4;
5868 Opcode = NVPTXISD::LoadV8;
5869 break;
5870 case NVPTXISD::LoadV8:
5871 // PTX doesn't support the next doubling of outputs
5872 return SDValue();
5873 }
5874
5875 // the non-glue, non-chain outputs in the new load
5876 const unsigned NewNumOutputs = OldNumOutputs * 2;
5877 SmallVector<EVT> NewVTs(NewNumOutputs, ElementVT.getVectorElementType());
5878 // add remaining chain and glue values
5879 NewVTs.append(LD->value_begin() + OldNumOutputs, LD->value_end());
5880
5881 // Create the new load
5882 SDValue NewLoad = DCI.DAG.getMemIntrinsicNode(
5883 Opcode, DL, DCI.DAG.getVTList(NewVTs), Operands, LD->getMemoryVT(),
5884 LD->getMemOperand());
5885
5886 // Now we use a combination of BUILD_VECTORs and a MERGE_VALUES node to keep
5887 // the outputs the same. These nodes will be optimized away in later
5888 // DAGCombiner iterations.
5890 for (unsigned I : seq(OldNumOutputs))
5891 Results.push_back(DCI.DAG.getBuildVector(
5892 ElementVT, DL, {NewLoad.getValue(I * 2), NewLoad.getValue(I * 2 + 1)}));
5893 // Add remaining chain and glue nodes
5894 for (unsigned I : seq(NewLoad->getNumValues() - NewNumOutputs))
5895 Results.push_back(NewLoad.getValue(NewNumOutputs + I));
5896
5897 return DCI.DAG.getMergeValues(Results, DL);
5898}
5899
5900/// Fold packing movs into a store.
5901///
5902/// ex:
5903/// v1: v2f16 = BUILD_VECTOR a:f16, b:f16
5904/// v2: v2f16 = BUILD_VECTOR c:f16, d:f16
5905/// StoreV2 v1, v2
5906///
5907/// ...is turned into...
5908///
5909/// StoreV4 a, b, c, d
5912 unsigned Front, unsigned Back) {
5913 // We want to run this as late as possible since other optimizations may
5914 // eliminate the BUILD_VECTORs.
5915 if (!DCI.isAfterLegalizeDAG())
5916 return SDValue();
5917
5918 // Get the type of the operands being stored.
5919 EVT ElementVT = N->getOperand(Front).getValueType();
5920
5921 // Avoid non-packed types and v4i8
5922 if (!NVPTX::isPackedVectorTy(ElementVT) || ElementVT == MVT::v4i8)
5923 return SDValue();
5924
5925 auto *ST = cast<MemSDNode>(N);
5926
5927 // The new opcode after we double the number of operands.
5928 unsigned Opcode;
5929 switch (N->getOpcode()) {
5930 case ISD::STORE:
5931 // Any packed type is legal, so the legalizer will not have lowered
5932 // ISD::STORE -> NVPTXISD::Store (unless it's under-aligned). We have to do
5933 // it here.
5934 Opcode = NVPTXISD::StoreV2;
5935 break;
5936 case NVPTXISD::StoreV2:
5937 Opcode = NVPTXISD::StoreV4;
5938 break;
5939 case NVPTXISD::StoreV4:
5940 // V8 is only supported for f32/i32. Don't forget, we're not changing the
5941 // store size here. This is already a 256-bit store.
5942 if (ElementVT != MVT::v2f32 && ElementVT != MVT::v2i32)
5943 return SDValue();
5944 Opcode = NVPTXISD::StoreV8;
5945 break;
5946 case NVPTXISD::StoreV8:
5947 // PTX doesn't support the next doubling of operands
5948 return SDValue();
5949 default:
5950 llvm_unreachable("Unhandled store opcode");
5951 }
5952
5953 // Scan the operands and if they're all BUILD_VECTORs, we'll have gathered
5954 // their elements.
5955 SmallVector<SDValue, 4> Operands(N->ops().take_front(Front));
5956 for (SDValue BV : N->ops().drop_front(Front).drop_back(Back)) {
5957 if (BV.getOpcode() != ISD::BUILD_VECTOR)
5958 return SDValue();
5959
5960 // If the operand has multiple uses, this optimization can increase register
5961 // pressure.
5962 if (!BV.hasOneUse())
5963 return SDValue();
5964
5965 // DAGCombiner visits nodes bottom-up. Check the BUILD_VECTOR operands for
5966 // any signs they may be folded by some other pattern or rule.
5967 for (SDValue Op : BV->ops()) {
5968 // Peek through bitcasts
5969 if (Op.getOpcode() == ISD::BITCAST)
5970 Op = Op.getOperand(0);
5971
5972 // This may be folded into a PRMT.
5973 if (Op.getValueType() == MVT::i16 && Op.getOpcode() == ISD::TRUNCATE &&
5974 Op->getOperand(0).getValueType() == MVT::i32)
5975 return SDValue();
5976
5977 // This may be folded into cvt.bf16x2
5978 if (Op.getOpcode() == ISD::FP_ROUND)
5979 return SDValue();
5980 }
5981 Operands.append({BV.getOperand(0), BV.getOperand(1)});
5982 }
5983 Operands.append(N->op_end() - Back, N->op_end());
5984
5985 // Now we replace the store
5986 return DCI.DAG.getMemIntrinsicNode(Opcode, SDLoc(N), N->getVTList(), Operands,
5987 ST->getMemoryVT(), ST->getMemOperand());
5988}
5989
5991 const NVPTXSubtarget &STI) {
5992
5993 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::STORE) {
5994 // Here is our chance to custom lower a store with a non-simple type.
5995 // Unfortunately, we can't do this in the legalizer because there is no
5996 // way to setOperationAction for an non-simple type.
5998 if (!ST->getValue().getValueType().isSimple())
5999 return lowerSTOREVector(SDValue(ST, 0), DCI.DAG, STI);
6000 }
6001
6002 return combinePackingMovIntoStore(N, DCI, 1, 2);
6003}
6004
6006 const NVPTXSubtarget &STI) {
6007 if (DCI.isBeforeLegalize() && N->getOpcode() == ISD::LOAD) {
6008 // Here is our chance to custom lower a load with a non-simple type.
6009 // Unfortunately, we can't do this in the legalizer because there is no
6010 // way to setOperationAction for an non-simple type.
6011 if (!N->getValueType(0).isSimple())
6012 return lowerLoadVector(N, DCI.DAG, STI);
6013 }
6014
6015 return combineUnpackingMovIntoLoad(N, DCI);
6016}
6017
6018/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
6019///
6022 CodeGenOptLevel OptLevel) {
6023 if (OptLevel == CodeGenOptLevel::None)
6024 return SDValue();
6025
6026 SDValue N0 = N->getOperand(0);
6027 SDValue N1 = N->getOperand(1);
6028
6029 // Skip non-integer, non-scalar case
6030 EVT VT = N0.getValueType();
6031 if (VT.isVector() || VT != MVT::i32)
6032 return SDValue();
6033
6034 // First try with the default operand order.
6035 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI))
6036 return Result;
6037
6038 // If that didn't work, try again with the operands commuted.
6039 return PerformADDCombineWithOperands(N, N1, N0, DCI);
6040}
6041
6042/// Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent
6043/// register pairs (non-coalescable).
6044static bool isNonCoalescableBuildVector(const SDValue &BV) {
6045 if (BV.getOpcode() != ISD::BUILD_VECTOR || BV.getValueType() != MVT::v2f32)
6046 return false;
6047
6048 SDValue Elt0 = BV.getOperand(0);
6049 SDValue Elt1 = BV.getOperand(1);
6050
6051 bool IsExt0 = Elt0.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
6052 bool IsExt1 = Elt1.getOpcode() == ISD::EXTRACT_VECTOR_ELT;
6053
6054 // If neither element is an EXTRACT_VECTOR_ELT they are free-standing
6055 // scalars and the register allocator can still place them side-by-side.
6056 if (!IsExt0 && !IsExt1)
6057 return false;
6058
6059 // If exactly one element is an EXTRACT_VECTOR_ELT, the other is a scalar
6060 // that cannot generally occupy the adjacent register slot.
6061 if (IsExt0 != IsExt1)
6062 return true;
6063
6064 // At this point both sources are extracting from vectors. If they are from
6065 // different vectors, then the BUILD_VECTOR is non-coalescable.
6066 SDValue Src0 = Elt0.getOperand(0);
6067 SDValue Src1 = Elt1.getOperand(0);
6068 if (Src0 != Src1)
6069 return true;
6070
6071 auto *Idx0 = dyn_cast<ConstantSDNode>(Elt0.getOperand(1));
6072 auto *Idx1 = dyn_cast<ConstantSDNode>(Elt1.getOperand(1));
6073 // If both indices are dynamic they will be lowered to
6074 // loads and the vector will be spilled to local memory. The register
6075 // allocator can easily place the results in adjacent registers.
6076 if (!Idx0 && !Idx1)
6077 return false;
6078
6079 // If one index is dynamic and the other is constant, the value from the
6080 // constant load will result in an additional register to pair with the result
6081 // from the dynamic load. We consider this non-coalescable.
6082 if ((Idx0 && !Idx1) || (!Idx0 && Idx1))
6083 return true;
6084
6085 // Both are constant, adjacent pairs are coalescable
6086 return std::abs(Idx0->getSExtValue() - Idx1->getSExtValue()) != 1;
6087}
6088
6089/// Return true if FMUL v2f32 node \p N may be scalarized to fold each lane's
6090/// product into a scalar FMA.
6091bool NVPTXTargetLowering::mayFoldFMULIntoFMA(SDNode *N, MachineFunction &MF,
6092 CodeGenOptLevel OptLevel) const {
6093 if (N->getOpcode() != ISD::FMUL || N->getValueType(0) != MVT::v2f32)
6094 return false;
6095 const bool GlobalFMA = allowFMA(MF, OptLevel);
6096 if (!N->getFlags().hasAllowContract() && !GlobalFMA)
6097 return false;
6098
6099 const SDNode *FirstFAdd = nullptr;
6100 unsigned NumScalarFAdd = 0;
6101
6102 // Both lanes must feed unique FADDs
6103 for (SDNode *EE : N->users()) {
6104 if (NumScalarFAdd == 2)
6105 return false;
6106
6107 if (EE->getOpcode() != ISD::EXTRACT_VECTOR_ELT || !EE->hasOneUse() ||
6108 !isa<ConstantSDNode>(EE->getOperand(1)))
6109 return false;
6110
6111 const SDNode *const FAdd = *EE->users().begin();
6112 if (FAdd->getOpcode() != ISD::FADD ||
6113 (!GlobalFMA && !FAdd->getFlags().hasAllowContract()))
6114 return false;
6115
6116 if (!FirstFAdd)
6117 FirstFAdd = FAdd;
6118 else if (FAdd == FirstFAdd)
6119 return false;
6120
6121 NumScalarFAdd++;
6122 }
6123
6124 return NumScalarFAdd == 2;
6125}
6126
6127/// Scalarize a v2f32 arithmetic node (FADD, FMUL, FSUB, FMA) when at least
6128/// one operand is a BUILD_VECTOR that repacks values from non-adjacent register
6129/// pairs. Without this combine the BUILD_VECTOR forces allocation of a
6130/// temporary 64-bit register, increasing register pressure.
6131///
6132/// Example - before:
6133/// t0: v2f32,v2f32,ch = LoadV2 ...
6134/// t1: f32 = extract_vector_elt t0, 0
6135/// t2: f32 = extract_vector_elt t0:1, 0
6136/// t3: v2f32 = BUILD_VECTOR t1, t2 ;; non-coalescable repack
6137/// t4: v2f32 = fma t_a, t3, t_c
6138///
6139/// After:
6140/// t0: v2f32,v2f32,ch = LoadV2 ...
6141/// t1: f32 = extract_vector_elt t0, 0
6142/// t2: f32 = extract_vector_elt t0:1, 0
6143/// a0: f32 = extract_vector_elt t_a, 0
6144/// a1: f32 = extract_vector_elt t_a, 1
6145/// c0: f32 = extract_vector_elt t_c, 0
6146/// c1: f32 = extract_vector_elt t_c, 1
6147/// r0: f32 = fma a0, t1, c0
6148/// r1: f32 = fma a1, t2, c1
6149/// t4: v2f32 = BUILD_VECTOR r0, r1
6150///
6151/// Also scalarizes an FMUL when all output lanes feed into scalar FADDs
6152/// to enable scalar FMA combining.
6153SDValue NVPTXTargetLowering::performScalarizeV2F32Op(
6155 CodeGenOptLevel OptLevel) const {
6156 EVT VT = N->getValueType(0);
6157 if (VT != MVT::v2f32)
6158 return SDValue();
6159
6160 if (none_of(N->ops(), isNonCoalescableBuildVector) &&
6161 !mayFoldFMULIntoFMA(N, DCI.DAG.getMachineFunction(), OptLevel))
6162 return SDValue();
6163
6164 SelectionDAG &DAG = DCI.DAG;
6165 SDLoc DL(N);
6166 EVT EltVT = VT.getVectorElementType();
6167 unsigned Opc = N->getOpcode();
6168
6169 // For each operand, get the scalar element at the given index: if the operand
6170 // is a BUILD_VECTOR, grab the element directly; otherwise, emit an
6171 // EXTRACT_VECTOR_ELT.
6172 auto GetElement = [&](SDValue Op, unsigned Index) -> SDValue {
6173 if (Op.getOpcode() == ISD::BUILD_VECTOR)
6174 return Op.getOperand(Index);
6175 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
6176 DAG.getVectorIdxConstant(Index, DL));
6177 };
6178
6179 // Build scalar operand lists for element 0 and element 1.
6180 SmallVector<SDValue, 3> Ops0, Ops1;
6181 for (const SDValue &Op : N->ops()) {
6182 Ops0.push_back(GetElement(Op, 0));
6183 Ops1.push_back(GetElement(Op, 1));
6184 }
6185
6186 SDValue Res0 = DAG.getNode(Opc, DL, EltVT, Ops0, N->getFlags());
6187 SDValue Res1 = DAG.getNode(Opc, DL, EltVT, Ops1, N->getFlags());
6188
6189 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Res0, Res1);
6190}
6191
6192/// Target-specific dag combine xforms for ISD::FADD.
6193SDValue
6194NVPTXTargetLowering::performFADDCombine(SDNode *N,
6196 CodeGenOptLevel OptLevel) const {
6197 if (SDValue Result = performScalarizeV2F32Op(N, DCI, OptLevel))
6198 return Result;
6199
6200 SDValue N0 = N->getOperand(0);
6201 SDValue N1 = N->getOperand(1);
6202
6203 EVT VT = N0.getValueType();
6204 if (VT.isVector() || !(VT == MVT::f32 || VT == MVT::f64))
6205 return SDValue();
6206
6207 // First try with the default operand order.
6208 if (SDValue Result = performFADDCombineWithOperands(N, N0, N1, DCI, OptLevel))
6209 return Result;
6210
6211 // If that didn't work, try again with the operands commuted.
6212 return performFADDCombineWithOperands(N, N1, N0, DCI, OptLevel);
6213}
6214
6215/// Get 3-input version of a 2-input min/max opcode
6216static unsigned getMinMax3Opcode(unsigned MinMax2Opcode) {
6217 switch (MinMax2Opcode) {
6218 case ISD::FMAXNUM:
6219 case ISD::FMAXIMUMNUM:
6220 return NVPTXISD::FMAXNUM3;
6221 case ISD::FMINNUM:
6222 case ISD::FMINIMUMNUM:
6223 return NVPTXISD::FMINNUM3;
6224 case ISD::FMAXIMUM:
6225 return NVPTXISD::FMAXIMUM3;
6226 case ISD::FMINIMUM:
6227 return NVPTXISD::FMINIMUM3;
6228 default:
6229 llvm_unreachable("Invalid 2-input min/max opcode");
6230 }
6231}
6232
6233/// PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into
6234/// (fmaxnum3 a, b, c). Also covers other llvm min/max intrinsics.
6237 unsigned PTXVersion, unsigned SmVersion) {
6238
6239 // 3-input min/max requires PTX 8.8+ and SM_100+, and only supports f32s
6240 EVT VT = N->getValueType(0);
6241 if (VT != MVT::f32 || PTXVersion < 88 || SmVersion < 100)
6242 return SDValue();
6243
6244 SDValue Op0 = N->getOperand(0);
6245 SDValue Op1 = N->getOperand(1);
6246 unsigned MinMaxOp2 = N->getOpcode();
6247 unsigned MinMaxOp3 = getMinMax3Opcode(MinMaxOp2);
6248
6249 if (Op0.getOpcode() == MinMaxOp2 && Op0.hasOneUse()) {
6250 // (maxnum (maxnum a, b), c) -> (maxnum3 a, b, c)
6251 SDValue A = Op0.getOperand(0);
6252 SDValue B = Op0.getOperand(1);
6253 SDValue C = Op1;
6254 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6255 } else if (Op1.getOpcode() == MinMaxOp2 && Op1.hasOneUse()) {
6256 // (maxnum a, (maxnum b, c)) -> (maxnum3 a, b, c)
6257 SDValue A = Op0;
6258 SDValue B = Op1.getOperand(0);
6259 SDValue C = Op1.getOperand(1);
6260 return DCI.DAG.getNode(MinMaxOp3, SDLoc(N), VT, A, B, C, N->getFlags());
6261 }
6262 return SDValue();
6263}
6264
6267 CodeGenOptLevel OptLevel) {
6268 assert(N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM);
6269
6270 // Don't do anything at less than -O2.
6271 if (OptLevel < CodeGenOptLevel::Default)
6272 return SDValue();
6273
6274 SelectionDAG &DAG = DCI.DAG;
6275 SDLoc DL(N);
6276 EVT VT = N->getValueType(0);
6277 bool IsSigned = N->getOpcode() == ISD::SREM;
6278 unsigned DivOpc = IsSigned ? ISD::SDIV : ISD::UDIV;
6279
6280 const SDValue &Num = N->getOperand(0);
6281 const SDValue &Den = N->getOperand(1);
6282
6283 for (const SDNode *U : Num->users()) {
6284 if (U->getOpcode() == DivOpc && U->getOperand(0) == Num &&
6285 U->getOperand(1) == Den) {
6286 // Num % Den -> Num - (Num / Den) * Den
6287 return DAG.getNode(ISD::SUB, DL, VT, Num,
6288 DAG.getNode(ISD::MUL, DL, VT,
6289 DAG.getNode(DivOpc, DL, VT, Num, Den),
6290 Den));
6291 }
6292 }
6293 return SDValue();
6294}
6295
6296// sext (mul.iN nsw x, y) => mul.wide.sN x, y
6297// zext (mul.iN nuw x, y) => mul.wide.uN x, y
6298// sext (shl.iN nsw x, const) => mul.wide.sN x, (1 << const)
6299// zext (shl.iN nuw x, const) => mul.wide.uN x, (1 << const)
6302 CodeGenOptLevel OptLevel) {
6303 assert(N->getOpcode() == ISD::SIGN_EXTEND ||
6304 N->getOpcode() == ISD::ZERO_EXTEND);
6305
6306 if (OptLevel == CodeGenOptLevel::None)
6307 return SDValue();
6308
6309 SDValue Op = N->getOperand(0);
6310 if (!Op.hasOneUse())
6311 return SDValue();
6312
6313 EVT ToVT = N->getValueType(0);
6314 EVT FromVT = Op.getValueType();
6315 if (!((ToVT == MVT::i32 && FromVT == MVT::i16) ||
6316 (ToVT == MVT::i64 && FromVT == MVT::i32)))
6317 return SDValue();
6318
6319 bool IsSigned = N->getOpcode() == ISD::SIGN_EXTEND;
6320 if ((IsSigned && !Op->getFlags().hasNoSignedWrap()) ||
6321 (!IsSigned && !Op->getFlags().hasNoUnsignedWrap()))
6322 return SDValue();
6323
6324 SDLoc DL(N);
6325 SDValue LHS = Op.getOperand(0);
6326 SDValue RHS = Op.getOperand(1);
6327 unsigned MulWideOpcode =
6328 IsSigned ? NVPTXISD::MUL_WIDE_SIGNED : NVPTXISD::MUL_WIDE_UNSIGNED;
6329 if (Op.getOpcode() == ISD::MUL) {
6330 return DCI.DAG.getNode(MulWideOpcode, DL, ToVT, LHS, RHS);
6331 } else if (Op.getOpcode() == ISD::SHL && isa<ConstantSDNode>(RHS)) {
6332 const auto ShiftAmt = Op.getConstantOperandVal(1);
6333 const auto MulVal = APInt(FromVT.getSizeInBits(), 1) << ShiftAmt;
6334
6335 // Note that the sext (shl nsw ...) case doesn't work if 1 << const
6336 // overflows to a negative value! The only valid input values in this
6337 // case are 0 and -1 (all other values yield poison because of the nsw),
6338 // and mul.wide.sN would give us the wrong sign for -1. We could use
6339 // mul.wide.uN, but since this is a weird case anyway, we might as well not
6340 // apply this transformation at all.
6341 if (IsSigned && MulVal.isNegative())
6342 return SDValue();
6343
6344 RHS = DCI.DAG.getConstant(MulVal, DL, FromVT);
6345 return DCI.DAG.getNode(MulWideOpcode, DL, ToVT, LHS, RHS);
6346 }
6347
6348 return SDValue();
6349}
6350
6356
6357/// IsMulWideOperandDemotable - Checks if the provided DAG node is an operand
6358/// that can be demoted to \p OptSize bits without loss of information. The
6359/// signedness of the operand, if determinable, is placed in \p S.
6361 unsigned OptSize,
6362 OperandSignedness &S) {
6363 S = Unknown;
6364
6365 if (Op.getOpcode() == ISD::SIGN_EXTEND ||
6366 Op.getOpcode() == ISD::SIGN_EXTEND_INREG) {
6367 EVT OrigVT = Op.getOperand(0).getValueType();
6368 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6369 S = Signed;
6370 return true;
6371 }
6372 } else if (Op.getOpcode() == ISD::ZERO_EXTEND) {
6373 EVT OrigVT = Op.getOperand(0).getValueType();
6374 if (OrigVT.getFixedSizeInBits() <= OptSize) {
6375 S = Unsigned;
6376 return true;
6377 }
6378 }
6379
6380 return false;
6381}
6382
6383/// AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can
6384/// be demoted to \p OptSize bits without loss of information. If the operands
6385/// contain a constant, it should appear as the RHS operand. The signedness of
6386/// the operands is placed in \p IsSigned.
6388 unsigned OptSize,
6389 bool &IsSigned) {
6390 OperandSignedness LHSSign;
6391
6392 // The LHS operand must be a demotable op
6393 if (!IsMulWideOperandDemotable(LHS, OptSize, LHSSign))
6394 return false;
6395
6396 // We should have been able to determine the signedness from the LHS
6397 if (LHSSign == Unknown)
6398 return false;
6399
6400 IsSigned = (LHSSign == Signed);
6401
6402 // The RHS can be a demotable op or a constant
6404 const APInt &Val = CI->getAPIntValue();
6405 if (LHSSign == Unsigned) {
6406 return Val.isIntN(OptSize);
6407 } else {
6408 return Val.isSignedIntN(OptSize);
6409 }
6410 } else {
6411 OperandSignedness RHSSign;
6412 if (!IsMulWideOperandDemotable(RHS, OptSize, RHSSign))
6413 return false;
6414
6415 return LHSSign == RHSSign;
6416 }
6417}
6418
6419/// TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply
6420/// of M/2 bits that produces an M-bit result (i.e. mul.wide). This transform
6421/// works on both multiply DAG nodes and SHL DAG nodes with a constant shift
6422/// amount.
6425 EVT MulType = N->getValueType(0);
6426 if (MulType != MVT::i32 && MulType != MVT::i64) {
6427 return SDValue();
6428 }
6429
6430 SDLoc DL(N);
6431 unsigned OptSize = MulType.getSizeInBits() >> 1;
6432 SDValue LHS = N->getOperand(0);
6433 SDValue RHS = N->getOperand(1);
6434
6435 // Canonicalize the multiply so the constant (if any) is on the right
6436 if (N->getOpcode() == ISD::MUL) {
6437 if (isa<ConstantSDNode>(LHS)) {
6438 std::swap(LHS, RHS);
6439 }
6440 }
6441
6442 // If we have a SHL, determine the actual multiply amount
6443 if (N->getOpcode() == ISD::SHL) {
6445 if (!ShlRHS) {
6446 return SDValue();
6447 }
6448
6449 APInt ShiftAmt = ShlRHS->getAPIntValue();
6450 unsigned BitWidth = MulType.getSizeInBits();
6451 if (ShiftAmt.sge(0) && ShiftAmt.slt(BitWidth)) {
6452 APInt MulVal = APInt(BitWidth, 1) << ShiftAmt;
6453 RHS = DCI.DAG.getConstant(MulVal, DL, MulType);
6454 } else {
6455 return SDValue();
6456 }
6457 }
6458
6459 bool Signed;
6460 // Verify that our operands are demotable
6461 if (!AreMulWideOperandsDemotable(LHS, RHS, OptSize, Signed)) {
6462 return SDValue();
6463 }
6464
6465 EVT DemotedVT;
6466 if (MulType == MVT::i32) {
6467 DemotedVT = MVT::i16;
6468 } else {
6469 DemotedVT = MVT::i32;
6470 }
6471
6472 // Truncate the operands to the correct size. Note that these are just for
6473 // type consistency and will (likely) be eliminated in later phases.
6474 SDValue TruncLHS =
6475 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, LHS);
6476 SDValue TruncRHS =
6477 DCI.DAG.getNode(ISD::TRUNCATE, DL, DemotedVT, RHS);
6478
6479 unsigned Opc;
6480 if (Signed) {
6481 Opc = NVPTXISD::MUL_WIDE_SIGNED;
6482 } else {
6483 Opc = NVPTXISD::MUL_WIDE_UNSIGNED;
6484 }
6485
6486 return DCI.DAG.getNode(Opc, DL, MulType, TruncLHS, TruncRHS);
6487}
6488
6489static bool isConstOne(const SDValue &Operand) {
6490 const auto *Const = dyn_cast<ConstantSDNode>(Operand);
6491 return Const && Const->getZExtValue() == 1;
6492}
6493
6495 if (Add->getOpcode() != ISD::ADD)
6496 return SDValue();
6497
6498 if (isConstOne(Add->getOperand(0)))
6499 return Add->getOperand(1);
6500
6501 if (isConstOne(Add->getOperand(1)))
6502 return Add->getOperand(0);
6503
6504 return SDValue();
6505}
6506
6509
6511 SDValue Mul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6512 return DCI.DAG.getNode(ISD::ADD, DL, VT, Mul, X);
6513 }
6514
6515 return SDValue();
6516}
6517
6519 SDLoc DL,
6521 if (Select->getOpcode() != ISD::SELECT)
6522 return SDValue();
6523
6524 SDValue Cond = Select->getOperand(0);
6525
6526 unsigned ConstOpNo;
6527 if (isConstOne(Select->getOperand(1)))
6528 ConstOpNo = 1;
6529 else if (isConstOne(Select->getOperand(2)))
6530 ConstOpNo = 2;
6531 else
6532 return SDValue();
6533
6534 SDValue Y = Select->getOperand((ConstOpNo == 1) ? 2 : 1);
6535
6536 // Do not combine if the resulting sequence is not obviously profitable.
6538 return SDValue();
6539
6540 SDValue NewMul = DCI.DAG.getNode(ISD::MUL, DL, VT, X, Y);
6541
6542 return DCI.DAG.getNode(ISD::SELECT, DL, VT, Cond,
6543 (ConstOpNo == 1) ? X : NewMul,
6544 (ConstOpNo == 1) ? NewMul : X);
6545}
6546
6547static SDValue
6550
6551 EVT VT = N0.getValueType();
6552 if (VT.isVector())
6553 return SDValue();
6554
6555 if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
6556 return SDValue();
6557
6558 SDLoc DL(N);
6559
6560 // (mul x, (add y, 1)) -> (add (mul x, y), x)
6561 if (SDValue Res = combineMADConstOne(N0, N1, VT, DL, DCI))
6562 return Res;
6563 if (SDValue Res = combineMADConstOne(N1, N0, VT, DL, DCI))
6564 return Res;
6565
6566 // (mul x, (select y, 1)) -> (select (mul x, y), x)
6567 if (SDValue Res = combineMulSelectConstOne(N0, N1, VT, DL, DCI))
6568 return Res;
6569 if (SDValue Res = combineMulSelectConstOne(N1, N0, VT, DL, DCI))
6570 return Res;
6571
6572 return SDValue();
6573}
6574
6575/// PerformMULCombine - Runs PTX-specific DAG combine patterns on MUL nodes.
6578 CodeGenOptLevel OptLevel) {
6579 if (OptLevel == CodeGenOptLevel::None)
6580 return SDValue();
6581
6582 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6583 return Ret;
6584
6585 SDValue N0 = N->getOperand(0);
6586 SDValue N1 = N->getOperand(1);
6587 return PerformMULCombineWithOperands(N, N0, N1, DCI);
6588}
6589
6590/// PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
6593 CodeGenOptLevel OptLevel) {
6594 if (OptLevel > CodeGenOptLevel::None) {
6595 // Try mul.wide combining at OptLevel > 0
6596 if (SDValue Ret = TryMULWIDECombine(N, DCI))
6597 return Ret;
6598 }
6599
6600 return SDValue();
6601}
6602
6605 unsigned int SmVersion) {
6606 EVT CCType = N->getValueType(0);
6607 SDValue A = N->getOperand(0);
6608 SDValue B = N->getOperand(1);
6609
6610 EVT AType = A.getValueType();
6611 if (!(CCType == MVT::v2i1 && (AType == MVT::v2f16 || AType == MVT::v2bf16)))
6612 return SDValue();
6613
6614 if (A.getValueType() == MVT::v2bf16 && SmVersion < 90)
6615 return SDValue();
6616
6617 SDLoc DL(N);
6618 // setp.f16x2 returns two scalar predicates, which we need to
6619 // convert back to v2i1. The returned result will be scalarized by
6620 // the legalizer, but the comparison will remain a single vector
6621 // instruction.
6622 SDValue CCNode = DCI.DAG.getNode(
6623 A.getValueType() == MVT::v2f16 ? NVPTXISD::SETP_F16X2
6625 DL, DCI.DAG.getVTList(MVT::i1, MVT::i1), {A, B, N->getOperand(2)});
6626 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
6627 CCNode.getValue(1));
6628}
6629
6632 SDValue Vector = peekThroughFreeze(N->getOperand(0));
6633 SDLoc DL(N);
6634 EVT VectorVT = Vector.getValueType();
6635 if (Vector->getOpcode() == ISD::LOAD && VectorVT.isSimple() &&
6636 IsPTXVectorType(VectorVT.getSimpleVT()))
6637 return SDValue(); // Native vector loads already combine nicely w/
6638 // extract_vector_elt.
6639 // Don't mess with singletons or packed types (v2*32, v2*16, v4i8 and v8i8),
6640 // we already handle them OK.
6641 if (VectorVT.getVectorNumElements() == 1 ||
6642 NVPTX::isPackedVectorTy(VectorVT) || VectorVT == MVT::v8i8)
6643 return SDValue();
6644
6645 // Don't mess with undef values as sra may be simplified to 0, not undef.
6646 if (Vector->isUndef() || ISD::allOperandsUndef(Vector.getNode()))
6647 return SDValue();
6648
6649 uint64_t VectorBits = VectorVT.getSizeInBits();
6650 // We only handle the types we can extract in-register.
6651 if (!(VectorBits == 16 || VectorBits == 32 || VectorBits == 64))
6652 return SDValue();
6653
6654 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(N->getOperand(1));
6655 // Index == 0 is handled by generic DAG combiner.
6656 if (!Index || Index->getZExtValue() == 0)
6657 return SDValue();
6658
6659 MVT IVT = MVT::getIntegerVT(VectorBits);
6660 EVT EltVT = VectorVT.getVectorElementType();
6661 EVT EltIVT = EltVT.changeTypeToInteger();
6662 uint64_t EltBits = EltVT.getScalarSizeInBits();
6663
6664 SDValue Result = DCI.DAG.getNode(
6665 ISD::TRUNCATE, DL, EltIVT,
6666 DCI.DAG.getNode(
6667 ISD::SRA, DL, IVT, DCI.DAG.getNode(ISD::BITCAST, DL, IVT, Vector),
6668 DCI.DAG.getConstant(Index->getZExtValue() * EltBits, DL, IVT)));
6669
6670 // If element has non-integer type, bitcast it back to the expected type.
6671 if (EltVT != EltIVT)
6672 Result = DCI.DAG.getNode(ISD::BITCAST, DL, EltVT, Result);
6673 // Past legalizer, we may need to extent i8 -> i16 to match the register type.
6674 if (EltVT != N->getValueType(0))
6675 Result = DCI.DAG.getNode(ISD::ANY_EXTEND, DL, N->getValueType(0), Result);
6676
6677 return Result;
6678}
6679
6680/// Transform patterns like:
6681/// (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt))
6682/// (select (ult shift_amt, BitWidth), (srl/shl x, shift_amt), 0)
6683/// Into:
6684/// (NVPTXISD::SRL_CLAMP x, shift_amt) or (NVPTXISD::SHL_CLAMP x, shift_amt)
6685///
6686/// These patterns arise from code like `s >= 32 ? 0 : x >> s`. In LLVM,
6687/// over-shifting a value results in poison, but PTX shr/shl instructions clamp
6688/// the shift amount to BitWidth, making the guard redundant.
6689///
6690/// Note: We only handle SRL and SHL, not SRA, because arithmetic right shifts
6691/// can produce 0 or -1 when shift >= BitWidth.
6692/// Note: We don't handle uge or ule. These don't appear because of
6693/// canonicalization.
6696 if (!DCI.isAfterLegalizeDAG())
6697 return SDValue();
6698
6699 using namespace SDPatternMatch;
6700 unsigned BitWidth = N->getValueType(0).getSizeInBits();
6701 SDValue ShiftAmt, ShiftOp;
6702
6703 // Match logical shifts where the shift amount in the guard matches the shift
6704 // amount in the operation.
6705 auto LogicalShift =
6706 m_AllOf(m_Value(ShiftOp),
6707 m_AnyOf(m_Srl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt))),
6708 m_Shl(m_Value(), m_TruncOrSelf(m_Deferred(ShiftAmt)))));
6709
6710 // shift_amt > BitWidth-1 ? 0 : shift_op
6711 bool MatchedUGT =
6712 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6714 m_SpecificCondCode(ISD::SETUGT)),
6715 m_Zero(), LogicalShift));
6716 // shift_amt < BitWidth ? shift_op : 0
6717 bool MatchedULT =
6718 !MatchedUGT &&
6719 sd_match(N, m_Select(m_SetCC(m_Value(ShiftAmt),
6721 m_SpecificCondCode(ISD::SETULT)),
6722 LogicalShift, m_Zero()));
6723
6724 if (!MatchedUGT && !MatchedULT)
6725 return SDValue();
6726
6727 // In LLVM IR, the shift amount and the value-to-be-shifted are the same
6728 // type, whereas in PTX the shift amount is always i32. Therefore when
6729 // shifting types larger than i32, we can only do this transformation if we
6730 // know that the upper bits of the shift amount are known zero.
6731 SDValue ClampAmt = ShiftOp.getOperand(1);
6732 unsigned ClampAmtBits = ClampAmt.getValueSizeInBits();
6733 if (ShiftAmt.getValueSizeInBits() > ClampAmtBits &&
6734 DCI.DAG.computeKnownBits(ShiftAmt).countMaxActiveBits() > ClampAmtBits)
6735 return SDValue();
6736
6737 // Return a clamp shift operation, which has the same semantics as PTX shift.
6738 unsigned ClampOpc = ShiftOp.getOpcode() == ISD::SRL ? NVPTXISD::SRL_CLAMP
6739 : NVPTXISD::SHL_CLAMP;
6740 return DCI.DAG.getNode(ClampOpc, SDLoc(N), ShiftOp.getValueType(),
6741 ShiftOp.getOperand(0), ClampAmt);
6742}
6743
6746 SDValue VA = N->getOperand(1);
6747 EVT VectorVT = VA.getValueType();
6748 if (VectorVT != MVT::v4i8)
6749 return SDValue();
6750
6751 // We need to split vselect into individual per-element operations Because we
6752 // use BFE/BFI instruction for byte extraction/insertion, we do end up with
6753 // 32-bit values, so we may as well do comparison as i32 to avoid conversions
6754 // to/from i16 normally used for i8 values.
6756 SDLoc DL(N);
6757 SDValue VCond = N->getOperand(0);
6758 SDValue VB = N->getOperand(2);
6759 for (int I = 0; I < 4; ++I) {
6760 SDValue C = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i1, VCond,
6761 DCI.DAG.getConstant(I, DL, MVT::i32));
6762 SDValue EA = DCI.DAG.getAnyExtOrTrunc(
6763 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VA,
6764 DCI.DAG.getConstant(I, DL, MVT::i32)),
6765 DL, MVT::i32);
6766 SDValue EB = DCI.DAG.getAnyExtOrTrunc(
6767 DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i8, VB,
6768 DCI.DAG.getConstant(I, DL, MVT::i32)),
6769 DL, MVT::i32);
6770 E.push_back(DCI.DAG.getAnyExtOrTrunc(
6771 DCI.DAG.getNode(ISD::SELECT, DL, MVT::i32, C, EA, EB), DL, MVT::i8));
6772 }
6773 return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E);
6774}
6775
6776static SDValue
6778 auto VT = N->getValueType(0);
6779 if (!DCI.isAfterLegalizeDAG() ||
6780 // only process v2*16 types
6781 !(NVPTX::isPackedVectorTy(VT) && VT.is32BitVector() &&
6782 VT.getVectorNumElements() == 2))
6783 return SDValue();
6784
6785 auto Op0 = N->getOperand(0);
6786 auto Op1 = N->getOperand(1);
6787
6788 // Start out by assuming we want to take the lower 2 bytes of each i32
6789 // operand.
6790 uint64_t Op0Bytes = 0x10;
6791 uint64_t Op1Bytes = 0x54;
6792
6793 std::pair<SDValue *, uint64_t *> OpData[2] = {{&Op0, &Op0Bytes},
6794 {&Op1, &Op1Bytes}};
6795
6796 // Check that each operand is an i16, truncated from an i32 operand. We'll
6797 // select individual bytes from those original operands. Optionally, fold in a
6798 // shift right of that original operand.
6799 for (auto &[Op, OpBytes] : OpData) {
6800 // Eat up any bitcast
6801 if (Op->getOpcode() == ISD::BITCAST)
6802 *Op = Op->getOperand(0);
6803
6804 if (!(Op->getValueType() == MVT::i16 && Op->getOpcode() == ISD::TRUNCATE &&
6805 Op->getOperand(0).getValueType() == MVT::i32))
6806 return SDValue();
6807
6808 // If the truncate has multiple uses, this optimization can increase
6809 // register pressure
6810 if (!Op->hasOneUse())
6811 return SDValue();
6812
6813 *Op = Op->getOperand(0);
6814
6815 // Optionally, fold in a shift-right of the original operand and let permute
6816 // pick the two higher bytes of the original value directly.
6817 if (Op->getOpcode() == ISD::SRL && isa<ConstantSDNode>(Op->getOperand(1))) {
6818 if (cast<ConstantSDNode>(Op->getOperand(1))->getZExtValue() == 16) {
6819 // Shift the PRMT byte selector to pick upper bytes from each respective
6820 // value, instead of the lower ones: 0x10 -> 0x32, 0x54 -> 0x76
6821 assert((*OpBytes == 0x10 || *OpBytes == 0x54) &&
6822 "PRMT selector values out of range");
6823 *OpBytes += 0x22;
6824 *Op = Op->getOperand(0);
6825 }
6826 }
6827 }
6828
6829 SDLoc DL(N);
6830 auto &DAG = DCI.DAG;
6831
6832 auto PRMT =
6833 getPRMT(DAG.getBitcast(MVT::i32, Op0), DAG.getBitcast(MVT::i32, Op1),
6834 (Op1Bytes << 8) | Op0Bytes, DL, DAG);
6835 return DAG.getBitcast(VT, PRMT);
6836}
6837
6840 auto *ASCN1 = cast<AddrSpaceCastSDNode>(N);
6841
6842 if (auto *ASCN2 = dyn_cast<AddrSpaceCastSDNode>(ASCN1->getOperand(0))) {
6843 assert(ASCN2->getDestAddressSpace() == ASCN1->getSrcAddressSpace());
6844
6845 // Fold asc[B -> A](asc[A -> B](x)) -> x
6846 if (ASCN1->getDestAddressSpace() == ASCN2->getSrcAddressSpace())
6847 return ASCN2->getOperand(0);
6848 }
6849
6850 return SDValue();
6851}
6852
6853// Given a constant selector value and a prmt mode, return the selector value
6854// normalized to the generic prmt mode. See the PTX ISA documentation for more
6855// details:
6856// https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-prmt
6857static APInt getPRMTSelector(const APInt &Selector, unsigned Mode) {
6858 assert(Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6859
6861 return Selector;
6862
6863 const unsigned V = Selector.trunc(2).getZExtValue();
6864
6865 const auto GetSelector = [](unsigned S0, unsigned S1, unsigned S2,
6866 unsigned S3) {
6867 return APInt(32, S0 | (S1 << 4) | (S2 << 8) | (S3 << 12));
6868 };
6869
6870 switch (Mode) {
6872 return GetSelector(V, V + 1, V + 2, V + 3);
6874 return GetSelector(V, (V - 1) & 7, (V - 2) & 7, (V - 3) & 7);
6876 return GetSelector(V, V, V, V);
6878 return GetSelector(V, std::max(V, 1U), std::max(V, 2U), 3U);
6880 return GetSelector(0, std::min(V, 1U), std::min(V, 2U), V);
6882 unsigned V1 = (V & 1) << 1;
6883 return GetSelector(V1, V1 + 1, V1, V1 + 1);
6884 }
6885 default:
6886 llvm_unreachable("Invalid PRMT mode");
6887 }
6888}
6889
6890static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode) {
6891 assert(A.getBitWidth() == 32 && B.getBitWidth() == 32 &&
6892 Selector.getBitWidth() == 32 && "PRMT must have i32 operands");
6893 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
6894 APInt BitField = B.concat(A);
6895 APInt SelectorVal = getPRMTSelector(Selector, Mode);
6896 APInt Result(32, 0);
6897 for (unsigned I : llvm::seq(4U)) {
6898 APInt Sel = SelectorVal.extractBits(4, I * 4);
6899 unsigned Idx = Sel.getLoBits(3).getZExtValue();
6900 unsigned Sign = Sel.getHiBits(1).getZExtValue();
6901 APInt Byte = BitField.extractBits(8, Idx * 8);
6902 if (Sign)
6903 Byte = Byte.ashr(8);
6904 Result.insertBits(Byte, I * 8);
6905 }
6906 return Result;
6907}
6908
6910 CodeGenOptLevel OptLevel) {
6911 if (OptLevel == CodeGenOptLevel::None)
6912 return SDValue();
6913
6914 // Constant fold PRMT
6915 if (isa<ConstantSDNode>(N->getOperand(0)) &&
6916 isa<ConstantSDNode>(N->getOperand(1)) &&
6917 isa<ConstantSDNode>(N->getOperand(2)))
6918 return DCI.DAG.getConstant(computePRMT(N->getConstantOperandAPInt(0),
6919 N->getConstantOperandAPInt(1),
6920 N->getConstantOperandAPInt(2),
6921 N->getConstantOperandVal(3)),
6922 SDLoc(N), N->getValueType(0));
6923 return SDValue();
6924}
6925
6926// During call lowering we wrap the return values in a ProxyReg node which
6927// depend on the chain value produced by the completed call. This ensures that
6928// the full call is emitted in cases where libcalls are used to legalize
6929// operations. To improve the functioning of other DAG combines we pull all
6930// operations we can through one of these nodes, ensuring that the ProxyReg
6931// directly wraps a load. That is:
6932//
6933// (ProxyReg (zext (load retval0))) => (zext (ProxyReg (load retval0)))
6934//
6937 switch (R.getOpcode()) {
6938 case ISD::TRUNCATE:
6939 case ISD::ANY_EXTEND:
6940 case ISD::SIGN_EXTEND:
6941 case ISD::ZERO_EXTEND:
6942 case ISD::BITCAST: {
6943 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6944 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), V);
6945 return SDValue();
6946 }
6947 case ISD::SHL:
6948 case ISD::SRL:
6949 case ISD::SRA:
6950 case ISD::OR: {
6951 if (SDValue A = sinkProxyReg(R.getOperand(0), Chain, DCI))
6952 if (SDValue B = sinkProxyReg(R.getOperand(1), Chain, DCI))
6953 return DCI.DAG.getNode(R.getOpcode(), SDLoc(R), R.getValueType(), A, B);
6954 return SDValue();
6955 }
6956 case ISD::Constant:
6957 return R;
6958 case ISD::LOAD:
6959 case NVPTXISD::LoadV2:
6960 case NVPTXISD::LoadV4: {
6961 return DCI.DAG.getNode(NVPTXISD::ProxyReg, SDLoc(R), R.getValueType(),
6962 {Chain, R});
6963 }
6964 case ISD::BUILD_VECTOR: {
6965 if (DCI.isBeforeLegalize())
6966 return SDValue();
6967
6969 for (auto &Op : R->ops()) {
6970 SDValue V = sinkProxyReg(Op, Chain, DCI);
6971 if (!V)
6972 return SDValue();
6973 Ops.push_back(V);
6974 }
6975 return DCI.DAG.getNode(ISD::BUILD_VECTOR, SDLoc(R), R.getValueType(), Ops);
6976 }
6978 if (DCI.isBeforeLegalize())
6979 return SDValue();
6980
6981 if (SDValue V = sinkProxyReg(R.getOperand(0), Chain, DCI))
6983 R.getValueType(), V, R.getOperand(1));
6984 return SDValue();
6985 }
6986 default:
6987 return SDValue();
6988 }
6989}
6990
6991static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID) {
6992 switch (AddIntrinsicID) {
6993 default:
6994 break;
6995 case Intrinsic::nvvm_add_rn_sat_f16:
6996 case Intrinsic::nvvm_add_rn_sat_v2f16:
6997 return NVPTXISD::SUB_RN_SAT;
6998 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
6999 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
7000 return NVPTXISD::SUB_RN_FTZ_SAT;
7001 }
7002 llvm_unreachable("Invalid F16 add intrinsic");
7003}
7004
7006 Intrinsic::ID AddIntrinsicID) {
7007 SDValue Op1 = N->getOperand(1);
7008 SDValue Op2 = N->getOperand(2);
7009
7010 SDValue SubOp1, SubOp2;
7011
7012 if (Op1.getOpcode() == ISD::FNEG) {
7013 SubOp1 = Op2;
7014 SubOp2 = Op1.getOperand(0);
7015 } else if (Op2.getOpcode() == ISD::FNEG) {
7016 SubOp1 = Op1;
7017 SubOp2 = Op2.getOperand(0);
7018 } else {
7019 return SDValue();
7020 }
7021
7022 SDLoc DL(N);
7023 return DAG.getNode(getF16SubOpc(AddIntrinsicID), DL, N->getValueType(0),
7024 SubOp1, SubOp2);
7025}
7026
7029 const NVPTXSubtarget &STI) {
7030 unsigned IID = N->getConstantOperandVal(0);
7031
7032 switch (IID) {
7033 default:
7034 break;
7035 case Intrinsic::nvvm_add_rn_sat_f16:
7036 case Intrinsic::nvvm_add_rn_ftz_sat_f16:
7037 case Intrinsic::nvvm_add_rn_sat_v2f16:
7038 case Intrinsic::nvvm_add_rn_ftz_sat_v2f16:
7039 return combineF16AddWithNeg(N, DCI.DAG, IID);
7040 }
7041 return SDValue();
7042}
7043
7046
7047 SDValue Chain = N->getOperand(0);
7048 SDValue Reg = N->getOperand(1);
7049
7050 // If the ProxyReg is not wrapping a load, try to pull the operations through
7051 // the ProxyReg.
7052 if (Reg.getOpcode() != ISD::LOAD) {
7053 if (SDValue V = sinkProxyReg(Reg, Chain, DCI))
7054 return V;
7055 }
7056
7057 return SDValue();
7058}
7059
7060SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
7061 DAGCombinerInfo &DCI) const {
7063 switch (N->getOpcode()) {
7064 default:
7065 break;
7066 case ISD::ADD:
7067 return PerformADDCombine(N, DCI, OptLevel);
7068 case ISD::ADDRSPACECAST:
7069 return combineADDRSPACECAST(N, DCI);
7070 case ISD::SIGN_EXTEND:
7071 case ISD::ZERO_EXTEND:
7072 return combineSZExtToMulWide(N, DCI, OptLevel);
7073 case ISD::BUILD_VECTOR:
7074 return PerformBUILD_VECTORCombine(N, DCI);
7076 return PerformEXTRACTCombine(N, DCI);
7077 case ISD::FADD:
7078 return performFADDCombine(N, DCI, OptLevel);
7079 case ISD::FMA:
7080 case ISD::FMUL:
7081 case ISD::FSUB:
7082 return performScalarizeV2F32Op(N, DCI, OptLevel);
7083 case ISD::FMAXNUM:
7084 case ISD::FMINNUM:
7085 case ISD::FMAXIMUM:
7086 case ISD::FMINIMUM:
7087 case ISD::FMAXIMUMNUM:
7088 case ISD::FMINIMUMNUM:
7089 return PerformFMinMaxCombine(N, DCI, STI.getPTXVersion(),
7090 STI.getSmVersion());
7091 case ISD::LOAD:
7092 case NVPTXISD::LoadV2:
7093 case NVPTXISD::LoadV4:
7094 return combineLOAD(N, DCI, STI);
7095 case ISD::MUL:
7096 return PerformMULCombine(N, DCI, OptLevel);
7097 case NVPTXISD::PRMT:
7098 return combinePRMT(N, DCI, OptLevel);
7099 case NVPTXISD::ProxyReg:
7100 return combineProxyReg(N, DCI);
7101 case ISD::SETCC:
7102 return PerformSETCCCombine(N, DCI, STI.getSmVersion());
7103 case ISD::SHL:
7104 return PerformSHLCombine(N, DCI, OptLevel);
7105 case ISD::SREM:
7106 case ISD::UREM:
7107 return PerformREMCombine(N, DCI, OptLevel);
7108 case ISD::STORE:
7109 case NVPTXISD::StoreV2:
7110 case NVPTXISD::StoreV4:
7111 return combineSTORE(N, DCI, STI);
7112 case ISD::SELECT:
7113 return PerformSELECTShiftCombine(N, DCI);
7114 case ISD::VSELECT:
7115 return PerformVSELECTCombine(N, DCI);
7117 return combineIntrinsicWOChain(N, DCI, STI);
7118 }
7119 return SDValue();
7120}
7121
7124 // Handle bitcasting to v2i8 without hitting the default promotion
7125 // strategy which goes through stack memory.
7126 SDValue Op(Node, 0);
7127 EVT ToVT = Op->getValueType(0);
7128 if (ToVT != MVT::v2i8) {
7129 return;
7130 }
7131
7132 // Bitcast to i16 and unpack elements into a vector
7133 SDLoc DL(Node);
7134 SDValue AsInt = DAG.getBitcast(MVT::i16, Op->getOperand(0));
7135 SDValue Vec0 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, AsInt);
7136 SDValue Const8 = DAG.getConstant(8, DL, MVT::i16);
7137 SDValue Vec1 =
7138 DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
7139 DAG.getNode(ISD::SRL, DL, MVT::i16, {AsInt, Const8}));
7140 Results.push_back(
7141 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2i8, {Vec0, Vec1}));
7142}
7143
7146 SDValue Chain = N->getOperand(0);
7147 SDValue Intrin = N->getOperand(1);
7148 SDLoc DL(N);
7149
7150 // Get the intrinsic ID
7151 unsigned IntrinNo = Intrin.getNode()->getAsZExtVal();
7152 switch (IntrinNo) {
7153 default:
7154 return;
7155 case Intrinsic::nvvm_ldu_global_i:
7156 case Intrinsic::nvvm_ldu_global_f:
7157 case Intrinsic::nvvm_ldu_global_p: {
7158 EVT ResVT = N->getValueType(0);
7159
7160 if (ResVT.isVector()) {
7161 // Vector LDG/LDU
7162
7163 unsigned NumElts = ResVT.getVectorNumElements();
7164 EVT EltVT = ResVT.getVectorElementType();
7165
7166 // Since LDU/LDG are target nodes, we cannot rely on DAG type
7167 // legalization.
7168 // Therefore, we must ensure the type is legal. For i1 and i8, we set the
7169 // loaded type to i16 and propagate the "real" type as the memory type.
7170 bool NeedTrunc = false;
7171 if (EltVT.getSizeInBits() < 16) {
7172 EltVT = MVT::i16;
7173 NeedTrunc = true;
7174 }
7175
7176 unsigned Opcode = 0;
7177 SDVTList LdResVTs;
7178
7179 switch (NumElts) {
7180 default:
7181 return;
7182 case 2:
7183 Opcode = NVPTXISD::LDUV2;
7184 LdResVTs = DAG.getVTList(EltVT, EltVT, MVT::Other);
7185 break;
7186 case 4: {
7187 Opcode = NVPTXISD::LDUV4;
7188 EVT ListVTs[] = { EltVT, EltVT, EltVT, EltVT, MVT::Other };
7189 LdResVTs = DAG.getVTList(ListVTs);
7190 break;
7191 }
7192 }
7193
7194 SmallVector<SDValue, 8> OtherOps;
7195
7196 // Copy regular operands
7197
7198 OtherOps.push_back(Chain); // Chain
7199 // Skip operand 1 (intrinsic ID)
7200 // Others
7201 OtherOps.append(N->op_begin() + 2, N->op_end());
7202
7204
7205 SDValue NewLD = DAG.getMemIntrinsicNode(Opcode, DL, LdResVTs, OtherOps,
7206 MemSD->getMemoryVT(),
7207 MemSD->getMemOperand());
7208
7209 SmallVector<SDValue, 4> ScalarRes;
7210
7211 for (unsigned i = 0; i < NumElts; ++i) {
7212 SDValue Res = NewLD.getValue(i);
7213 if (NeedTrunc)
7214 Res =
7215 DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
7216 ScalarRes.push_back(Res);
7217 }
7218
7219 SDValue LoadChain = NewLD.getValue(NumElts);
7220
7221 SDValue BuildVec =
7222 DAG.getBuildVector(ResVT, DL, ScalarRes);
7223
7224 Results.push_back(BuildVec);
7225 Results.push_back(LoadChain);
7226 } else {
7227 // i8 LDG/LDU
7228 assert(ResVT.isSimple() && ResVT.getSimpleVT().SimpleTy == MVT::i8 &&
7229 "Custom handling of non-i8 ldu/ldg?");
7230
7231 // Just copy all operands as-is
7233
7234 // Force output to i16
7235 SDVTList LdResVTs = DAG.getVTList(MVT::i16, MVT::Other);
7236
7238
7239 // We make sure the memory type is i8, which will be used during isel
7240 // to select the proper instruction.
7241 SDValue NewLD =
7243 MVT::i8, MemSD->getMemOperand());
7244
7245 Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i8,
7246 NewLD.getValue(0)));
7247 Results.push_back(NewLD.getValue(1));
7248 }
7249 return;
7250 }
7251
7252 case Intrinsic::nvvm_tcgen05_ld_16x64b_x4:
7253 case Intrinsic::nvvm_tcgen05_ld_16x64b_x8:
7254 case Intrinsic::nvvm_tcgen05_ld_16x64b_x16:
7255 case Intrinsic::nvvm_tcgen05_ld_16x64b_x32:
7256 case Intrinsic::nvvm_tcgen05_ld_16x64b_x64:
7257 case Intrinsic::nvvm_tcgen05_ld_16x64b_x128:
7258 case Intrinsic::nvvm_tcgen05_ld_32x32b_x4:
7259 case Intrinsic::nvvm_tcgen05_ld_32x32b_x8:
7260 case Intrinsic::nvvm_tcgen05_ld_32x32b_x16:
7261 case Intrinsic::nvvm_tcgen05_ld_32x32b_x32:
7262 case Intrinsic::nvvm_tcgen05_ld_32x32b_x64:
7263 case Intrinsic::nvvm_tcgen05_ld_32x32b_x128:
7264 case Intrinsic::nvvm_tcgen05_ld_16x128b_x2:
7265 case Intrinsic::nvvm_tcgen05_ld_16x128b_x4:
7266 case Intrinsic::nvvm_tcgen05_ld_16x128b_x8:
7267 case Intrinsic::nvvm_tcgen05_ld_16x128b_x16:
7268 case Intrinsic::nvvm_tcgen05_ld_16x128b_x32:
7269 case Intrinsic::nvvm_tcgen05_ld_16x128b_x64:
7270 case Intrinsic::nvvm_tcgen05_ld_16x256b_x1:
7271 case Intrinsic::nvvm_tcgen05_ld_16x256b_x2:
7272 case Intrinsic::nvvm_tcgen05_ld_16x256b_x4:
7273 case Intrinsic::nvvm_tcgen05_ld_16x256b_x8:
7274 case Intrinsic::nvvm_tcgen05_ld_16x256b_x16:
7275 case Intrinsic::nvvm_tcgen05_ld_16x256b_x32:
7276 if (auto Res = lowerTcgen05Ld(N, DAG)) {
7277 Results.push_back(Res->first);
7278 Results.push_back(Res->second);
7279 }
7280 return;
7281
7282 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x4:
7283 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x8:
7284 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x16:
7285 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x32:
7286 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x64:
7287 case Intrinsic::nvvm_tcgen05_ld_16x32bx2_x128:
7288 if (auto Res = lowerTcgen05Ld(N, DAG, /*HasOffset=*/true)) {
7289 Results.push_back(Res->first);
7290 Results.push_back(Res->second);
7291 }
7292 return;
7293
7294 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_i32:
7295 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x8_f32:
7296 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_i32:
7297 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x64_f32:
7298 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_i32:
7299 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x4_f32:
7300 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_i32:
7301 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x32_f32:
7302 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_i32:
7303 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x16_f32:
7304 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_i32:
7305 case Intrinsic::nvvm_tcgen05_ld_red_32x32b_x128_f32:
7306 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_i32:
7307 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x8_f32:
7308 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_i32:
7309 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x64_f32:
7310 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_i32:
7311 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x4_f32:
7312 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_i32:
7313 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x32_f32:
7314 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_i32:
7315 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x16_f32:
7316 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_i32:
7317 case Intrinsic::nvvm_tcgen05_ld_red_16x32bx2_x128_f32:
7318 if (auto Res = lowerTcgen05LdRed(N, DAG)) {
7319 Results.push_back(std::get<0>(*Res));
7320 Results.push_back(std::get<1>(*Res));
7321 Results.push_back(std::get<2>(*Res));
7322 }
7323 return;
7324 }
7325}
7326
7329 // Change the CopyFromReg to output 2 64-bit results instead of a 128-bit
7330 // result so that it can pass the legalization
7331 SDLoc DL(N);
7332 SDValue Chain = N->getOperand(0);
7333 SDValue Reg = N->getOperand(1);
7334 SDValue Glue = N->getOperand(2);
7335
7336 assert(Reg.getValueType() == MVT::i128 &&
7337 "Custom lowering for CopyFromReg with 128-bit reg only");
7338 SmallVector<EVT, 4> ResultsType = {MVT::i64, MVT::i64, N->getValueType(1),
7339 N->getValueType(2)};
7340 SmallVector<SDValue, 3> NewOps = {Chain, Reg, Glue};
7341
7342 SDValue NewValue = DAG.getNode(ISD::CopyFromReg, DL, ResultsType, NewOps);
7343 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i128,
7344 {NewValue.getValue(0), NewValue.getValue(1)});
7345
7346 Results.push_back(Pair);
7347 Results.push_back(NewValue.getValue(2));
7348 Results.push_back(NewValue.getValue(3));
7349}
7350
7352 const TargetLowering &TLI,
7354 SDValue Chain = N->getOperand(0);
7355 SDValue Reg = N->getOperand(1);
7356
7357 MVT VT = TLI.getRegisterType(*DAG.getContext(), Reg.getValueType());
7358
7359 SDValue NewReg = DAG.getAnyExtOrTrunc(Reg, SDLoc(N), VT);
7360 SDValue NewProxy =
7361 DAG.getNode(NVPTXISD::ProxyReg, SDLoc(N), VT, {Chain, NewReg});
7362 SDValue Res = DAG.getAnyExtOrTrunc(NewProxy, SDLoc(N), N->getValueType(0));
7363
7364 Results.push_back(Res);
7365}
7366
7368 const NVPTXSubtarget &STI,
7370 assert(N->getValueType(0) == MVT::i128 &&
7371 "Custom lowering for atomic128 only supports i128");
7372
7374 SDLoc dl(N);
7375
7376 if (!STI.hasAtomSwap128()) {
7379 "Support for b128 atomics introduced in PTX ISA version 8.3 and "
7380 "requires target sm_90.",
7381 dl.getDebugLoc()));
7382
7383 Results.push_back(DAG.getUNDEF(MVT::i128));
7384 Results.push_back(AN->getOperand(0)); // Chain
7385 return;
7386 }
7387
7389 Ops.push_back(AN->getOperand(0)); // Chain
7390 Ops.push_back(AN->getOperand(1)); // Ptr
7391 for (const auto &Op : AN->ops().drop_front(2)) {
7392 // Low part
7393 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7394 DAG.getIntPtrConstant(0, dl)));
7395 // High part
7396 Ops.push_back(DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i64, Op,
7397 DAG.getIntPtrConstant(1, dl)));
7398 }
7399 unsigned Opcode = N->getOpcode() == ISD::ATOMIC_SWAP
7402 SDVTList Tys = DAG.getVTList(MVT::i64, MVT::i64, MVT::Other);
7403 SDValue Result = DAG.getMemIntrinsicNode(Opcode, dl, Tys, Ops, MVT::i128,
7404 AN->getMemOperand());
7405 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i128,
7406 {Result.getValue(0), Result.getValue(1)}));
7407 Results.push_back(Result.getValue(2));
7408}
7409
7410void NVPTXTargetLowering::ReplaceNodeResults(
7412 switch (N->getOpcode()) {
7413 default:
7414 report_fatal_error("Unhandled custom legalization");
7415 case ISD::BITCAST:
7416 ReplaceBITCAST(N, DAG, Results);
7417 return;
7418 case ISD::LOAD:
7419 case ISD::MLOAD:
7420 replaceLoadVector(N, DAG, Results, STI);
7421 return;
7424 return;
7425 case ISD::CopyFromReg:
7427 return;
7428 case NVPTXISD::ProxyReg:
7429 replaceProxyReg(N, DAG, *this, Results);
7430 return;
7432 case ISD::ATOMIC_SWAP:
7433 replaceAtomicSwap128(N, DAG, STI, Results);
7434 return;
7435 }
7436}
7437
7440 Type *Ty = AI->getValOperand()->getType();
7441
7442 // Try to lower LLVM atomicrmw fadd to PTX atomic.add. This is complicated
7443 // by the weird FTZ behavior PTX atom.add has:
7444 // - atom.add.f32 on global memory flushes denormals
7445 // - atom.add.f32 on shared memory does not flush denormals
7446 // - atom.add.f16 and atomic.add.bf16 never flush denormals
7447 //
7448 // We lower to atom.add only if the function's FTZ behavior matches that of
7449 // atom.add; otherwise, we lower to a CAS loop. But we always allow
7450 // atomic.add.bf16; even though it never flushes denormals, we never flush
7451 // bf16 denormals when doing regular arithmetic, even when FTZ is enabled.
7452 if (AI->isFloatingPointOperation() &&
7454 const bool FTZ =
7457
7458 // AllowFTZAtomics forces atom.add regardless of the FTZ mismatch.
7459 if (Ty->isFloatTy()) {
7461 switch (AI->getPointerAddressSpace()) {
7463 UseNative |= FTZ;
7464 break;
7467 UseNative |= !FTZ;
7468 break;
7469 }
7470 if (UseNative)
7472 }
7473
7474 if (Ty->isHalfTy() && (!FTZ || AllowFTZAtomics) &&
7475 STI.getSmVersion() >= 70 && STI.getPTXVersion() >= 63)
7477
7478 if (Ty->isBFloatTy() && STI.getSmVersion() >= 90 &&
7479 STI.getPTXVersion() >= 78)
7481
7482 if (Ty->isDoubleTy() && STI.hasAtomAddF64())
7484 }
7485
7486 // PTX's only atomic fp op is `add`; all other ops expand to a CAS loop.
7487 if (AI->isFloatingPointOperation())
7489
7490 assert(Ty->isIntegerTy() && "Ty should be integer at this point");
7491 const unsigned BitWidth = cast<IntegerType>(Ty)->getBitWidth();
7492
7493 switch (AI->getOperation()) {
7494 default:
7497 if (BitWidth == 128)
7499 [[fallthrough]];
7503 switch (BitWidth) {
7504 case 8:
7505 case 16:
7507 case 32:
7509 case 64:
7510 if (STI.hasAtomBitwise64())
7513 case 128:
7515 default:
7516 llvm_unreachable("unsupported width encountered");
7517 }
7524 switch (BitWidth) {
7525 case 8:
7526 case 16:
7528 case 32:
7530 case 64:
7531 if (STI.hasAtomMinMax64())
7534 case 128:
7536 default:
7537 llvm_unreachable("unsupported width encountered");
7538 }
7541 switch (BitWidth) {
7542 case 32:
7544 case 8:
7545 case 16:
7546 case 64:
7547 case 128:
7549 default:
7550 llvm_unreachable("unsupported width encountered");
7551 }
7552 }
7553
7555}
7556
7558 const Instruction *I) const {
7559 // This function returns true iff the operation is emulated using a CAS-loop,
7560 // or if it has the memory order seq_cst (which is not natively supported in
7561 // the PTX `atom` instruction).
7562 //
7563 // atomicrmw and cmpxchg instructions not efficiently supported by PTX
7564 // are lowered to CAS emulation loops that preserve their memory order,
7565 // syncscope, and volatile semantics. For PTX, it is more efficient to use
7566 // atom.cas.relaxed.sco instructions within the loop, and fences before and
7567 // after the loop to restore order.
7568 //
7569 // Atomic instructions efficiently supported by PTX are lowered to
7570 // `atom.<op>.<sem>.<scope` instruction with their corresponding memory order
7571 // and scope. Since PTX does not support seq_cst, we emulate it by lowering to
7572 // a fence.sc followed by an atom according to the PTX atomics ABI
7573 // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html
7574 if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I))
7575 return (cast<IntegerType>(CI->getCompareOperand()->getType())
7576 ->getBitWidth() < STI.getMinCmpXchgSizeInBits()) ||
7577 CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent;
7578 if (auto *RI = dyn_cast<AtomicRMWInst>(I))
7580 RI->getOrdering() == AtomicOrdering::SequentiallyConsistent;
7581 return false;
7582}
7583
7585 const Instruction *I) const {
7586 // If the operation is emulated by a CAS-loop, we lower the instruction to
7587 // atom.<op>.relaxed, since AtomicExpandPass will insert fences for enforcing
7588 // the correct memory ordering around the CAS loop.
7589 //
7590 // When the operation is not emulated, but the memory order is seq_cst,
7591 // we must lower to "fence.sc.<scope>; atom.<op>.acquire.<scope>;" to conform
7592 // to the PTX atomics ABI.
7593 // https://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/atomic-abi.html
7594 // For such cases, emitLeadingFence() will separately insert the leading
7595 // "fence.sc.<scope>;". Here, we only set the memory order to acquire.
7596 //
7597 // Otherwise, the operation is not emulated, and the memory order is not
7598 // seq_cst. In this case, the LLVM memory order is natively supported by the
7599 // PTX `atom` instruction, and we just lower to the corresponding
7600 // `atom.<op>.relaxed|acquire|release|acq_rel". For such cases, this function
7601 // will NOT be called.
7602 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7603 // I before its memory order was modified.
7604 if (auto *CI = dyn_cast<AtomicCmpXchgInst>(I);
7605 CI && CI->getMergedOrdering() == AtomicOrdering::SequentiallyConsistent &&
7606 cast<IntegerType>(CI->getCompareOperand()->getType())->getBitWidth() >=
7607 STI.getMinCmpXchgSizeInBits())
7609 else if (auto *RI = dyn_cast<AtomicRMWInst>(I);
7610 RI && RI->getOrdering() == AtomicOrdering::SequentiallyConsistent &&
7613
7615}
7616
7618 Instruction *Inst,
7619 AtomicOrdering Ord) const {
7620 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7621 // `Inst` before its memory order was modified. We cannot enforce this with an
7622 // assert, because AtomicExpandPass will have modified the memory order
7623 // between the initial call to shouldInsertFencesForAtomic() and the call to
7624 // this function.
7625 if (!isa<AtomicCmpXchgInst>(Inst) && !isa<AtomicRMWInst>(Inst))
7626 return TargetLoweringBase::emitLeadingFence(Builder, Inst, Ord);
7627
7628 // Specialize for cmpxchg and atomicrmw
7629 auto SSID = getAtomicSyncScopeID(Inst);
7630 assert(SSID.has_value() && "Expected an atomic operation");
7631
7632 if (isReleaseOrStronger(Ord))
7633 return Builder.CreateFence(Ord == AtomicOrdering::SequentiallyConsistent
7636 SSID.value());
7637
7638 return nullptr;
7639}
7640
7642 Instruction *Inst,
7643 AtomicOrdering Ord) const {
7644 // prerequisite: shouldInsertFencesForAtomic() should have returned `true` for
7645 // `Inst` before its memory order was modified. See `emitLeadingFence` for why
7646 // this cannot be enforced with an assert. Specialize for cmpxchg and
7647 // atomicrmw
7648 auto *CI = dyn_cast<AtomicCmpXchgInst>(Inst);
7649 auto *RI = dyn_cast<AtomicRMWInst>(Inst);
7650 if (!CI && !RI)
7651 return TargetLoweringBase::emitTrailingFence(Builder, Inst, Ord);
7652
7653 auto SSID = getAtomicSyncScopeID(Inst);
7654 assert(SSID.has_value() && "Expected an atomic operation");
7655
7656 bool IsEmulated =
7657 CI ? cast<IntegerType>(CI->getCompareOperand()->getType())
7658 ->getBitWidth() < STI.getMinCmpXchgSizeInBits()
7660
7661 if (isAcquireOrStronger(Ord) && IsEmulated)
7662 return Builder.CreateFence(AtomicOrdering::Acquire, SSID.value());
7663
7664 return nullptr;
7665}
7666
7667// Rather than default to SINT when both UINT and SINT are custom, we only
7668// change the opcode when UINT is not legal and SINT is. UINT is preferred when
7669// both are custom since unsigned CVT instructions can lead to slightly better
7670// SASS code with fewer instructions.
7672 EVT ToVT) const {
7673 if (isOperationLegal(Op, ToVT))
7674 return Op;
7675 switch (Op) {
7676 case ISD::FP_TO_UINT:
7678 return ISD::FP_TO_SINT;
7679 break;
7683 break;
7684 case ISD::VP_FP_TO_UINT:
7685 if (isOperationLegal(ISD::VP_FP_TO_SINT, ToVT))
7686 return ISD::VP_FP_TO_SINT;
7687 break;
7688 default:
7689 break;
7690 }
7691 return Op;
7692}
7693
7694// Pin NVPTXTargetObjectFile's vtables to this file.
7696
7701
7703 const SelectionDAG &DAG, unsigned Depth) {
7704 SDValue A = Op.getOperand(0);
7705 SDValue B = Op.getOperand(1);
7706 ConstantSDNode *Selector = dyn_cast<ConstantSDNode>(Op.getOperand(2));
7707 unsigned Mode = Op.getConstantOperandVal(3);
7708
7709 if (!Selector)
7710 return;
7711
7712 KnownBits AKnown = DAG.computeKnownBits(A, Depth);
7713 KnownBits BKnown = DAG.computeKnownBits(B, Depth);
7714
7715 // {b, a} = {{b7, b6, b5, b4}, {b3, b2, b1, b0}}
7716 assert(AKnown.getBitWidth() == 32 && BKnown.getBitWidth() == 32 &&
7717 "PRMT must have i32 operands");
7718 assert(Known.getBitWidth() == 32 && "PRMT must have i32 result");
7719 KnownBits BitField = BKnown.concat(AKnown);
7720
7721 APInt SelectorVal = getPRMTSelector(Selector->getAPIntValue(), Mode);
7722 for (unsigned I : llvm::seq(4)) {
7723 APInt Sel = SelectorVal.extractBits(4, I * 4);
7724 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7725 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7726 KnownBits Byte = BitField.extractBits(8, Idx * 8);
7727 if (Sign)
7728 Byte = KnownBits::ashr(Byte, KnownBits::makeConstant(APInt(8, 7)));
7729 Known.insertBits(Byte, I * 8);
7730 }
7731}
7732
7733static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known) {
7735
7736 // We can't do anything without knowing the sign bit.
7737 auto ExtType = LD->getConstantOperandVal(LD->getNumOperands() - 1);
7738 if (ExtType == ISD::SEXTLOAD)
7739 return;
7740
7741 // ExtLoading to vector types is weird and may not work well with known bits.
7742 auto DestVT = LD->getValueType(0);
7743 if (DestVT.isVector())
7744 return;
7745
7746 assert(Known.getBitWidth() == DestVT.getSizeInBits());
7747 auto ElementBitWidth = NVPTXDAGToDAGISel::getFromTypeWidthForLoad(LD);
7748 Known.Zero.setHighBits(Known.getBitWidth() - ElementBitWidth);
7749}
7750
7752 const SDValue Op, KnownBits &Known, const APInt &DemandedElts,
7753 const SelectionDAG &DAG, unsigned Depth) const {
7754 Known.resetAll();
7755
7756 switch (Op.getOpcode()) {
7757 case NVPTXISD::PRMT:
7758 computeKnownBitsForPRMT(Op, Known, DAG, Depth);
7759 break;
7760 case NVPTXISD::LoadV2:
7761 case NVPTXISD::LoadV4:
7762 case NVPTXISD::LoadV8:
7764 break;
7765 default:
7766 break;
7767 }
7768}
7769
7770static std::pair<APInt, APInt> getPRMTDemandedBits(const APInt &SelectorVal,
7771 const APInt &DemandedBits) {
7772 APInt DemandedLHS = APInt(32, 0);
7773 APInt DemandedRHS = APInt(32, 0);
7774
7775 for (unsigned I : llvm::seq(4)) {
7776 if (DemandedBits.extractBits(8, I * 8).isZero())
7777 continue;
7778
7779 APInt Sel = SelectorVal.extractBits(4, I * 4);
7780 unsigned Idx = Sel.getLoBits(3).getZExtValue();
7781 unsigned Sign = Sel.getHiBits(1).getZExtValue();
7782
7783 APInt &Src = Idx < 4 ? DemandedLHS : DemandedRHS;
7784 unsigned ByteStart = (Idx % 4) * 8;
7785 if (Sign)
7786 Src.setBit(ByteStart + 7);
7787 else
7788 Src.setBits(ByteStart, ByteStart + 8);
7789 }
7790
7791 return {DemandedLHS, DemandedRHS};
7792}
7793
7794// Replace undef with 0 as this is easier for other optimizations such as
7795// known bits.
7797 if (!Op)
7798 return SDValue();
7799 if (Op.isUndef())
7800 return DAG.getConstant(0, SDLoc(), MVT::i32);
7801 return Op;
7802}
7803
7805 const APInt &DemandedBits,
7806 SelectionDAG &DAG,
7807 const TargetLowering &TLI,
7808 unsigned Depth) {
7809 assert(PRMT.getOpcode() == NVPTXISD::PRMT);
7810 SDValue Op0 = PRMT.getOperand(0);
7811 SDValue Op1 = PRMT.getOperand(1);
7812 auto *SelectorConst = dyn_cast<ConstantSDNode>(PRMT.getOperand(2));
7813 if (!SelectorConst)
7814 return SDValue();
7815
7816 unsigned Mode = PRMT.getConstantOperandVal(3);
7817 const APInt Selector = getPRMTSelector(SelectorConst->getAPIntValue(), Mode);
7818
7819 // Try to simplify the PRMT to one of the inputs if the used bytes are all
7820 // from the same input in the correct order.
7821 const unsigned LeadingBytes = DemandedBits.countLeadingZeros() / 8;
7822 const unsigned SelBits = (4 - LeadingBytes) * 4;
7823 if (Selector.getLoBits(SelBits) == APInt(32, 0x3210).getLoBits(SelBits))
7824 return Op0;
7825 if (Selector.getLoBits(SelBits) == APInt(32, 0x7654).getLoBits(SelBits))
7826 return Op1;
7827
7828 auto [DemandedLHS, DemandedRHS] = getPRMTDemandedBits(Selector, DemandedBits);
7829
7830 // Attempt to avoid multi-use ops if we don't need anything from them.
7831 SDValue DemandedOp0 =
7832 TLI.SimplifyMultipleUseDemandedBits(Op0, DemandedLHS, DAG, Depth + 1);
7833 SDValue DemandedOp1 =
7834 TLI.SimplifyMultipleUseDemandedBits(Op1, DemandedRHS, DAG, Depth + 1);
7835
7836 DemandedOp0 = canonicalizePRMTInput(DemandedOp0, DAG);
7837 DemandedOp1 = canonicalizePRMTInput(DemandedOp1, DAG);
7838 if ((DemandedOp0 && DemandedOp0 != Op0) ||
7839 (DemandedOp1 && DemandedOp1 != Op1)) {
7840 Op0 = DemandedOp0 ? DemandedOp0 : Op0;
7841 Op1 = DemandedOp1 ? DemandedOp1 : Op1;
7842 return getPRMT(Op0, Op1, Selector.getZExtValue(), SDLoc(PRMT), DAG);
7843 }
7844
7845 return SDValue();
7846}
7847
7849 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
7850 KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const {
7851 Known.resetAll();
7852
7853 switch (Op.getOpcode()) {
7854 case NVPTXISD::PRMT:
7856 *this, Depth)) {
7857 TLO.CombineTo(Op, Result);
7858 return true;
7859 }
7860 break;
7861 default:
7862 break;
7863 }
7864
7865 computeKnownBitsForTargetNode(Op, Known, DemandedElts, TLO.DAG, Depth);
7866 return false;
7867}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
constexpr LLT S1
constexpr LLT F32
static cl::list< std::string > UseNative("amdgpu-use-native", cl::desc("Comma separated list of functions to replace with native, or all"), cl::CommaSeparated, cl::ValueOptional, cl::Hidden)
AMDGPU Register Bank Select
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define clEnumValN(ENUMVAL, FLAGNAME, DESC)
This file contains the declarations for the subclasses of Constant, which represent the different fla...
This file contains the declarations of entities that describe floating point environment and related ...
static bool IsIndirectCall(const MachineInstr *MI)
Module.h This file contains the declarations for the Module class.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
#define T
NVPTX address space definition.
static SDValue reportInvalidTensormapReplaceUsage(SDValue Op, SelectionDAG &DAG, unsigned Val)
static SDValue combineADDRSPACECAST(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< bool > sched4reg("nvptx-sched4reg", cl::desc("NVPTX Specific: schedule for register pressue"), cl::init(false))
static SDValue lowerTcgen05St(SDValue Op, SelectionDAG &DAG, bool hasOffset=false)
static SDValue PerformEXTRACTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static cl::opt< NVPTX::DivPrecisionLevel > UsePrecDivF32("nvptx-prec-divf32", cl::Hidden, cl::desc("NVPTX Specific: Override the precision of the lowering for f32 fdiv"), cl::values(clEnumValN(NVPTX::DivPrecisionLevel::Approx, "0", "Use div.approx"), clEnumValN(NVPTX::DivPrecisionLevel::Full, "1", "Use div.full"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754, "2", "Use IEEE Compliant F32 div.rnd if available (default)"), clEnumValN(NVPTX::DivPrecisionLevel::IEEE754_NoFTZ, "3", "Use IEEE Compliant F32 div.rnd if available, no FTZ")), cl::init(NVPTX::DivPrecisionLevel::IEEE754))
static bool isConstOne(const SDValue &Operand)
static cl::opt< unsigned > FMAContractLevelOpt("nvptx-fma-level", cl::Hidden, cl::desc("NVPTX Specific: FMA contraction (0: don't do it" " 1: do it 2: do it aggressively"), cl::init(2))
static bool IsPTXVectorType(MVT VT)
static SDValue PerformSELECTShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Transform patterns like: (select (ugt shift_amt, BitWidth-1), 0, (srl/shl x, shift_amt)) (select (ult...
static SDValue lowerLOADi1(LoadSDNode *LD, SelectionDAG &DAG)
static SDValue lowerIntrinsicVoid(SDValue Op, SelectionDAG &DAG)
static MachinePointerInfo refinePtrAS(SDValue &Ptr, SelectionDAG &DAG, const DataLayout &DL, const TargetLowering &TL)
static SDValue lowerROT(SDValue Op, SelectionDAG &DAG)
static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL, LLVMContext &Ctx, CallingConv::ID CallConv, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< uint64_t > &Offsets, uint64_t StartingOffset=0)
ComputePTXValueVTs - For the given Type Ty, returns the set of primitive legal-ish MVTs that compose ...
static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static void replaceAtomicSwap128(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI, SmallVectorImpl< SDValue > &Results)
static unsigned getMinMax3Opcode(unsigned MinMax2Opcode)
Get 3-input version of a 2-input min/max opcode.
static SDValue lowerStAsyncWithMbarrier(SDValue Op, SelectionDAG &DAG)
static SDValue lowerSTOREVector(SDValue Op, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue lowerLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static void replaceProxyReg(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI, SmallVectorImpl< SDValue > &Results)
static SDValue lowerStAsyncRelease(SDValue Op, SelectionDAG &DAG)
static void ReplaceCopyFromReg_128(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
#define TCGEN05_LD_RED_INST(SHAPE, NUM, TYPE)
static SDValue lowerCTLZCTPOP(SDValue Op, SelectionDAG &DAG)
static SDValue combineMADConstOne(SDValue X, SDValue Add, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static unsigned getTcgen05LdRedID(Intrinsic::ID IID)
static SDValue combinePRMT(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue combinePackingMovIntoStore(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned Front, unsigned Back)
Fold packing movs into a store.
static void ReplaceINTRINSIC_W_CHAIN(SDNode *N, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results)
static SDValue getBuildVectorizedValue(unsigned N, const SDLoc &dl, SelectionDAG &DAG, T GetElement)
static SDValue getExtractVectorizedValue(SDValue V, unsigned I, EVT VT, const SDLoc &dl, SelectionDAG &DAG)
static SDValue combineSZExtToMulWide(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static unsigned canMergeParamLoadStoresStartingAt(unsigned Idx, uint32_t AccessSize, const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment)
static EVT getVectorizedVT(EVT VT, unsigned N, LLVMContext &C)
static SDValue lowerIntrinsicWOChain(SDValue Op, SelectionDAG &DAG)
static SDValue PerformFMinMaxCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned PTXVersion, unsigned SmVersion)
PerformFMinMaxCombine - Combine (fmaxnum (fmaxnum a, b), c) into (fmaxnum3 a, b, c).
static std::optional< unsigned > getScalar3OpcodeForReduction(unsigned ReductionOpcode)
Get 3-input scalar reduction opcode.
static SDValue lowerIntrinsicWChain(SDValue Op, SelectionDAG &DAG)
static bool isNonCoalescableBuildVector(const SDValue &BV)
Check if a v2f32 BUILD_VECTOR provably packs values from non-adjacent register pairs (non-coalescable...
static bool isConstZero(const SDValue &Operand)
static unsigned getF16SubOpc(Intrinsic::ID AddIntrinsicID)
static SDValue LowerVectorArith(SDValue Op, SelectionDAG &DAG)
static SDValue LowerTcgen05MMADisableOutputLane(SDValue Op, SelectionDAG &DAG)
static bool IsMulWideOperandDemotable(SDValue Op, unsigned OptSize, OperandSignedness &S)
IsMulWideOperandDemotable - Checks if the provided DAG node is an operand that can be demoted to OptS...
static unsigned getTcgen05MMADisableOutputLane(unsigned IID)
static std::pair< APInt, APInt > getPRMTDemandedBits(const APInt &SelectorVal, const APInt &DemandedBits)
static APInt computePRMT(APInt A, APInt B, APInt Selector, unsigned Mode)
static ISD::NodeType getScalarOpcodeForReduction(unsigned ReductionOpcode)
static SDValue PerformREMCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
static SDValue lowerBSWAP(SDValue Op, SelectionDAG &DAG)
static SDValue lowerMSTORE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformMULCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI)
static void computeKnownBitsForPRMT(const SDValue Op, KnownBits &Known, const SelectionDAG &DAG, unsigned Depth)
static SDValue combineUnpackingMovIntoLoad(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Fold unpacking movs into a load by increasing the number of return values.
#define TCGEN05_LD_RED_INTR(SHAPE, NUM, TYPE)
static SDValue lowerTensormapReplaceElemtype(SDValue Op, SelectionDAG &DAG)
static SDValue LowerClusterLaunchControlQueryCancel(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > lowerTcgen05Ld(SDNode *N, SelectionDAG &DAG, bool HasOffset=false)
static SDValue lowerCvtRSIntrinsics(SDValue Op, SelectionDAG &DAG)
static std::optional< std::pair< SDValue, SDValue > > replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
replaceLoadVector - Convert vector loads into multi-output scalar loads.
static SDValue expandFSH64(SDValue A, SDValue B, SDValue ShiftAmount, SDLoc DL, unsigned Opcode, SelectionDAG &DAG)
static bool AreMulWideOperandsDemotable(SDValue LHS, SDValue RHS, unsigned OptSize, bool &IsSigned)
AreMulWideOperandsDemotable - Checks if the given LHS and RHS operands can be demoted to OptSize bits...
static std::pair< MemSDNode *, uint32_t > convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI)
static SDValue TryMULWIDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
TryMULWIDECombine - Attempt to replace a multiply of M bits with a multiply of M/2 bits that produces...
static SDValue lowerPrmtIntrinsic(SDValue Op, SelectionDAG &DAG)
static SDValue combineMulSelectConstOne(SDValue X, SDValue Select, EVT VT, SDLoc DL, TargetLowering::DAGCombinerInfo &DCI)
static SDValue buildTreeReduction(const SmallVector< SDValue > &Elements, EVT EltTy, ArrayRef< std::pair< unsigned, unsigned > > Ops, const SDLoc &DL, const SDNodeFlags Flags, SelectionDAG &DAG)
Reduces the elements using the scalar operations provided.
static SDValue combineProxyReg(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SmallVector< unsigned, 16 > VectorizePTXValueVTs(const SmallVectorImpl< EVT > &ValueVTs, const SmallVectorImpl< T > &Offsets, Align ParamAlignment, bool IsVAArg=false)
static SDValue getPRMT(SDValue A, SDValue B, SDValue Selector, SDLoc DL, SelectionDAG &DAG, unsigned Mode=NVPTX::PTXPrmtMode::NONE)
static SDValue matchMADConstOnePattern(SDValue Add)
static SDValue correctParamType(SDValue V, EVT ExpectedVT, ISD::ArgFlagsTy Flags, SelectionDAG &DAG, SDLoc dl)
static ISD::NodeType getExtOpcode(const ISD::ArgFlagsTy &Flags)
static cl::opt< bool > UsePrecSqrtF32("nvptx-prec-sqrtf32", cl::Hidden, cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."), cl::init(true))
static cl::opt< bool > AllowFTZAtomics("nvptx-allow-ftz-atomics", cl::Hidden, cl::desc("NVPTX Specific: Lower atomicrmw fadd to atom.add even when its " "FTZ behavior does not match the function's denormal mode."), cl::init(false))
static void computeKnownBitsForLoadV(const SDValue Op, KnownBits &Known)
static APInt getPRMTSelector(const APInt &Selector, unsigned Mode)
static EVT promoteScalarIntegerPTX(const EVT VT)
PromoteScalarIntegerPTX Used to make sure the arguments/returns are suitable for passing and promote ...
static std::optional< std::tuple< SDValue, SDValue, SDValue > > lowerTcgen05LdRed(SDNode *N, SelectionDAG &DAG)
static SDValue simplifyDemandedBitsForPRMT(SDValue PRMT, const APInt &DemandedBits, SelectionDAG &DAG, const TargetLowering &TLI, unsigned Depth)
static SDValue lowerFREM(SDValue Op, SelectionDAG &DAG)
static SDValue canonicalizePRMTInput(SDValue Op, SelectionDAG &DAG)
static SDValue sinkProxyReg(SDValue R, SDValue Chain, TargetLowering::DAGCombinerInfo &DCI)
static SDValue lowerFSH(SDValue Op, SelectionDAG &DAG)
static SDValue lowerTensormapReplaceSwizzleMode(SDValue Op, SelectionDAG &DAG)
static SDValue combineIntrinsicWOChain(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PromoteBinOpToF32(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSETCCCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, unsigned int SmVersion)
static std::optional< std::pair< unsigned int, MVT > > getVectorLoweringShape(EVT VectorEVT, const NVPTXSubtarget &STI, unsigned AddressSpace)
static SDValue combineF16AddWithNeg(SDNode *N, SelectionDAG &DAG, Intrinsic::ID AddIntrinsicID)
static cl::opt< bool > UseApproxLog2F32("nvptx-approx-log2f32", cl::desc("NVPTX Specific: whether to use lg2.approx for log2"), cl::init(false))
Whereas CUDA's implementation (see libdevice) uses ex2.approx for exp2(), it does NOT use lg2....
static SDValue lowerSELECT(SDValue Op, SelectionDAG &DAG)
static SDValue combineLOAD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue combineSTORE(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const NVPTXSubtarget &STI)
static SDValue PerformSHLCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, CodeGenOptLevel OptLevel)
PerformSHLCombine - Runs PTX-specific DAG combine patterns on SHL nodes.
MachineInstr unsigned OpIdx
uint64_t High
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
This file contains some templates that are useful if you are working with the STL at all.
This file defines the SmallVector class.
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This file describes how to lower LLVM code to machine code.
Value * RHS
Value * LHS
BinaryOperator * Mul
static const fltSemantics & IEEEsingle()
Definition APFloat.h:297
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1163
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt getLoBits(unsigned numBits) const
Compute an APInt containing numBits lowbits from this APInt.
Definition APInt.cpp:645
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
LLVM_ABI APInt getHiBits(unsigned numBits) const
Compute an APInt containing numBits highbits from this APInt.
Definition APInt.cpp:640
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isSignedIntN(unsigned N) const
Check if this APInt has an N-bits signed integer value.
Definition APInt.h:436
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137
LLVM_ABI APInt extractBits(unsigned numBits, unsigned bitPosition) const
Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
Definition APInt.cpp:483
bool isIntN(unsigned N) const
Check if this APInt has an N-bits unsigned integer value.
Definition APInt.h:433
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
ArrayRef< T > slice(size_t N, size_t M) const
slice(n, m) - Chop off the first N elements of the array, and keep M elements in the array.
Definition ArrayRef.h:185
an instruction that atomically reads a memory location, combines it with another value,...
@ Add
*p = old + v
@ FAdd
*p = old + v
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ UMax
*p = old >unsigned v ? old : v
@ UDecWrap
Decrement one until a minimum value or zero.
bool isFloatingPointOperation() const
BinOp getOperation() const
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
This is an SDNode representing atomic operations.
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
FunctionType * getFunctionType() const
const APInt & getAPIntValue() const
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
Diagnostic information for unsupported feature in backend.
void addFnAttr(Attribute::AttrKind Kind)
Add function attributes to this function.
Definition Function.cpp:633
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:799
Module * getParent()
Get the module that this global value is contained inside of...
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
MCSection * getDataSection() const
static constexpr unsigned NoRegister
Definition MCRegister.h:60
Instances of this class represent a uniqued identifier for a section in the current translation unit.
Definition MCSection.h:573
StringRef getName() const
getName - Get the symbol name.
Definition MCSymbol.h:188
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
static auto fp_fixedlen_vector_valuetypes()
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getAlign() const
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static unsigned getFromTypeWidthForLoad(const MemSDNode *Mem)
bool hasTensormapReplaceSwizzleModeSupport(unsigned value) const
bool hasUsedBytesMaskPragma() const
bool hasTensormapReplaceElemtypeSupport(unsigned value) const
bool hasAtomSwap128() const
bool hasF32x2Instructions() const
bool has256BitVectorLoadStore(unsigned AS) const
AtomicOrdering atomicOperationOrderAfterFenceSplit(const Instruction *I) const override
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
const NVPTXTargetMachine * nvTM
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
NVPTXTargetLowering(const NVPTXTargetMachine &TM, const NVPTXSubtarget &STI)
unsigned getPreferredFPToIntOpcode(unsigned Op, EVT FromVT, EVT ToVT) const override
bool useF32FTZ(const MachineFunction &MF) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &ExtraSteps, bool &UseOneConst, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &dl, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &dl, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
SDValue LowerSTACKRESTORE(SDValue Op, SelectionDAG &DAG) const
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
std::string getParamName(const Function *F, int Idx) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
NVPTX::DivPrecisionLevel getDivF32Level(const MachineFunction &MF, const SDNode &N) const
bool shouldInsertFencesForAtomic(const Instruction *) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override
Return the ValueType of the result of SETCC operations.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
bool allowFMA(MachineFunction &MF, CodeGenOptLevel OptLevel) const
bool usePrecSqrtF32(const SDNode *N=nullptr) const
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
MCSection * SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const override
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
const APInt & getAsAPIntVal() const
Helper method returns the APInt value of a ConstantSDNode.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
unsigned getIROrder() const
Return the node ordering.
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
uint64_t getScalarValueSizeInBits() const
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
SectionKind - This is a simple POD value that classifies the properties of a section.
Definition SectionKind.h:22
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getSymbolFunctionGlobalAddress(SDValue Op, Function **TargetFunction=nullptr)
Return a GlobalAddress of the function from the current module with name matching the given ExternalS...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI Align getEVTAlign(EVT MemoryVT) const
Compute the default alignment value for the given type.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
LLVM_ABI SDNode * MorphNodeTo(SDNode *N, unsigned Opc, SDVTList VTs, ArrayRef< SDValue > Ops)
This mutates the specified node to have the specified return type, opcode, and operands.
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI SDValue getFPExtendOrRound(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of float type, to the float type VT, by either extending or rounding (by tr...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
MachineFunction & getMachineFunction() const
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
ArrayRef< int > getMask() const
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
constexpr const char * data() const
Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:138
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void setOperationPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
Convenience method to set an operation to Promote and specify the type in a single call.
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
virtual unsigned getNumRegisters(LLVMContext &Context, EVT VT, std::optional< MVT > RegisterVT=std::nullopt) const
Return the number of registers that this ValueType will eventually require.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setCondCodeAction(ArrayRef< ISD::CondCode > CCs, MVT VT, LegalizeAction Action)
Indicate that the specified condition code is or isn't supported on the target and indicate what to d...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
Align getMinStackArgumentAlignment() const
Return the minimum stack alignment of an argument.
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
std::vector< ArgListEntry > ArgListTy
virtual Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
virtual Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const
Inserts in the IR a target-specific intrinsic specifying a fence.
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
MVT getRegisterType(MVT VT) const
Return the type of registers that this ValueType will eventually require.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
LegalizeAction getOperationAction(unsigned Op, EVT VT) const
Return how this operation should be treated: either it is legal, needs to be promoted to a larger siz...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
TargetLowering(const TargetLowering &)=delete
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
SDValue expandFP_ROUND(SDNode *Node, SelectionDAG &DAG) const
Expand round(fp) to fp conversion.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
TargetOptions Options
MCSymbol * getSymbol(const GlobalValue *GV) const
FPOpFusion::FPOpFusionMode AllowFPOpFusion
AllowFPOpFusion - This flag is set by the -fp-contract=xxx option.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetFrameLowering * getFrameLowering() const
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
A raw_ostream that writes to an std::string.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
LLVM_ABI APInt pow(const APInt &X, int64_t N)
Compute X^N for N>=0.
Definition APInt.cpp:3186
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ POISON
POISON - A poison node.
Definition ISDOpcodes.h:236
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ VECREDUCE_FMAXIMUM
FMINIMUM/FMAXIMUM nodes propatate NaNs and signed zeroes using the llvm.minimum and llvm....
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:819
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SSHLSAT
RESULT = [US]SHLSAT(LHS, RHS) - Perform saturation left shift.
Definition ISDOpcodes.h:386
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:813
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ VECREDUCE_FMINIMUM
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:841
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ ABS_MIN_POISON
ABS with a poison result for INT_MIN.
Definition ISDOpcodes.h:751
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI bool allOperandsUndef(const SDNode *N)
Return true if the node has at least one operand and all operands of the specified node are ISD::UNDE...
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI StringRef getName(ID id)
Return the LLVM name for an intrinsic, such as "llvm.ppc.altivec.lvx".
@ Bitcast
Perform the operation on a different, but equivalently sized type.
@ ATOMIC_CMP_SWAP_B128
These nodes are used to lower atomic instructions with i128 type.
@ DeviceParam
Definition NVPTX.h:217
@ EntryParam
Definition NVPTX.h:211
bool isPackedVectorTy(EVT VT)
DivPrecisionLevel
Definition NVPTX.h:280
match_combine_or< CastInst_match< OpTy, TruncInst >, OpTy > m_TruncOrSelf(const OpTy &Op)
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
match_deferred< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ValuesClass values(OptsTy... Options)
Helper to build a ValuesClass by forwarding a variable number of arguments as an initializer list to ...
initializer< Ty > init(const Ty &Val)
@ User
could "use" a pointer
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
This is an optimization pass for GlobalISel generic memory operations.
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:573
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
SDValue peekThroughFreeze(SDValue V)
Return the non-frozen source operand of V if it exists.
RelativeUniformCounterPtr Values
Definition InstrProf.h:91
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
bool isReleaseOrStronger(AtomicOrdering AO)
OutputIt transform(R &&Range, OutputIt d_first, UnaryFunction F)
Wrapper function around std::transform to apply a function to a range and store the result elsewhere.
Definition STLExtras.h:2026
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
std::optional< SyncScope::ID > getAtomicSyncScopeID(const Instruction *I)
A helper function that returns an atomic operation's sync scope; returns std::nullopt if it is not an...
unsigned promoteScalarArgumentSize(unsigned size)
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool shouldPassAsArray(Type *Ty)
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
iterator_range< filter_iterator< detail::IterOfRange< RangeT >, PredicateT > > make_filter_range(RangeT &&Range, PredicateT Pred)
Convenience function that takes a range of elements and a predicate, and return a new filter_iterator...
Definition STLExtras.h:551
CodeGenOptLevel
Code generation optimization level.
Definition CodeGen.h:82
@ Default
-O2, -Os, -Oz
Definition CodeGen.h:85
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
DWARFExpression::Operation Op
Align getPTXParamAlign(const Function *F, Type *Ty, unsigned AttrIdx, const DataLayout &DL)
Get the alignment for a function parameter or return value.
ArrayRef(const T &OneElt) -> ArrayRef< T >
bool isAcquireOrStronger(AtomicOrdering AO)
Align getDeviceByValParamAlign(const Function *F, Type *ArgTy, Align InitialAlign, const DataLayout &DL)
constexpr unsigned BitWidth
bool isKernelFunction(const Function &F)
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
DenormalModeKind Output
Denormal flushing mode for floating point instruction results in the default floating point environme...
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:323
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:373
bool is32BitVector() const
Return true if this is a 32-bit vector type.
Definition ValueTypes.h:220
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
static LLVM_ABI KnownBits ashr(const KnownBits &LHS, const KnownBits &RHS, bool ShAmtNonZero=false, bool Exact=false)
Compute known bits for ashr(LHS, RHS).
KnownBits concat(const KnownBits &Lo) const
Concatenate the bits from Lo onto the bottom of *this.
Definition KnownBits.h:247
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
void insertBits(const KnownBits &SubBits, unsigned BitPosition)
Insert the bits from a smaller known bits starting at bitPosition.
Definition KnownBits.h:233
This class contains a discriminated union of information about pointers in memory operands,...
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasAllowContract() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs
Type * RetTy
Same as OrigRetTy, or partially legalized for soft float libcalls.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...