LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
19#include "AMDGPUTargetMachine.h"
20#include "GCNSubtarget.h"
23#include "SIRegisterInfo.h"
24#include "llvm/ADT/APInt.h"
26#include "llvm/ADT/Statistic.h"
41#include "llvm/IR/IRBuilder.h"
43#include "llvm/IR/IntrinsicsAMDGPU.h"
44#include "llvm/IR/IntrinsicsR600.h"
45#include "llvm/IR/MDBuilder.h"
48#include "llvm/Support/ModRef.h"
50#include <optional>
51
52using namespace llvm;
53using namespace llvm::SDPatternMatch;
54
55#define DEBUG_TYPE "si-lower"
56
57STATISTIC(NumTailCalls, "Number of tail calls");
58
59static cl::opt<bool>
60 DisableLoopAlignment("amdgpu-disable-loop-alignment",
61 cl::desc("Do not align and prefetch loops"),
62 cl::init(false));
63
65 "amdgpu-use-divergent-register-indexing", cl::Hidden,
66 cl::desc("Use indirect register addressing for divergent indexes"),
67 cl::init(false));
68
73
78
79static unsigned findFirstFreeSGPR(CCState &CCInfo) {
80 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
81 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
82 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
83 return AMDGPU::SGPR0 + Reg;
84 }
85 }
86 llvm_unreachable("Cannot allocate sgpr");
87}
88
90 const GCNSubtarget &STI)
91 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
92 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
93 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
94
95 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
96
97 const SIRegisterInfo *TRI = STI.getRegisterInfo();
98 const TargetRegisterClass *V32RegClass =
99 TRI->getDefaultVectorSuperClassForBitWidth(32);
100 addRegisterClass(MVT::f32, V32RegClass);
101
102 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
103
104 const TargetRegisterClass *V64RegClass =
105 TRI->getDefaultVectorSuperClassForBitWidth(64);
106
107 addRegisterClass(MVT::f64, V64RegClass);
108 addRegisterClass(MVT::v2f32, V64RegClass);
109 addRegisterClass(MVT::Untyped, V64RegClass);
110
111 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
112 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
113
114 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
115 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
116
117 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
118 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
119
120 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
121 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
122
123 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
124 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
125
126 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
127 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
128
129 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
130 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
131
132 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
133 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
134
135 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
136 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
137
138 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
139 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
140
141 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
142 addRegisterClass(MVT::v10f32,
143 TRI->getDefaultVectorSuperClassForBitWidth(320));
144
145 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
146 addRegisterClass(MVT::v11f32,
147 TRI->getDefaultVectorSuperClassForBitWidth(352));
148
149 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
150 addRegisterClass(MVT::v12f32,
151 TRI->getDefaultVectorSuperClassForBitWidth(384));
152
153 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
154 addRegisterClass(MVT::v16f32,
155 TRI->getDefaultVectorSuperClassForBitWidth(512));
156
157 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
158 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
159
160 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
161 addRegisterClass(MVT::v16f64,
162 TRI->getDefaultVectorSuperClassForBitWidth(1024));
163
164 if (Subtarget->has16BitInsts()) {
165 if (Subtarget->useRealTrue16Insts()) {
166 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
167 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
169 } else {
170 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
171 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
173 }
174
175 // Unless there are also VOP3P operations, not operations are really legal.
176 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
177 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
180 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
183 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
186 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
189 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
191 }
192
193 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
194 addRegisterClass(MVT::v32f32,
195 TRI->getDefaultVectorSuperClassForBitWidth(1024));
196
197 computeRegisterProperties(Subtarget->getRegisterInfo());
198
201
202 // The boolean content concept here is too inflexible. Compares only ever
203 // really produce a 1-bit result. Any copy/extend from these will turn into a
204 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
205 // it's what most targets use.
208
209 // We need to custom lower vector stores from local memory
211 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
212 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
213 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
214 MVT::i1, MVT::v32i32},
215 Custom);
216
218 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
219 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
220 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
221 MVT::i1, MVT::v32i32},
222 Custom);
223
224 if (isTypeLegal(MVT::bf16)) {
225 for (unsigned Opc :
234 ISD::SETCC}) {
235 setOperationAction(Opc, MVT::bf16, Promote);
236 }
237
239
241 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
242
246
247 // We only need to custom lower because we can't specify an action for bf16
248 // sources.
251 }
252
253 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
254 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
255 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
256 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
257 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
258 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
259 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
260 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
261 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
262 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
263 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
264 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
265 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
266 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
267 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
268 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
269
270 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
271 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
272 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
273 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
276 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
277
278 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
279 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
280
284 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
285
286 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
287
289 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
290
292 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
293 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
294
296 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
297 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
298 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
299 Expand);
301 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
302 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
303 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
304 Expand);
305
307 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
308 MVT::v3i16, MVT::v4i16, MVT::Other},
309 Custom);
310
313 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
314
316
318
320 Expand);
321
323
324#if 0
326#endif
327
328 // We only support LOAD/STORE and vector manipulation ops for vectors
329 // with > 4 elements.
330 for (MVT VT :
331 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
332 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
333 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
334 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
335 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
336 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
337 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
338 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
339 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
340 switch (Op) {
341 case ISD::LOAD:
342 case ISD::STORE:
344 case ISD::BITCAST:
345 case ISD::UNDEF:
349 case ISD::IS_FPCLASS:
350 break;
355 break;
356 default:
358 break;
359 }
360 }
361 }
362
364
365 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
366 // is expanded to avoid having two separate loops in case the index is a VGPR.
367
368 // Most operations are naturally 32-bit vector operations. We only support
369 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
370 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
372 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
373
375 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
376
378 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
379
381 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
382 }
383
384 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
386 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
387
389 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
390
392 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
393
395 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
396 }
397
398 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
400 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
401
403 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
404
406 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
407
409 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
410 }
411
412 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
414 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
415
417 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
418
420 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
421
423 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
424 }
425
426 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
428 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
429
431 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
432
434 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
435
437 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
438 }
439
441 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
442 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
443 Custom);
444
445 if (Subtarget->hasPkMovB32()) {
446 // TODO: 16-bit element vectors should be legal with even aligned elements.
447 // TODO: Can be legal with wider source types than the result with
448 // subregister extracts.
449 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
450 }
451
453 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
454 // instead lower to cndmask in SITargetLowering::LowerSELECT().
456 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
457 // alignbit.
458 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
459
460 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
461 Custom);
462
463 // Avoid stack access for these.
464 // TODO: Generalize to more vector types.
466 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
467 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
468 Custom);
469
470 // Deal with vec3 vector operations when widened to vec4.
472 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
473
474 // Deal with vec5/6/7 vector operations when widened to vec8.
476 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
477 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
478 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
479 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
480 Custom);
481
482 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
483 // and output demarshalling
484 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
485
486 // We can't return success/failure, only the old value,
487 // let LLVM add the comparison
489 Expand);
490
491 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
492
493 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
494
495 // FIXME: This should be narrowed to i32, but that only happens if i64 is
496 // illegal.
497 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
498 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
499
500 // On SI this is s_memtime and s_memrealtime on VI.
502
503 if (Subtarget->hasSMemRealTime() ||
504 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
507
508 if (Subtarget->has16BitInsts()) {
511 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
514 } else {
516 }
517
518 if (Subtarget->hasMadMacF32Insts())
520
524
525 // We only really have 32-bit BFE instructions (and 16-bit on VI).
526 //
527 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
528 // effort to match them now. We want this to be false for i64 cases when the
529 // extraction isn't restricted to the upper or lower half. Ideally we would
530 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
531 // span the midpoint are probably relatively rare, so don't worry about them
532 // for now.
534
535 // Clamp modifier on add/sub
536 if (Subtarget->hasIntClamp())
538
539 if (Subtarget->hasAddNoCarryInsts())
540 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
541 Legal);
542
545 {MVT::f32, MVT::f64}, Custom);
546
547 // These are really only legal for ieee_mode functions. We should be avoiding
548 // them for functions that don't have ieee_mode enabled, so just say they are
549 // legal.
551 {MVT::f32, MVT::f64}, Legal);
552
553 if (Subtarget->haveRoundOpsF64())
555 Legal);
556 else
558 MVT::f64, Custom);
559
561 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
562 Legal);
563 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
564
567
568 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
569 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
570
572 Custom);
574 Custom);
576 Custom);
577
578 // Custom lower these because we can't specify a rule based on an illegal
579 // source bf16.
582
583 if (Subtarget->has16BitInsts()) {
586 MVT::i16, Legal);
587
588 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
589
591 MVT::i16, Expand);
592
596 ISD::CTPOP},
597 MVT::i16, Promote);
598
600
601 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
602
604 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
606 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
607
611
613
614 // F16 - Constant Actions.
617
618 // F16 - Load/Store Actions.
620 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
622 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
623
624 // BF16 - Load/Store Actions.
626 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
628 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
629
630 // F16 - VOP1 Actions.
633 MVT::f16, Custom);
634
635 // BF16 - VOP1 Actions.
636 if (Subtarget->hasBF16TransInsts())
638
641 MVT::f16, Promote);
644 MVT::bf16, Promote);
645
646 // F16 - VOP2 Actions.
647 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
648 Expand);
652
653 // F16 - VOP3 Actions.
655 if (STI.hasMadF16())
657
658 for (MVT VT :
659 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
660 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
661 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
662 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
663 switch (Op) {
664 case ISD::LOAD:
665 case ISD::STORE:
667 case ISD::BITCAST:
668 case ISD::UNDEF:
673 case ISD::IS_FPCLASS:
674 break;
677 case ISD::FSIN:
678 case ISD::FCOS:
680 break;
681 default:
683 break;
684 }
685 }
686 }
687
688 // v_perm_b32 can handle either of these.
689 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
691
692 // XXX - Do these do anything? Vector constants turn into build_vector.
693 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
694
695 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
696 Legal);
697
699 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
701 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
702
704 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
706 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
707
709 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2i16, MVT::i32);
711 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f16, MVT::i32);
712
714 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2i16, MVT::i32);
716 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f16, MVT::i32);
717
718 setOperationAction(ISD::AND, MVT::v2i16, Promote);
719 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
720 setOperationAction(ISD::OR, MVT::v2i16, Promote);
721 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
722 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
723 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
724
726 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
728 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
729 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
730 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
731
733 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4i16, MVT::i64);
735 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4f16, MVT::i64);
736
738 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4i16, MVT::i64);
740 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4f16, MVT::i64);
741
743 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
745 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
747 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
748
750 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
752 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
753 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
754 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
755
757 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
759 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
760
762 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
764 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
766 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
767
768 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
769 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
770 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
771 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
772 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
773 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
774
776 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
778 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
779 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
780 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
781
782 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
783 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
784 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
785 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
786 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
787 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
788
790 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
792 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
793 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
794 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
795
797 MVT::v2i32, Expand);
799
801 MVT::v4i32, Expand);
802
804 MVT::v8i32, Expand);
805
806 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
807 Subtarget->hasVOP3PInsts() ? Legal : Custom);
808
809 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
810 // This isn't really legal, but this avoids the legalizer unrolling it (and
811 // allows matching fneg (fabs x) patterns)
812 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
813
814 // Can do this in one BFI plus a constant materialize.
816 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
817 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
818 MVT::v32f16, MVT::v32bf16},
819 Custom);
820
823 MVT::f16, Custom);
825
828 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
829 Custom);
830
832 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
833 Expand);
834
835 for (MVT Vec16 :
836 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
837 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
840 Vec16, Custom);
842 }
843 }
844
845 if (Subtarget->hasVOP3PInsts()) {
849 MVT::v2i16, Legal);
850
854 MVT::v2f16, Legal);
855
857 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
858
860 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
861 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
862 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
863 Custom);
864
865 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
866 // Split vector operations.
871 VT, Custom);
872
873 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
874 // Split vector operations.
877 VT, Custom);
878
881 {MVT::v2f16, MVT::v4f16}, Custom);
882
883 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
884 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
885 Custom);
886
887 if (Subtarget->hasBF16PackedInsts()) {
890 MVT::v2bf16, Legal);
891
892 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
893 // Split vector operations.
896 VT, Custom);
897 }
898
899 if (Subtarget->hasPackedFP32Ops()) {
901 MVT::v2f32, Legal);
903 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
904 Custom);
905 }
906 }
907
909
910 if (Subtarget->has16BitInsts()) {
912 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
914 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
915 } else {
916 // Legalization hack.
917 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
918
920 }
921
923 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
924 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
925 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
926 MVT::v32f16, MVT::v32bf16},
927 Custom);
928
930
931 if (Subtarget->hasVMulU64Inst())
933 else if (Subtarget->hasScalarSMulU64())
935
936 if (Subtarget->hasMad64_32())
938
939 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
941
942 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
944 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
945 } else {
946 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
947 if (Subtarget->hasMinimum3Maximum3F32())
949
950 if (Subtarget->hasMinimum3Maximum3PKF16()) {
952
953 // If only the vector form is available, we need to widen to a vector.
954 if (!Subtarget->hasMinimum3Maximum3F16())
956 }
957 }
958
959 if (Subtarget->hasVOP3PInsts()) {
960 // We want to break these into v2f16 pieces, not scalarize.
962 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
963 Custom);
964 }
965
966 if (Subtarget->hasIntMinMax64())
968 Legal);
969
971 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
972 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
973 MVT::i8},
974 Custom);
975
977 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
978 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
979 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
980 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
981 Custom);
982
984 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
985 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
986 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
987 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
988 Custom);
989
995
996 // TODO: Could move this to custom lowering, could benefit from combines on
997 // extract of relevant bits.
999
1001
1002 if (Subtarget->hasBF16ConversionInsts()) {
1003 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
1005 }
1006
1007 if (Subtarget->hasBF16TransInsts()) {
1009 }
1010
1011 if (Subtarget->hasCvtPkF16F32Inst()) {
1013 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1014 Custom);
1015 }
1016
1020 ISD::SUB,
1022 ISD::MUL,
1023 ISD::FADD,
1024 ISD::FSUB,
1025 ISD::FDIV,
1026 ISD::FMUL,
1035 ISD::FMA,
1036 ISD::SMIN,
1037 ISD::SMAX,
1038 ISD::UMIN,
1039 ISD::UMAX,
1040 ISD::SETCC,
1042 ISD::SMIN,
1043 ISD::SMAX,
1044 ISD::UMIN,
1045 ISD::UMAX,
1046 ISD::AND,
1047 ISD::OR,
1048 ISD::XOR,
1049 ISD::SHL,
1050 ISD::SRL,
1051 ISD::SRA,
1052 ISD::FSHR,
1063
1064 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1066
1067 // All memory operations. Some folding on the pointer operand is done to help
1068 // matching the constant offsets in the addressing modes.
1070 ISD::STORE,
1095
1096 // FIXME: In other contexts we pretend this is a per-function property.
1098
1100}
1101
1102const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1103
1105 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1106 return RCRegs;
1107}
1108
1109//===----------------------------------------------------------------------===//
1110// TargetLowering queries
1111//===----------------------------------------------------------------------===//
1112
1113// v_mad_mix* support a conversion from f16 to f32.
1114//
1115// There is only one special case when denormals are enabled we don't currently,
1116// where this is OK to use.
1117bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1118 EVT DestVT, EVT SrcVT) const {
1119 return DestVT.getScalarType() == MVT::f32 &&
1120 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1121 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1122 SrcVT.getScalarType() == MVT::f16) ||
1123 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1124 SrcVT.getScalarType() == MVT::bf16)) &&
1125 // TODO: This probably only requires no input flushing?
1127}
1128
1130 LLT DestTy, LLT SrcTy) const {
1131 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1132 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1133 DestTy.getScalarSizeInBits() == 32 &&
1134 SrcTy.getScalarSizeInBits() == 16 &&
1135 // TODO: This probably only requires no input flushing?
1136 denormalModeIsFlushAllF32(*MI.getMF());
1137}
1138
1140 // SI has some legal vector types, but no legal vector operations. Say no
1141 // shuffles are legal in order to prefer scalarizing some vector operations.
1142 return false;
1143}
1144
1146 CallingConv::ID CC,
1147 EVT VT) const {
1149 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1150
1151 if (VT.isVector()) {
1152 EVT ScalarVT = VT.getScalarType();
1153 unsigned Size = ScalarVT.getSizeInBits();
1154 if (Size == 16) {
1155 return Subtarget->has16BitInsts()
1156 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
1157 : MVT::i32;
1158 }
1159
1160 if (Size < 16)
1161 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1162 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1163 }
1164
1165 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1166 return MVT::i32;
1167
1168 if (VT.getSizeInBits() > 32)
1169 return MVT::i32;
1170
1171 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1172}
1173
1175 CallingConv::ID CC,
1176 EVT VT) const {
1178 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1179
1180 if (VT.isVector()) {
1181 unsigned NumElts = VT.getVectorNumElements();
1182 EVT ScalarVT = VT.getScalarType();
1183 unsigned Size = ScalarVT.getSizeInBits();
1184
1185 // FIXME: Should probably promote 8-bit vectors to i16.
1186 if (Size == 16)
1187 return (NumElts + 1) / 2;
1188
1189 if (Size <= 32)
1190 return NumElts;
1191
1192 if (Size > 32)
1193 return NumElts * ((Size + 31) / 32);
1194 } else if (VT.getSizeInBits() > 32)
1195 return (VT.getSizeInBits() + 31) / 32;
1196
1197 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1198}
1199
1201 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1202 unsigned &NumIntermediates, MVT &RegisterVT) const {
1203 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1204 unsigned NumElts = VT.getVectorNumElements();
1205 EVT ScalarVT = VT.getScalarType();
1206 unsigned Size = ScalarVT.getSizeInBits();
1207 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1208 // support, but unless we can properly handle 3-vectors, it will be still be
1209 // inconsistent.
1210 if (Size == 16) {
1211 MVT SimpleIntermediateVT =
1213 IntermediateVT = SimpleIntermediateVT;
1214 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1215 NumIntermediates = (NumElts + 1) / 2;
1216 return (NumElts + 1) / 2;
1217 }
1218
1219 if (Size == 32) {
1220 RegisterVT = ScalarVT.getSimpleVT();
1221 IntermediateVT = RegisterVT;
1222 NumIntermediates = NumElts;
1223 return NumIntermediates;
1224 }
1225
1226 if (Size < 16 && Subtarget->has16BitInsts()) {
1227 // FIXME: Should probably form v2i16 pieces
1228 RegisterVT = MVT::i16;
1229 IntermediateVT = ScalarVT;
1230 NumIntermediates = NumElts;
1231 return NumIntermediates;
1232 }
1233
1234 if (Size != 16 && Size <= 32) {
1235 RegisterVT = MVT::i32;
1236 IntermediateVT = ScalarVT;
1237 NumIntermediates = NumElts;
1238 return NumIntermediates;
1239 }
1240
1241 if (Size > 32) {
1242 RegisterVT = MVT::i32;
1243 IntermediateVT = RegisterVT;
1244 NumIntermediates = NumElts * ((Size + 31) / 32);
1245 return NumIntermediates;
1246 }
1247 }
1248
1250 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1251}
1252
1254 const DataLayout &DL, Type *Ty,
1255 unsigned MaxNumLanes) {
1256 assert(MaxNumLanes != 0);
1257
1258 LLVMContext &Ctx = Ty->getContext();
1259 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1260 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1261 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1262 NumElts);
1263 }
1264
1265 return TLI.getValueType(DL, Ty);
1266}
1267
1268// Peek through TFE struct returns to only use the data size.
1270 const DataLayout &DL, Type *Ty,
1271 unsigned MaxNumLanes) {
1272 auto *ST = dyn_cast<StructType>(Ty);
1273 if (!ST)
1274 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1275
1276 // TFE intrinsics return an aggregate type.
1277 assert(ST->getNumContainedTypes() == 2 &&
1278 ST->getContainedType(1)->isIntegerTy(32));
1279 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1280}
1281
1282/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1283/// in-memory representation. This return value is a custom type because there
1284/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1285/// could cause issues during codegen, these address space 7 pointers will be
1286/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1287/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1288/// for cost modeling, to work. (This also sets us up decently for doing the
1289/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1291 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1292 return MVT::amdgpuBufferFatPointer;
1294 DL.getPointerSizeInBits(AS) == 192)
1295 return MVT::amdgpuBufferStridedPointer;
1297}
1298/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1299/// v8i32 when padding is added.
1300/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1301/// also v8i32 with padding.
1303 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1304 DL.getPointerSizeInBits(AS) == 160) ||
1306 DL.getPointerSizeInBits(AS) == 192))
1307 return MVT::v8i32;
1309}
1310
1311static unsigned getIntrMemWidth(unsigned IntrID) {
1312 switch (IntrID) {
1313 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1314 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1315 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1316 return 8;
1317 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1318 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1319 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1320 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1321 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1322 case Intrinsic::amdgcn_flat_load_monitor_b32:
1323 case Intrinsic::amdgcn_global_load_monitor_b32:
1324 return 32;
1325 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1326 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1327 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1328 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1329 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1330 case Intrinsic::amdgcn_flat_load_monitor_b64:
1331 case Intrinsic::amdgcn_global_load_monitor_b64:
1332 return 64;
1333 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1334 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1335 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1336 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1337 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1338 case Intrinsic::amdgcn_flat_load_monitor_b128:
1339 case Intrinsic::amdgcn_global_load_monitor_b128:
1340 return 128;
1341 default:
1342 llvm_unreachable("Unknown width");
1343 }
1344}
1345
1347 unsigned ArgIdx) {
1348 Value *OrderingArg = CI.getArgOperand(ArgIdx);
1349 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1350 switch (AtomicOrderingCABI(Ord)) {
1353 break;
1356 break;
1359 break;
1360 default:
1362 }
1363}
1364
1365static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1366 MDNode *ScopeMD = cast<MDNode>(
1367 cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
1368 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1369 return CI.getContext().getOrInsertSyncScopeID(Scope);
1370}
1371
1373 const CallBase &CI,
1374 MachineFunction &MF,
1375 unsigned IntrID) const {
1377 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1379 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1381 Flags |= getTargetMMOFlags(CI);
1382
1383 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1385 AttributeSet Attr =
1387 MemoryEffects ME = Attr.getMemoryEffects();
1388 if (ME.doesNotAccessMemory())
1389 return;
1390
1391 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1392 if (!IsSPrefetch) {
1393 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1394 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1396 }
1398
1399 IntrinsicInfo Info;
1400 // TODO: Should images get their own address space?
1402
1403 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1404 if (RsrcIntr->IsImage) {
1405 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1407 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1408 Info.align.reset();
1409 }
1410
1411 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1412 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1413 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1414 // We conservatively set the memory operand of a buffer intrinsic to the
1415 // base resource pointer, so that we can access alias information about
1416 // those pointers. Cases like "this points at the same value
1417 // but with a different offset" are handled in
1418 // areMemAccessesTriviallyDisjoint.
1419 Info.ptrVal = RsrcArg;
1420 }
1421
1422 if (ME.onlyReadsMemory()) {
1423 if (RsrcIntr->IsImage) {
1424 unsigned MaxNumLanes = 4;
1425
1426 if (!BaseOpcode->Gather4) {
1427 // If this isn't a gather, we may have excess loaded elements in the
1428 // IR type. Check the dmask for the real number of elements loaded.
1429 unsigned DMask =
1430 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1431 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1432 }
1433
1434 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1435 CI.getType(), MaxNumLanes);
1436 } else {
1437 Info.memVT =
1439 std::numeric_limits<unsigned>::max());
1440 }
1441
1442 // FIXME: What does alignment mean for an image?
1443 Info.opc = ISD::INTRINSIC_W_CHAIN;
1444 Info.flags = Flags | MachineMemOperand::MOLoad;
1445 } else if (ME.onlyWritesMemory()) {
1446 Info.opc = ISD::INTRINSIC_VOID;
1447
1448 Type *DataTy = CI.getArgOperand(0)->getType();
1449 if (RsrcIntr->IsImage) {
1450 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1451 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1452 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1453 DMaskLanes);
1454 } else
1455 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1456
1457 Info.flags = Flags | MachineMemOperand::MOStore;
1458 } else {
1459 // Atomic, NoReturn Sampler or prefetch
1460 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1462
1463 switch (IntrID) {
1464 default:
1465 Info.flags = Flags | MachineMemOperand::MOLoad;
1466 if (!IsSPrefetch)
1467 Info.flags |= MachineMemOperand::MOStore;
1468
1469 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1470 // Fake memory access type for no return sampler intrinsics
1471 Info.memVT = MVT::i32;
1472 } else {
1473 // XXX - Should this be volatile without known ordering?
1474 Info.flags |= MachineMemOperand::MOVolatile;
1475 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1476 }
1477 break;
1478 case Intrinsic::amdgcn_raw_buffer_load_lds:
1479 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1480 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1481 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1482 case Intrinsic::amdgcn_struct_buffer_load_lds:
1483 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1484 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1485 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1486 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1487
1488 // Entry 0: Load from buffer.
1489 // Don't set an offset, since the pointer value always represents the
1490 // base of the buffer.
1491 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1492 Info.flags = Flags | MachineMemOperand::MOLoad;
1493 Infos.push_back(Info);
1494
1495 // Entry 1: Store to LDS.
1496 // Instruction offset is applied, and an additional per-lane offset
1497 // which we simulate using a larger memory type.
1498 Info.memVT = EVT::getIntegerVT(
1499 CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
1500 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1501 Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
1502 ->getZExtValue();
1503 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1504 Info.flags = Flags | MachineMemOperand::MOStore;
1505 Infos.push_back(Info);
1506 return;
1507 }
1508 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1509 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1510 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1511 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1512 Info.memVT =
1514 std::numeric_limits<unsigned>::max());
1515 Info.flags = Flags | MachineMemOperand::MOLoad;
1516 Infos.push_back(Info);
1517 return;
1518 }
1519 }
1520 }
1521 Infos.push_back(Info);
1522 return;
1523 }
1524
1525 IntrinsicInfo Info;
1526 switch (IntrID) {
1527 case Intrinsic::amdgcn_ds_ordered_add:
1528 case Intrinsic::amdgcn_ds_ordered_swap: {
1529 Info.opc = ISD::INTRINSIC_W_CHAIN;
1530 Info.memVT = MVT::getVT(CI.getType());
1531 Info.ptrVal = CI.getOperand(0);
1532 Info.align.reset();
1534
1535 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1536 if (!Vol->isZero())
1537 Info.flags |= MachineMemOperand::MOVolatile;
1538
1539 Infos.push_back(Info);
1540 return;
1541 }
1542 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1543 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1544 Info.opc = ISD::INTRINSIC_W_CHAIN;
1545 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1546 Info.ptrVal = nullptr;
1547 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1549 Infos.push_back(Info);
1550 return;
1551 }
1552 case Intrinsic::amdgcn_ds_append:
1553 case Intrinsic::amdgcn_ds_consume: {
1554 Info.opc = ISD::INTRINSIC_W_CHAIN;
1555 Info.memVT = MVT::getVT(CI.getType());
1556 Info.ptrVal = CI.getOperand(0);
1557 Info.align.reset();
1559
1560 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1561 if (!Vol->isZero())
1562 Info.flags |= MachineMemOperand::MOVolatile;
1563
1564 Infos.push_back(Info);
1565 return;
1566 }
1567 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1568 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1569 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1572 Info.memVT = MVT::getVT(CI.getType());
1573 Info.ptrVal = CI.getOperand(0);
1574 Info.memVT = MVT::i64;
1575 Info.size = 8;
1576 Info.align.reset();
1578 Info.order = AtomicOrdering::Monotonic;
1579 Infos.push_back(Info);
1580 return;
1581 }
1582 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1583 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1584 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1585 Info.opc = ISD::INTRINSIC_W_CHAIN;
1586 Info.memVT =
1587 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1588 ? CI.getType()
1590 ->getElementType(0)); // XXX: what is correct VT?
1591
1592 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1593 Info.align.reset();
1594 Info.flags = Flags | MachineMemOperand::MOLoad |
1596 Infos.push_back(Info);
1597 return;
1598 }
1599 case Intrinsic::amdgcn_global_atomic_fmin_num:
1600 case Intrinsic::amdgcn_global_atomic_fmax_num:
1601 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1602 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1603 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1604 Info.opc = ISD::INTRINSIC_W_CHAIN;
1605 Info.memVT = MVT::getVT(CI.getType());
1606 Info.ptrVal = CI.getOperand(0);
1607 Info.align.reset();
1608 Info.flags =
1611 Infos.push_back(Info);
1612 return;
1613 }
1614 case Intrinsic::amdgcn_cluster_load_b32:
1615 case Intrinsic::amdgcn_cluster_load_b64:
1616 case Intrinsic::amdgcn_cluster_load_b128:
1617 case Intrinsic::amdgcn_ds_load_tr6_b96:
1618 case Intrinsic::amdgcn_ds_load_tr4_b64:
1619 case Intrinsic::amdgcn_ds_load_tr8_b64:
1620 case Intrinsic::amdgcn_ds_load_tr16_b128:
1621 case Intrinsic::amdgcn_global_load_tr6_b96:
1622 case Intrinsic::amdgcn_global_load_tr4_b64:
1623 case Intrinsic::amdgcn_global_load_tr_b64:
1624 case Intrinsic::amdgcn_global_load_tr_b128:
1625 case Intrinsic::amdgcn_ds_read_tr4_b64:
1626 case Intrinsic::amdgcn_ds_read_tr6_b96:
1627 case Intrinsic::amdgcn_ds_read_tr8_b64:
1628 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1629 Info.opc = ISD::INTRINSIC_W_CHAIN;
1630 Info.memVT = MVT::getVT(CI.getType());
1631 Info.ptrVal = CI.getOperand(0);
1632 Info.align.reset();
1633 Info.flags = Flags | MachineMemOperand::MOLoad;
1634 Infos.push_back(Info);
1635 return;
1636 }
1637 case Intrinsic::amdgcn_flat_load_monitor_b32:
1638 case Intrinsic::amdgcn_flat_load_monitor_b64:
1639 case Intrinsic::amdgcn_flat_load_monitor_b128:
1640 case Intrinsic::amdgcn_global_load_monitor_b32:
1641 case Intrinsic::amdgcn_global_load_monitor_b64:
1642 case Intrinsic::amdgcn_global_load_monitor_b128: {
1643 Info.opc = ISD::INTRINSIC_W_CHAIN;
1644 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1645 Info.ptrVal = CI.getOperand(0);
1646 Info.align.reset();
1647 Info.flags = MachineMemOperand::MOLoad;
1648 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1649 Info.ssid = parseSyncscopeMDArg(CI, 2);
1650 Infos.push_back(Info);
1651 return;
1652 }
1653 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1654 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1655 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1656 Info.opc = ISD::INTRINSIC_W_CHAIN;
1657 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1658 Info.ptrVal = CI.getOperand(0);
1659 Info.align.reset();
1661 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1662 Info.ssid = parseSyncscopeMDArg(CI, 2);
1663 Infos.push_back(Info);
1664 return;
1665 }
1666 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1667 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1668 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1669 Info.opc = ISD::INTRINSIC_VOID;
1670 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1671 Info.ptrVal = CI.getArgOperand(0);
1672 Info.align.reset();
1674 Info.order = parseAtomicOrderingCABIArg(CI, 2);
1675 Info.ssid = parseSyncscopeMDArg(CI, 3);
1676 Infos.push_back(Info);
1677 return;
1678 }
1679 case Intrinsic::amdgcn_ds_gws_init:
1680 case Intrinsic::amdgcn_ds_gws_barrier:
1681 case Intrinsic::amdgcn_ds_gws_sema_v:
1682 case Intrinsic::amdgcn_ds_gws_sema_br:
1683 case Intrinsic::amdgcn_ds_gws_sema_p:
1684 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1685 Info.opc = ISD::INTRINSIC_VOID;
1686
1687 const GCNTargetMachine &TM =
1688 static_cast<const GCNTargetMachine &>(getTargetMachine());
1689
1691 Info.ptrVal = MFI->getGWSPSV(TM);
1692
1693 // This is an abstract access, but we need to specify a type and size.
1694 Info.memVT = MVT::i32;
1695 Info.size = 4;
1696 Info.align = Align(4);
1697
1698 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1699 Info.flags = Flags | MachineMemOperand::MOLoad;
1700 else
1701 Info.flags = Flags | MachineMemOperand::MOStore;
1702 Infos.push_back(Info);
1703 return;
1704 }
1705 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1706 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1707 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1708 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1709 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1710 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1711 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1712 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1713 // Entry 0: Load from source (global/flat).
1714 Info.opc = ISD::INTRINSIC_VOID;
1715 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1716 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1717 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1718 Info.flags = Flags | MachineMemOperand::MOLoad;
1719 Infos.push_back(Info);
1720
1721 // Entry 1: Store to LDS (same offset).
1722 Info.flags = Flags | MachineMemOperand::MOStore;
1723 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1724 Infos.push_back(Info);
1725 return;
1726 }
1727 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1728 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1729 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1730 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1731 // Entry 0: Load from LDS.
1732 Info.opc = ISD::INTRINSIC_VOID;
1733 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1734 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1735 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1736 Info.flags = Flags | MachineMemOperand::MOLoad;
1737 Infos.push_back(Info);
1738
1739 // Entry 1: Store to global (same offset).
1740 Info.flags = Flags | MachineMemOperand::MOStore;
1741 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1742 Infos.push_back(Info);
1743 return;
1744 }
1745 case Intrinsic::amdgcn_load_to_lds:
1746 case Intrinsic::amdgcn_load_async_to_lds:
1747 case Intrinsic::amdgcn_global_load_lds:
1748 case Intrinsic::amdgcn_global_load_async_lds: {
1749 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1750 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1751 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1752 if (IsVolatile)
1754
1755 // Entry 0: Load from source (global/flat).
1756 Info.opc = ISD::INTRINSIC_VOID;
1757 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1758 Info.ptrVal = CI.getArgOperand(0); // Source pointer
1759 Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
1760 Info.flags = Flags | MachineMemOperand::MOLoad;
1761 Infos.push_back(Info);
1762
1763 // Entry 1: Store to LDS.
1764 // Same offset from the instruction, but an additional per-lane offset is
1765 // added. Represent that using a wider memory type.
1766 Info.memVT = EVT::getIntegerVT(CI.getContext(),
1767 Width * 8 * Subtarget->getWavefrontSize());
1768 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1769 Info.flags = Flags | MachineMemOperand::MOStore;
1770 Infos.push_back(Info);
1771 return;
1772 }
1773 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1774 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1775 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1776 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1777 Info.opc = ISD::INTRINSIC_W_CHAIN;
1778
1779 const GCNTargetMachine &TM =
1780 static_cast<const GCNTargetMachine &>(getTargetMachine());
1781
1783 Info.ptrVal = MFI->getGWSPSV(TM);
1784
1785 // This is an abstract access, but we need to specify a type and size.
1786 Info.memVT = MVT::i32;
1787 Info.size = 4;
1788 Info.align = Align(4);
1789
1791 Infos.push_back(Info);
1792 return;
1793 }
1794 case Intrinsic::amdgcn_s_prefetch_data:
1795 case Intrinsic::amdgcn_flat_prefetch:
1796 case Intrinsic::amdgcn_global_prefetch: {
1797 Info.opc = ISD::INTRINSIC_VOID;
1798 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1799 Info.ptrVal = CI.getArgOperand(0);
1800 Info.flags = Flags | MachineMemOperand::MOLoad;
1801 Infos.push_back(Info);
1802 return;
1803 }
1804 default:
1805 return;
1806 }
1807}
1808
1810 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1812 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1813 // The DAG's ValueType loses the addrspaces.
1814 // Add them as 2 extra Constant operands "from" and "to".
1815 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1816 unsigned DstAS = I.getType()->getPointerAddressSpace();
1817 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1818 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1819 break;
1820 }
1821 default:
1822 break;
1823 }
1824}
1825
1828 Type *&AccessTy) const {
1829 Value *Ptr = nullptr;
1830 switch (II->getIntrinsicID()) {
1831 case Intrinsic::amdgcn_cluster_load_b128:
1832 case Intrinsic::amdgcn_cluster_load_b64:
1833 case Intrinsic::amdgcn_cluster_load_b32:
1834 case Intrinsic::amdgcn_ds_append:
1835 case Intrinsic::amdgcn_ds_consume:
1836 case Intrinsic::amdgcn_ds_load_tr8_b64:
1837 case Intrinsic::amdgcn_ds_load_tr16_b128:
1838 case Intrinsic::amdgcn_ds_load_tr4_b64:
1839 case Intrinsic::amdgcn_ds_load_tr6_b96:
1840 case Intrinsic::amdgcn_ds_read_tr4_b64:
1841 case Intrinsic::amdgcn_ds_read_tr6_b96:
1842 case Intrinsic::amdgcn_ds_read_tr8_b64:
1843 case Intrinsic::amdgcn_ds_read_tr16_b64:
1844 case Intrinsic::amdgcn_ds_ordered_add:
1845 case Intrinsic::amdgcn_ds_ordered_swap:
1846 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1847 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1848 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1849 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1850 case Intrinsic::amdgcn_global_atomic_fmax_num:
1851 case Intrinsic::amdgcn_global_atomic_fmin_num:
1852 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1853 case Intrinsic::amdgcn_global_load_tr_b64:
1854 case Intrinsic::amdgcn_global_load_tr_b128:
1855 case Intrinsic::amdgcn_global_load_tr4_b64:
1856 case Intrinsic::amdgcn_global_load_tr6_b96:
1857 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1858 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1859 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1860 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1861 Ptr = II->getArgOperand(0);
1862 break;
1863 case Intrinsic::amdgcn_load_to_lds:
1864 case Intrinsic::amdgcn_load_async_to_lds:
1865 case Intrinsic::amdgcn_global_load_lds:
1866 case Intrinsic::amdgcn_global_load_async_lds:
1867 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1868 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1869 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1870 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1871 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1872 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1873 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1874 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1875 Ptr = II->getArgOperand(1);
1876 break;
1877 default:
1878 return false;
1879 }
1880 AccessTy = II->getType();
1881 Ops.push_back(Ptr);
1882 return true;
1883}
1884
1886 unsigned AddrSpace) const {
1887 if (!Subtarget->hasFlatInstOffsets()) {
1888 // Flat instructions do not have offsets, and only have the register
1889 // address.
1890 return AM.BaseOffs == 0 && AM.Scale == 0;
1891 }
1892
1893 decltype(SIInstrFlags::FLAT) FlatVariant =
1897
1898 return AM.Scale == 0 &&
1899 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1900 AM.BaseOffs, AddrSpace, FlatVariant));
1901}
1902
1904 if (Subtarget->hasFlatGlobalInsts())
1906
1907 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1908 // Assume the we will use FLAT for all global memory accesses
1909 // on VI.
1910 // FIXME: This assumption is currently wrong. On VI we still use
1911 // MUBUF instructions for the r + i addressing mode. As currently
1912 // implemented, the MUBUF instructions only work on buffer < 4GB.
1913 // It may be possible to support > 4GB buffers with MUBUF instructions,
1914 // by setting the stride value in the resource descriptor which would
1915 // increase the size limit to (stride * 4GB). However, this is risky,
1916 // because it has never been validated.
1918 }
1919
1920 return isLegalMUBUFAddressingMode(AM);
1921}
1922
1923bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1924 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1925 // additionally can do r + r + i with addr64. 32-bit has more addressing
1926 // mode options. Depending on the resource constant, it can also do
1927 // (i64 r0) + (i32 r1) * (i14 i).
1928 //
1929 // Private arrays end up using a scratch buffer most of the time, so also
1930 // assume those use MUBUF instructions. Scratch loads / stores are currently
1931 // implemented as mubuf instructions with offen bit set, so slightly
1932 // different than the normal addr64.
1933 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1934 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1935 return false;
1936
1937 // FIXME: Since we can split immediate into soffset and immediate offset,
1938 // would it make sense to allow any immediate?
1939
1940 switch (AM.Scale) {
1941 case 0: // r + i or just i, depending on HasBaseReg.
1942 return true;
1943 case 1:
1944 return true; // We have r + r or r + i.
1945 case 2:
1946 if (AM.HasBaseReg) {
1947 // Reject 2 * r + r.
1948 return false;
1949 }
1950
1951 // Allow 2 * r as r + r
1952 // Or 2 * r + i is allowed as r + r + i.
1953 return true;
1954 default: // Don't allow n * r
1955 return false;
1956 }
1957}
1958
1960 const AddrMode &AM, Type *Ty,
1961 unsigned AS,
1962 Instruction *I) const {
1963 // No global is ever allowed as a base.
1964 if (AM.BaseGV)
1965 return false;
1966
1967 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
1968 return isLegalGlobalAddressingMode(AM);
1969
1970 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
1974 // If the offset isn't a multiple of 4, it probably isn't going to be
1975 // correctly aligned.
1976 // FIXME: Can we get the real alignment here?
1977 if (AM.BaseOffs % 4 != 0)
1978 return isLegalMUBUFAddressingMode(AM);
1979
1980 if (!Subtarget->hasScalarSubwordLoads()) {
1981 // There are no SMRD extloads, so if we have to do a small type access we
1982 // will use a MUBUF load.
1983 // FIXME?: We also need to do this if unaligned, but we don't know the
1984 // alignment here.
1985 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
1986 return isLegalGlobalAddressingMode(AM);
1987 }
1988
1989 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
1990 // SMRD instructions have an 8-bit, dword offset on SI.
1991 if (!isUInt<8>(AM.BaseOffs / 4))
1992 return false;
1993 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
1994 // On CI+, this can also be a 32-bit literal constant offset. If it fits
1995 // in 8-bits, it can use a smaller encoding.
1996 if (!isUInt<32>(AM.BaseOffs / 4))
1997 return false;
1998 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
1999 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
2000 if (!isUInt<20>(AM.BaseOffs))
2001 return false;
2002 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
2003 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
2004 // for S_BUFFER_* instructions).
2005 if (!isInt<21>(AM.BaseOffs))
2006 return false;
2007 } else {
2008 // On GFX12, all offsets are signed 24-bit in bytes.
2009 if (!isInt<24>(AM.BaseOffs))
2010 return false;
2011 }
2012
2013 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
2015 AM.BaseOffs < 0) {
2016 // Scalar (non-buffer) loads can only use a negative offset if
2017 // soffset+offset is non-negative. Since the compiler can only prove that
2018 // in a few special cases, it is safer to claim that negative offsets are
2019 // not supported.
2020 return false;
2021 }
2022
2023 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2024 return true;
2025
2026 if (AM.Scale == 1 && AM.HasBaseReg)
2027 return true;
2028
2029 return false;
2030 }
2031
2032 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2033 return Subtarget->hasFlatScratchEnabled()
2035 : isLegalMUBUFAddressingMode(AM);
2036
2037 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2038 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2039 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2040 // field.
2041 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2042 // an 8-bit dword offset but we don't know the alignment here.
2043 if (!isUInt<16>(AM.BaseOffs))
2044 return false;
2045
2046 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2047 return true;
2048
2049 if (AM.Scale == 1 && AM.HasBaseReg)
2050 return true;
2051
2052 return false;
2053 }
2054
2056 // For an unknown address space, this usually means that this is for some
2057 // reason being used for pure arithmetic, and not based on some addressing
2058 // computation. We don't have instructions that compute pointers with any
2059 // addressing modes, so treat them as having no offset like flat
2060 // instructions.
2062 }
2063
2064 // Assume a user alias of global for unknown address spaces.
2065 return isLegalGlobalAddressingMode(AM);
2066}
2067
2069 const MachineFunction &MF) const {
2071 return (MemVT.getSizeInBits() <= 4 * 32);
2072 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2073 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2074 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2075 }
2077 return (MemVT.getSizeInBits() <= 2 * 32);
2078 return true;
2079}
2080
2082 unsigned Size, unsigned AddrSpace, Align Alignment,
2083 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2084 if (IsFast)
2085 *IsFast = 0;
2086
2087 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2088 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2089 // Check if alignment requirements for ds_read/write instructions are
2090 // disabled.
2091 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2092 return false;
2093
2094 Align RequiredAlignment(
2095 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
2096 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2097 Alignment < RequiredAlignment)
2098 return false;
2099
2100 // Either, the alignment requirements are "enabled", or there is an
2101 // unaligned LDS access related hardware bug though alignment requirements
2102 // are "disabled". In either case, we need to check for proper alignment
2103 // requirements.
2104 //
2105 switch (Size) {
2106 case 64:
2107 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2108 // address is negative, then the instruction is incorrectly treated as
2109 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2110 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2111 // load later in the SILoadStoreOptimizer.
2112 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2113 return false;
2114
2115 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2116 // can do a 4 byte aligned, 8 byte access in a single operation using
2117 // ds_read2/write2_b32 with adjacent offsets.
2118 RequiredAlignment = Align(4);
2119
2120 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2121 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2122 // ds_write2_b32 depending on the alignment. In either case with either
2123 // alignment there is no faster way of doing this.
2124
2125 // The numbers returned here and below are not additive, it is a 'speed
2126 // rank'. They are just meant to be compared to decide if a certain way
2127 // of lowering an operation is faster than another. For that purpose
2128 // naturally aligned operation gets it bitsize to indicate that "it
2129 // operates with a speed comparable to N-bit wide load". With the full
2130 // alignment ds128 is slower than ds96 for example. If underaligned it
2131 // is comparable to a speed of a single dword access, which would then
2132 // mean 32 < 128 and it is faster to issue a wide load regardless.
2133 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2134 // wider load which will not be aligned anymore the latter is slower.
2135 if (IsFast)
2136 *IsFast = (Alignment >= RequiredAlignment) ? 64
2137 : (Alignment < Align(4)) ? 32
2138 : 1;
2139 return true;
2140 }
2141
2142 break;
2143 case 96:
2144 if (!Subtarget->hasDS96AndDS128())
2145 return false;
2146
2147 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2148 // gfx8 and older.
2149
2150 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2151 // Naturally aligned access is fastest. However, also report it is Fast
2152 // if memory is aligned less than DWORD. A narrow load or store will be
2153 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2154 // be more of them, so overall we will pay less penalty issuing a single
2155 // instruction.
2156
2157 // See comment on the values above.
2158 if (IsFast)
2159 *IsFast = (Alignment >= RequiredAlignment) ? 96
2160 : (Alignment < Align(4)) ? 32
2161 : 1;
2162 return true;
2163 }
2164
2165 break;
2166 case 128:
2167 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2168 return false;
2169
2170 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2171 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2172 // single operation using ds_read2/write2_b64.
2173 RequiredAlignment = Align(8);
2174
2175 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2176 // Naturally aligned access is fastest. However, also report it is Fast
2177 // if memory is aligned less than DWORD. A narrow load or store will be
2178 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2179 // will be more of them, so overall we will pay less penalty issuing a
2180 // single instruction.
2181
2182 // See comment on the values above.
2183 if (IsFast)
2184 *IsFast = (Alignment >= RequiredAlignment) ? 128
2185 : (Alignment < Align(4)) ? 32
2186 : 1;
2187 return true;
2188 }
2189
2190 break;
2191 default:
2192 if (Size > 32)
2193 return false;
2194
2195 break;
2196 }
2197
2198 // See comment on the values above.
2199 // Note that we have a single-dword or sub-dword here, so if underaligned
2200 // it is a slowest possible access, hence returned value is 0.
2201 if (IsFast)
2202 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2203
2204 return Alignment >= RequiredAlignment ||
2205 Subtarget->hasUnalignedDSAccessEnabled();
2206 }
2207
2208 // FIXME: We have to be conservative here and assume that flat operations
2209 // will access scratch. If we had access to the IR function, then we
2210 // could determine if any private memory was used in the function.
2211 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2212 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2213 bool AlignedBy4 = Alignment >= Align(4);
2214 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2215 if (IsFast)
2216 *IsFast = AlignedBy4 ? Size : 1;
2217 return true;
2218 }
2219
2220 if (IsFast)
2221 *IsFast = AlignedBy4;
2222
2223 return AlignedBy4;
2224 }
2225
2226 // So long as they are correct, wide global memory operations perform better
2227 // than multiple smaller memory ops -- even when misaligned
2228 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2229 if (IsFast)
2230 *IsFast = Size;
2231
2232 return Alignment >= Align(4) ||
2233 Subtarget->hasUnalignedBufferAccessEnabled();
2234 }
2235
2236 // Ensure robust out-of-bounds guarantees for buffer accesses are met if
2237 // RelaxedBufferOOBMode is disabled. Normally hardware will ensure proper
2238 // out-of-bounds behavior, but in the edge case where an access starts
2239 // out-of-bounds and then enter in-bounds, the entire access would be treated
2240 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2241 // natural alignment of buffer accesses.
2242 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2243 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2244 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2245 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2246 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2247 return false;
2248 }
2249
2250 // Smaller than dword value must be aligned.
2251 if (Size < 32)
2252 return false;
2253
2254 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2255 // byte-address are ignored, thus forcing Dword alignment.
2256 // This applies to private, global, and constant memory.
2257 if (IsFast)
2258 *IsFast = 1;
2259
2260 return Size >= 32 && Alignment >= Align(4);
2261}
2262
2264 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2265 unsigned *IsFast) const {
2267 Alignment, Flags, IsFast);
2268}
2269
2271 LLVMContext &Context, const MemOp &Op,
2272 const AttributeList &FuncAttributes) const {
2273 // FIXME: Should account for address space here.
2274
2275 // The default fallback uses the private pointer size as a guess for a type to
2276 // use. Make sure we switch these to 64-bit accesses.
2277
2278 if (Op.size() >= 16 &&
2279 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2280 return MVT::v4i32;
2281
2282 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2283 return MVT::v2i32;
2284
2285 // Use the default.
2286 return MVT::Other;
2287}
2288
2290 const MemSDNode *MemNode = cast<MemSDNode>(N);
2291 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2292}
2293
2298
2300 unsigned DestAS) const {
2301 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2302 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2303 Subtarget->hasGloballyAddressableScratch()) {
2304 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2305 return false;
2306 }
2307
2308 // Flat -> private/local is a simple truncate.
2309 // Flat -> global is no-op
2310 return true;
2311 }
2312
2313 const GCNTargetMachine &TM =
2314 static_cast<const GCNTargetMachine &>(getTargetMachine());
2315 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2316}
2317
2325
2327 Type *Ty) const {
2328 // FIXME: Could be smarter if called for vector constants.
2329 return true;
2330}
2331
2333 unsigned Index) const {
2335 return false;
2336
2337 // TODO: Add more cases that are cheap.
2338 return Index == 0;
2339}
2340
2341bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2342 // TODO: This should be more aggressive, particular for 16-bit element
2343 // vectors. However there are some mixed improvements and regressions.
2344 EVT EltTy = VT.getVectorElementType();
2345 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2346 return EltTy.getSizeInBits() % MinAlign == 0;
2347}
2348
2350 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2351 switch (Op) {
2352 case ISD::LOAD:
2353 case ISD::STORE:
2354 return true;
2355 default:
2356 return false;
2357 }
2358 }
2359
2360 // SimplifySetCC uses this function to determine whether or not it should
2361 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2362 if (VT == MVT::i1 && Op == ISD::SETCC)
2363 return false;
2364
2366}
2367
2370 // This isn't really a constant pool but close enough.
2373 return PtrInfo;
2374}
2375
2376SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2377 const SDLoc &SL,
2378 SDValue Chain,
2379 uint64_t Offset) const {
2380 const DataLayout &DL = DAG.getDataLayout();
2384
2385 auto [InputPtrReg, RC, ArgTy] =
2386 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2387
2388 // We may not have the kernarg segment argument if we have no kernel
2389 // arguments.
2390 if (!InputPtrReg)
2391 return DAG.getConstant(Offset, SL, PtrVT);
2392
2394 SDValue BasePtr = DAG.getCopyFromReg(
2395 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2396
2397 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2398}
2399
2400SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2401 const SDLoc &SL) const {
2404 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2405}
2406
2407SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2408 const SDLoc &SL) const {
2409
2411 std::optional<uint32_t> KnownSize =
2413 if (KnownSize.has_value())
2414 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2415 return SDValue();
2416}
2417
2418SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2419 const SDLoc &SL, SDValue Val,
2420 bool Signed,
2421 const ISD::InputArg *Arg) const {
2422 // First, if it is a widened vector, narrow it.
2423 if (VT.isVector() &&
2425 EVT NarrowedVT =
2428 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2429 DAG.getConstant(0, SL, MVT::i32));
2430 }
2431
2432 // Then convert the vector elements or scalar value.
2433 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2434 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2435 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2436 }
2437
2438 if (MemVT.isFloatingPoint()) {
2439 if (VT.isFloatingPoint()) {
2440 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2441 } else {
2442 assert(!MemVT.isVector());
2443 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2444 SDValue Cast = DAG.getBitcast(IntVT, Val);
2445 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2446 }
2447 } else if (Signed)
2448 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2449 else
2450 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2451
2452 return Val;
2453}
2454
2455SDValue SITargetLowering::lowerKernargMemParameter(
2456 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2457 uint64_t Offset, Align Alignment, bool Signed,
2458 const ISD::InputArg *Arg) const {
2459
2460 MachinePointerInfo PtrInfo =
2462
2463 // Try to avoid using an extload by loading earlier than the argument address,
2464 // and extracting the relevant bits. The load should hopefully be merged with
2465 // the previous argument.
2466 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2467 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2468 int64_t AlignDownOffset = alignDown(Offset, 4);
2469 int64_t OffsetDiff = Offset - AlignDownOffset;
2470
2471 EVT IntVT = MemVT.changeTypeToInteger();
2472
2473 // TODO: If we passed in the base kernel offset we could have a better
2474 // alignment than 4, but we don't really need it.
2475 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2476 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2477 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2480
2481 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2482 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2483
2484 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2485 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2486 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2487
2488 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2489 }
2490
2491 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2492 SDValue Load = DAG.getLoad(
2493 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2495
2496 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2497 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2498}
2499
2500/// Coerce an argument which was passed in a different ABI type to the original
2501/// expected value type.
2502SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2503 SDValue Val,
2504 CCValAssign &VA,
2505 const SDLoc &SL) const {
2506 EVT ValVT = VA.getValVT();
2507
2508 // If this is an 8 or 16-bit value, it is really passed promoted
2509 // to 32 bits. Insert an assert[sz]ext to capture this, then
2510 // truncate to the right size.
2511 switch (VA.getLocInfo()) {
2512 case CCValAssign::Full:
2513 return Val;
2514 case CCValAssign::BCvt:
2515 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2516 case CCValAssign::SExt:
2517 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2518 DAG.getValueType(ValVT));
2519 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2520 case CCValAssign::ZExt:
2521 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2522 DAG.getValueType(ValVT));
2523 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2524 case CCValAssign::AExt:
2525 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2526 default:
2527 llvm_unreachable("Unknown loc info!");
2528 }
2529}
2530
2531SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2532 CCValAssign &VA, const SDLoc &SL,
2533 SDValue Chain,
2534 const ISD::InputArg &Arg) const {
2535 MachineFunction &MF = DAG.getMachineFunction();
2536 MachineFrameInfo &MFI = MF.getFrameInfo();
2537
2538 if (Arg.Flags.isByVal()) {
2539 unsigned Size = Arg.Flags.getByValSize();
2540 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2541 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2542 }
2543
2544 unsigned ArgOffset = VA.getLocMemOffset();
2545 unsigned ArgSize = VA.getValVT().getStoreSize();
2546
2547 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2548
2549 // Create load nodes to retrieve arguments from the stack.
2550 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2551
2552 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2554 MVT MemVT = VA.getValVT();
2555
2556 switch (VA.getLocInfo()) {
2557 default:
2558 break;
2559 case CCValAssign::BCvt:
2560 MemVT = VA.getLocVT();
2561 break;
2562 case CCValAssign::SExt:
2563 ExtType = ISD::SEXTLOAD;
2564 break;
2565 case CCValAssign::ZExt:
2566 ExtType = ISD::ZEXTLOAD;
2567 break;
2568 case CCValAssign::AExt:
2569 ExtType = ISD::EXTLOAD;
2570 break;
2571 }
2572
2573 SDValue ArgValue = DAG.getExtLoad(
2574 ExtType, SL, VA.getLocVT(), Chain, FIN,
2576
2577 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2578 if (ConvertedVal == ArgValue)
2579 return ConvertedVal;
2580
2581 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2582}
2583
2584SDValue SITargetLowering::lowerWorkGroupId(
2585 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2588 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2589 if (!Subtarget->hasClusters())
2590 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2591
2592 // Clusters are supported. Return the global position in the grid. If clusters
2593 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2594
2595 // WorkGroupIdXYZ = ClusterId == 0 ?
2596 // ClusterIdXYZ :
2597 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2598 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2599 SDLoc SL(ClusterIdXYZ);
2600 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2601 SDValue One = DAG.getConstant(1, SL, VT);
2602 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2603 SDValue ClusterWorkGroupIdXYZ =
2604 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2605 SDValue GlobalIdXYZ =
2606 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2607 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2608
2609 switch (MFI.getClusterDims().getKind()) {
2612 return GlobalIdXYZ;
2614 return ClusterIdXYZ;
2616 using namespace AMDGPU::Hwreg;
2617 SDValue ClusterIdField =
2618 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2619 SDNode *GetReg =
2620 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2621 SDValue ClusterId(GetReg, 0);
2622 SDValue Zero = DAG.getConstant(0, SL, VT);
2623 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2624 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2625 }
2626 }
2627
2628 llvm_unreachable("nothing should reach here");
2629}
2630
2631SDValue SITargetLowering::getPreloadedValue(
2632 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2634 const ArgDescriptor *Reg = nullptr;
2635 const TargetRegisterClass *RC;
2636 LLT Ty;
2637
2639 const ArgDescriptor WorkGroupIDX =
2640 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2641 // If GridZ is not programmed in an entry function then the hardware will set
2642 // it to all zeros, so there is no need to mask the GridY value in the low
2643 // order bits.
2644 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2645 AMDGPU::TTMP7,
2646 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2647 const ArgDescriptor WorkGroupIDZ =
2648 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2649 const ArgDescriptor ClusterWorkGroupIDX =
2650 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2651 const ArgDescriptor ClusterWorkGroupIDY =
2652 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2653 const ArgDescriptor ClusterWorkGroupIDZ =
2654 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2655 const ArgDescriptor ClusterWorkGroupMaxIDX =
2656 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2657 const ArgDescriptor ClusterWorkGroupMaxIDY =
2658 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2659 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2660 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2661 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2662 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2663
2664 auto LoadConstant = [&](unsigned N) {
2665 return DAG.getConstant(N, SDLoc(), VT);
2666 };
2667
2668 if (Subtarget->hasArchitectedSGPRs() &&
2670 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2671 bool HasFixedDims = ClusterDims.isFixedDims();
2672
2673 switch (PVID) {
2675 Reg = &WorkGroupIDX;
2676 RC = &AMDGPU::SReg_32RegClass;
2677 Ty = LLT::scalar(32);
2678 break;
2680 Reg = &WorkGroupIDY;
2681 RC = &AMDGPU::SReg_32RegClass;
2682 Ty = LLT::scalar(32);
2683 break;
2685 Reg = &WorkGroupIDZ;
2686 RC = &AMDGPU::SReg_32RegClass;
2687 Ty = LLT::scalar(32);
2688 break;
2690 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2691 return LoadConstant(0);
2692 Reg = &ClusterWorkGroupIDX;
2693 RC = &AMDGPU::SReg_32RegClass;
2694 Ty = LLT::scalar(32);
2695 break;
2697 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2698 return LoadConstant(0);
2699 Reg = &ClusterWorkGroupIDY;
2700 RC = &AMDGPU::SReg_32RegClass;
2701 Ty = LLT::scalar(32);
2702 break;
2704 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2705 return LoadConstant(0);
2706 Reg = &ClusterWorkGroupIDZ;
2707 RC = &AMDGPU::SReg_32RegClass;
2708 Ty = LLT::scalar(32);
2709 break;
2711 if (HasFixedDims)
2712 return LoadConstant(ClusterDims.getDims()[0] - 1);
2713 Reg = &ClusterWorkGroupMaxIDX;
2714 RC = &AMDGPU::SReg_32RegClass;
2715 Ty = LLT::scalar(32);
2716 break;
2718 if (HasFixedDims)
2719 return LoadConstant(ClusterDims.getDims()[1] - 1);
2720 Reg = &ClusterWorkGroupMaxIDY;
2721 RC = &AMDGPU::SReg_32RegClass;
2722 Ty = LLT::scalar(32);
2723 break;
2725 if (HasFixedDims)
2726 return LoadConstant(ClusterDims.getDims()[2] - 1);
2727 Reg = &ClusterWorkGroupMaxIDZ;
2728 RC = &AMDGPU::SReg_32RegClass;
2729 Ty = LLT::scalar(32);
2730 break;
2732 Reg = &ClusterWorkGroupMaxFlatID;
2733 RC = &AMDGPU::SReg_32RegClass;
2734 Ty = LLT::scalar(32);
2735 break;
2736 default:
2737 break;
2738 }
2739 }
2740
2741 if (!Reg)
2742 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2743 if (!Reg) {
2745 // It's possible for a kernarg intrinsic call to appear in a kernel with
2746 // no allocated segment, in which case we do not add the user sgpr
2747 // argument, so just return null.
2748 return DAG.getConstant(0, SDLoc(), VT);
2749 }
2750
2751 // It's undefined behavior if a function marked with the amdgpu-no-*
2752 // attributes uses the corresponding intrinsic.
2753 return DAG.getPOISON(VT);
2754 }
2755
2756 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2757}
2758
2760 CallingConv::ID CallConv,
2761 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2762 FunctionType *FType,
2763 SIMachineFunctionInfo *Info) {
2764 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2765 const ISD::InputArg *Arg = &Ins[I];
2766
2767 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2768 "vector type argument should have been split");
2769
2770 // First check if it's a PS input addr.
2771 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2772 PSInputNum <= 15) {
2773 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2774
2775 // Inconveniently only the first part of the split is marked as isSplit,
2776 // so skip to the end. We only want to increment PSInputNum once for the
2777 // entire split argument.
2778 if (Arg->Flags.isSplit()) {
2779 while (!Arg->Flags.isSplitEnd()) {
2780 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2781 "unexpected vector split in ps argument type");
2782 if (!SkipArg)
2783 Splits.push_back(*Arg);
2784 Arg = &Ins[++I];
2785 }
2786 }
2787
2788 if (SkipArg) {
2789 // We can safely skip PS inputs.
2790 Skipped.set(Arg->getOrigArgIndex());
2791 ++PSInputNum;
2792 continue;
2793 }
2794
2795 Info->markPSInputAllocated(PSInputNum);
2796 if (Arg->Used)
2797 Info->markPSInputEnabled(PSInputNum);
2798
2799 ++PSInputNum;
2800 }
2801
2802 Splits.push_back(*Arg);
2803 }
2804}
2805
2806// Allocate special inputs passed in VGPRs.
2808 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2809 SIMachineFunctionInfo &Info) const {
2810 const LLT S32 = LLT::scalar(32);
2811 MachineRegisterInfo &MRI = MF.getRegInfo();
2812
2813 if (Info.hasWorkItemIDX()) {
2814 Register Reg = AMDGPU::VGPR0;
2815 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2816
2817 CCInfo.AllocateReg(Reg);
2818 unsigned Mask =
2819 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2820 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2821 }
2822
2823 if (Info.hasWorkItemIDY()) {
2824 assert(Info.hasWorkItemIDX());
2825 if (Subtarget->hasPackedTID()) {
2826 Info.setWorkItemIDY(
2827 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2828 } else {
2829 unsigned Reg = AMDGPU::VGPR1;
2830 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2831
2832 CCInfo.AllocateReg(Reg);
2833 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2834 }
2835 }
2836
2837 if (Info.hasWorkItemIDZ()) {
2838 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2839 if (Subtarget->hasPackedTID()) {
2840 Info.setWorkItemIDZ(
2841 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2842 } else {
2843 unsigned Reg = AMDGPU::VGPR2;
2844 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2845
2846 CCInfo.AllocateReg(Reg);
2847 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2848 }
2849 }
2850}
2851
2852// Try to allocate a VGPR at the end of the argument list, or if no argument
2853// VGPRs are left allocating a stack slot.
2854// If \p Mask is is given it indicates bitfield position in the register.
2855// If \p Arg is given use it with new ]p Mask instead of allocating new.
2856static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2857 ArgDescriptor Arg = ArgDescriptor()) {
2858 if (Arg.isSet())
2859 return ArgDescriptor::createArg(Arg, Mask);
2860
2861 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2862 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2863 if (RegIdx == ArgVGPRs.size()) {
2864 // Spill to stack required.
2865 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2866
2867 return ArgDescriptor::createStack(Offset, Mask);
2868 }
2869
2870 unsigned Reg = ArgVGPRs[RegIdx];
2871 Reg = CCInfo.AllocateReg(Reg);
2872 assert(Reg != AMDGPU::NoRegister);
2873
2874 MachineFunction &MF = CCInfo.getMachineFunction();
2875 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2876 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2877 return ArgDescriptor::createRegister(Reg, Mask);
2878}
2879
2881 const TargetRegisterClass *RC,
2882 unsigned NumArgRegs) {
2883 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2884 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2885 if (RegIdx == ArgSGPRs.size())
2886 report_fatal_error("ran out of SGPRs for arguments");
2887
2888 unsigned Reg = ArgSGPRs[RegIdx];
2889 Reg = CCInfo.AllocateReg(Reg);
2890 assert(Reg != AMDGPU::NoRegister);
2891
2892 MachineFunction &MF = CCInfo.getMachineFunction();
2893 MF.addLiveIn(Reg, RC);
2895}
2896
2897// If this has a fixed position, we still should allocate the register in the
2898// CCInfo state. Technically we could get away with this for values passed
2899// outside of the normal argument range.
2901 const TargetRegisterClass *RC,
2902 MCRegister Reg) {
2903 Reg = CCInfo.AllocateReg(Reg);
2904 assert(Reg != AMDGPU::NoRegister);
2905 MachineFunction &MF = CCInfo.getMachineFunction();
2906 MF.addLiveIn(Reg, RC);
2907}
2908
2909static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2910 if (Arg) {
2911 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2912 Arg.getRegister());
2913 } else
2914 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2915}
2916
2917static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2918 if (Arg) {
2919 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2920 Arg.getRegister());
2921 } else
2922 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2923}
2924
2925/// Allocate implicit function VGPR arguments at the end of allocated user
2926/// arguments.
2928 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2929 SIMachineFunctionInfo &Info) const {
2930 const unsigned Mask = 0x3ff;
2931 ArgDescriptor Arg;
2932
2933 if (Info.hasWorkItemIDX()) {
2934 Arg = allocateVGPR32Input(CCInfo, Mask);
2935 Info.setWorkItemIDX(Arg);
2936 }
2937
2938 if (Info.hasWorkItemIDY()) {
2939 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2940 Info.setWorkItemIDY(Arg);
2941 }
2942
2943 if (Info.hasWorkItemIDZ())
2944 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2945}
2946
2947/// Allocate implicit function VGPR arguments in fixed registers.
2949 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2950 SIMachineFunctionInfo &Info) const {
2951 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
2952 if (!Reg)
2953 report_fatal_error("failed to allocate VGPR for implicit arguments");
2954
2955 const unsigned Mask = 0x3ff;
2956 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2957 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
2958 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
2959}
2960
2962 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2963 SIMachineFunctionInfo &Info) const {
2964 auto &ArgInfo = Info.getArgInfo();
2965 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
2966
2967 // TODO: Unify handling with private memory pointers.
2968 if (UserSGPRInfo.hasDispatchPtr())
2969 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
2970
2971 if (UserSGPRInfo.hasQueuePtr())
2972 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
2973
2974 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
2975 // constant offset from the kernarg segment.
2976 if (Info.hasImplicitArgPtr())
2977 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
2978
2979 if (UserSGPRInfo.hasDispatchID())
2980 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
2981
2982 // flat_scratch_init is not applicable for non-kernel functions.
2983
2984 if (Info.hasWorkGroupIDX())
2985 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
2986
2987 if (Info.hasWorkGroupIDY())
2988 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
2989
2990 if (Info.hasWorkGroupIDZ())
2991 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
2992
2993 if (Info.hasLDSKernelId())
2994 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
2995}
2996
2997// Allocate special inputs passed in user SGPRs.
2999 MachineFunction &MF,
3000 const SIRegisterInfo &TRI,
3001 SIMachineFunctionInfo &Info) const {
3002 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
3003 if (UserSGPRInfo.hasImplicitBufferPtr()) {
3004 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
3005 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
3006 CCInfo.AllocateReg(ImplicitBufferPtrReg);
3007 }
3008
3009 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
3010 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
3011 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
3012 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
3013 CCInfo.AllocateReg(PrivateSegmentBufferReg);
3014 }
3015
3016 if (UserSGPRInfo.hasDispatchPtr()) {
3017 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
3018 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
3019 CCInfo.AllocateReg(DispatchPtrReg);
3020 }
3021
3022 if (UserSGPRInfo.hasQueuePtr()) {
3023 Register QueuePtrReg = Info.addQueuePtr(TRI);
3024 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3025 CCInfo.AllocateReg(QueuePtrReg);
3026 }
3027
3028 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3029 MachineRegisterInfo &MRI = MF.getRegInfo();
3030 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3031 CCInfo.AllocateReg(InputPtrReg);
3032
3033 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
3035 }
3036
3037 if (UserSGPRInfo.hasDispatchID()) {
3038 Register DispatchIDReg = Info.addDispatchID(TRI);
3039 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3040 CCInfo.AllocateReg(DispatchIDReg);
3041 }
3042
3043 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3044 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3045 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3046 CCInfo.AllocateReg(FlatScratchInitReg);
3047 }
3048
3049 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3050 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3051 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3052 CCInfo.AllocateReg(PrivateSegmentSizeReg);
3053 }
3054
3055 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3056 // these from the dispatch pointer.
3057}
3058
3059// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3060// sequential starting from the first argument.
3062 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3064 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3065 Function &F = MF.getFunction();
3066 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3067 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3068 bool InPreloadSequence = true;
3069 unsigned InIdx = 0;
3070 bool AlignedForImplictArgs = false;
3071 unsigned ImplicitArgOffset = 0;
3072 for (auto &Arg : F.args()) {
3073 if (!InPreloadSequence || !Arg.hasInRegAttr())
3074 break;
3075
3076 unsigned ArgIdx = Arg.getArgNo();
3077 // Don't preload non-original args or parts not in the current preload
3078 // sequence.
3079 if (InIdx < Ins.size() &&
3080 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3081 break;
3082
3083 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3084 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3085 InIdx++) {
3086 assert(ArgLocs[ArgIdx].isMemLoc());
3087 auto &ArgLoc = ArgLocs[InIdx];
3088 const Align KernelArgBaseAlign = Align(16);
3089 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3090 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
3091 unsigned NumAllocSGPRs =
3092 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3093
3094 // Fix alignment for hidden arguments.
3095 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
3096 if (!AlignedForImplictArgs) {
3097 ImplicitArgOffset =
3098 alignTo(LastExplicitArgOffset,
3099 Subtarget->getAlignmentForImplicitArgPtr()) -
3100 LastExplicitArgOffset;
3101 AlignedForImplictArgs = true;
3102 }
3103 ArgOffset += ImplicitArgOffset;
3104 }
3105
3106 // Arg is preloaded into the previous SGPR.
3107 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3108 assert(InIdx >= 1 && "No previous SGPR");
3109 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3110 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3111 continue;
3112 }
3113
3114 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3115 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3116 // Check for free user SGPRs for preloading.
3117 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3118 InPreloadSequence = false;
3119 break;
3120 }
3121
3122 // Preload this argument.
3123 const TargetRegisterClass *RC =
3124 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3125 SmallVectorImpl<MCRegister> *PreloadRegs =
3126 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3127
3128 if (PreloadRegs->size() > 1)
3129 RC = &AMDGPU::SGPR_32RegClass;
3130 for (auto &Reg : *PreloadRegs) {
3131 assert(Reg);
3132 MF.addLiveIn(Reg, RC);
3133 CCInfo.AllocateReg(Reg);
3134 }
3135
3136 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3137 }
3138 }
3139}
3140
3142 const SIRegisterInfo &TRI,
3143 SIMachineFunctionInfo &Info) const {
3144 // Always allocate this last since it is a synthetic preload.
3145 if (Info.hasLDSKernelId()) {
3146 Register Reg = Info.addLDSKernelId();
3147 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3148 CCInfo.AllocateReg(Reg);
3149 }
3150}
3151
3152// Allocate special input registers that are initialized per-wave.
3155 CallingConv::ID CallConv,
3156 bool IsShader) const {
3157 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3158 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3159 // Note: user SGPRs are handled by the front-end for graphics shaders
3160 // Pad up the used user SGPRs with dead inputs.
3161
3162 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3163 // before enabling architected SGPRs for workgroup IDs.
3164 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3165
3166 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3167 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3168 // rely on it to reach 16 since if we end up having no stack usage, it will
3169 // not really be added.
3170 unsigned NumRequiredSystemSGPRs =
3171 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3172 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3173 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3174 Register Reg = Info.addReservedUserSGPR();
3175 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3176 CCInfo.AllocateReg(Reg);
3177 }
3178 }
3179
3180 if (!HasArchitectedSGPRs) {
3181 if (Info.hasWorkGroupIDX()) {
3182 Register Reg = Info.addWorkGroupIDX();
3183 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3184 CCInfo.AllocateReg(Reg);
3185 }
3186
3187 if (Info.hasWorkGroupIDY()) {
3188 Register Reg = Info.addWorkGroupIDY();
3189 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3190 CCInfo.AllocateReg(Reg);
3191 }
3192
3193 if (Info.hasWorkGroupIDZ()) {
3194 Register Reg = Info.addWorkGroupIDZ();
3195 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3196 CCInfo.AllocateReg(Reg);
3197 }
3198 }
3199
3200 if (Info.hasWorkGroupInfo()) {
3201 Register Reg = Info.addWorkGroupInfo();
3202 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3203 CCInfo.AllocateReg(Reg);
3204 }
3205
3206 if (Info.hasPrivateSegmentWaveByteOffset()) {
3207 // Scratch wave offset passed in system SGPR.
3208 unsigned PrivateSegmentWaveByteOffsetReg;
3209
3210 if (IsShader) {
3211 PrivateSegmentWaveByteOffsetReg =
3212 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3213
3214 // This is true if the scratch wave byte offset doesn't have a fixed
3215 // location.
3216 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3217 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3218 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3219 }
3220 } else
3221 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3222
3223 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3224 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3225 }
3226
3227 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3228 Info.getNumPreloadedSGPRs() >= 16);
3229}
3230
3232 MachineFunction &MF,
3233 const SIRegisterInfo &TRI,
3234 SIMachineFunctionInfo &Info) {
3235 // Now that we've figured out where the scratch register inputs are, see if
3236 // should reserve the arguments and use them directly.
3237 MachineFrameInfo &MFI = MF.getFrameInfo();
3238 bool HasStackObjects = MFI.hasStackObjects();
3239 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3240
3241 // Record that we know we have non-spill stack objects so we don't need to
3242 // check all stack objects later.
3243 if (HasStackObjects)
3244 Info.setHasNonSpillStackObjects(true);
3245
3246 // Everything live out of a block is spilled with fast regalloc, so it's
3247 // almost certain that spilling will be required.
3249 HasStackObjects = true;
3250
3251 // For now assume stack access is needed in any callee functions, so we need
3252 // the scratch registers to pass in.
3253 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3254
3255 if (!ST.hasFlatScratchEnabled()) {
3256 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3257 // If we have stack objects, we unquestionably need the private buffer
3258 // resource. For the Code Object V2 ABI, this will be the first 4 user
3259 // SGPR inputs. We can reserve those and use them directly.
3260
3261 Register PrivateSegmentBufferReg =
3263 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3264 } else {
3265 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3266 // We tentatively reserve the last registers (skipping the last registers
3267 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3268 // we'll replace these with the ones immediately after those which were
3269 // really allocated. In the prologue copies will be inserted from the
3270 // argument to these reserved registers.
3271
3272 // Without HSA, relocations are used for the scratch pointer and the
3273 // buffer resource setup is always inserted in the prologue. Scratch wave
3274 // offset is still in an input SGPR.
3275 Info.setScratchRSrcReg(ReservedBufferReg);
3276 }
3277 }
3278
3279 MachineRegisterInfo &MRI = MF.getRegInfo();
3280
3281 // For entry functions we have to set up the stack pointer if we use it,
3282 // whereas non-entry functions get this "for free". This means there is no
3283 // intrinsic advantage to using S32 over S34 in cases where we do not have
3284 // calls but do need a frame pointer (i.e. if we are requested to have one
3285 // because frame pointer elimination is disabled). To keep things simple we
3286 // only ever use S32 as the call ABI stack pointer, and so using it does not
3287 // imply we need a separate frame pointer.
3288 //
3289 // Try to use s32 as the SP, but move it if it would interfere with input
3290 // arguments. This won't work with calls though.
3291 //
3292 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3293 // registers.
3294 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3295 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3296 } else {
3298
3299 if (MFI.hasCalls())
3300 report_fatal_error("call in graphics shader with too many input SGPRs");
3301
3302 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3303 if (!MRI.isLiveIn(Reg)) {
3304 Info.setStackPtrOffsetReg(Reg);
3305 break;
3306 }
3307 }
3308
3309 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3310 report_fatal_error("failed to find register for SP");
3311 }
3312
3313 // hasFP should be accurate for entry functions even before the frame is
3314 // finalized, because it does not rely on the known stack size, only
3315 // properties like whether variable sized objects are present.
3316 if (ST.getFrameLowering()->hasFP(MF)) {
3317 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3318 }
3319}
3320
3323 return !Info->isEntryFunction();
3324}
3325
3327
3329 MachineBasicBlock *Entry,
3330 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3332
3333 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3334 if (!IStart)
3335 return;
3336
3337 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3338 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3339 MachineBasicBlock::iterator MBBI = Entry->begin();
3340 for (const MCPhysReg *I = IStart; *I; ++I) {
3341 const TargetRegisterClass *RC = nullptr;
3342 if (AMDGPU::SReg_64RegClass.contains(*I))
3343 RC = &AMDGPU::SGPR_64RegClass;
3344 else if (AMDGPU::SReg_32RegClass.contains(*I))
3345 RC = &AMDGPU::SGPR_32RegClass;
3346 else
3347 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3348
3349 Register NewVR = MRI->createVirtualRegister(RC);
3350 // Create copy from CSR to a virtual register.
3351 Entry->addLiveIn(*I);
3352 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3353 .addReg(*I);
3354
3355 // Insert the copy-back instructions right before the terminator.
3356 for (auto *Exit : Exits)
3357 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3358 TII->get(TargetOpcode::COPY), *I)
3359 .addReg(NewVR);
3360 }
3361}
3362
3364 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3365 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3366 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3368
3370 const Function &Fn = MF.getFunction();
3373 bool IsError = false;
3374
3375 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3377 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3378 IsError = true;
3379 }
3380
3383 BitVector Skipped(Ins.size());
3384 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3385 *DAG.getContext());
3386
3387 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3388 bool IsKernel = AMDGPU::isKernel(CallConv);
3389 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3390
3391 if (IsGraphics) {
3392 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3393 assert(!UserSGPRInfo.hasDispatchPtr() &&
3394 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3395 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3396 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3397 (void)UserSGPRInfo;
3398 if (!Subtarget->hasFlatScratchEnabled())
3399 assert(!UserSGPRInfo.hasFlatScratchInit());
3400 if ((CallConv != CallingConv::AMDGPU_CS &&
3401 CallConv != CallingConv::AMDGPU_Gfx &&
3402 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3403 !Subtarget->hasArchitectedSGPRs())
3404 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3405 !Info->hasWorkGroupIDZ());
3406 }
3407
3408 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3409
3410 if (CallConv == CallingConv::AMDGPU_PS) {
3411 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3412
3413 // At least one interpolation mode must be enabled or else the GPU will
3414 // hang.
3415 //
3416 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3417 // set PSInputAddr, the user wants to enable some bits after the compilation
3418 // based on run-time states. Since we can't know what the final PSInputEna
3419 // will look like, so we shouldn't do anything here and the user should take
3420 // responsibility for the correct programming.
3421 //
3422 // Otherwise, the following restrictions apply:
3423 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3424 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3425 // enabled too.
3426 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3427 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3428 CCInfo.AllocateReg(AMDGPU::VGPR0);
3429 CCInfo.AllocateReg(AMDGPU::VGPR1);
3430 Info->markPSInputAllocated(0);
3431 Info->markPSInputEnabled(0);
3432 }
3433 if (Subtarget->isAmdPalOS()) {
3434 // For isAmdPalOS, the user does not enable some bits after compilation
3435 // based on run-time states; the register values being generated here are
3436 // the final ones set in hardware. Therefore we need to apply the
3437 // workaround to PSInputAddr and PSInputEnable together. (The case where
3438 // a bit is set in PSInputAddr but not PSInputEnable is where the
3439 // frontend set up an input arg for a particular interpolation mode, but
3440 // nothing uses that input arg. Really we should have an earlier pass
3441 // that removes such an arg.)
3442 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3443 if ((PsInputBits & 0x7F) == 0 ||
3444 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3445 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3446 }
3447 } else if (IsKernel) {
3448 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3449 } else {
3450 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3451 Ins.end());
3452 }
3453
3454 if (IsKernel)
3455 analyzeFormalArgumentsCompute(CCInfo, Ins);
3456
3457 if (IsEntryFunc) {
3458 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3459 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3460 if (IsKernel && Subtarget->hasKernargPreload())
3461 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3462
3463 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3464 } else if (!IsGraphics) {
3465 // For the fixed ABI, pass workitem IDs in the last argument register.
3466 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3467
3468 // FIXME: Sink this into allocateSpecialInputSGPRs
3469 if (!Subtarget->hasFlatScratchEnabled())
3470 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3471
3472 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3473 }
3474
3475 if (!IsKernel) {
3476 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3477 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3478
3479 // This assumes the registers are allocated by CCInfo in ascending order
3480 // with no gaps.
3481 Info->setNumWaveDispatchSGPRs(
3482 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3483 Info->setNumWaveDispatchVGPRs(
3484 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3485 } else if (Info->getNumKernargPreloadedSGPRs()) {
3486 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3487 }
3488
3490
3491 if (IsWholeWaveFunc) {
3492 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3493 {MVT::i1, MVT::Other}, Chain);
3494 InVals.push_back(Setup.getValue(0));
3495 Chains.push_back(Setup.getValue(1));
3496 }
3497
3498 // FIXME: This is the minimum kernel argument alignment. We should improve
3499 // this to the maximum alignment of the arguments.
3500 //
3501 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3502 // kern arg offset.
3503 const Align KernelArgBaseAlign = Align(16);
3504
3505 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3506 ++i) {
3507 const ISD::InputArg &Arg = Ins[i];
3508 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3509 InVals.push_back(DAG.getPOISON(Arg.VT));
3510 continue;
3511 }
3512
3513 CCValAssign &VA = ArgLocs[ArgIdx++];
3514 MVT VT = VA.getLocVT();
3515
3516 if (IsEntryFunc && VA.isMemLoc()) {
3517 VT = Ins[i].VT;
3518 EVT MemVT = VA.getLocVT();
3519
3520 const uint64_t Offset = VA.getLocMemOffset();
3521 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3522
3523 if (Arg.Flags.isByRef()) {
3524 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3525
3526 const GCNTargetMachine &TM =
3527 static_cast<const GCNTargetMachine &>(getTargetMachine());
3528 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3529 Arg.Flags.getPointerAddrSpace())) {
3532 }
3533
3534 InVals.push_back(Ptr);
3535 continue;
3536 }
3537
3538 SDValue NewArg;
3539 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3540 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3541 // In this case the argument is packed into the previous preload SGPR.
3542 int64_t AlignDownOffset = alignDown(Offset, 4);
3543 int64_t OffsetDiff = Offset - AlignDownOffset;
3544 EVT IntVT = MemVT.changeTypeToInteger();
3545
3546 const SIMachineFunctionInfo *Info =
3549 Register Reg =
3550 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3551
3552 assert(Reg);
3553 Register VReg = MRI.getLiveInVirtReg(Reg);
3554 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3555
3556 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3557 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3558
3559 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3560 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3561 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3562 Ins[i].Flags.isSExt(), &Ins[i]);
3563
3564 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3565 } else {
3566 const SIMachineFunctionInfo *Info =
3569 const SmallVectorImpl<MCRegister> &PreloadRegs =
3570 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3571
3572 SDValue Copy;
3573 if (PreloadRegs.size() == 1) {
3574 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3575 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3576 NewArg = DAG.getCopyFromReg(
3577 Chain, DL, VReg,
3579 TRI->getRegSizeInBits(*RC)));
3580
3581 } else {
3582 // If the kernarg alignment does not match the alignment of the SGPR
3583 // tuple RC that can accommodate this argument, it will be built up
3584 // via copies from from the individual SGPRs that the argument was
3585 // preloaded to.
3587 for (auto Reg : PreloadRegs) {
3588 Register VReg = MRI.getLiveInVirtReg(Reg);
3589 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3590 Elts.push_back(Copy);
3591 }
3592 NewArg =
3593 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3594 PreloadRegs.size()),
3595 DL, Elts);
3596 }
3597
3598 // If the argument was preloaded to multiple consecutive 32-bit
3599 // registers because of misalignment between addressable SGPR tuples
3600 // and the argument size, we can still assume that because of kernarg
3601 // segment alignment restrictions that NewArg's size is the same as
3602 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3603 // truncate since we cannot preload to less than a single SGPR and the
3604 // MemVT may be smaller.
3605 EVT MemVTInt =
3607 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3608 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3609
3610 NewArg = DAG.getBitcast(MemVT, NewArg);
3611 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3612 Ins[i].Flags.isSExt(), &Ins[i]);
3613 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3614 }
3615 } else {
3616 // Hidden arguments that are in the kernel signature must be preloaded
3617 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3618 // the argument list and is not preloaded.
3619 if (Arg.isOrigArg()) {
3620 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3621 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3623 *OrigArg->getParent(),
3624 "hidden argument in kernel signature was not preloaded",
3625 DL.getDebugLoc()));
3626 }
3627 }
3628
3629 NewArg =
3630 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3631 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3632 }
3633 Chains.push_back(NewArg.getValue(1));
3634
3635 auto *ParamTy =
3636 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3637 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3638 ParamTy &&
3639 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3640 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3641 // On SI local pointers are just offsets into LDS, so they are always
3642 // less than 16-bits. On CI and newer they could potentially be
3643 // real pointers, so we can't guarantee their size.
3644 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3645 DAG.getValueType(MVT::i16));
3646 }
3647
3648 InVals.push_back(NewArg);
3649 continue;
3650 }
3651 if (!IsEntryFunc && VA.isMemLoc()) {
3652 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3653 InVals.push_back(Val);
3654 if (!Arg.Flags.isByVal())
3655 Chains.push_back(Val.getValue(1));
3656 continue;
3657 }
3658
3659 assert(VA.isRegLoc() && "Parameter must be in a register!");
3660
3661 Register Reg = VA.getLocReg();
3662 const TargetRegisterClass *RC = nullptr;
3663 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3664 RC = &AMDGPU::VGPR_32RegClass;
3665 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3666 RC = &AMDGPU::SGPR_32RegClass;
3667 else
3668 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3669
3670 Reg = MF.addLiveIn(Reg, RC);
3671 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3672 if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3673 // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3674 // they will read physical regs before any side effect instructions.
3675 SDValue ReadFirstLane =
3676 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3678 ReadFirstLane, Val);
3679 }
3680
3681 if (Arg.Flags.isSRet()) {
3682 // The return object should be reasonably addressable.
3683
3684 // FIXME: This helps when the return is a real sret. If it is a
3685 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3686 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3687 unsigned NumBits =
3689 Val = DAG.getNode(
3690 ISD::AssertZext, DL, VT, Val,
3691 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3692 }
3693
3694 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3695 InVals.push_back(Val);
3696 }
3697
3698 // Start adding system SGPRs.
3699 if (IsEntryFunc)
3700 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3701
3702 unsigned StackArgSize = CCInfo.getStackSize();
3703 Info->setBytesInStackArgArea(StackArgSize);
3704
3705 return Chains.empty() ? Chain
3706 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3707}
3708
3709// TODO: If return values can't fit in registers, we should return as many as
3710// possible in registers before passing on stack.
3712 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3713 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3714 const Type *RetTy) const {
3715 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3716 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3717 // for shaders. Vector types should be explicitly handled by CC.
3718 if (AMDGPU::isEntryFunctionCC(CallConv))
3719 return true;
3720
3722 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3723 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3724 return false;
3725
3726 // We must use the stack if return would require unavailable registers.
3727 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3728 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3729 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3730 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3731 return false;
3732
3733 return true;
3734}
3735
3736SDValue
3738 bool isVarArg,
3740 const SmallVectorImpl<SDValue> &OutVals,
3741 const SDLoc &DL, SelectionDAG &DAG) const {
3745
3746 if (AMDGPU::isKernel(CallConv)) {
3747 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3748 OutVals, DL, DAG);
3749 }
3750
3751 bool IsShader = AMDGPU::isShader(CallConv);
3752
3753 Info->setIfReturnsVoid(Outs.empty());
3754 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3755
3756 // CCValAssign - represent the assignment of the return value to a location.
3758
3759 // CCState - Info about the registers and stack slots.
3760 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3761 *DAG.getContext());
3762
3763 // Analyze outgoing return values.
3764 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3765
3766 SDValue Glue;
3768 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3769
3770 SDValue ReadFirstLane =
3771 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3772 // Copy the result values into the output registers.
3773 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3774 ++I, ++RealRVLocIdx) {
3775 CCValAssign &VA = RVLocs[I];
3776 assert(VA.isRegLoc() && "Can only return in registers!");
3777 // TODO: Partially return in registers if return values don't fit.
3778 SDValue Arg = OutVals[RealRVLocIdx];
3779
3780 // Copied from other backends.
3781 switch (VA.getLocInfo()) {
3782 case CCValAssign::Full:
3783 break;
3784 case CCValAssign::BCvt:
3785 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3786 break;
3787 case CCValAssign::SExt:
3788 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3789 break;
3790 case CCValAssign::ZExt:
3791 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3792 break;
3793 case CCValAssign::AExt:
3794 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3795 break;
3796 default:
3797 llvm_unreachable("Unknown loc info!");
3798 }
3799 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3801 ReadFirstLane, Arg);
3802 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3803 Glue = Chain.getValue(1);
3804 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3805 }
3806
3807 // FIXME: Does sret work properly?
3808 if (!Info->isEntryFunction()) {
3809 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3810 const MCPhysReg *I =
3811 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3812 if (I) {
3813 for (; *I; ++I) {
3814 if (AMDGPU::SReg_64RegClass.contains(*I))
3815 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3816 else if (AMDGPU::SReg_32RegClass.contains(*I))
3817 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3818 else
3819 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3820 }
3821 }
3822 }
3823
3824 // Update chain and glue.
3825 RetOps[0] = Chain;
3826 if (Glue.getNode())
3827 RetOps.push_back(Glue);
3828
3829 unsigned Opc = AMDGPUISD::ENDPGM;
3830 if (!IsWaveEnd)
3831 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3832 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3833 : AMDGPUISD::RET_GLUE;
3834 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3835}
3836
3838 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3839 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3840 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3841 SDValue ThisVal) const {
3842 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3843
3844 // Assign locations to each value returned by this call.
3846 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3847 *DAG.getContext());
3848 CCInfo.AnalyzeCallResult(Ins, RetCC);
3849
3850 // Copy all of the result registers out of their specified physreg.
3851 for (CCValAssign VA : RVLocs) {
3852 SDValue Val;
3853
3854 if (VA.isRegLoc()) {
3855 Val =
3856 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3857 Chain = Val.getValue(1);
3858 InGlue = Val.getValue(2);
3859 } else if (VA.isMemLoc()) {
3860 report_fatal_error("TODO: return values in memory");
3861 } else
3862 llvm_unreachable("unknown argument location type");
3863
3864 switch (VA.getLocInfo()) {
3865 case CCValAssign::Full:
3866 break;
3867 case CCValAssign::BCvt:
3868 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3869 break;
3870 case CCValAssign::ZExt:
3871 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3872 DAG.getValueType(VA.getValVT()));
3873 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3874 break;
3875 case CCValAssign::SExt:
3876 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3877 DAG.getValueType(VA.getValVT()));
3878 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3879 break;
3880 case CCValAssign::AExt:
3881 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3882 break;
3883 default:
3884 llvm_unreachable("Unknown loc info!");
3885 }
3886
3887 InVals.push_back(Val);
3888 }
3889
3890 return Chain;
3891}
3892
3893// Add code to pass special inputs required depending on used features separate
3894// from the explicit user arguments present in the IR.
3896 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3897 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3898 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3899 // If we don't have a call site, this was a call inserted by
3900 // legalization. These can never use special inputs.
3901 if (!CLI.CB)
3902 return;
3903
3904 SelectionDAG &DAG = CLI.DAG;
3905 const SDLoc &DL = CLI.DL;
3906 const Function &F = DAG.getMachineFunction().getFunction();
3907
3908 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3909 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3910
3911 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3913
3914 // TODO: Unify with private memory register handling. This is complicated by
3915 // the fact that at least in kernels, the input argument is not necessarily
3916 // in the same location as the input.
3917 // clang-format off
3918 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3919 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3920 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3921 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3922 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3923 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3924 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3925 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3926 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3927 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3928 };
3929 // clang-format on
3930
3931 for (auto [InputID, Attrs] : ImplicitAttrs) {
3932 // If the callee does not use the attribute value, skip copying the value.
3933 if (all_of(Attrs, [&](StringRef Attr) {
3934 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3935 }))
3936 continue;
3937
3938 const auto [OutgoingArg, ArgRC, ArgTy] =
3939 CalleeArgInfo.getPreloadedValue(InputID);
3940 if (!OutgoingArg)
3941 continue;
3942
3943 const auto [IncomingArg, IncomingArgRC, Ty] =
3944 CallerArgInfo.getPreloadedValue(InputID);
3945 assert(IncomingArgRC == ArgRC);
3946
3947 // All special arguments are ints for now.
3948 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3949 SDValue InputReg;
3950
3951 if (IncomingArg) {
3952 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
3953 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
3954 // The implicit arg ptr is special because it doesn't have a corresponding
3955 // input for kernels, and is computed from the kernarg segment pointer.
3956 InputReg = getImplicitArgPtr(DAG, DL);
3957 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
3958 std::optional<uint32_t> Id =
3960 if (Id.has_value()) {
3961 InputReg = DAG.getConstant(*Id, DL, ArgVT);
3962 } else {
3963 InputReg = DAG.getPOISON(ArgVT);
3964 }
3965 } else {
3966 // We may have proven the input wasn't needed, although the ABI is
3967 // requiring it. We just need to allocate the register appropriately.
3968 InputReg = DAG.getPOISON(ArgVT);
3969 }
3970
3971 if (OutgoingArg->isRegister()) {
3972 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
3973 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
3974 report_fatal_error("failed to allocate implicit input argument");
3975 } else {
3976 unsigned SpecialArgOffset =
3977 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
3978 SDValue ArgStore =
3979 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
3980 MemOpChains.push_back(ArgStore);
3981 }
3982 }
3983
3984 // Pack workitem IDs into a single register or pass it as is if already
3985 // packed.
3986
3987 auto [OutgoingArg, ArgRC, Ty] =
3989 if (!OutgoingArg)
3990 std::tie(OutgoingArg, ArgRC, Ty) =
3992 if (!OutgoingArg)
3993 std::tie(OutgoingArg, ArgRC, Ty) =
3995 if (!OutgoingArg)
3996 return;
3997
3998 const ArgDescriptor *IncomingArgX = std::get<0>(
4000 const ArgDescriptor *IncomingArgY = std::get<0>(
4002 const ArgDescriptor *IncomingArgZ = std::get<0>(
4004
4005 SDValue InputReg;
4006 SDLoc SL;
4007
4008 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
4009 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
4010 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
4011
4012 // If incoming ids are not packed we need to pack them.
4013 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
4014 NeedWorkItemIDX) {
4015 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
4016 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
4017 } else {
4018 InputReg = DAG.getConstant(0, DL, MVT::i32);
4019 }
4020 }
4021
4022 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
4023 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
4024 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
4025 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
4026 DAG.getShiftAmountConstant(10, MVT::i32, SL));
4027 InputReg = InputReg.getNode()
4028 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
4029 : Y;
4030 }
4031
4032 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4033 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
4034 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
4035 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
4036 DAG.getShiftAmountConstant(20, MVT::i32, SL));
4037 InputReg = InputReg.getNode()
4038 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
4039 : Z;
4040 }
4041
4042 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4043 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4044 // We're in a situation where the outgoing function requires the workitem
4045 // ID, but the calling function does not have it (e.g a graphics function
4046 // calling a C calling convention function). This is illegal, but we need
4047 // to produce something.
4048 InputReg = DAG.getPOISON(MVT::i32);
4049 } else {
4050 // Workitem ids are already packed, any of present incoming arguments
4051 // will carry all required fields.
4052 ArgDescriptor IncomingArg =
4053 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
4054 : IncomingArgY ? *IncomingArgY
4055 : *IncomingArgZ,
4056 ~0u);
4057 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
4058 }
4059 }
4060
4061 if (OutgoingArg->isRegister()) {
4062 if (InputReg)
4063 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4064
4065 CCInfo.AllocateReg(OutgoingArg->getRegister());
4066 } else {
4067 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
4068 if (InputReg) {
4069 SDValue ArgStore =
4070 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
4071 MemOpChains.push_back(ArgStore);
4072 }
4073 }
4074}
4075
4077 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4079 const SmallVectorImpl<SDValue> &OutVals,
4080 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4081 if (AMDGPU::isChainCC(CalleeCC))
4082 return true;
4083
4084 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
4085 return false;
4086
4087 // For a divergent call target, we need to do a waterfall loop over the
4088 // possible callees which precludes us from using a simple jump.
4089 if (Callee->isDivergent())
4090 return false;
4091
4093 const Function &CallerF = MF.getFunction();
4094 CallingConv::ID CallerCC = CallerF.getCallingConv();
4096 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4097
4098 // Kernels aren't callable, and don't have a live in return address so it
4099 // doesn't make sense to do a tail call with entry functions.
4100 if (!CallerPreserved)
4101 return false;
4102
4103 bool CCMatch = CallerCC == CalleeCC;
4104
4106 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4107 return true;
4108 return false;
4109 }
4110
4111 // TODO: Can we handle var args?
4112 if (IsVarArg)
4113 return false;
4114
4115 for (const Argument &Arg : CallerF.args()) {
4116 if (Arg.hasByValAttr())
4117 return false;
4118 }
4119
4120 LLVMContext &Ctx = *DAG.getContext();
4121
4122 // Check that the call results are passed in the same way.
4123 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4124 CCAssignFnForCall(CalleeCC, IsVarArg),
4125 CCAssignFnForCall(CallerCC, IsVarArg)))
4126 return false;
4127
4128 // The callee has to preserve all registers the caller needs to preserve.
4129 if (!CCMatch) {
4130 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4131 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4132 return false;
4133 }
4134
4135 // Nothing more to check if the callee is taking no arguments.
4136 if (Outs.empty())
4137 return true;
4138
4140 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4141
4142 // FIXME: We are not allocating special input registers, so we will be
4143 // deciding based on incorrect register assignments.
4144 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4145
4146 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4147 // If the stack arguments for this call do not fit into our own save area then
4148 // the call cannot be made tail.
4149 // TODO: Is this really necessary?
4150 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4151 return false;
4152
4153 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4154 // FIXME: What about inreg arguments that end up passed in memory?
4155 if (!CCVA.isRegLoc())
4156 continue;
4157
4158 // If we are passing an argument in an SGPR, and the value is divergent,
4159 // this call requires a waterfall loop.
4160 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4161 LLVM_DEBUG(
4162 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4163 << printReg(CCVA.getLocReg(), TRI) << '\n');
4164 return false;
4165 }
4166 }
4167
4168 const MachineRegisterInfo &MRI = MF.getRegInfo();
4169 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4170}
4171
4173 if (!CI->isTailCall())
4174 return false;
4175
4176 const Function *ParentFn = CI->getFunction();
4178 return false;
4179 return true;
4180}
4181
4182namespace {
4183// Chain calls have special arguments that we need to handle. These are
4184// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4185// arguments (index 0 and 1 respectively).
4186enum ChainCallArgIdx {
4187 Exec = 2,
4188 Flags,
4189 NumVGPRs,
4190 FallbackExec,
4191 FallbackCallee
4192};
4193} // anonymous namespace
4194
4195// The wave scratch offset register is used as the global base pointer.
4197 SmallVectorImpl<SDValue> &InVals) const {
4198 CallingConv::ID CallConv = CLI.CallConv;
4199 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4200
4201 SelectionDAG &DAG = CLI.DAG;
4202
4203 const SDLoc &DL = CLI.DL;
4204 SDValue Chain = CLI.Chain;
4205 SDValue Callee = CLI.Callee;
4206
4207 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4208 bool UsesDynamicVGPRs = false;
4209 if (IsChainCallConv) {
4210 // The last arguments should be the value that we need to put in EXEC,
4211 // followed by the flags and any other arguments with special meanings.
4212 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4213 // we don't treat them like the "real" arguments.
4214 auto RequestedExecIt =
4215 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4216 return Arg.OrigArgIndex == 2;
4217 });
4218 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4219
4220 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4221 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4222 CLI.OutVals.end());
4223 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4224
4225 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4226 "Haven't popped all the special args");
4227
4228 TargetLowering::ArgListEntry RequestedExecArg =
4229 CLI.Args[ChainCallArgIdx::Exec];
4230 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4231 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4232
4233 // Convert constants into TargetConstants, so they become immediate operands
4234 // instead of being selected into S_MOV.
4235 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4236 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4237 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4238 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4239 } else
4240 ChainCallSpecialArgs.push_back(Arg.Node);
4241 };
4242
4243 PushNodeOrTargetConstant(RequestedExecArg);
4244
4245 // Process any other special arguments depending on the value of the flags.
4246 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4247
4248 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4249 if (FlagsValue.isZero()) {
4250 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4251 return lowerUnhandledCall(CLI, InVals,
4252 "no additional args allowed if flags == 0");
4253 } else if (FlagsValue.isOneBitSet(0)) {
4254 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4255 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4256 }
4257
4258 if (!Subtarget->isWave32()) {
4259 return lowerUnhandledCall(
4260 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4261 }
4262
4263 UsesDynamicVGPRs = true;
4264 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4265 CLI.Args.end(), PushNodeOrTargetConstant);
4266 }
4267 }
4268
4270 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4272 bool &IsTailCall = CLI.IsTailCall;
4273 bool IsVarArg = CLI.IsVarArg;
4274 bool IsSibCall = false;
4276
4277 if (Callee.isUndef() || isNullConstant(Callee)) {
4278 if (!CLI.IsTailCall) {
4279 for (ISD::InputArg &Arg : CLI.Ins)
4280 InVals.push_back(DAG.getPOISON(Arg.VT));
4281 }
4282
4283 return Chain;
4284 }
4285
4286 if (IsVarArg) {
4287 return lowerUnhandledCall(CLI, InVals,
4288 "unsupported call to variadic function ");
4289 }
4290
4291 if (!CLI.CB)
4292 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4293
4294 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4295 return lowerUnhandledCall(CLI, InVals,
4296 "unsupported required tail call to function ");
4297 }
4298
4299 if (IsTailCall) {
4300 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4301 Outs, OutVals, Ins, DAG);
4302 if (!IsTailCall &&
4303 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4304 report_fatal_error("failed to perform tail call elimination on a call "
4305 "site marked musttail or on llvm.amdgcn.cs.chain");
4306 }
4307
4308 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4309
4310 // A sibling call is one where we're under the usual C ABI and not planning
4311 // to change that but can still do a tail call:
4312 if (!TailCallOpt && IsTailCall)
4313 IsSibCall = true;
4314
4315 if (IsTailCall)
4316 ++NumTailCalls;
4317 }
4318
4321 SmallVector<SDValue, 8> MemOpChains;
4322
4323 // Analyze operands of the call, assigning locations to each operand.
4325 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4326 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4327
4328 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4330 // With a fixed ABI, allocate fixed registers before user arguments.
4331 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4332 }
4333
4334 // Mark the scratch resource descriptor as allocated so the CC analysis
4335 // does not assign user arguments to these registers, matching the callee.
4336 if (!Subtarget->hasFlatScratchEnabled())
4337 CCInfo.AllocateReg(Info->getScratchRSrcReg());
4338
4339 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4340
4341 // Get a count of how many bytes are to be pushed on the stack.
4342 unsigned NumBytes = CCInfo.getStackSize();
4343
4344 if (IsSibCall) {
4345 // Since we're not changing the ABI to make this a tail call, the memory
4346 // operands are already available in the caller's incoming argument space.
4347 NumBytes = 0;
4348 }
4349
4350 // FPDiff is the byte offset of the call's argument area from the callee's.
4351 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4352 // by this amount for a tail call. In a sibling call it must be 0 because the
4353 // caller will deallocate the entire stack and the callee still expects its
4354 // arguments to begin at SP+0. Completely unused for non-tail calls.
4355 int32_t FPDiff = 0;
4356 MachineFrameInfo &MFI = MF.getFrameInfo();
4357 auto *TRI = Subtarget->getRegisterInfo();
4358
4359 // Adjust the stack pointer for the new arguments...
4360 // These operations are automatically eliminated by the prolog/epilog pass
4361 if (!IsSibCall)
4362 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4363
4364 if (!IsSibCall || IsChainCallConv) {
4365 if (!Subtarget->hasFlatScratchEnabled()) {
4366 SmallVector<SDValue, 4> CopyFromChains;
4367
4368 // In the HSA case, this should be an identity copy.
4369 SDValue ScratchRSrcReg =
4370 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4371 RegsToPass.emplace_back(IsChainCallConv
4372 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4373 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4374 ScratchRSrcReg);
4375 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4376 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4377 }
4378 }
4379
4380 const unsigned NumSpecialInputs = RegsToPass.size();
4381
4382 MVT PtrVT = MVT::i32;
4383
4384 // Walk the register/memloc assignments, inserting copies/loads.
4385 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4386 CCValAssign &VA = ArgLocs[i];
4387 SDValue Arg = OutVals[i];
4388
4389 // Promote the value if needed.
4390 switch (VA.getLocInfo()) {
4391 case CCValAssign::Full:
4392 break;
4393 case CCValAssign::BCvt:
4394 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4395 break;
4396 case CCValAssign::ZExt:
4397 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4398 break;
4399 case CCValAssign::SExt:
4400 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4401 break;
4402 case CCValAssign::AExt:
4403 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4404 break;
4405 case CCValAssign::FPExt:
4406 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4407 break;
4408 default:
4409 llvm_unreachable("Unknown loc info!");
4410 }
4411
4412 if (VA.isRegLoc()) {
4413 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4414 } else {
4415 assert(VA.isMemLoc());
4416
4417 SDValue DstAddr;
4418 MachinePointerInfo DstInfo;
4419
4420 unsigned LocMemOffset = VA.getLocMemOffset();
4421 int32_t Offset = LocMemOffset;
4422
4423 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4424 MaybeAlign Alignment;
4425
4426 if (IsTailCall) {
4427 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4428 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4429 : VA.getValVT().getStoreSize();
4430
4431 // FIXME: We can have better than the minimum byval required alignment.
4432 Alignment =
4433 Flags.isByVal()
4434 ? Flags.getNonZeroByValAlign()
4435 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4436
4437 Offset = Offset + FPDiff;
4438 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4439
4440 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4441 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4442
4443 // Make sure any stack arguments overlapping with where we're storing
4444 // are loaded before this eventual operation. Otherwise they'll be
4445 // clobbered.
4446
4447 // FIXME: Why is this really necessary? This seems to just result in a
4448 // lot of code to copy the stack and write them back to the same
4449 // locations, which are supposed to be immutable?
4450 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4451 } else {
4452 // Stores to the argument stack area are relative to the stack pointer.
4453 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4454 MVT::i32);
4455 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4456 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4457 Alignment =
4458 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4459 }
4460
4461 if (Outs[i].Flags.isByVal()) {
4462 SDValue SizeNode =
4463 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4464 SDValue Cpy =
4465 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4466 Outs[i].Flags.getNonZeroByValAlign(),
4467 /*isVol = */ false, /*AlwaysInline = */ true,
4468 /*CI=*/nullptr, std::nullopt, DstInfo,
4470
4471 MemOpChains.push_back(Cpy);
4472 } else {
4473 SDValue Store =
4474 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4475 MemOpChains.push_back(Store);
4476 }
4477 }
4478 }
4479
4480 if (!MemOpChains.empty())
4481 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4482
4483 SDValue ReadFirstLaneID =
4484 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4485
4486 SDValue TokenGlue;
4487 if (CLI.ConvergenceControlToken) {
4488 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4490 }
4491
4492 // Build a sequence of copy-to-reg nodes chained together with token chain
4493 // and flag operands which copy the outgoing args into the appropriate regs.
4494 SDValue InGlue;
4495
4496 unsigned ArgIdx = 0;
4497 for (auto [Reg, Val] : RegsToPass) {
4498 if (ArgIdx++ >= NumSpecialInputs &&
4499 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4500 // For chain calls, the inreg arguments are required to be
4501 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4502 // they are uniform.
4503 //
4504 // For other calls, if an inreg arguments is known to be uniform,
4505 // speculatively insert a readfirstlane in case it is in a VGPR.
4506 //
4507 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4508 // value, so let that continue to produce invalid code.
4509
4510 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4511 if (TokenGlue)
4512 ReadfirstlaneArgs.push_back(TokenGlue);
4514 ReadfirstlaneArgs);
4515 }
4516
4517 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4518 InGlue = Chain.getValue(1);
4519 }
4520
4521 // We don't usually want to end the call-sequence here because we would tidy
4522 // the frame up *after* the call, however in the ABI-changing tail-call case
4523 // we've carefully laid out the parameters so that when sp is reset they'll be
4524 // in the correct location.
4525 if (IsTailCall && !IsSibCall) {
4526 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4527 InGlue = Chain.getValue(1);
4528 }
4529
4530 std::vector<SDValue> Ops({Chain});
4531
4532 // Add a redundant copy of the callee global which will not be legalized, as
4533 // we need direct access to the callee later.
4535 const GlobalValue *GV = GSD->getGlobal();
4536 Ops.push_back(Callee);
4537 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4538 } else {
4539 if (IsTailCall) {
4540 // isEligibleForTailCallOptimization considered whether the call target is
4541 // divergent, but we may still end up with a uniform value in a VGPR.
4542 // Insert a readfirstlane just in case.
4543 SDValue ReadFirstLaneID =
4544 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4545
4546 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4547 if (TokenGlue)
4548 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4549 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4550 ReadfirstlaneArgs);
4551 }
4552
4553 Ops.push_back(Callee);
4554 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4555 }
4556
4557 if (IsTailCall) {
4558 // Each tail call may have to adjust the stack by a different amount, so
4559 // this information must travel along with the operation for eventual
4560 // consumption by emitEpilogue.
4561 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4562 }
4563
4564 if (IsChainCallConv)
4565 llvm::append_range(Ops, ChainCallSpecialArgs);
4566
4567 // Add argument registers to the end of the list so that they are known live
4568 // into the call.
4569 for (auto &[Reg, Val] : RegsToPass)
4570 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4571
4572 // Add a register mask operand representing the call-preserved registers.
4573 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4574 assert(Mask && "Missing call preserved mask for calling convention");
4575 Ops.push_back(DAG.getRegisterMask(Mask));
4576
4577 if (SDValue Token = CLI.ConvergenceControlToken) {
4579 GlueOps.push_back(Token);
4580 if (InGlue)
4581 GlueOps.push_back(InGlue);
4582
4583 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4584 MVT::Glue, GlueOps),
4585 0);
4586 }
4587
4588 if (InGlue)
4589 Ops.push_back(InGlue);
4590
4591 // If we're doing a tall call, use a TC_RETURN here rather than an
4592 // actual call instruction.
4593 if (IsTailCall) {
4594 MFI.setHasTailCall();
4595 unsigned OPC = AMDGPUISD::TC_RETURN;
4596 switch (CallConv) {
4598 OPC = AMDGPUISD::TC_RETURN_GFX;
4599 break;
4602 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4603 : AMDGPUISD::TC_RETURN_CHAIN;
4604 break;
4605 }
4606
4607 // If the caller is a whole wave function, we need to use a special opcode
4608 // so we can patch up EXEC.
4609 if (Info->isWholeWaveFunction())
4610 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4611
4612 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4613 }
4614
4615 // Returns a chain and a flag for retval copy to use.
4616 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4617 Chain = Call.getValue(0);
4618 InGlue = Call.getValue(1);
4619
4620 uint64_t CalleePopBytes = NumBytes;
4621 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4622 if (!Ins.empty())
4623 InGlue = Chain.getValue(1);
4624
4625 // Handle result values, copying them out of physregs into vregs that we
4626 // return.
4627 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4628 InVals, /*IsThisReturn=*/false, SDValue());
4629}
4630
4631// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4632// except for:
4633// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4634// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4636 SelectionDAG &DAG) const {
4637 const MachineFunction &MF = DAG.getMachineFunction();
4639
4640 SDLoc dl(Op);
4641 EVT VT = Op.getValueType();
4642 SDValue Chain = Op.getOperand(0);
4643 Register SPReg = Info->getStackPtrOffsetReg();
4644
4645 // Chain the dynamic stack allocation so that it doesn't modify the stack
4646 // pointer when other instructions are using the stack.
4647 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4648
4649 SDValue Size = Op.getOperand(1);
4650 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4651 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4652
4653 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4655 "Stack grows upwards for AMDGPU");
4656
4657 Chain = BaseAddr.getValue(1);
4658 Align StackAlign = TFL->getStackAlign();
4659 if (Alignment > StackAlign) {
4660 uint64_t ScaledAlignment = Alignment.value()
4661 << Subtarget->getWavefrontSizeLog2();
4662 uint64_t StackAlignMask = ScaledAlignment - 1;
4663 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4664 DAG.getConstant(StackAlignMask, dl, VT));
4665 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4666 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4667 }
4668
4669 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4670 SDValue NewSP;
4672 // For constant sized alloca, scale alloca size by wave-size
4673 SDValue ScaledSize = DAG.getNode(
4674 ISD::SHL, dl, VT, Size,
4675 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4676 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4677 } else {
4678 // For dynamic sized alloca, perform wave-wide reduction to get max of
4679 // alloca size(divergent) and then scale it by wave-size
4680 SDValue WaveReduction =
4681 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4682 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4683 Size, DAG.getTargetConstant(0, dl, MVT::i32));
4684 SDValue ScaledSize = DAG.getNode(
4685 ISD::SHL, dl, VT, Size,
4686 DAG.getConstant(Subtarget->getWavefrontSizeLog2(), dl, MVT::i32));
4687 NewSP =
4688 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4689 SDValue ReadFirstLaneID =
4690 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4691 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4692 NewSP);
4693 }
4694
4695 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4696 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4697
4698 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4699}
4700
4702 if (Op.getValueType() != MVT::i32)
4703 return Op; // Defer to cannot select error.
4704
4706 SDLoc SL(Op);
4707
4708 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4709
4710 // Convert from wave uniform to swizzled vector address. This should protect
4711 // from any edge cases where the stacksave result isn't directly used with
4712 // stackrestore.
4713 SDValue VectorAddress =
4714 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4715 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4716}
4717
4719 SelectionDAG &DAG) const {
4720 SDLoc SL(Op);
4721 assert(Op.getValueType() == MVT::i32);
4722
4723 uint32_t BothRoundHwReg =
4725 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4726
4727 SDValue IntrinID =
4728 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4729 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4730 Op.getOperand(0), IntrinID, GetRoundBothImm);
4731
4732 // There are two rounding modes, one for f32 and one for f64/f16. We only
4733 // report in the standard value range if both are the same.
4734 //
4735 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4736 // ties away from zero is not supported, and the other values are rotated by
4737 // 1.
4738 //
4739 // If the two rounding modes are not the same, report a target defined value.
4740
4741 // Mode register rounding mode fields:
4742 //
4743 // [1:0] Single-precision round mode.
4744 // [3:2] Double/Half-precision round mode.
4745 //
4746 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4747 //
4748 // Hardware Spec
4749 // Toward-0 3 0
4750 // Nearest Even 0 1
4751 // +Inf 1 2
4752 // -Inf 2 3
4753 // NearestAway0 N/A 4
4754 //
4755 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4756 // table we can index by the raw hardware mode.
4757 //
4758 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4759
4760 SDValue BitTable =
4762
4763 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4764 SDValue RoundModeTimesNumBits =
4765 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4766
4767 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4768 // knew only one mode was demanded.
4769 SDValue TableValue =
4770 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4771 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4772
4773 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4774 SDValue TableEntry =
4775 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4776
4777 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4778 // if it's an extended value.
4779 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4780 SDValue IsStandardValue =
4781 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4782 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4783 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4784 TableEntry, EnumOffset);
4785
4786 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4787}
4788
4790 SelectionDAG &DAG) const {
4791 SDLoc SL(Op);
4792
4793 SDValue NewMode = Op.getOperand(1);
4794 assert(NewMode.getValueType() == MVT::i32);
4795
4796 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4797 // hardware MODE.fp_round values.
4798 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4799 uint32_t ClampedVal = std::min(
4800 static_cast<uint32_t>(ConstMode->getZExtValue()),
4802 NewMode = DAG.getConstant(
4803 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4804 } else {
4805 // If we know the input can only be one of the supported standard modes in
4806 // the range 0-3, we can use a simplified mapping to hardware values.
4807 KnownBits KB = DAG.computeKnownBits(NewMode);
4808 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4809 // The supported standard values are 0-3. The extended values start at 8. We
4810 // need to offset by 4 if the value is in the extended range.
4811
4812 if (UseReducedTable) {
4813 // Truncate to the low 32-bits.
4814 SDValue BitTable = DAG.getConstant(
4815 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4816
4817 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4818 SDValue RoundModeTimesNumBits =
4819 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4820
4821 NewMode =
4822 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4823
4824 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4825 // the table extracted bits into inline immediates.
4826 } else {
4827 // table_index = umin(value, value - 4)
4828 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4829 SDValue BitTable =
4831
4832 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4833 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4834 SDValue IndexVal =
4835 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4836
4837 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4838 SDValue RoundModeTimesNumBits =
4839 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4840
4841 SDValue TableValue =
4842 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4843 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4844
4845 // No need to mask out the high bits since the setreg will ignore them
4846 // anyway.
4847 NewMode = TruncTable;
4848 }
4849
4850 // Insert a readfirstlane in case the value is a VGPR. We could do this
4851 // earlier and keep more operations scalar, but that interferes with
4852 // combining the source.
4853 SDValue ReadFirstLaneID =
4854 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4855 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4856 ReadFirstLaneID, NewMode);
4857 }
4858
4859 // N.B. The setreg will be later folded into s_round_mode on supported
4860 // targets.
4861 SDValue IntrinID =
4862 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4863 uint32_t BothRoundHwReg =
4865 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4866
4867 SDValue SetReg =
4868 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4869 IntrinID, RoundBothImm, NewMode);
4870
4871 return SetReg;
4872}
4873
4875 if (Op->isDivergent() &&
4876 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4877 // Cannot do I$ prefetch with divergent pointer.
4878 return SDValue();
4879
4880 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4884 break;
4886 if (Subtarget->hasSafeSmemPrefetch())
4887 break;
4888 [[fallthrough]];
4889 default:
4890 return SDValue();
4891 }
4892
4893 // I$ prefetch
4894 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4895 return SDValue();
4896
4897 return Op;
4898}
4899
4900// Work around DAG legality rules only based on the result type.
4902 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4903 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4904 EVT SrcVT = Src.getValueType();
4905
4906 if (SrcVT.getScalarType() != MVT::bf16)
4907 return Op;
4908
4909 SDLoc SL(Op);
4910 SDValue BitCast =
4911 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4912
4913 EVT DstVT = Op.getValueType();
4914 if (IsStrict)
4915 llvm_unreachable("Need STRICT_BF16_TO_FP");
4916
4917 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4918}
4919
4921 SDLoc SL(Op);
4922 if (Op.getValueType() != MVT::i64)
4923 return Op;
4924
4925 uint32_t ModeHwReg =
4927 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4928 uint32_t TrapHwReg =
4930 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4931
4932 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4933 SDValue IntrinID =
4934 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4935 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4936 Op.getOperand(0), IntrinID, ModeHwRegImm);
4937 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4938 Op.getOperand(0), IntrinID, TrapHwRegImm);
4939 SDValue TokenReg =
4940 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
4941 GetTrapReg.getValue(1));
4942
4943 SDValue CvtPtr =
4944 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
4945 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
4946
4947 return DAG.getMergeValues({Result, TokenReg}, SL);
4948}
4949
4951 SDLoc SL(Op);
4952 if (Op.getOperand(1).getValueType() != MVT::i64)
4953 return Op;
4954
4955 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
4956 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4957 DAG.getConstant(0, SL, MVT::i32));
4958 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
4959 DAG.getConstant(1, SL, MVT::i32));
4960
4961 SDValue ReadFirstLaneID =
4962 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4963 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4964 ReadFirstLaneID, NewModeReg);
4965 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4966 ReadFirstLaneID, NewTrapReg);
4967
4968 unsigned ModeHwReg =
4970 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4971 unsigned TrapHwReg =
4973 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4974
4975 SDValue IntrinID =
4976 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4977 SDValue SetModeReg =
4978 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4979 IntrinID, ModeHwRegImm, NewModeReg);
4980 SDValue SetTrapReg =
4981 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
4982 IntrinID, TrapHwRegImm, NewTrapReg);
4983 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
4984}
4985
4987 const MachineFunction &MF) const {
4988 const Function &Fn = MF.getFunction();
4989
4991 .Case("m0", AMDGPU::M0)
4992 .Case("exec", AMDGPU::EXEC)
4993 .Case("exec_lo", AMDGPU::EXEC_LO)
4994 .Case("exec_hi", AMDGPU::EXEC_HI)
4995 .Case("flat_scratch", AMDGPU::FLAT_SCR)
4996 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
4997 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
4998 .Default(Register());
4999 if (!Reg)
5000 return Reg;
5001
5002 if (!Subtarget->hasFlatScrRegister() &&
5003 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
5004 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
5005 "\" for subtarget."));
5006 }
5007
5008 switch (Reg) {
5009 case AMDGPU::M0:
5010 case AMDGPU::EXEC_LO:
5011 case AMDGPU::EXEC_HI:
5012 case AMDGPU::FLAT_SCR_LO:
5013 case AMDGPU::FLAT_SCR_HI:
5014 if (VT.getSizeInBits() == 32)
5015 return Reg;
5016 break;
5017 case AMDGPU::EXEC:
5018 case AMDGPU::FLAT_SCR:
5019 if (VT.getSizeInBits() == 64)
5020 return Reg;
5021 break;
5022 default:
5023 llvm_unreachable("missing register type checking");
5024 }
5025
5027 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
5028}
5029
5030// If kill is not the last instruction, split the block so kill is always a
5031// proper terminator.
5034 MachineBasicBlock *BB) const {
5035 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
5037 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
5038 return SplitBB;
5039}
5040
5041// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5042// \p MI will be the only instruction in the loop body block. Otherwise, it will
5043// be the first instruction in the remainder block.
5044//
5045/// \returns { LoopBody, Remainder }
5046static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5048 MachineFunction *MF = MBB.getParent();
5050
5051 // To insert the loop we need to split the block. Move everything after this
5052 // point to a new block, and insert a new empty block between the two.
5054 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5056 ++MBBI;
5057
5058 MF->insert(MBBI, LoopBB);
5059 MF->insert(MBBI, RemainderBB);
5060
5061 LoopBB->addSuccessor(LoopBB);
5062 LoopBB->addSuccessor(RemainderBB);
5063
5064 // Move the rest of the block into a new block.
5065 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5066
5067 if (InstInLoop) {
5068 auto Next = std::next(I);
5069
5070 // Move instruction to loop body.
5071 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
5072
5073 // Move the rest of the block.
5074 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
5075 } else {
5076 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
5077 }
5078
5079 MBB.addSuccessor(LoopBB);
5080
5081 return std::pair(LoopBB, RemainderBB);
5082}
5083
5084/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5086 MachineBasicBlock *MBB = MI.getParent();
5088 auto I = MI.getIterator();
5089 auto E = std::next(I);
5090
5091 // clang-format off
5092 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
5093 .addImm(0);
5094 // clang-format on
5095
5096 MIBundleBuilder Bundler(*MBB, I, E);
5097 finalizeBundle(*MBB, Bundler.begin());
5098}
5099
5102 MachineBasicBlock *BB) const {
5103 const DebugLoc &DL = MI.getDebugLoc();
5104
5106
5108
5109 // Apparently kill flags are only valid if the def is in the same block?
5110 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5111 Src->setIsKill(false);
5112
5113 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5114
5115 MachineBasicBlock::iterator I = LoopBB->end();
5116
5117 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5119
5120 // Clear TRAP_STS.MEM_VIOL
5121 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5122 .addImm(0)
5123 .addImm(EncodedReg);
5124
5126
5127 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5128
5129 // Load and check TRAP_STS.MEM_VIOL
5130 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5131 .addImm(EncodedReg);
5132
5133 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5134 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5135 .addReg(Reg, RegState::Kill)
5136 .addImm(0);
5137 // clang-format off
5138 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5139 .addMBB(LoopBB);
5140 // clang-format on
5141
5142 return RemainderBB;
5143}
5144
5145// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5146// wavefront. If the value is uniform and just happens to be in a VGPR, this
5147// will only do one iteration. In the worst case, this will loop 64 times.
5148//
5149// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5152 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5153 const DebugLoc &DL, const MachineOperand &Idx,
5154 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5155 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5156 Register &SGPRIdxReg) {
5157
5158 MachineFunction *MF = OrigBB.getParent();
5159 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5160 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5163
5164 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5165 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5166 Register NewExec = MRI.createVirtualRegister(BoolRC);
5167 Register CurrentIdxReg =
5168 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5169 Register CondReg = MRI.createVirtualRegister(BoolRC);
5170
5171 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5172 .addReg(InitReg)
5173 .addMBB(&OrigBB)
5174 .addReg(ResultReg)
5175 .addMBB(&LoopBB);
5176
5177 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5178 .addReg(InitSaveExecReg)
5179 .addMBB(&OrigBB)
5180 .addReg(NewExec)
5181 .addMBB(&LoopBB);
5182
5183 // Read the next variant <- also loop target.
5184 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5185 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5186
5187 // Compare the just read M0 value to all possible Idx values.
5188 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5189 .addReg(CurrentIdxReg)
5190 .addReg(Idx.getReg(), {}, Idx.getSubReg());
5191
5192 // Update EXEC, save the original EXEC value to VCC.
5193 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5194 .addReg(CondReg, RegState::Kill);
5195
5196 MRI.setSimpleHint(NewExec, CondReg);
5197
5198 if (UseGPRIdxMode) {
5199 if (Offset == 0) {
5200 SGPRIdxReg = CurrentIdxReg;
5201 } else {
5202 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5203 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5204 .addReg(CurrentIdxReg, RegState::Kill)
5205 .addImm(Offset);
5206 }
5207 } else {
5208 // Move index from VCC into M0
5209 if (Offset == 0) {
5210 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5211 .addReg(CurrentIdxReg, RegState::Kill);
5212 } else {
5213 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5214 .addReg(CurrentIdxReg, RegState::Kill)
5215 .addImm(Offset);
5216 }
5217 }
5218
5219 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5220 MachineInstr *InsertPt =
5221 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5222 .addReg(LMC.ExecReg)
5223 .addReg(NewExec);
5224
5225 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5226 // s_cbranch_scc0?
5227
5228 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5229 // clang-format off
5230 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5231 .addMBB(&LoopBB);
5232 // clang-format on
5233
5234 return InsertPt->getIterator();
5235}
5236
5237// This has slightly sub-optimal regalloc when the source vector is killed by
5238// the read. The register allocator does not understand that the kill is
5239// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5240// subregister from it, using 1 more VGPR than necessary. This was saved when
5241// this was expanded after register allocation.
5244 unsigned InitResultReg, unsigned PhiReg, int Offset,
5245 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5246 MachineFunction *MF = MBB.getParent();
5247 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5248 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5249 MachineRegisterInfo &MRI = MF->getRegInfo();
5250 const DebugLoc &DL = MI.getDebugLoc();
5252
5253 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5254 Register DstReg = MI.getOperand(0).getReg();
5255 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5256 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5258
5259 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5260
5261 // Save the EXEC mask
5262 // clang-format off
5263 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5264 .addReg(LMC.ExecReg);
5265 // clang-format on
5266
5267 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5268
5269 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5270
5271 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5272 InitResultReg, DstReg, PhiReg, TmpExec,
5273 Offset, UseGPRIdxMode, SGPRIdxReg);
5274
5275 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5277 ++MBBI;
5278 MF->insert(MBBI, LandingPad);
5279 LoopBB->removeSuccessor(RemainderBB);
5280 LandingPad->addSuccessor(RemainderBB);
5281 LoopBB->addSuccessor(LandingPad);
5282 MachineBasicBlock::iterator First = LandingPad->begin();
5283 // clang-format off
5284 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5285 .addReg(SaveExec);
5286 // clang-format on
5287
5288 return InsPt;
5289}
5290
5291// Returns subreg index, offset
5292static std::pair<unsigned, int>
5294 const TargetRegisterClass *SuperRC, unsigned VecReg,
5295 int Offset) {
5296 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5297
5298 // Skip out of bounds offsets, or else we would end up using an undefined
5299 // register.
5300 if (Offset >= NumElts || Offset < 0)
5301 return std::pair(AMDGPU::sub0, Offset);
5302
5303 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5304}
5305
5308 int Offset) {
5309 MachineBasicBlock *MBB = MI.getParent();
5310 const DebugLoc &DL = MI.getDebugLoc();
5312
5313 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5314
5315 assert(Idx->getReg() != AMDGPU::NoRegister);
5316
5317 if (Offset == 0) {
5318 // clang-format off
5319 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5320 .add(*Idx);
5321 // clang-format on
5322 } else {
5323 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5324 .add(*Idx)
5325 .addImm(Offset);
5326 }
5327}
5328
5331 int Offset) {
5332 MachineBasicBlock *MBB = MI.getParent();
5333 const DebugLoc &DL = MI.getDebugLoc();
5335
5336 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5337
5338 if (Offset == 0)
5339 return Idx->getReg();
5340
5341 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5342 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5343 .add(*Idx)
5344 .addImm(Offset);
5345 return Tmp;
5346}
5347
5350 const GCNSubtarget &ST) {
5351 const SIInstrInfo *TII = ST.getInstrInfo();
5352 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5353 MachineFunction *MF = MBB.getParent();
5354 MachineRegisterInfo &MRI = MF->getRegInfo();
5355
5356 Register Dst = MI.getOperand(0).getReg();
5357 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5358 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5359 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5360
5361 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5362 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5363
5364 unsigned SubReg;
5365 std::tie(SubReg, Offset) =
5366 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5367
5368 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5369
5370 // Check for a SGPR index.
5371 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5373 const DebugLoc &DL = MI.getDebugLoc();
5374
5375 if (UseGPRIdxMode) {
5376 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5377 // to avoid interfering with other uses, so probably requires a new
5378 // optimization pass.
5379 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5380
5381 const MCInstrDesc &GPRIDXDesc =
5382 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5383 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5384 .addReg(SrcReg)
5385 .addReg(Idx)
5386 .addImm(SubReg);
5387 } else {
5389
5390 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5391 .addReg(SrcReg, {}, SubReg)
5392 .addReg(SrcReg, RegState::Implicit);
5393 }
5394
5395 MI.eraseFromParent();
5396
5397 return &MBB;
5398 }
5399
5400 // Control flow needs to be inserted if indexing with a VGPR.
5401 const DebugLoc &DL = MI.getDebugLoc();
5403
5404 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5405 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5406
5407 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5408
5409 Register SGPRIdxReg;
5410 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5411 UseGPRIdxMode, SGPRIdxReg);
5412
5413 MachineBasicBlock *LoopBB = InsPt->getParent();
5414
5415 if (UseGPRIdxMode) {
5416 const MCInstrDesc &GPRIDXDesc =
5417 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5418
5419 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5420 .addReg(SrcReg)
5421 .addReg(SGPRIdxReg)
5422 .addImm(SubReg);
5423 } else {
5424 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5425 .addReg(SrcReg, {}, SubReg)
5426 .addReg(SrcReg, RegState::Implicit);
5427 }
5428
5429 MI.eraseFromParent();
5430
5431 return LoopBB;
5432}
5433
5436 const GCNSubtarget &ST) {
5437 const SIInstrInfo *TII = ST.getInstrInfo();
5438 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5439 MachineFunction *MF = MBB.getParent();
5440 MachineRegisterInfo &MRI = MF->getRegInfo();
5441
5442 Register Dst = MI.getOperand(0).getReg();
5443 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5444 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5445 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5446 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5447 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5448 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5449
5450 // This can be an immediate, but will be folded later.
5451 assert(Val->getReg());
5452
5453 unsigned SubReg;
5454 std::tie(SubReg, Offset) =
5455 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5456 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5457
5458 if (Idx->getReg() == AMDGPU::NoRegister) {
5460 const DebugLoc &DL = MI.getDebugLoc();
5461
5462 assert(Offset == 0);
5463
5464 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5465 .add(*SrcVec)
5466 .add(*Val)
5467 .addImm(SubReg);
5468
5469 MI.eraseFromParent();
5470 return &MBB;
5471 }
5472
5473 // Check for a SGPR index.
5474 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5476 const DebugLoc &DL = MI.getDebugLoc();
5477
5478 if (UseGPRIdxMode) {
5479 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5480
5481 const MCInstrDesc &GPRIDXDesc =
5482 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5483 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5484 .addReg(SrcVec->getReg())
5485 .add(*Val)
5486 .addReg(Idx)
5487 .addImm(SubReg);
5488 } else {
5490
5491 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5492 TRI.getRegSizeInBits(*VecRC), 32, false);
5493 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5494 .addReg(SrcVec->getReg())
5495 .add(*Val)
5496 .addImm(SubReg);
5497 }
5498 MI.eraseFromParent();
5499 return &MBB;
5500 }
5501
5502 // Control flow needs to be inserted if indexing with a VGPR.
5503 if (Val->isReg())
5504 MRI.clearKillFlags(Val->getReg());
5505
5506 const DebugLoc &DL = MI.getDebugLoc();
5507
5508 Register PhiReg = MRI.createVirtualRegister(VecRC);
5509
5510 Register SGPRIdxReg;
5511 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5512 UseGPRIdxMode, SGPRIdxReg);
5513 MachineBasicBlock *LoopBB = InsPt->getParent();
5514
5515 if (UseGPRIdxMode) {
5516 const MCInstrDesc &GPRIDXDesc =
5517 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5518
5519 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5520 .addReg(PhiReg)
5521 .add(*Val)
5522 .addReg(SGPRIdxReg)
5523 .addImm(SubReg);
5524 } else {
5525 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5526 TRI.getRegSizeInBits(*VecRC), 32, false);
5527 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5528 .addReg(PhiReg)
5529 .add(*Val)
5530 .addImm(SubReg);
5531 }
5532
5533 MI.eraseFromParent();
5534 return LoopBB;
5535}
5536
5538 MachineBasicBlock *BB) {
5539 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5540 // For GFX12, we emit s_add_u64 and s_sub_u64.
5541 MachineFunction *MF = BB->getParent();
5542 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5543 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5545 const DebugLoc &DL = MI.getDebugLoc();
5546 MachineOperand &Dest = MI.getOperand(0);
5547 MachineOperand &Src0 = MI.getOperand(1);
5548 MachineOperand &Src1 = MI.getOperand(2);
5549 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5550 if (ST.hasScalarAddSub64()) {
5551 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5552 // clang-format off
5553 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5554 .add(Src0)
5555 .add(Src1);
5556 // clang-format on
5557 } else {
5558 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5559 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5560
5561 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5562 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5563
5564 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5565 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5566 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5567 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5568
5569 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5570 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5571 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5572 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5573
5574 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5575 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5576 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5577 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5578 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5579 .addReg(DestSub0)
5580 .addImm(AMDGPU::sub0)
5581 .addReg(DestSub1)
5582 .addImm(AMDGPU::sub1);
5583 }
5584 MI.eraseFromParent();
5585 return BB;
5586}
5587
5589 MachineFunction *MF = BB->getParent();
5590 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5591 const SIInstrInfo *TII = ST.getInstrInfo();
5592 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5593 MachineRegisterInfo &MRI = MF->getRegInfo();
5594 const DebugLoc &DL = MI.getDebugLoc();
5595 Register Dst = MI.getOperand(0).getReg();
5596 const MachineOperand &Src0 = MI.getOperand(1);
5597 const MachineOperand &Src1 = MI.getOperand(2);
5598 Register SrcCond = MI.getOperand(3).getReg();
5599
5600 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5601 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5602 const TargetRegisterClass *CondRC = TRI->getWaveMaskRegClass();
5603 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5604
5605 int Src0Idx =
5606 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
5607 int Src1Idx =
5608 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
5609 const TargetRegisterClass *Src0RC =
5610 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src0Idx));
5611 const TargetRegisterClass *Src1RC =
5612 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src1Idx));
5613
5614 const TargetRegisterClass *Src0SubRC =
5615 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5616 const TargetRegisterClass *Src1SubRC =
5617 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5618
5619 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5620 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5621 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5622 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5623
5624 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5625 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5626 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5627 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5628
5629 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5630 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5631 .addImm(0)
5632 .add(Src0Sub0)
5633 .addImm(0)
5634 .add(Src1Sub0)
5635 .addReg(SrcCondCopy);
5636
5637 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5638 .addImm(0)
5639 .add(Src0Sub1)
5640 .addImm(0)
5641 .add(Src1Sub1)
5642 .addReg(SrcCondCopy);
5643
5644 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5645 .addReg(DstLo)
5646 .addImm(AMDGPU::sub0)
5647 .addReg(DstHi)
5648 .addImm(AMDGPU::sub1);
5649 MI.eraseFromParent();
5650}
5651
5653 switch (Opc) {
5654 case AMDGPU::S_MIN_U32:
5655 return std::numeric_limits<uint32_t>::max();
5656 case AMDGPU::S_MIN_I32:
5657 return std::numeric_limits<int32_t>::max();
5658 case AMDGPU::S_MAX_U32:
5659 return std::numeric_limits<uint32_t>::min();
5660 case AMDGPU::S_MAX_I32:
5661 return std::numeric_limits<int32_t>::min();
5662 case AMDGPU::V_ADD_F32_e64: // -0.0
5663 return 0x80000000;
5664 case AMDGPU::V_SUB_F32_e64: // +0.0
5665 return 0x0;
5666 case AMDGPU::S_ADD_I32:
5667 case AMDGPU::S_SUB_I32:
5668 case AMDGPU::S_OR_B32:
5669 case AMDGPU::S_XOR_B32:
5670 return std::numeric_limits<uint32_t>::min();
5671 case AMDGPU::S_AND_B32:
5672 return std::numeric_limits<uint32_t>::max();
5673 case AMDGPU::V_MIN_F32_e64:
5674 case AMDGPU::V_MAX_F32_e64:
5675 return 0x7fc00000; // qNAN
5676 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5677 return std::numeric_limits<uint64_t>::max();
5678 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5679 return std::numeric_limits<int64_t>::max();
5680 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5681 return std::numeric_limits<uint64_t>::min();
5682 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5683 return std::numeric_limits<int64_t>::min();
5684 case AMDGPU::V_MIN_F64_e64:
5685 case AMDGPU::V_MAX_F64_e64:
5686 case AMDGPU::V_MIN_NUM_F64_e64:
5687 case AMDGPU::V_MAX_NUM_F64_e64:
5688 return 0x7FF8000000000000; // qNAN
5689 case AMDGPU::S_ADD_U64_PSEUDO:
5690 case AMDGPU::S_SUB_U64_PSEUDO:
5691 case AMDGPU::S_OR_B64:
5692 case AMDGPU::S_XOR_B64:
5693 return std::numeric_limits<uint64_t>::min();
5694 case AMDGPU::S_AND_B64:
5695 return std::numeric_limits<uint64_t>::max();
5696 case AMDGPU::V_ADD_F64_e64:
5697 case AMDGPU::V_ADD_F64_pseudo_e64:
5698 return 0x8000000000000000; // -0.0
5699 default:
5700 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5701 }
5702}
5703
5704static bool is32bitWaveReduceOperation(unsigned Opc) {
5705 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5706 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5707 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5708 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5709 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5710 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5711 Opc == AMDGPU::V_SUB_F32_e64;
5712}
5713
5715 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5716 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5717 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5718 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5719 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5720}
5721
5722static std::tuple<unsigned, unsigned>
5724 unsigned DPPOpc;
5725 switch (Opc) {
5726 case AMDGPU::S_MIN_U32:
5727 DPPOpc = AMDGPU::V_MIN_U32_dpp;
5728 break;
5729 case AMDGPU::S_MIN_I32:
5730 DPPOpc = AMDGPU::V_MIN_I32_dpp;
5731 break;
5732 case AMDGPU::S_MAX_U32:
5733 DPPOpc = AMDGPU::V_MAX_U32_dpp;
5734 break;
5735 case AMDGPU::S_MAX_I32:
5736 DPPOpc = AMDGPU::V_MAX_I32_dpp;
5737 break;
5738 case AMDGPU::S_ADD_I32:
5739 case AMDGPU::S_SUB_I32:
5740 DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5741 : AMDGPU::V_ADD_CO_U32_dpp;
5742 break;
5743 case AMDGPU::S_AND_B32:
5744 DPPOpc = AMDGPU::V_AND_B32_dpp;
5745 break;
5746 case AMDGPU::S_OR_B32:
5747 DPPOpc = AMDGPU::V_OR_B32_dpp;
5748 break;
5749 case AMDGPU::S_XOR_B32:
5750 DPPOpc = AMDGPU::V_XOR_B32_dpp;
5751 break;
5752 case AMDGPU::V_ADD_F32_e64:
5753 case AMDGPU::V_SUB_F32_e64:
5754 DPPOpc = AMDGPU::V_ADD_F32_dpp;
5755 break;
5756 case AMDGPU::V_MIN_F32_e64:
5757 DPPOpc = AMDGPU::V_MIN_F32_dpp;
5758 break;
5759 case AMDGPU::V_MAX_F32_e64:
5760 DPPOpc = AMDGPU::V_MAX_F32_dpp;
5761 break;
5762 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5763 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5764 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5765 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5766 case AMDGPU::S_ADD_U64_PSEUDO:
5767 case AMDGPU::S_SUB_U64_PSEUDO:
5768 case AMDGPU::S_AND_B64:
5769 case AMDGPU::S_OR_B64:
5770 case AMDGPU::S_XOR_B64:
5771 case AMDGPU::V_MIN_NUM_F64_e64:
5772 case AMDGPU::V_MIN_F64_e64:
5773 case AMDGPU::V_MAX_NUM_F64_e64:
5774 case AMDGPU::V_MAX_F64_e64:
5775 case AMDGPU::V_ADD_F64_pseudo_e64:
5776 case AMDGPU::V_ADD_F64_e64:
5777 DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
5778 break;
5779 default:
5780 llvm_unreachable("unhandled lane op");
5781 }
5782 unsigned ClampOpc = Opc;
5783 if (!ST.getInstrInfo()->isVALU(Opc)) {
5784 if (Opc == AMDGPU::S_SUB_I32)
5785 ClampOpc = AMDGPU::S_ADD_I32;
5786 if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
5787 ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
5788 else if (Opc == AMDGPU::S_AND_B64)
5789 ClampOpc = AMDGPU::V_AND_B32_e64;
5790 else if (Opc == AMDGPU::S_OR_B64)
5791 ClampOpc = AMDGPU::V_OR_B32_e64;
5792 else if (Opc == AMDGPU::S_XOR_B64)
5793 ClampOpc = AMDGPU::V_XOR_B32_e64;
5794 else
5795 ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
5796 }
5797 return {DPPOpc, ClampOpc};
5798}
5799
5800static std::pair<Register, Register>
5802 const TargetRegisterClass *SrcRC, const GCNSubtarget &ST,
5803 MachineRegisterInfo &MRI) {
5804 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5805 const SIInstrInfo *TII = ST.getInstrInfo();
5806 const TargetRegisterClass *SrcSubRC =
5807 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5808 Register Op1L =
5809 TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub0, SrcSubRC);
5810 Register Op1H =
5811 TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub1, SrcSubRC);
5812 return {Op1L, Op1H};
5813}
5814
5817 const GCNSubtarget &ST,
5818 unsigned Opc) {
5820 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5821 const DebugLoc &DL = MI.getDebugLoc();
5822 const SIInstrInfo *TII = ST.getInstrInfo();
5823
5824 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5825 Register SrcReg = MI.getOperand(1).getReg();
5826 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5827 Register DstReg = MI.getOperand(0).getReg();
5828 unsigned Stratergy = static_cast<unsigned>(MI.getOperand(2).getImm());
5829 enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };
5830 MachineBasicBlock *RetBB = nullptr;
5831 unsigned MIOpc = MI.getOpcode();
5832 auto BuildRegSequence = [&](MachineBasicBlock &BB,
5834 Register Src0, Register Src1) {
5835 auto RegSequence =
5836 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dst)
5837 .addReg(Src0)
5838 .addImm(AMDGPU::sub0)
5839 .addReg(Src1)
5840 .addImm(AMDGPU::sub1);
5841 return RegSequence;
5842 };
5843 if (isSGPR) {
5844 switch (Opc) {
5845 case AMDGPU::S_MIN_U32:
5846 case AMDGPU::S_MIN_I32:
5847 case AMDGPU::V_MIN_F32_e64:
5848 case AMDGPU::S_MAX_U32:
5849 case AMDGPU::S_MAX_I32:
5850 case AMDGPU::V_MAX_F32_e64:
5851 case AMDGPU::S_AND_B32:
5852 case AMDGPU::S_OR_B32: {
5853 // Idempotent operations.
5854 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5855 RetBB = &BB;
5856 break;
5857 }
5858 case AMDGPU::V_CMP_LT_U64_e64: // umin
5859 case AMDGPU::V_CMP_LT_I64_e64: // min
5860 case AMDGPU::V_CMP_GT_U64_e64: // umax
5861 case AMDGPU::V_CMP_GT_I64_e64: // max
5862 case AMDGPU::V_MIN_F64_e64:
5863 case AMDGPU::V_MIN_NUM_F64_e64:
5864 case AMDGPU::V_MAX_F64_e64:
5865 case AMDGPU::V_MAX_NUM_F64_e64:
5866 case AMDGPU::S_AND_B64:
5867 case AMDGPU::S_OR_B64: {
5868 // Idempotent operations.
5869 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5870 RetBB = &BB;
5871 break;
5872 }
5873 case AMDGPU::S_XOR_B32:
5874 case AMDGPU::S_XOR_B64:
5875 case AMDGPU::S_ADD_I32:
5876 case AMDGPU::S_ADD_U64_PSEUDO:
5877 case AMDGPU::V_ADD_F32_e64:
5878 case AMDGPU::V_ADD_F64_e64:
5879 case AMDGPU::V_ADD_F64_pseudo_e64:
5880 case AMDGPU::S_SUB_I32:
5881 case AMDGPU::S_SUB_U64_PSEUDO:
5882 case AMDGPU::V_SUB_F32_e64: {
5883 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5884 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5885 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5886 Register NumActiveLanes =
5887 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5888
5889 bool IsWave32 = ST.isWave32();
5890 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5891 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5892 unsigned BitCountOpc =
5893 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5894
5895 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5896
5897 auto NewAccumulator =
5898 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5899 .addReg(ExecMask);
5900
5901 switch (Opc) {
5902 case AMDGPU::S_XOR_B32:
5903 case AMDGPU::S_XOR_B64: {
5904 // Performing an XOR operation on a uniform value
5905 // depends on the parity of the number of active lanes.
5906 // For even parity, the result will be 0, for odd
5907 // parity the result will be the same as the input value.
5908 Register ParityRegister =
5909 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5910
5911 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5912 .addReg(NewAccumulator->getOperand(0).getReg())
5913 .addImm(1)
5914 .setOperandDead(3); // Dead scc
5915 if (Opc == AMDGPU::S_XOR_B32) {
5916 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5917 .addReg(SrcReg)
5918 .addReg(ParityRegister);
5919 } else {
5920 Register DestSub0 =
5921 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5922 Register DestSub1 =
5923 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5924 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
5925 MRI.getRegClass(SrcReg), ST, MRI);
5926 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5927 .addReg(Op1L)
5928 .addReg(ParityRegister);
5929 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5930 .addReg(Op1H)
5931 .addReg(ParityRegister);
5932 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
5933 }
5934 break;
5935 }
5936 case AMDGPU::S_SUB_I32: {
5937 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
5938
5939 // Take the negation of the source operand.
5940 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
5941 .addImm(0)
5942 .addReg(SrcReg);
5943 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5944 .addReg(NegatedVal)
5945 .addReg(NewAccumulator->getOperand(0).getReg());
5946 break;
5947 }
5948 case AMDGPU::S_ADD_I32: {
5949 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5950 .addReg(SrcReg)
5951 .addReg(NewAccumulator->getOperand(0).getReg());
5952 break;
5953 }
5954 case AMDGPU::S_ADD_U64_PSEUDO:
5955 case AMDGPU::S_SUB_U64_PSEUDO: {
5956 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5957 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5958 Register Op1H_Op0L_Reg =
5959 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5960 Register Op1L_Op0H_Reg =
5961 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5962 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5963 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5964 Register NegatedValLo =
5965 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5966 Register NegatedValHi =
5967 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5968 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
5969 MRI.getRegClass(SrcReg), ST, MRI);
5970 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
5971 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
5972 .addImm(0)
5973 .addReg(NewAccumulator->getOperand(0).getReg())
5974 .setOperandDead(3); // Dead scc
5975 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
5976 .addReg(NegatedValLo)
5977 .addImm(31)
5978 .setOperandDead(3); // Dead scc
5979 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
5980 .addReg(Op1L)
5981 .addReg(NegatedValHi);
5982 }
5983 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
5984 ? NegatedValLo
5985 : NewAccumulator->getOperand(0).getReg();
5986 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5987 .addReg(Op1L)
5988 .addReg(LowOpcode);
5989 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
5990 .addReg(Op1L)
5991 .addReg(LowOpcode);
5992 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
5993 .addReg(Op1H)
5994 .addReg(LowOpcode);
5995
5996 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
5997 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
5998 .addReg(CarryReg)
5999 .addReg(Op1H_Op0L_Reg)
6000 .setOperandDead(3); // Dead scc
6001
6002 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6003 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
6004 .addReg(HiVal)
6005 .addReg(Op1L_Op0H_Reg)
6006 .setOperandDead(3); // Dead scc
6007 }
6008 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
6009 break;
6010 }
6011 case AMDGPU::V_ADD_F32_e64:
6012 case AMDGPU::V_ADD_F64_e64:
6013 case AMDGPU::V_ADD_F64_pseudo_e64:
6014 case AMDGPU::V_SUB_F32_e64: {
6015 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
6016 const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
6017 Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
6018 Register DstVreg = MRI.createVirtualRegister(VregRC);
6019 // Get number of active lanes as a float val.
6020 BuildMI(BB, MI, DL,
6021 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
6022 : AMDGPU::V_CVT_F64_I32_e64),
6023 ActiveLanesVreg)
6024 .addReg(NewAccumulator->getOperand(0).getReg())
6025 .addImm(0) // clamp
6026 .addImm(0); // output-modifier
6027
6028 // Take negation of input for SUB reduction
6029 unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6030 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
6033 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
6034 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
6035 ? AMDGPU::V_MUL_F64_pseudo_e64
6036 : AMDGPU::V_MUL_F64_e64;
6037 auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
6038 DstVreg)
6039 .addImm(srcMod) // src0 modifier
6040 .addReg(SrcReg)
6041 .addImm(SISrcMods::NONE) // src1 modifier
6042 .addReg(ActiveLanesVreg)
6043 .addImm(SISrcMods::NONE) // clamp
6044 .addImm(SISrcMods::NONE); // output-mod
6045 if (is32BitOpc) {
6046 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6047 .addReg(DstVreg);
6048 } else {
6049 Register LaneValueLoReg =
6050 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6051 Register LaneValueHiReg =
6052 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6053 auto [Op1L, Op1H] =
6054 ExtractSubRegs(MI, DestVregInst->getOperand(0), VregRC, ST, MRI);
6055 // lane value input should be in an sgpr
6056 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6057 LaneValueLoReg)
6058 .addReg(Op1L);
6059 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6060 LaneValueHiReg)
6061 .addReg(Op1H);
6062 NewAccumulator =
6063 BuildRegSequence(BB, MI, DstReg, LaneValueLoReg, LaneValueHiReg);
6064 }
6065 }
6066 }
6067 RetBB = &BB;
6068 }
6069 }
6070 } else {
6072 Register SrcReg = MI.getOperand(1).getReg();
6073 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
6075 bool NeedsMovDPP = !is32BitOpc;
6076 // Create virtual registers required for lowering.
6077 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
6078 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
6079 const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
6080 bool IsWave32 = ST.isWave32();
6081 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6082 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6083 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
6084 !ST.hasDPP()) { // If target doesn't support DPP operations, default to
6085 // iterative stratergy
6086
6087 // To reduce the VGPR using iterative approach, we need to iterate
6088 // over all the active lanes. Lowering consists of ComputeLoop,
6089 // which iterate over only active lanes. We use copy of EXEC register
6090 // as induction variable and every active lane modifies it using bitset0
6091 // so that we will get the next active lane for next iteration.
6092
6093 // Create Control flow for loop
6094 // Split MI's Machine Basic block into For loop
6095 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
6096
6097 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
6098 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
6099 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
6100 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
6101 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
6102 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6103 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
6104
6105 // Create initial values of induction variable from Exec, Accumulator and
6106 // insert branch instr to newly created ComputeBlock
6107 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
6108 uint64_t IdentityValue =
6109 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6110 ? 0x0 // +0.0 for double sub reduction
6112 BuildMI(BB, I, DL,
6113 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6114 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6115 IdentityValReg)
6116 .addImm(IdentityValue);
6117 // clang-format off
6118 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
6119 .addMBB(ComputeLoop);
6120 // clang-format on
6121
6122 // Start constructing ComputeLoop
6123 I = ComputeLoop->begin();
6124 auto Accumulator =
6125 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
6126 .addReg(IdentityValReg)
6127 .addMBB(&BB);
6128 auto ActiveBits =
6129 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
6130 .addReg(LoopIterator)
6131 .addMBB(&BB);
6132
6133 I = ComputeLoop->end();
6134 MachineInstr *NewAccumulator;
6135 // Perform the computations
6136 unsigned SFFOpc =
6137 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6138 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
6139 .addReg(ActiveBitsReg);
6140 if (is32BitOpc) {
6141 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6142 LaneValueReg)
6143 .addReg(SrcReg)
6144 .addReg(FF1Reg);
6145 if (isFPOp) {
6146 Register LaneValVreg =
6147 MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
6148 Register DstVreg = MRI.createVirtualRegister(MRI.getRegClass(SrcReg));
6149 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
6150 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_MOV_B32_e32),
6151 LaneValVreg)
6152 .addReg(LaneValueReg);
6153 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6154 .addImm(0) // src0 modifier
6155 .addReg(Accumulator->getOperand(0).getReg())
6156 .addImm(0) // src1 modifier
6157 .addReg(LaneValVreg)
6158 .addImm(0) // clamp
6159 .addImm(0); // omod
6160 NewAccumulator =
6161 BuildMI(*ComputeLoop, I, DL,
6162 TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6163 .addReg(DstVreg);
6164 } else {
6165 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6166 .addReg(Accumulator->getOperand(0).getReg())
6167 .addReg(LaneValueReg);
6168 }
6169 } else {
6170 Register LaneValueLoReg =
6171 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6172 Register LaneValueHiReg =
6173 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6174 Register LaneValReg =
6175 MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6176 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
6177 MRI.getRegClass(SrcReg), ST, MRI);
6178 // lane value input should be in an sgpr
6179 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6180 LaneValueLoReg)
6181 .addReg(Op1L)
6182 .addReg(FF1Reg);
6183 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6184 LaneValueHiReg)
6185 .addReg(Op1H)
6186 .addReg(FF1Reg);
6187 auto LaneValue = BuildRegSequence(*ComputeLoop, I, LaneValReg,
6188 LaneValueLoReg, LaneValueHiReg);
6189 switch (Opc) {
6190 case AMDGPU::S_OR_B64:
6191 case AMDGPU::S_AND_B64:
6192 case AMDGPU::S_XOR_B64: {
6193 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6194 .addReg(Accumulator->getOperand(0).getReg())
6195 .addReg(LaneValue->getOperand(0).getReg())
6196 .setOperandDead(3); // Dead scc
6197 break;
6198 }
6199 case AMDGPU::V_CMP_GT_I64_e64:
6200 case AMDGPU::V_CMP_GT_U64_e64:
6201 case AMDGPU::V_CMP_LT_I64_e64:
6202 case AMDGPU::V_CMP_LT_U64_e64: {
6203 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6204 Register ComparisonResultReg =
6205 MRI.createVirtualRegister(WaveMaskRegClass);
6206 int SrcIdx =
6207 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6208 const TargetRegisterClass *VregClass =
6209 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6210 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
6211 auto [SrcReg0Sub0, SrcReg0Sub1] = ExtractSubRegs(
6212 MI, Accumulator->getOperand(0), VregClass, ST, MRI);
6213 BuildRegSequence(*ComputeLoop, I, AccumulatorVReg, SrcReg0Sub0,
6214 SrcReg0Sub1);
6215 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
6216 .addReg(LaneValue->getOperand(0).getReg())
6217 .addReg(AccumulatorVReg);
6218
6219 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6220 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
6221 .addReg(LaneMaskReg)
6222 .addReg(ActiveBitsReg);
6223
6224 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6225 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6226 .addReg(LaneValue->getOperand(0).getReg())
6227 .addReg(Accumulator->getOperand(0).getReg());
6228 break;
6229 }
6230 case AMDGPU::V_MIN_F64_e64:
6231 case AMDGPU::V_MIN_NUM_F64_e64:
6232 case AMDGPU::V_MAX_F64_e64:
6233 case AMDGPU::V_MAX_NUM_F64_e64:
6234 case AMDGPU::V_ADD_F64_e64:
6235 case AMDGPU::V_ADD_F64_pseudo_e64: {
6236 int SrcIdx =
6237 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6238 const TargetRegisterClass *VregRC =
6239 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6240 Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
6241 Register DstVreg = MRI.createVirtualRegister(VregRC);
6242 Register LaneValLo =
6243 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6244 Register LaneValHi =
6245 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6246 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
6247 .addReg(Accumulator->getOperand(0).getReg());
6248 unsigned Modifier =
6249 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6252 auto DstVregInst =
6253 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6254 .addImm(Modifier) // src0 modifiers
6255 .addReg(LaneValue->getOperand(0).getReg())
6256 .addImm(SISrcMods::NONE) // src1 modifiers
6257 .addReg(AccumulatorVReg)
6258 .addImm(SISrcMods::NONE) // clamp
6259 .addImm(SISrcMods::NONE); // omod
6260 auto ReadLaneLo =
6261 BuildMI(*ComputeLoop, I, DL,
6262 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6263 auto ReadLaneHi =
6264 BuildMI(*ComputeLoop, I, DL,
6265 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6266 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6267 auto [Op1L, Op1H] = ExtractSubRegs(*Iters, DstVregInst->getOperand(0),
6268 VregRC, ST, MRI);
6269 ReadLaneLo.addReg(Op1L);
6270 ReadLaneHi.addReg(Op1H);
6271 NewAccumulator =
6272 BuildRegSequence(*ComputeLoop, I, DstReg, LaneValLo, LaneValHi);
6273 break;
6274 }
6275 case AMDGPU::S_ADD_U64_PSEUDO:
6276 case AMDGPU::S_SUB_U64_PSEUDO: {
6277 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6278 .addReg(Accumulator->getOperand(0).getReg())
6279 .addReg(LaneValue->getOperand(0).getReg());
6280 ComputeLoop =
6281 expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
6282 break;
6283 }
6284 }
6285 }
6286 // Manipulate the iterator to get the next active lane
6287 unsigned BITSETOpc =
6288 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6289 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
6290 .addReg(FF1Reg)
6291 .addReg(ActiveBitsReg);
6292
6293 // Add phi nodes
6294 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
6295 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6296
6297 // Creating branching
6298 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6299 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
6300 .addReg(NewActiveBitsReg)
6301 .addImm(0);
6302 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6303 .addMBB(ComputeLoop);
6304
6305 RetBB = ComputeEnd;
6306 } else {
6307 assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
6308 MachineBasicBlock *CurrBB = &BB;
6309 Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
6310 Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
6311 Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
6312 Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);
6313 Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);
6314 Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);
6315 Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);
6316 Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);
6317 Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);
6318 Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);
6319 Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
6320 Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);
6321 Register FinalDPPResult;
6322 MachineInstr *SrcWithIdentityInstr;
6323 MachineInstr *LastBcastInstr;
6324 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
6325
6327 BuildMI(*CurrBB, MI, DL,
6328 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6329 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6330 IdentitySGPR)
6331 .addImm(IdentityValue);
6332 auto IdentityCopyInstr =
6333 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
6334 .addReg(IdentitySGPR);
6335 auto DPPClampOpcPair = getDPPOpcForWaveReduction(Opc, ST);
6336 unsigned DPPOpc = std::get<0>(DPPClampOpcPair);
6337 unsigned ClampOpc = std::get<1>(DPPClampOpcPair);
6338 auto BuildSetInactiveInstr = [&](Register Dst, Register Src0,
6339 Register Src1) {
6340 return BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32),
6341 Dst)
6342 .addImm(0) // src0 modifiers
6343 .addReg(Src0) // src0
6344 .addImm(0) // src1 modifiers
6345 .addReg(Src1) // identity value for inactive lanes
6346 .addReg(UndefExec); // bool i1
6347 };
6348 auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
6349 unsigned DPPCtrl) {
6350 auto DPPInstr =
6351 BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
6352 if (isFPOp && !NeedsMovDPP)
6353 DPPInstr.addImm(SISrcMods::NONE); // src0 modifier
6354 DPPInstr.addReg(Src); // src0
6355 if (isFPOp && !NeedsMovDPP)
6356 DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
6357 if (!NeedsMovDPP)
6358 DPPInstr.addReg(Src); // src1
6359 if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
6360 DPPInstr.addImm(0); // clamp
6361 DPPInstr
6362 .addImm(DPPCtrl) // dpp-ctrl
6363 .addImm(0xf) // row-mask
6364 .addImm(0xf) // bank-mask
6365 .addImm(0); // bound-control
6366 };
6367 auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,
6368 bool isAddSub = false,
6369 bool needsCarryIn = false,
6370 Register CarryIn = Register()) {
6371 unsigned InstrOpc = ClampOpc;
6372 Register CarryOutReg = MRI.createVirtualRegister(WaveMaskRegClass);
6373 if (needsCarryIn)
6374 InstrOpc = AMDGPU::V_ADDC_U32_e64;
6375 auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(InstrOpc), Dst);
6376 if (isFPOp)
6377 ClampInstr.addImm(SISrcMods::NONE); // src0 mod
6378 if (isAddSub) {
6379 if (needsCarryIn)
6380 ClampInstr.addReg(CarryOutReg,
6382 RegState::Dead); // killed carry-out reg
6383 else
6384 ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
6385 }
6386 ClampInstr.addReg(Src0); // src0
6387 if (isFPOp)
6388 ClampInstr.addImm(SISrcMods::NONE); // src1 mod
6389 ClampInstr.addReg(Src1); // src1
6390 if (needsCarryIn)
6391 ClampInstr.addReg(CarryIn, RegState::Kill); // carry-in reg
6392 if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
6393 ClampInstr.addImm(0); // clamp
6394 if (isFPOp)
6395 ClampInstr.addImm(0); // omod
6396 LastBcastInstr = ClampInstr;
6397 return CarryOutReg;
6398 };
6399 auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
6400 bool isAddSubOpc =
6401 Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
6402 bool isBitWiseOpc = Opc == AMDGPU::S_AND_B64 ||
6403 Opc == AMDGPU::S_OR_B64 || Opc == AMDGPU::S_XOR_B64;
6404 Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
6405 if (isAddSubOpc || isBitWiseOpc) {
6406 Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6407 Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6408 MachineOperand Src0Operand =
6409 MachineOperand::CreateReg(Src0, /*isDef=*/false);
6410 MachineOperand Src1Operand =
6411 MachineOperand::CreateReg(Src1, /*isDef=*/false);
6412 auto [Src0Lo, Src0Hi] =
6413 ExtractSubRegs(MI, Src0Operand, SrcRegClass, ST, MRI);
6414 auto [Src1Lo, Src1Hi] =
6415 ExtractSubRegs(MI, Src1Operand, SrcRegClass, ST, MRI);
6416 Register CarryReg = BuildClampInstr(
6417 ResLo, Src0Lo, Src1Lo, isAddSubOpc, /*needsCarryIn*/ false);
6418 BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,
6419 /*needsCarryIn*/ isAddSubOpc, CarryReg);
6420 BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
6421 } else {
6422 if (isFPOp) {
6423 BuildMI(*CurrBB, MI, DL, TII->get(Opc), ReturnReg)
6424 .addImm(SISrcMods::NONE) // src0 modifiers
6425 .addReg(Src0)
6426 .addImm(SISrcMods::NONE) // src1 modifiers
6427 .addReg(Src1)
6428 .addImm(SISrcMods::NONE) // clamp
6429 .addImm(SISrcMods::NONE); // omod
6430 } else {
6431 Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6432 BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
6433 .addReg(Src0) // src0
6434 .addReg(Src1); // src1
6435 LastBcastInstr =
6436 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
6437 ReturnReg)
6438 .addReg(Src1) // src0
6439 .addReg(Src0) // src1
6440 .addReg(CmpMaskReg); // src2
6441 expand64BitV_CNDMASK(*LastBcastInstr, CurrBB);
6442 }
6443 }
6444 return ReturnReg;
6445 };
6446
6447 // Set inactive lanes to the identity value.
6448 if (is32BitOpc) {
6449 SrcWithIdentityInstr =
6450 BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
6451 } else {
6452 Register SrcWithIdentitylo =
6453 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6454 Register SrcWithIdentityhi =
6455 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6456 auto [Reg0Sub0, Reg0Sub1] = ExtractSubRegs(
6457 MI, IdentityCopyInstr->getOperand(0), SrcRegClass, ST, MRI);
6458 auto [SrcReg0Sub0, SrcReg0Sub1] =
6459 ExtractSubRegs(MI, MI.getOperand(1), SrcRegClass, ST, MRI);
6460 MachineInstr *SetInactiveLoInstr =
6461 BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);
6462 MachineInstr *SetInactiveHiInstr =
6463 BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);
6464 SrcWithIdentityInstr =
6465 BuildRegSequence(*CurrBB, MI, SrcWithIdentity,
6466 SetInactiveLoInstr->getOperand(0).getReg(),
6467 SetInactiveHiInstr->getOperand(0).getReg());
6468 }
6469 // DPP reduction
6470 Register SrcWithIdentityReg =
6471 SrcWithIdentityInstr->getOperand(0).getReg();
6472 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
6474 if (NeedsMovDPP)
6475 DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
6476
6477 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6479 if (NeedsMovDPP)
6480 DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
6481
6482 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6484 if (NeedsMovDPP)
6485 DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
6486
6487 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6489 if (NeedsMovDPP)
6490 DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
6491
6492 if (ST.hasDPPBroadcasts()) {
6493 BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
6494 if (NeedsMovDPP)
6495 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
6496 } else {
6497 // magic constant: 0x1E0
6498 // To Set BIT_MODE : bit 15 = 0
6499 // XOR mask : bit [14:10] = 0
6500 // OR mask : bit [9:5] = 15
6501 // AND mask : bit [4:0] = 0
6502 if (is32BitOpc) {
6503 Register SwizzledValue =
6504 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6505 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6506 SwizzledValue)
6507 .addReg(DPPRowShr8) // addr
6508 .addImm(0x1E0) // swizzle offset (i16)
6509 .addImm(0x0); // gds (i1)
6510 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
6511 } else {
6512 Register SwizzledValuelo =
6513 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6514 Register SwizzledValuehi =
6515 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6516 Register SwizzledValue64 = MRI.createVirtualRegister(SrcRegClass);
6517 MachineOperand DPPRowShr8Op =
6518 MachineOperand::CreateReg(DPPRowShr8, /*isDef=*/false);
6519 auto [Op1L, Op1H] =
6520 ExtractSubRegs(MI, DPPRowShr8Op, SrcRegClass, ST, MRI);
6521 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6522 SwizzledValuelo)
6523 .addReg(Op1L) // addr
6524 .addImm(0x1E0) // swizzle offset (i16)
6525 .addImm(0x0); // gds (i1)
6526 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6527 SwizzledValuehi)
6528 .addReg(Op1H) // addr
6529 .addImm(0x1E0) // swizzle offset (i16)
6530 .addImm(0x0); // gds (i1)
6531 BuildRegSequence(*CurrBB, MI, SwizzledValue64, SwizzledValuelo,
6532 SwizzledValuehi);
6533 if (NeedsMovDPP)
6534 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
6535 else
6536 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
6537 }
6538 }
6539 FinalDPPResult = RowBcast15;
6540 if (!IsWave32) {
6541 if (ST.hasDPPBroadcasts()) {
6542 BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
6543 if (NeedsMovDPP)
6544 RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
6545 } else {
6546 Register ShiftedThreadID =
6547 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6548 Register PermuteByteOffset =
6549 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6550 Register PermutedValue = MRI.createVirtualRegister(SrcRegClass);
6551 Register Lane32Offset =
6552 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6553 Register WordSizeConst =
6554 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6555 Register ThreadIDRegLo =
6556 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6557 Register ThreadIDReg =
6558 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6559 // Get the thread ID.
6560 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
6561 ThreadIDRegLo)
6562 .addImm(-1)
6563 .addImm(0);
6564 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
6565 ThreadIDReg)
6566 .addImm(-1)
6567 .addReg(ThreadIDRegLo);
6568 // shift each lane over by 32 positions, so value in 31st lane is
6569 // present in 63rd lane.
6570 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
6571 .addImm(0x20);
6572 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64),
6573 ShiftedThreadID)
6574 .addReg(ThreadIDReg)
6575 .addReg(Lane32Offset)
6576 .addImm(0); // clamp
6577 // multiply by reg size.
6578 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
6579 .addImm(0x4);
6580 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
6581 PermuteByteOffset)
6582 .addReg(WordSizeConst)
6583 .addReg(ShiftedThreadID);
6584 // Permute the lanes
6585 if (is32BitOpc) {
6586 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6587 PermutedValue)
6588 .addReg(PermuteByteOffset) // addr
6589 .addReg(RowBcast15) // data
6590 .addImm(0); // offset
6591 } else {
6592 Register PermutedValuelo =
6593 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6594 Register PermutedValuehi =
6595 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6596 MachineOperand RowBcast15Op =
6597 MachineOperand::CreateReg(RowBcast15, /*isDef=*/false);
6598 auto [RowBcast15Lo, RowBcast15Hi] =
6599 ExtractSubRegs(MI, RowBcast15Op, SrcRegClass, ST, MRI);
6600 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6601 PermutedValuelo)
6602 .addReg(PermuteByteOffset) // addr
6603 .addReg(RowBcast15Lo) // swizzle offset (i16)
6604 .addImm(0x0); // gds (i1)
6605 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6606 PermutedValuehi)
6607 .addReg(PermuteByteOffset) // addr
6608 .addReg(RowBcast15Hi) // swizzle offset (i16)
6609 .addImm(0x0); // gds (i1)
6610 BuildRegSequence(*CurrBB, MI, PermutedValue, PermutedValuelo,
6611 PermutedValuehi);
6612 }
6613 if (NeedsMovDPP)
6614 RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
6615 else
6616 BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
6617 }
6618 FinalDPPResult = RowBcast31;
6619 }
6620 if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6621 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
6622 Register NegatedValVGPR = MRI.createVirtualRegister(SrcRegClass);
6623 // Opc for f32 reduction is V_SUB_F32.
6624 // For f64, there is no equivalent V_SUB_F64 opcode, so use
6625 // V_ADD_F64/V_ADD_F64_pseudo, and negate the second operand.
6626 BuildMI(*CurrBB, MI, DL, TII->get(Opc),
6627 NegatedValVGPR)
6628 .addImm(SISrcMods::NONE) // src0 mods
6629 .addReg(IdentityVGPR) // src0
6630 .addImm(is32BitOpc ? SISrcMods::NONE : SISrcMods::NEG) // src1 mods
6631 .addReg(IsWave32 ? RowBcast15 : RowBcast31) // src1
6632 .addImm(SISrcMods::NONE) // clamp
6633 .addImm(SISrcMods::NONE); // omod
6634 FinalDPPResult = NegatedValVGPR;
6635 }
6636 // The final reduced value is in the last lane.
6637 if (is32BitOpc) {
6638 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6639 ReducedValSGPR)
6640 .addReg(FinalDPPResult)
6641 .addImm(ST.getWavefrontSize() - 1);
6642 } else {
6643 Register LaneValueLoReg =
6644 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6645 Register LaneValueHiReg =
6646 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6647 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
6648 MachineOperand FinalDPPResultOperand =
6649 MachineOperand::CreateReg(FinalDPPResult, /*isDef=*/false);
6650 auto [Op1L, Op1H] =
6651 ExtractSubRegs(MI, FinalDPPResultOperand, SrcRC, ST, MRI);
6652 // lane value input should be in an sgpr
6653 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6654 LaneValueLoReg)
6655 .addReg(Op1L)
6656 .addImm(ST.getWavefrontSize() - 1);
6657 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6658 LaneValueHiReg)
6659 .addReg(Op1H)
6660 .addImm(ST.getWavefrontSize() - 1);
6661 BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
6662 LaneValueHiReg);
6663 }
6664 if (Opc == AMDGPU::S_SUB_I32) {
6665 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6666 .addImm(0)
6667 .addReg(ReducedValSGPR);
6668 } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6669 auto NegatedValInstr =
6670 BuildMI(*CurrBB, MI, DL, TII->get(Opc), NegatedReducedVal)
6671 .addImm(0)
6672 .addReg(ReducedValSGPR);
6673 CurrBB = expand64BitScalarArithmetic(*NegatedValInstr, CurrBB);
6674 }
6675 // Mark the final result as a whole-wave-mode calculation.
6676 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
6677 .addReg(Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO
6678 ? NegatedReducedVal
6679 : ReducedValSGPR);
6680 RetBB = CurrBB;
6681 }
6682 }
6683 MI.eraseFromParent();
6684 return RetBB;
6685}
6686
6689 MachineBasicBlock *BB) const {
6690 MachineFunction *MF = BB->getParent();
6692 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6694 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6695 MachineRegisterInfo &MRI = MF->getRegInfo();
6696 const DebugLoc &DL = MI.getDebugLoc();
6697
6698 switch (MI.getOpcode()) {
6699 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6700 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6701 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6702 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6703 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6704 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6705 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6706 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6707 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6708 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6709 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6710 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6711 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6712 ? AMDGPU::V_MIN_NUM_F64_e64
6713 : AMDGPU::V_MIN_F64_e64);
6714 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6715 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6716 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6717 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6718 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6719 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6720 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6721 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6722 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6723 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6724 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6725 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6726 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6727 ? AMDGPU::V_MAX_NUM_F64_e64
6728 : AMDGPU::V_MAX_F64_e64);
6729 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6730 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6731 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6732 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6733 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6734 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6735 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6736 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6737 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6738 ? AMDGPU::V_ADD_F64_pseudo_e64
6739 : AMDGPU::V_ADD_F64_e64);
6740 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6741 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6742 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6743 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6744 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6745 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6746 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6747 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6748 // fadd + neg, by setting the NEG bit in the instruction.
6749 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6750 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6751 ? AMDGPU::V_ADD_F64_pseudo_e64
6752 : AMDGPU::V_ADD_F64_e64);
6753 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6754 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6755 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6756 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6757 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6758 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6759 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6760 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6761 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6762 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6763 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6764 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6765 case AMDGPU::S_UADDO_PSEUDO:
6766 case AMDGPU::S_USUBO_PSEUDO: {
6767 MachineOperand &Dest0 = MI.getOperand(0);
6768 MachineOperand &Dest1 = MI.getOperand(1);
6769 MachineOperand &Src0 = MI.getOperand(2);
6770 MachineOperand &Src1 = MI.getOperand(3);
6771
6772 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6773 ? AMDGPU::S_ADD_U32
6774 : AMDGPU::S_SUB_U32;
6775 // clang-format off
6776 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6777 .add(Src0)
6778 .add(Src1);
6779 // clang-format on
6780
6781 unsigned SelOpc =
6782 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6783 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6784
6785 MI.eraseFromParent();
6786 return BB;
6787 }
6788 case AMDGPU::S_ADD_U64_PSEUDO:
6789 case AMDGPU::S_SUB_U64_PSEUDO: {
6790 return expand64BitScalarArithmetic(MI, BB);
6791 }
6792 case AMDGPU::V_ADD_U64_PSEUDO:
6793 case AMDGPU::V_SUB_U64_PSEUDO: {
6794 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6795
6796 MachineOperand &Dest = MI.getOperand(0);
6797 MachineOperand &Src0 = MI.getOperand(1);
6798 MachineOperand &Src1 = MI.getOperand(2);
6799
6800 if (ST.hasAddSubU64Insts()) {
6801 auto I = BuildMI(*BB, MI, DL,
6802 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6803 : AMDGPU::V_SUB_U64_e64),
6804 Dest.getReg())
6805 .add(Src0)
6806 .add(Src1)
6807 .addImm(0); // clamp
6808 TII->legalizeOperands(*I);
6809 MI.eraseFromParent();
6810 return BB;
6811 }
6812
6813 if (IsAdd && ST.hasLshlAddU64Inst()) {
6814 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6815 Dest.getReg())
6816 .add(Src0)
6817 .addImm(0)
6818 .add(Src1);
6819 TII->legalizeOperands(*Add);
6820 MI.eraseFromParent();
6821 return BB;
6822 }
6823
6824 const auto *CarryRC = TRI->getWaveMaskRegClass();
6825
6826 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6827 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6828
6829 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6830 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6831
6832 const TargetRegisterClass *Src0RC = Src0.isReg()
6833 ? MRI.getRegClass(Src0.getReg())
6834 : &AMDGPU::VReg_64RegClass;
6835 const TargetRegisterClass *Src1RC = Src1.isReg()
6836 ? MRI.getRegClass(Src1.getReg())
6837 : &AMDGPU::VReg_64RegClass;
6838
6839 const TargetRegisterClass *Src0SubRC =
6840 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6841 const TargetRegisterClass *Src1SubRC =
6842 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6843
6844 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6845 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6846 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6847 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6848
6849 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6850 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6851 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6852 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6853
6854 unsigned LoOpc =
6855 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6856 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6857 .addReg(CarryReg, RegState::Define)
6858 .add(SrcReg0Sub0)
6859 .add(SrcReg1Sub0)
6860 .addImm(0); // clamp bit
6861
6862 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6863 MachineInstr *HiHalf =
6864 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6865 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6866 .add(SrcReg0Sub1)
6867 .add(SrcReg1Sub1)
6868 .addReg(CarryReg, RegState::Kill)
6869 .addImm(0); // clamp bit
6870
6871 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6872 .addReg(DestSub0)
6873 .addImm(AMDGPU::sub0)
6874 .addReg(DestSub1)
6875 .addImm(AMDGPU::sub1);
6876 TII->legalizeOperands(*LoHalf);
6877 TII->legalizeOperands(*HiHalf);
6878 MI.eraseFromParent();
6879 return BB;
6880 }
6881 case AMDGPU::S_ADD_CO_PSEUDO:
6882 case AMDGPU::S_SUB_CO_PSEUDO: {
6883 // This pseudo has a chance to be selected
6884 // only from uniform add/subcarry node. All the VGPR operands
6885 // therefore assumed to be splat vectors.
6887 MachineOperand &Dest = MI.getOperand(0);
6888 MachineOperand &CarryDest = MI.getOperand(1);
6889 MachineOperand &Src0 = MI.getOperand(2);
6890 MachineOperand &Src1 = MI.getOperand(3);
6891 MachineOperand &Src2 = MI.getOperand(4);
6892 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6893 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6894 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6895 .addReg(Src0.getReg());
6896 Src0.setReg(RegOp0);
6897 }
6898 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6899 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6900 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6901 .addReg(Src1.getReg());
6902 Src1.setReg(RegOp1);
6903 }
6904 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6905 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6906 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6907 .addReg(Src2.getReg());
6908 Src2.setReg(RegOp2);
6909 }
6910
6911 if (ST.isWave64()) {
6912 if (ST.hasScalarCompareEq64()) {
6913 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6914 .addReg(Src2.getReg())
6915 .addImm(0);
6916 } else {
6917 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6918 const TargetRegisterClass *SubRC =
6919 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6920 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6921 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6922 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6923 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
6924 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6925
6926 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
6927 .add(Src2Sub0)
6928 .add(Src2Sub1);
6929
6930 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6931 .addReg(Src2_32, RegState::Kill)
6932 .addImm(0);
6933 }
6934 } else {
6935 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
6936 .addReg(Src2.getReg())
6937 .addImm(0);
6938 }
6939
6940 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
6941 ? AMDGPU::S_ADDC_U32
6942 : AMDGPU::S_SUBB_U32;
6943
6944 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
6945
6946 unsigned SelOpc =
6947 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6948
6949 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
6950 .addImm(-1)
6951 .addImm(0);
6952
6953 MI.eraseFromParent();
6954 return BB;
6955 }
6956 case AMDGPU::SI_INIT_M0: {
6957 MachineOperand &M0Init = MI.getOperand(0);
6958 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6959 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
6960 AMDGPU::M0)
6961 .add(M0Init);
6962 MI.eraseFromParent();
6963 return BB;
6964 }
6965 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
6966 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
6967 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
6968 TII->get(AMDGPU::S_CMP_EQ_U32))
6969 .addImm(0)
6970 .addImm(0);
6971 return BB;
6972 }
6973 case AMDGPU::GET_GROUPSTATICSIZE: {
6974 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
6975 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
6976 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
6977 .add(MI.getOperand(0))
6978 .addImm(MFI->getLDSSize());
6979 MI.eraseFromParent();
6980 return BB;
6981 }
6982 case AMDGPU::GET_SHADERCYCLESHILO: {
6983 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
6984 // The algorithm is:
6985 //
6986 // hi1 = getreg(SHADER_CYCLES_HI)
6987 // lo1 = getreg(SHADER_CYCLES_LO)
6988 // hi2 = getreg(SHADER_CYCLES_HI)
6989 //
6990 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
6991 // Otherwise there was overflow and the result is hi2:0. In both cases the
6992 // result should represent the actual time at some point during the sequence
6993 // of three getregs.
6994 using namespace AMDGPU::Hwreg;
6995 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6996 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
6997 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
6998 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6999 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
7000 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
7001 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7002 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
7003 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
7004 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
7005 .addReg(RegHi1)
7006 .addReg(RegHi2);
7007 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7008 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
7009 .addReg(RegLo1)
7010 .addImm(0);
7011 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
7012 .add(MI.getOperand(0))
7013 .addReg(RegLo)
7014 .addImm(AMDGPU::sub0)
7015 .addReg(RegHi2)
7016 .addImm(AMDGPU::sub1);
7017 MI.eraseFromParent();
7018 return BB;
7019 }
7020 case AMDGPU::SI_INDIRECT_SRC_V1:
7021 case AMDGPU::SI_INDIRECT_SRC_V2:
7022 case AMDGPU::SI_INDIRECT_SRC_V3:
7023 case AMDGPU::SI_INDIRECT_SRC_V4:
7024 case AMDGPU::SI_INDIRECT_SRC_V5:
7025 case AMDGPU::SI_INDIRECT_SRC_V6:
7026 case AMDGPU::SI_INDIRECT_SRC_V7:
7027 case AMDGPU::SI_INDIRECT_SRC_V8:
7028 case AMDGPU::SI_INDIRECT_SRC_V9:
7029 case AMDGPU::SI_INDIRECT_SRC_V10:
7030 case AMDGPU::SI_INDIRECT_SRC_V11:
7031 case AMDGPU::SI_INDIRECT_SRC_V12:
7032 case AMDGPU::SI_INDIRECT_SRC_V16:
7033 case AMDGPU::SI_INDIRECT_SRC_V32:
7034 return emitIndirectSrc(MI, *BB, *getSubtarget());
7035 case AMDGPU::SI_INDIRECT_DST_V1:
7036 case AMDGPU::SI_INDIRECT_DST_V2:
7037 case AMDGPU::SI_INDIRECT_DST_V3:
7038 case AMDGPU::SI_INDIRECT_DST_V4:
7039 case AMDGPU::SI_INDIRECT_DST_V5:
7040 case AMDGPU::SI_INDIRECT_DST_V6:
7041 case AMDGPU::SI_INDIRECT_DST_V7:
7042 case AMDGPU::SI_INDIRECT_DST_V8:
7043 case AMDGPU::SI_INDIRECT_DST_V9:
7044 case AMDGPU::SI_INDIRECT_DST_V10:
7045 case AMDGPU::SI_INDIRECT_DST_V11:
7046 case AMDGPU::SI_INDIRECT_DST_V12:
7047 case AMDGPU::SI_INDIRECT_DST_V16:
7048 case AMDGPU::SI_INDIRECT_DST_V32:
7049 return emitIndirectDst(MI, *BB, *getSubtarget());
7050 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7051 case AMDGPU::SI_KILL_I1_PSEUDO:
7052 return splitKillBlock(MI, BB);
7053 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
7055 return BB;
7056 }
7057 case AMDGPU::SI_BR_UNDEF: {
7058 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
7059 .add(MI.getOperand(0));
7060 Br->getOperand(1).setIsUndef(); // read undef SCC
7061 MI.eraseFromParent();
7062 return BB;
7063 }
7064 case AMDGPU::ADJCALLSTACKUP:
7065 case AMDGPU::ADJCALLSTACKDOWN: {
7067 MachineInstrBuilder MIB(*MF, &MI);
7068 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
7069 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
7070 return BB;
7071 }
7072 case AMDGPU::SI_CALL_ISEL: {
7073 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
7074
7076 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
7077
7078 for (const MachineOperand &MO : MI.operands())
7079 MIB.add(MO);
7080
7081 MIB.cloneMemRefs(MI);
7082 MI.eraseFromParent();
7083 return BB;
7084 }
7085 case AMDGPU::V_ADD_CO_U32_e32:
7086 case AMDGPU::V_SUB_CO_U32_e32:
7087 case AMDGPU::V_SUBREV_CO_U32_e32: {
7088 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
7089 unsigned Opc = MI.getOpcode();
7090
7091 bool NeedClampOperand = false;
7092 if (TII->pseudoToMCOpcode(Opc) == -1) {
7094 NeedClampOperand = true;
7095 }
7096
7097 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
7098 if (TII->isVOP3(*I)) {
7099 I.addReg(TRI->getVCC(), RegState::Define);
7100 }
7101 I.add(MI.getOperand(1)).add(MI.getOperand(2));
7102 if (NeedClampOperand)
7103 I.addImm(0); // clamp bit for e64 encoding
7104
7105 TII->legalizeOperands(*I);
7106
7107 MI.eraseFromParent();
7108 return BB;
7109 }
7110 case AMDGPU::V_ADDC_U32_e32:
7111 case AMDGPU::V_SUBB_U32_e32:
7112 case AMDGPU::V_SUBBREV_U32_e32:
7113 // These instructions have an implicit use of vcc which counts towards the
7114 // constant bus limit.
7115 TII->legalizeOperands(MI);
7116 return BB;
7117 case AMDGPU::DS_GWS_INIT:
7118 case AMDGPU::DS_GWS_SEMA_BR:
7119 case AMDGPU::DS_GWS_BARRIER:
7120 case AMDGPU::DS_GWS_SEMA_V:
7121 case AMDGPU::DS_GWS_SEMA_P:
7122 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
7123 // A s_waitcnt 0 is required to be the instruction immediately following.
7124 if (getSubtarget()->hasGWSAutoReplay()) {
7126 return BB;
7127 }
7128
7129 return emitGWSMemViolTestLoop(MI, BB);
7130 case AMDGPU::S_SETREG_B32: {
7131 // Try to optimize cases that only set the denormal mode or rounding mode.
7132 //
7133 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
7134 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
7135 // instead.
7136 //
7137 // FIXME: This could be predicates on the immediate, but tablegen doesn't
7138 // allow you to have a no side effect instruction in the output of a
7139 // sideeffecting pattern.
7140 auto [ID, Offset, Width] =
7141 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
7143 return BB;
7144
7145 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
7146 const unsigned SetMask = WidthMask << Offset;
7147
7148 if (getSubtarget()->hasDenormModeInst()) {
7149 unsigned SetDenormOp = 0;
7150 unsigned SetRoundOp = 0;
7151
7152 // The dedicated instructions can only set the whole denorm or round mode
7153 // at once, not a subset of bits in either.
7154 if (SetMask ==
7156 // If this fully sets both the round and denorm mode, emit the two
7157 // dedicated instructions for these.
7158 SetRoundOp = AMDGPU::S_ROUND_MODE;
7159 SetDenormOp = AMDGPU::S_DENORM_MODE;
7160 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
7161 SetRoundOp = AMDGPU::S_ROUND_MODE;
7162 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
7163 SetDenormOp = AMDGPU::S_DENORM_MODE;
7164 }
7165
7166 if (SetRoundOp || SetDenormOp) {
7167 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
7168 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
7169 unsigned ImmVal = Def->getOperand(1).getImm();
7170 if (SetRoundOp) {
7171 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
7172 .addImm(ImmVal & 0xf);
7173
7174 // If we also have the denorm mode, get just the denorm mode bits.
7175 ImmVal >>= 4;
7176 }
7177
7178 if (SetDenormOp) {
7179 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
7180 .addImm(ImmVal & 0xf);
7181 }
7182
7183 MI.eraseFromParent();
7184 return BB;
7185 }
7186 }
7187 }
7188
7189 // If only FP bits are touched, used the no side effects pseudo.
7190 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
7191 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
7192 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
7193
7194 return BB;
7195 }
7196 case AMDGPU::S_INVERSE_BALLOT_U32:
7197 case AMDGPU::S_INVERSE_BALLOT_U64:
7198 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
7199 // necessary. After that they are equivalent to a COPY.
7200 MI.setDesc(TII->get(AMDGPU::COPY));
7201 return BB;
7202 case AMDGPU::ENDPGM_TRAP: {
7203 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
7204 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
7205 MI.addOperand(MachineOperand::CreateImm(0));
7206 return BB;
7207 }
7208
7209 // We need a block split to make the real endpgm a terminator. We also don't
7210 // want to break phis in successor blocks, so we can't just delete to the
7211 // end of the block.
7212
7213 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
7215 MF->push_back(TrapBB);
7216 // clang-format off
7217 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
7218 .addImm(0);
7219 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
7220 .addMBB(TrapBB);
7221 // clang-format on
7222
7223 BB->addSuccessor(TrapBB);
7224 MI.eraseFromParent();
7225 return SplitBB;
7226 }
7227 case AMDGPU::SIMULATED_TRAP: {
7228 assert(Subtarget->hasPrivEnabledTrap2NopBug());
7229 MachineBasicBlock *SplitBB =
7230 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
7231 MI.eraseFromParent();
7232 return SplitBB;
7233 }
7234 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
7235 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
7237
7238 // During ISel, it's difficult to propagate the original EXEC mask to use as
7239 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
7240 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
7241 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
7242 Register OriginalExec = Setup->getOperand(0).getReg();
7243 MF->getRegInfo().clearKillFlags(OriginalExec);
7244 MI.getOperand(0).setReg(OriginalExec);
7245 return BB;
7246 }
7247 default:
7248 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
7249 if (!MI.mayStore())
7251 return BB;
7252 }
7254 }
7255}
7256
7258 // This currently forces unfolding various combinations of fsub into fma with
7259 // free fneg'd operands. As long as we have fast FMA (controlled by
7260 // isFMAFasterThanFMulAndFAdd), we should perform these.
7261
7262 // When fma is quarter rate, for f64 where add / sub are at best half rate,
7263 // most of these combines appear to be cycle neutral but save on instruction
7264 // count / code size.
7265 return true;
7266}
7267
7269
7271 EVT VT) const {
7272 if (!VT.isVector()) {
7273 return MVT::i1;
7274 }
7275 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
7276}
7277
7279 // TODO: Should i16 be used always if legal? For now it would force VALU
7280 // shifts.
7281 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7282}
7283
7285 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7286 ? Ty.changeElementSize(16)
7287 : Ty.changeElementSize(32);
7288}
7289
7290// Answering this is somewhat tricky and depends on the specific device which
7291// have different rates for fma or all f64 operations.
7292//
7293// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
7294// regardless of which device (although the number of cycles differs between
7295// devices), so it is always profitable for f64.
7296//
7297// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
7298// only on full rate devices. Normally, we should prefer selecting v_mad_f32
7299// which we can always do even without fused FP ops since it returns the same
7300// result as the separate operations and since it is always full
7301// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
7302// however does not support denormals, so we do report fma as faster if we have
7303// a fast fma device and require denormals.
7304//
7306 EVT VT) const {
7307 VT = VT.getScalarType();
7308
7309 switch (VT.getSimpleVT().SimpleTy) {
7310 case MVT::f32: {
7311 // If mad is not available this depends only on if f32 fma is full rate.
7312 if (!Subtarget->hasMadMacF32Insts())
7313 return Subtarget->hasFastFMAF32();
7314
7315 // Otherwise f32 mad is always full rate and returns the same result as
7316 // the separate operations so should be preferred over fma.
7317 // However does not support denormals.
7319 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7320
7321 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
7322 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7323 }
7324 case MVT::f64:
7325 return true;
7326 case MVT::f16:
7327 case MVT::bf16:
7328 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
7329 default:
7330 break;
7331 }
7332
7333 return false;
7334}
7335
7337 LLT Ty) const {
7338 switch (Ty.getScalarSizeInBits()) {
7339 case 16:
7340 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
7341 case 32:
7342 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
7343 case 64:
7344 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
7345 default:
7346 break;
7347 }
7348
7349 return false;
7350}
7351
7353 if (!Ty.isScalar())
7354 return false;
7355
7356 if (Ty.getScalarSizeInBits() == 16)
7357 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
7358 if (Ty.getScalarSizeInBits() == 32)
7359 return Subtarget->hasMadMacF32Insts() &&
7360 denormalModeIsFlushAllF32(*MI.getMF());
7361
7362 return false;
7363}
7364
7366 const SDNode *N) const {
7367 // TODO: Check future ftz flag
7368 // v_mad_f32/v_mac_f32 do not support denormals.
7369 EVT VT = N->getValueType(0);
7370 if (VT == MVT::f32)
7371 return Subtarget->hasMadMacF32Insts() &&
7373 if (VT == MVT::f16) {
7374 return Subtarget->hasMadF16() &&
7376 }
7377
7378 return false;
7379}
7380
7381//===----------------------------------------------------------------------===//
7382// Custom DAG Lowering Operations
7383//===----------------------------------------------------------------------===//
7384
7385// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7386// wider vector type is legal.
7388 SelectionDAG &DAG) const {
7389 unsigned Opc = Op.getOpcode();
7390 EVT VT = Op.getValueType();
7391 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7392 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7393 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7394 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7395 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7396 VT == MVT::v32bf16);
7397
7398 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7399
7400 SDLoc SL(Op);
7401 SDValue OpLo = DAG.getNode(Opc, SL, Lo.getValueType(), Lo, Op->getFlags());
7402 SDValue OpHi = DAG.getNode(Opc, SL, Hi.getValueType(), Hi, Op->getFlags());
7403
7404 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7405}
7406
7407// Enable lowering of ROTR for vxi32 types. This is a workaround for a
7408// regression whereby extra unnecessary instructions were added to codegen
7409// for rotr operations, casued by legalising v2i32 or. This resulted in extra
7410// instructions to extract the result from the vector.
7412 [[maybe_unused]] EVT VT = Op.getValueType();
7413
7414 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7415 VT == MVT::v16i32) &&
7416 "Unexpected ValueType.");
7417
7418 return DAG.UnrollVectorOp(Op.getNode());
7419}
7420
7421// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7422// wider vector type is legal.
7424 SelectionDAG &DAG) const {
7425 unsigned Opc = Op.getOpcode();
7426 EVT VT = Op.getValueType();
7427 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16 ||
7428 VT == MVT::v4f32 || VT == MVT::v8i16 || VT == MVT::v8f16 ||
7429 VT == MVT::v8bf16 || VT == MVT::v16i16 || VT == MVT::v16f16 ||
7430 VT == MVT::v16bf16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7431 VT == MVT::v32f32 || VT == MVT::v32i16 || VT == MVT::v32f16 ||
7432 VT == MVT::v32bf16);
7433
7434 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
7435 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7436
7437 SDLoc SL(Op);
7438
7439 SDValue OpLo =
7440 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
7441 SDValue OpHi =
7442 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
7443
7444 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7445}
7446
7448 SelectionDAG &DAG) const {
7449 unsigned Opc = Op.getOpcode();
7450 EVT VT = Op.getValueType();
7451 assert(VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v8i16 ||
7452 VT == MVT::v8f16 || VT == MVT::v4f32 || VT == MVT::v16i16 ||
7453 VT == MVT::v16f16 || VT == MVT::v8f32 || VT == MVT::v16f32 ||
7454 VT == MVT::v32f32 || VT == MVT::v32f16 || VT == MVT::v32i16 ||
7455 VT == MVT::v4bf16 || VT == MVT::v8bf16 || VT == MVT::v16bf16 ||
7456 VT == MVT::v32bf16);
7457
7458 SDValue Op0 = Op.getOperand(0);
7459 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7460 ? DAG.SplitVectorOperand(Op.getNode(), 0)
7461 : std::pair(Op0, Op0);
7462
7463 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7464 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
7465
7466 SDLoc SL(Op);
7467 auto ResVT = DAG.GetSplitDestVTs(VT);
7468
7469 SDValue OpLo =
7470 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
7471 SDValue OpHi =
7472 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
7473
7474 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7475}
7476
7478 switch (Op.getOpcode()) {
7479 default:
7481 case ISD::BRCOND:
7482 return LowerBRCOND(Op, DAG);
7483 case ISD::RETURNADDR:
7484 return LowerRETURNADDR(Op, DAG);
7485 case ISD::SPONENTRY:
7486 return LowerSPONENTRY(Op, DAG);
7487 case ISD::LOAD: {
7488 SDValue Result = LowerLOAD(Op, DAG);
7489 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7490 "Load should return a value and a chain");
7491 return Result;
7492 }
7493 case ISD::FSQRT: {
7494 EVT VT = Op.getValueType();
7495 if (VT == MVT::f32)
7496 return lowerFSQRTF32(Op, DAG);
7497 if (VT == MVT::f64)
7498 return lowerFSQRTF64(Op, DAG);
7499 return SDValue();
7500 }
7501 case ISD::FSIN:
7502 case ISD::FCOS:
7503 return LowerTrig(Op, DAG);
7504 case ISD::SELECT:
7505 return LowerSELECT(Op, DAG);
7506 case ISD::FDIV:
7507 return LowerFDIV(Op, DAG);
7508 case ISD::FFREXP:
7509 return LowerFFREXP(Op, DAG);
7511 return LowerATOMIC_CMP_SWAP(Op, DAG);
7512 case ISD::STORE:
7513 return LowerSTORE(Op, DAG);
7514 case ISD::GlobalAddress: {
7517 return LowerGlobalAddress(MFI, Op, DAG);
7518 }
7520 return LowerExternalSymbol(Op, DAG);
7522 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7524 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7526 return LowerINTRINSIC_VOID(Op, DAG);
7527 case ISD::ADDRSPACECAST:
7528 return lowerADDRSPACECAST(Op, DAG);
7530 return lowerINSERT_SUBVECTOR(Op, DAG);
7532 return lowerINSERT_VECTOR_ELT(Op, DAG);
7534 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7536 return lowerVECTOR_SHUFFLE(Op, DAG);
7538 return lowerSCALAR_TO_VECTOR(Op, DAG);
7539 case ISD::BUILD_VECTOR:
7540 return lowerBUILD_VECTOR(Op, DAG);
7541 case ISD::FP_ROUND:
7543 return lowerFP_ROUND(Op, DAG);
7544 case ISD::TRAP:
7545 return lowerTRAP(Op, DAG);
7546 case ISD::DEBUGTRAP:
7547 return lowerDEBUGTRAP(Op, DAG);
7548 case ISD::ABS:
7549 case ISD::FABS:
7550 case ISD::FNEG:
7551 case ISD::FCANONICALIZE:
7552 case ISD::BSWAP:
7553 return splitUnaryVectorOp(Op, DAG);
7554 case ISD::FMINNUM:
7555 case ISD::FMAXNUM:
7556 return lowerFMINNUM_FMAXNUM(Op, DAG);
7557 case ISD::FMINIMUMNUM:
7558 case ISD::FMAXIMUMNUM:
7559 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7560 case ISD::FMINIMUM:
7561 case ISD::FMAXIMUM:
7562 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7563 case ISD::FLDEXP:
7564 case ISD::STRICT_FLDEXP:
7565 return lowerFLDEXP(Op, DAG);
7566 case ISD::FMA:
7567 return splitTernaryVectorOp(Op, DAG);
7568 case ISD::FP_TO_SINT:
7569 case ISD::FP_TO_UINT:
7570 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
7571 Op.getValueType() == MVT::i16 &&
7572 Op.getOperand(0).getValueType() == MVT::f32) {
7573 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7574 return Op;
7575 }
7576 return LowerFP_TO_INT(Op, DAG);
7577 case ISD::SHL:
7578 case ISD::SRA:
7579 case ISD::SRL:
7580 case ISD::ADD:
7581 case ISD::SUB:
7582 case ISD::SMIN:
7583 case ISD::SMAX:
7584 case ISD::UMIN:
7585 case ISD::UMAX:
7586 case ISD::FADD:
7587 case ISD::FMUL:
7588 case ISD::FMINNUM_IEEE:
7589 case ISD::FMAXNUM_IEEE:
7590 case ISD::UADDSAT:
7591 case ISD::USUBSAT:
7592 case ISD::SADDSAT:
7593 case ISD::SSUBSAT:
7594 return splitBinaryVectorOp(Op, DAG);
7595 case ISD::FCOPYSIGN:
7596 return lowerFCOPYSIGN(Op, DAG);
7597 case ISD::MUL:
7598 return lowerMUL(Op, DAG);
7599 case ISD::SMULO:
7600 case ISD::UMULO:
7601 return lowerXMULO(Op, DAG);
7602 case ISD::SMUL_LOHI:
7603 case ISD::UMUL_LOHI:
7604 return lowerXMUL_LOHI(Op, DAG);
7606 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7607 case ISD::STACKSAVE:
7608 return LowerSTACKSAVE(Op, DAG);
7609 case ISD::GET_ROUNDING:
7610 return lowerGET_ROUNDING(Op, DAG);
7611 case ISD::SET_ROUNDING:
7612 return lowerSET_ROUNDING(Op, DAG);
7613 case ISD::PREFETCH:
7614 return lowerPREFETCH(Op, DAG);
7615 case ISD::FP_EXTEND:
7617 return lowerFP_EXTEND(Op, DAG);
7618 case ISD::GET_FPENV:
7619 return lowerGET_FPENV(Op, DAG);
7620 case ISD::SET_FPENV:
7621 return lowerSET_FPENV(Op, DAG);
7622 case ISD::ROTR:
7623 return lowerROTR(Op, DAG);
7624 case ISD::INLINEASM:
7625 return LowerINLINEASM(Op, DAG);
7626 }
7627 return SDValue();
7628}
7629
7630// Used for D16: Casts the result of an instruction into the right vector,
7631// packs values if loads return unpacked values.
7633 const SDLoc &DL, SelectionDAG &DAG,
7634 bool Unpacked) {
7635 if (!LoadVT.isVector())
7636 return Result;
7637
7638 // Cast back to the original packed type or to a larger type that is a
7639 // multiple of 32 bit for D16. Widening the return type is a required for
7640 // legalization.
7641 EVT FittingLoadVT = LoadVT;
7642 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7643 FittingLoadVT =
7645 LoadVT.getVectorNumElements() + 1);
7646 }
7647
7648 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7649 // Truncate to v2i16/v4i16.
7650 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7651
7652 // Workaround legalizer not scalarizing truncate after vector op
7653 // legalization but not creating intermediate vector trunc.
7655 DAG.ExtractVectorElements(Result, Elts);
7656 for (SDValue &Elt : Elts)
7657 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7658
7659 // Pad illegal v1i16/v3fi6 to v4i16
7660 if ((LoadVT.getVectorNumElements() % 2) == 1)
7661 Elts.push_back(DAG.getPOISON(MVT::i16));
7662
7663 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
7664
7665 // Bitcast to original type (v2f16/v4f16).
7666 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7667 }
7668
7669 // Cast back to the original packed type.
7670 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7671}
7672
7673SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7674 SelectionDAG &DAG,
7676 bool IsIntrinsic) const {
7677 SDLoc DL(M);
7678
7679 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7680 EVT LoadVT = M->getValueType(0);
7681
7682 EVT EquivLoadVT = LoadVT;
7683 if (LoadVT.isVector()) {
7684 if (Unpacked) {
7685 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7686 LoadVT.getVectorNumElements());
7687 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7688 // Widen v3f16 to legal type
7689 EquivLoadVT =
7691 LoadVT.getVectorNumElements() + 1);
7692 }
7693 }
7694
7695 // Change from v4f16/v2f16 to EquivLoadVT.
7696 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7697
7699 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7700 M->getMemoryVT(), M->getMemOperand());
7701
7702 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7703
7704 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7705}
7706
7707SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7708 SelectionDAG &DAG,
7709 ArrayRef<SDValue> Ops) const {
7710 SDLoc DL(M);
7711 EVT LoadVT = M->getValueType(0);
7712 EVT EltType = LoadVT.getScalarType();
7713 EVT IntVT = LoadVT.changeTypeToInteger();
7714
7715 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7716
7717 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7718 bool IsTFE = M->getNumValues() == 3;
7719
7720 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7721 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7722 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7723 : AMDGPUISD::BUFFER_LOAD;
7724
7725 if (IsD16) {
7726 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7727 }
7728
7729 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7730 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7731 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7732 IsTFE);
7733
7734 if (isTypeLegal(LoadVT)) {
7735 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7736 M->getMemOperand(), DAG);
7737 }
7738
7739 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7740 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7741 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7742 M->getMemOperand(), DAG);
7743 return DAG.getMergeValues(
7744 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7745 DL);
7746}
7747
7749 SelectionDAG &DAG) {
7750 EVT VT = N->getValueType(0);
7751 unsigned CondCode = N->getConstantOperandVal(3);
7752 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7753 return DAG.getPOISON(VT);
7754
7755 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7756
7757 SDValue LHS = N->getOperand(1);
7758 SDValue RHS = N->getOperand(2);
7759
7760 SDLoc DL(N);
7761
7762 EVT CmpVT = LHS.getValueType();
7763 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7764 unsigned PromoteOp =
7766 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7767 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7768 }
7769
7770 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7771
7772 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7773 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7774
7775 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7776 DAG.getCondCode(CCOpcode));
7777 if (VT.bitsEq(CCVT))
7778 return SetCC;
7779 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7780}
7781
7783 SelectionDAG &DAG) {
7784 EVT VT = N->getValueType(0);
7785
7786 unsigned CondCode = N->getConstantOperandVal(3);
7787 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7788 return DAG.getPOISON(VT);
7789
7790 SDValue Src0 = N->getOperand(1);
7791 SDValue Src1 = N->getOperand(2);
7792 EVT CmpVT = Src0.getValueType();
7793 SDLoc SL(N);
7794
7795 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7796 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7797 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7798 }
7799
7800 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7801 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7802 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7803 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7804 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7805 DAG.getCondCode(CCOpcode));
7806 if (VT.bitsEq(CCVT))
7807 return SetCC;
7808 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7809}
7810
7812 SelectionDAG &DAG) {
7813 EVT VT = N->getValueType(0);
7814 SDValue Src = N->getOperand(1);
7815 SDLoc SL(N);
7816
7817 if (Src.getOpcode() == ISD::SETCC) {
7818 SDValue Op0 = Src.getOperand(0);
7819 SDValue Op1 = Src.getOperand(1);
7820 // Need to expand bfloat to float for comparison (setcc).
7821 if (Op0.getValueType() == MVT::bf16) {
7822 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7823 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7824 }
7825 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7826 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7827 }
7828 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7829 // (ballot 0) -> 0
7830 if (Arg->isZero())
7831 return DAG.getConstant(0, SL, VT);
7832
7833 // (ballot 1) -> EXEC/EXEC_LO
7834 if (Arg->isOne()) {
7835 Register Exec;
7836 if (VT.getScalarSizeInBits() == 32)
7837 Exec = AMDGPU::EXEC_LO;
7838 else if (VT.getScalarSizeInBits() == 64)
7839 Exec = AMDGPU::EXEC;
7840 else
7841 return SDValue();
7842
7843 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7844 }
7845 }
7846
7847 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7848 // ISD::SETNE)
7849 return DAG.getNode(
7850 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7851 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7852}
7853
7855 SelectionDAG &DAG) {
7856 EVT VT = N->getValueType(0);
7857 unsigned ValSize = VT.getSizeInBits();
7858 unsigned IID = N->getConstantOperandVal(0);
7859 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7860 IID == Intrinsic::amdgcn_permlanex16;
7861 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7862 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7863 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
7864 IID == Intrinsic::amdgcn_permlane_up ||
7865 IID == Intrinsic::amdgcn_permlane_down ||
7866 IID == Intrinsic::amdgcn_permlane_xor;
7867 SDLoc SL(N);
7868 MVT IntVT = MVT::getIntegerVT(ValSize);
7869 const GCNSubtarget *ST = TLI.getSubtarget();
7870 unsigned SplitSize = 32;
7871 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7872 ST->hasDPALU_DPP() &&
7873 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7874 SplitSize = 64;
7875
7876 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7877 SDValue Src2, MVT ValT) -> SDValue {
7878 SmallVector<SDValue, 8> Operands;
7879 switch (IID) {
7880 case Intrinsic::amdgcn_permlane16:
7881 case Intrinsic::amdgcn_permlanex16:
7882 case Intrinsic::amdgcn_update_dpp:
7883 Operands.push_back(N->getOperand(6));
7884 Operands.push_back(N->getOperand(5));
7885 Operands.push_back(N->getOperand(4));
7886 [[fallthrough]];
7887 case Intrinsic::amdgcn_writelane:
7888 case Intrinsic::amdgcn_permlane_bcast:
7889 case Intrinsic::amdgcn_permlane_up:
7890 case Intrinsic::amdgcn_permlane_down:
7891 case Intrinsic::amdgcn_permlane_xor:
7892 Operands.push_back(Src2);
7893 [[fallthrough]];
7894 case Intrinsic::amdgcn_readlane:
7895 case Intrinsic::amdgcn_set_inactive:
7896 case Intrinsic::amdgcn_set_inactive_chain_arg:
7897 case Intrinsic::amdgcn_mov_dpp8:
7898 Operands.push_back(Src1);
7899 [[fallthrough]];
7900 case Intrinsic::amdgcn_readfirstlane:
7901 case Intrinsic::amdgcn_permlane64:
7902 Operands.push_back(Src0);
7903 break;
7904 default:
7905 llvm_unreachable("unhandled lane op");
7906 }
7907
7908 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7909 std::reverse(Operands.begin(), Operands.end());
7910
7911 if (SDNode *GL = N->getGluedNode()) {
7912 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7913 GL = GL->getOperand(0).getNode();
7914 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7915 SDValue(GL, 0)));
7916 }
7917
7918 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
7919 };
7920
7921 SDValue Src0 = N->getOperand(1);
7922 SDValue Src1, Src2;
7923 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
7924 IID == Intrinsic::amdgcn_mov_dpp8 ||
7925 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
7926 IsPermlaneShuffle) {
7927 Src1 = N->getOperand(2);
7928 if (IID == Intrinsic::amdgcn_writelane ||
7929 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 ||
7930 IsPermlaneShuffle)
7931 Src2 = N->getOperand(3);
7932 }
7933
7934 if (ValSize == SplitSize) {
7935 // Already legal
7936 return SDValue();
7937 }
7938
7939 if (ValSize < 32) {
7940 bool IsFloat = VT.isFloatingPoint();
7941 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
7942 SL, MVT::i32);
7943
7944 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
7945 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
7946 SL, MVT::i32);
7947 }
7948
7949 if (IID == Intrinsic::amdgcn_writelane) {
7950 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
7951 SL, MVT::i32);
7952 }
7953
7954 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
7955 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
7956 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
7957 }
7958
7959 if (ValSize % SplitSize != 0)
7960 return SDValue();
7961
7962 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
7963 EVT VT = N->getValueType(0);
7964 unsigned NE = VT.getVectorNumElements();
7965 EVT EltVT = VT.getVectorElementType();
7967 unsigned NumOperands = N->getNumOperands();
7968 SmallVector<SDValue, 4> Operands(NumOperands);
7969 SDNode *GL = N->getGluedNode();
7970
7971 // only handle convergencectrl_glue
7973
7974 for (unsigned i = 0; i != NE; ++i) {
7975 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
7976 ++j) {
7977 SDValue Operand = N->getOperand(j);
7978 EVT OperandVT = Operand.getValueType();
7979 if (OperandVT.isVector()) {
7980 // A vector operand; extract a single element.
7981 EVT OperandEltVT = OperandVT.getVectorElementType();
7982 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
7983 Operand, DAG.getVectorIdxConstant(i, SL));
7984 } else {
7985 // A scalar operand; just use it as is.
7986 Operands[j] = Operand;
7987 }
7988 }
7989
7990 if (GL)
7991 Operands[NumOperands - 1] =
7992 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
7993 SDValue(GL->getOperand(0).getNode(), 0));
7994
7995 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
7996 }
7997
7998 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
7999 return DAG.getBuildVector(VecVT, SL, Scalars);
8000 };
8001
8002 if (VT.isVector()) {
8003 switch (MVT::SimpleValueType EltTy =
8005 case MVT::i32:
8006 case MVT::f32:
8007 if (SplitSize == 32) {
8008 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
8009 return unrollLaneOp(LaneOp.getNode());
8010 }
8011 [[fallthrough]];
8012 case MVT::i16:
8013 case MVT::f16:
8014 case MVT::bf16: {
8015 unsigned SubVecNumElt =
8016 SplitSize / VT.getVectorElementType().getSizeInBits();
8017 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
8019 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
8020 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
8021 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
8022 DAG.getConstant(EltIdx, SL, MVT::i32));
8023
8024 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
8025 IsPermLane16) {
8026 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
8027 DAG.getConstant(EltIdx, SL, MVT::i32));
8028
8029 Pieces.push_back(
8030 createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT));
8031 } else if (IID == Intrinsic::amdgcn_writelane) {
8032 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
8033 DAG.getConstant(EltIdx, SL, MVT::i32));
8034 Pieces.push_back(
8035 createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
8036 } else {
8037 Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT));
8038 }
8039
8040 EltIdx += SubVecNumElt;
8041 }
8042 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
8043 }
8044 default:
8045 // Handle all other cases by bitcasting to i32 vectors
8046 break;
8047 }
8048 }
8049
8050 MVT VecVT =
8051 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
8052 Src0 = DAG.getBitcast(VecVT, Src0);
8053
8054 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
8055 Src1 = DAG.getBitcast(VecVT, Src1);
8056
8057 if (IID == Intrinsic::amdgcn_writelane)
8058 Src2 = DAG.getBitcast(VecVT, Src2);
8059
8060 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
8061 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
8062 return DAG.getBitcast(VT, UnrolledLaneOp);
8063}
8064
8066 SelectionDAG &DAG) {
8067 EVT VT = N->getValueType(0);
8068
8069 if (VT.getSizeInBits() != 32)
8070 return SDValue();
8071
8072 SDLoc SL(N);
8073
8074 SDValue Value = N->getOperand(1);
8075 SDValue Index = N->getOperand(2);
8076
8077 // ds_bpermute requires index to be multiplied by 4
8078 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
8079 SDValue ShiftedIndex =
8080 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
8081
8082 // Intrinsics will require i32 to operate on
8083 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
8084
8085 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
8086 SmallVector<SDValue> IntrinArgs) -> SDValue {
8087 SmallVector<SDValue> Operands(1);
8088 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
8089 Operands.append(IntrinArgs);
8090 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
8091 };
8092
8093 // If we can bpermute across the whole wave, then just do that
8095 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8096 {ShiftedIndex, ValueI32});
8097 return DAG.getBitcast(VT, BPermute);
8098 }
8099
8100 assert(TLI.getSubtarget()->isWave64());
8101
8102 // Otherwise, we need to make use of whole wave mode
8103 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
8104
8105 // Set inactive lanes to poison
8106 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8107 {ValueI32, PoisonVal});
8108 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8109 {ShiftedIndex, PoisonVal});
8110
8111 SDValue Swapped =
8112 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
8113
8114 // Get permutation of each half, then we'll select which one to use
8115 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8116 {WWMIndex, WWMValue});
8117 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
8118 MVT::i32, {WWMIndex, Swapped});
8119 SDValue BPermOtherHalfWWM =
8120 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
8121
8122 // Select which side to take the permute from
8123 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
8124 // We can get away with only using mbcnt_lo here since we're only
8125 // trying to detect which side of 32 each lane is on, and mbcnt_lo
8126 // returns 32 for lanes 32-63.
8127 SDValue ThreadID =
8128 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
8129 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
8130
8131 SDValue SameOrOtherHalf =
8132 DAG.getNode(ISD::AND, SL, MVT::i32,
8133 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
8134 DAG.getTargetConstant(32, SL, MVT::i32));
8135 SDValue UseSameHalf =
8136 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
8137 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
8138 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
8139 BPermOtherHalfWWM);
8140 return DAG.getBitcast(VT, Result);
8141}
8142
8145 SelectionDAG &DAG) const {
8146 switch (N->getOpcode()) {
8148 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
8149 Results.push_back(Res);
8150 return;
8151 }
8153 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
8154 Results.push_back(Res);
8155 return;
8156 }
8158 unsigned IID = N->getConstantOperandVal(0);
8159 switch (IID) {
8160 case Intrinsic::amdgcn_make_buffer_rsrc:
8161 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
8162 return;
8163 case Intrinsic::amdgcn_cvt_pkrtz: {
8164 SDValue Src0 = N->getOperand(1);
8165 SDValue Src1 = N->getOperand(2);
8166 SDLoc SL(N);
8167 SDValue Cvt =
8168 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
8169 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
8170 return;
8171 }
8172 case Intrinsic::amdgcn_cvt_pknorm_i16:
8173 case Intrinsic::amdgcn_cvt_pknorm_u16:
8174 case Intrinsic::amdgcn_cvt_pk_i16:
8175 case Intrinsic::amdgcn_cvt_pk_u16: {
8176 SDValue Src0 = N->getOperand(1);
8177 SDValue Src1 = N->getOperand(2);
8178 SDLoc SL(N);
8179 unsigned Opcode;
8180
8181 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
8182 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8183 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
8184 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8185 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
8186 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8187 else
8188 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8189
8190 EVT VT = N->getValueType(0);
8191 if (isTypeLegal(VT))
8192 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
8193 else {
8194 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
8195 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
8196 }
8197 return;
8198 }
8199 case Intrinsic::amdgcn_s_buffer_load: {
8200 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
8201 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
8202 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
8203 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
8204 // s_buffer_load_i8.
8205 if (!Subtarget->hasScalarSubwordLoads())
8206 return;
8207 SDValue Op = SDValue(N, 0);
8208 SDValue Rsrc = Op.getOperand(1);
8209 SDValue Offset = Op.getOperand(2);
8210 SDValue CachePolicy = Op.getOperand(3);
8211 EVT VT = Op.getValueType();
8212 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
8213 SDLoc DL(Op);
8215 const DataLayout &DataLayout = DAG.getDataLayout();
8216 Align Alignment =
8222 VT.getStoreSize(), Alignment);
8223 SDValue LoadVal;
8224 if (!Offset->isDivergent()) {
8225 SDValue Ops[] = {Rsrc, // source register
8226 Offset, CachePolicy};
8227 SDValue BufferLoad =
8228 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
8229 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8230 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8231 } else {
8232 SDValue Ops[] = {
8233 DAG.getEntryNode(), // Chain
8234 Rsrc, // rsrc
8235 DAG.getConstant(0, DL, MVT::i32), // vindex
8236 {}, // voffset
8237 {}, // soffset
8238 {}, // offset
8239 CachePolicy, // cachepolicy
8240 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8241 };
8242 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8243 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8244 }
8245 Results.push_back(LoadVal);
8246 return;
8247 }
8248 case Intrinsic::amdgcn_dead: {
8249 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
8250 Results.push_back(DAG.getPOISON(N->getValueType(I)));
8251 return;
8252 }
8253 }
8254 break;
8255 }
8257 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
8258 if (Res.getOpcode() == ISD::MERGE_VALUES) {
8259 // FIXME: Hacky
8260 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
8261 Results.push_back(Res.getOperand(I));
8262 }
8263 } else {
8264 Results.push_back(Res);
8265 Results.push_back(Res.getValue(1));
8266 }
8267 return;
8268 }
8269
8270 break;
8271 }
8272 case ISD::SELECT: {
8273 SDLoc SL(N);
8274 EVT VT = N->getValueType(0);
8275 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
8276 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
8277 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
8278
8279 EVT SelectVT = NewVT;
8280 if (NewVT.bitsLT(MVT::i32)) {
8281 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
8282 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
8283 SelectVT = MVT::i32;
8284 }
8285
8286 SDValue NewSelect =
8287 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
8288
8289 if (NewVT != SelectVT)
8290 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
8291 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
8292 return;
8293 }
8294 case ISD::FNEG: {
8295 if (N->getValueType(0) != MVT::v2f16)
8296 break;
8297
8298 SDLoc SL(N);
8299 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8300
8301 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
8302 DAG.getConstant(0x80008000, SL, MVT::i32));
8303 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8304 return;
8305 }
8306 case ISD::FABS: {
8307 if (N->getValueType(0) != MVT::v2f16)
8308 break;
8309
8310 SDLoc SL(N);
8311 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8312
8313 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
8314 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
8315 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8316 return;
8317 }
8318 case ISD::FSQRT: {
8319 if (N->getValueType(0) != MVT::f16)
8320 break;
8321 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
8322 break;
8323 }
8324 default:
8326 break;
8327 }
8328}
8329
8330/// Helper function for LowerBRCOND
8331static SDNode *findUser(SDValue Value, unsigned Opcode) {
8332
8333 for (SDUse &U : Value->uses()) {
8334 if (U.get() != Value)
8335 continue;
8336
8337 if (U.getUser()->getOpcode() == Opcode)
8338 return U.getUser();
8339 }
8340 return nullptr;
8341}
8342
8343unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
8344 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
8345 switch (Intr->getConstantOperandVal(1)) {
8346 case Intrinsic::amdgcn_if:
8347 return AMDGPUISD::IF;
8348 case Intrinsic::amdgcn_else:
8349 return AMDGPUISD::ELSE;
8350 case Intrinsic::amdgcn_loop:
8351 return AMDGPUISD::LOOP;
8352 case Intrinsic::amdgcn_end_cf:
8353 llvm_unreachable("should not occur");
8354 default:
8355 return 0;
8356 }
8357 }
8358
8359 // break, if_break, else_break are all only used as inputs to loop, not
8360 // directly as branch conditions.
8361 return 0;
8362}
8363
8370
8372 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8373 return false;
8374
8375 // FIXME: Either avoid relying on address space here or change the default
8376 // address space for functions to avoid the explicit check.
8377 return (GV->getValueType()->isFunctionTy() ||
8380}
8381
8383 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
8384}
8385
8387 if (!GV->hasExternalLinkage())
8388 return true;
8389
8390 // With object linking, external LDS declarations need relocations so the
8391 // linker can assign their offsets.
8393 if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
8394 if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8395 assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
8396 "when object linking is enabled");
8397 return false;
8398 }
8399 }
8400 }
8401
8402 const auto OS = getTargetMachine().getTargetTriple().getOS();
8403 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
8404}
8405
8406/// This transforms the control flow intrinsics to get the branch destination as
8407/// last parameter, also switches branch target with BR if the need arise
8408SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
8409 SDLoc DL(BRCOND);
8410
8411 SDNode *Intr = BRCOND.getOperand(1).getNode();
8412 SDValue Target = BRCOND.getOperand(2);
8413 SDNode *BR = nullptr;
8414 SDNode *SetCC = nullptr;
8415
8416 switch (Intr->getOpcode()) {
8417 case ISD::SETCC: {
8418 // As long as we negate the condition everything is fine
8419 SetCC = Intr;
8420 Intr = SetCC->getOperand(0).getNode();
8421 break;
8422 }
8423 case ISD::XOR: {
8424 // Similar to SETCC, if we have (xor c, -1), we will be fine.
8425 SDValue LHS = Intr->getOperand(0);
8426 SDValue RHS = Intr->getOperand(1);
8427 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
8428 Intr = LHS.getNode();
8429 break;
8430 }
8431 [[fallthrough]];
8432 }
8433 default: {
8434 // Get the target from BR if we don't negate the condition
8435 BR = findUser(BRCOND, ISD::BR);
8436 assert(BR && "brcond missing unconditional branch user");
8437 Target = BR->getOperand(1);
8438 }
8439 }
8440
8441 unsigned CFNode = isCFIntrinsic(Intr);
8442 if (CFNode == 0) {
8443 // This is a uniform branch so we don't need to legalize.
8444 return BRCOND;
8445 }
8446
8447 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
8449
8450 assert(!SetCC ||
8451 (SetCC->getConstantOperandVal(1) == 1 &&
8452 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
8453 ISD::SETNE));
8454
8455 // operands of the new intrinsic call
8457 if (HaveChain)
8458 Ops.push_back(BRCOND.getOperand(0));
8459
8460 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
8461 Ops.push_back(Target);
8462
8463 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
8464
8465 // build the new intrinsic call
8466 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
8467
8468 if (!HaveChain) {
8469 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
8470
8472 }
8473
8474 if (BR) {
8475 // Give the branch instruction our target
8476 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
8477 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
8478 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
8479 }
8480
8481 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
8482
8483 // Copy the intrinsic results to registers
8484 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
8485 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
8486 if (!CopyToReg)
8487 continue;
8488
8489 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
8490 SDValue(Result, i - 1), SDValue());
8491
8492 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
8493 }
8494
8495 // Remove the old intrinsic from the chain
8496 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
8497 Intr->getOperand(0));
8498
8499 return Chain;
8500}
8501
8502SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8503 MVT VT = Op.getSimpleValueType();
8504 SDLoc DL(Op);
8505 // Checking the depth
8506 if (Op.getConstantOperandVal(0) != 0)
8507 return DAG.getConstant(0, DL, VT);
8508
8509 MachineFunction &MF = DAG.getMachineFunction();
8510 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8511 // Check for kernel and shader functions
8512 if (Info->isEntryFunction())
8513 return DAG.getConstant(0, DL, VT);
8514
8515 MachineFrameInfo &MFI = MF.getFrameInfo();
8516 // There is a call to @llvm.returnaddress in this function
8517 MFI.setReturnAddressIsTaken(true);
8518
8519 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8520 // Get the return address reg and mark it as an implicit live-in
8521 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
8522 getRegClassFor(VT, Op.getNode()->isDivergent()));
8523
8524 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
8525}
8526
8527SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8528 MachineFunction &MF = DAG.getMachineFunction();
8529 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8530
8531 // For functions that set up their own stack, select the GET_STACK_BASE
8532 // pseudo.
8533 if (MFI->isBottomOfStack())
8534 return Op;
8535
8536 // For everything else, create a dummy stack object.
8537 int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
8538 return DAG.getFrameIndex(FI, Op.getValueType());
8539}
8540
8541SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8542 const SDLoc &DL, EVT VT) const {
8543 return Op.getValueType().bitsLE(VT)
8544 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
8545 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
8546 DAG.getTargetConstant(0, DL, MVT::i32));
8547}
8548
8549SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8550 SelectionDAG &DAG) const {
8551 EVT DstVT = Op.getValueType();
8552 unsigned NumElts = DstVT.getVectorNumElements();
8553 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8554
8555 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
8556
8557 SDLoc DL(Op);
8558 unsigned Opc = Op.getOpcode();
8559 SDValue Flags = Op.getOperand(1);
8560 EVT HalfDstVT =
8561 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
8562 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
8563 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
8564
8565 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
8566}
8567
8568SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8569 SDValue Src = Op.getOperand(0);
8570 EVT SrcVT = Src.getValueType();
8571 EVT DstVT = Op.getValueType();
8572
8573 if (DstVT.isVector() && DstVT.getScalarType() == MVT::f16) {
8574 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8575 if (SrcVT.getScalarType() != MVT::f32)
8576 return SDValue();
8577 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8578 }
8579
8580 if (SrcVT.getScalarType() != MVT::f64)
8581 return Op;
8582
8583 SDLoc DL(Op);
8584 if (DstVT == MVT::f16) {
8585 // TODO: Handle strictfp
8586 if (Op.getOpcode() != ISD::FP_ROUND)
8587 return Op;
8588
8589 if (!Subtarget->has16BitInsts()) {
8590 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
8591 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8592 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8593 }
8594 if (Op->getFlags().hasApproximateFuncs()) {
8595 SDValue Flags = Op.getOperand(1);
8596 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
8597 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
8598 }
8599 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8600 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8601 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8602 }
8603
8604 assert(DstVT.getScalarType() == MVT::bf16 &&
8605 "custom lower FP_ROUND for f16 or bf16");
8606 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8607
8608 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8609 // hardware f32 -> bf16 instruction.
8610 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
8611 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
8612 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
8613 DAG.getTargetConstant(0, DL, MVT::i32));
8614}
8615
8616SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8617 SelectionDAG &DAG) const {
8618 EVT VT = Op.getValueType();
8619 const MachineFunction &MF = DAG.getMachineFunction();
8620 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8621 bool IsIEEEMode = Info->getMode().IEEE;
8622
8623 // FIXME: Assert during selection that this is only selected for
8624 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8625 // mode functions, but this happens to be OK since it's only done in cases
8626 // where there is known no sNaN.
8627 if (IsIEEEMode)
8628 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
8629
8630 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8631 VT == MVT::v16bf16)
8632 return splitBinaryVectorOp(Op, DAG);
8633 return Op;
8634}
8635
8636SDValue
8637SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8638 SelectionDAG &DAG) const {
8639 EVT VT = Op.getValueType();
8640 const MachineFunction &MF = DAG.getMachineFunction();
8641 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8642 bool IsIEEEMode = Info->getMode().IEEE;
8643
8644 if (IsIEEEMode)
8645 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
8646
8647 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8648 VT == MVT::v16bf16)
8649 return splitBinaryVectorOp(Op, DAG);
8650 return Op;
8651}
8652
8653SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8654 SelectionDAG &DAG) const {
8655 EVT VT = Op.getValueType();
8656 if (VT.isVector())
8657 return splitBinaryVectorOp(Op, DAG);
8658
8659 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8660 !Subtarget->hasMinimum3Maximum3F16() &&
8661 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8662 "should not need to widen f16 minimum/maximum to v2f16");
8663
8664 // Widen f16 operation to v2f16
8665
8666 // fminimum f16:x, f16:y ->
8667 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8668 // (v2f16 (scalar_to_vector y))), 0
8669 SDLoc SL(Op);
8670 SDValue WideSrc0 =
8671 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
8672 SDValue WideSrc1 =
8673 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
8674
8675 SDValue Widened =
8676 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8677
8678 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
8679 DAG.getConstant(0, SL, MVT::i32));
8680}
8681
8682SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8683 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8684 EVT VT = Op.getValueType();
8685 assert(VT == MVT::f16);
8686
8687 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
8688 EVT ExpVT = Exp.getValueType();
8689 if (ExpVT == MVT::i16)
8690 return Op;
8691
8692 SDLoc DL(Op);
8693
8694 // Correct the exponent type for f16 to i16.
8695 // Clamp the range of the exponent to the instruction's range.
8696
8697 // TODO: This should be a generic narrowing legalization, and can easily be
8698 // for GlobalISel.
8699
8700 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
8701 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
8702
8703 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
8704 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
8705
8706 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
8707
8708 if (IsStrict) {
8709 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
8710 {Op.getOperand(0), Op.getOperand(1), TruncExp});
8711 }
8712
8713 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
8714}
8715
8717 switch (Op->getOpcode()) {
8718 case ISD::SRA:
8719 case ISD::SMIN:
8720 case ISD::SMAX:
8721 return ISD::SIGN_EXTEND;
8722 case ISD::SRL:
8723 case ISD::UMIN:
8724 case ISD::UMAX:
8725 return ISD::ZERO_EXTEND;
8726 case ISD::ADD:
8727 case ISD::SUB:
8728 case ISD::AND:
8729 case ISD::OR:
8730 case ISD::XOR:
8731 case ISD::SHL:
8732 case ISD::SELECT:
8733 case ISD::MUL:
8734 // operation result won't be influenced by garbage high bits.
8735 // TODO: are all of those cases correct, and are there more?
8736 return ISD::ANY_EXTEND;
8737 case ISD::SETCC: {
8738 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8740 }
8741 default:
8742 llvm_unreachable("unexpected opcode!");
8743 }
8744}
8745
8746SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8747 DAGCombinerInfo &DCI) const {
8748 const unsigned Opc = Op.getOpcode();
8749 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8750 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8751 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8752 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8753 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX);
8754
8755 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8756 : Op->getOperand(0).getValueType();
8757 auto &DAG = DCI.DAG;
8758 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8759
8760 if (DCI.isBeforeLegalizeOps() ||
8761 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8762 return SDValue();
8763
8764 SDLoc DL(Op);
8765 SDValue LHS;
8766 SDValue RHS;
8767 if (Opc == ISD::SELECT) {
8768 LHS = Op->getOperand(1);
8769 RHS = Op->getOperand(2);
8770 } else {
8771 LHS = Op->getOperand(0);
8772 RHS = Op->getOperand(1);
8773 }
8774
8775 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8776 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8777
8778 // Special case: for shifts, the RHS always needs a zext.
8779 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8780 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8781 else
8782 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8783
8784 // setcc always return i1/i1 vec so no need to truncate after.
8785 if (Opc == ISD::SETCC) {
8786 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8787 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8788 }
8789
8790 // For other ops, we extend the operation's return type as well so we need to
8791 // truncate back to the original type.
8792 SDValue NewVal;
8793 if (Opc == ISD::SELECT)
8794 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8795 else
8796 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8797
8798 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8799}
8800
8801SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8802 SDValue Mag = Op.getOperand(0);
8803 EVT MagVT = Mag.getValueType();
8804
8805 if (MagVT.getVectorNumElements() > 2)
8806 return splitBinaryVectorOp(Op, DAG);
8807
8808 SDValue Sign = Op.getOperand(1);
8809 EVT SignVT = Sign.getValueType();
8810
8811 if (MagVT == SignVT)
8812 return Op;
8813
8814 // fcopysign v2f16:mag, v2f32:sign ->
8815 // fcopysign v2f16:mag, bitcast (trunc (bitcast sign to v2i32) to v2i16)
8816
8817 SDLoc SL(Op);
8818 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8819 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignAsInt32);
8820
8821 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8822
8823 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8824}
8825
8826// Custom lowering for vector multiplications and s_mul_u64.
8827SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8828 EVT VT = Op.getValueType();
8829
8830 // Split vector operands.
8831 if (VT.isVector())
8832 return splitBinaryVectorOp(Op, DAG);
8833
8834 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8835
8836 // There are four ways to lower s_mul_u64:
8837 //
8838 // 1. If all the operands are uniform, then we lower it as it is.
8839 //
8840 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8841 // multiplications because there is not a vector equivalent of s_mul_u64.
8842 //
8843 // 3. If the cost model decides that it is more efficient to use vector
8844 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8845 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8846 //
8847 // 4. If the cost model decides to use vector registers and both of the
8848 // operands are zero-extended/sign-extended from 32-bits, then we split the
8849 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8850 // possible to check if the operands are zero-extended or sign-extended in
8851 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8852 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8853 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8854 // If the cost model decides that we have to use vector registers, then
8855 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8856 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8857 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8858 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8859 // SIInstrInfo.cpp .
8860
8861 if (Op->isDivergent())
8862 return SDValue();
8863
8864 SDValue Op0 = Op.getOperand(0);
8865 SDValue Op1 = Op.getOperand(1);
8866 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8867 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8868 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8869 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8870 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8871 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8872 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8873 SDLoc SL(Op);
8874 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8875 return SDValue(
8876 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8877 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8878 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8879 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8880 return SDValue(
8881 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8882 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8883 return Op;
8884}
8885
8886SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8887 EVT VT = Op.getValueType();
8888 SDLoc SL(Op);
8889 SDValue LHS = Op.getOperand(0);
8890 SDValue RHS = Op.getOperand(1);
8891 bool isSigned = Op.getOpcode() == ISD::SMULO;
8892
8893 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
8894 const APInt &C = RHSC->getAPIntValue();
8895 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
8896 if (C.isPowerOf2()) {
8897 // smulo(x, signed_min) is same as umulo(x, signed_min).
8898 bool UseArithShift = isSigned && !C.isMinSignedValue();
8899 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
8900 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
8901 SDValue Overflow =
8902 DAG.getSetCC(SL, MVT::i1,
8903 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
8904 Result, ShiftAmt),
8905 LHS, ISD::SETNE);
8906 return DAG.getMergeValues({Result, Overflow}, SL);
8907 }
8908 }
8909
8910 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
8911 SDValue Top =
8912 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
8913
8914 SDValue Sign = isSigned
8915 ? DAG.getNode(ISD::SRA, SL, VT, Result,
8916 DAG.getConstant(VT.getScalarSizeInBits() - 1,
8917 SL, MVT::i32))
8918 : DAG.getConstant(0, SL, VT);
8919 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
8920
8921 return DAG.getMergeValues({Result, Overflow}, SL);
8922}
8923
8924SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
8925 if (Op->isDivergent()) {
8926 // Select to V_MAD_[IU]64_[IU]32.
8927 return Op;
8928 }
8929 if (Subtarget->hasSMulHi()) {
8930 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
8931 return SDValue();
8932 }
8933 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
8934 // calculate the high part, so we might as well do the whole thing with
8935 // V_MAD_[IU]64_[IU]32.
8936 return Op;
8937}
8938
8939SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
8940 if (!Subtarget->hasTrapHandler() ||
8941 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
8942 return lowerTrapEndpgm(Op, DAG);
8943
8944 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
8945 : lowerTrapHsaQueuePtr(Op, DAG);
8946}
8947
8948SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
8949 SDLoc SL(Op);
8950 SDValue Chain = Op.getOperand(0);
8951 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
8952}
8953
8954SDValue
8955SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
8956 const SDLoc &DL, Align Alignment,
8957 ImplicitParameter Param) const {
8958 MachineFunction &MF = DAG.getMachineFunction();
8959 uint64_t Offset = getImplicitParameterOffset(MF, Param);
8960 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
8961 MachinePointerInfo PtrInfo =
8963 return DAG.getLoad(
8964 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
8966}
8967
8968SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
8969 SelectionDAG &DAG) const {
8970 SDLoc SL(Op);
8971 SDValue Chain = Op.getOperand(0);
8972
8973 SDValue QueuePtr;
8974 // For code object version 5, QueuePtr is passed through implicit kernarg.
8975 const Module *M = DAG.getMachineFunction().getFunction().getParent();
8977 QueuePtr =
8978 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
8979 } else {
8980 MachineFunction &MF = DAG.getMachineFunction();
8981 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8982 Register UserSGPR = Info->getQueuePtrUserSGPR();
8983
8984 if (UserSGPR == AMDGPU::NoRegister) {
8985 // We probably are in a function incorrectly marked with
8986 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
8987 // trap, so just use a null pointer.
8988 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
8989 } else {
8990 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
8991 MVT::i64);
8992 }
8993 }
8994
8995 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
8996 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
8997
8998 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
8999 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
9000 ToReg.getValue(1)};
9001 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
9002}
9003
9004SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
9005 SDLoc SL(Op);
9006 SDValue Chain = Op.getOperand(0);
9007
9008 // We need to simulate the 's_trap 2' instruction on targets that run in
9009 // PRIV=1 (where it is treated as a nop).
9010 if (Subtarget->hasPrivEnabledTrap2NopBug())
9011 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
9012
9013 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
9014 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
9015 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
9016}
9017
9018SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
9019 SDLoc SL(Op);
9020 SDValue Chain = Op.getOperand(0);
9021 MachineFunction &MF = DAG.getMachineFunction();
9022
9023 if (!Subtarget->hasTrapHandler() ||
9024 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
9025 LLVMContext &Ctx = MF.getFunction().getContext();
9026 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
9027 "debugtrap handler not supported",
9028 Op.getDebugLoc(), DS_Warning));
9029 return Chain;
9030 }
9031
9032 uint64_t TrapID =
9033 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
9034 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
9035 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
9036}
9037
9038/// When a divergent value (in VGPR) is passed to an inline asm with an SGPR
9039/// constraint ('s'), we need to insert v_readfirstlane to move the value from
9040/// VGPR to SGPR. This is done by modifying the CopyToReg nodes in the glue
9041/// chain that feed into the INLINEASM node.
9042SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
9043 unsigned NumOps = Op.getNumOperands();
9044
9045 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
9046 SmallSet<Register, 8> SGPRInputRegs;
9047
9048 unsigned NumVals = 0;
9049 for (unsigned I = InlineAsm::Op_FirstOperand; I < NumOps - 1;
9050 I += 1 + NumVals) {
9051 const InlineAsm::Flag Flags(Op.getConstantOperandVal(I));
9052 NumVals = Flags.getNumOperandRegisters();
9053
9054 unsigned RCID;
9055 bool IsSGPRInput = Flags.getKind() == InlineAsm::Kind::RegUse &&
9056 NumVals > 0 && Flags.hasRegClassConstraint(RCID) &&
9057 TRI->isSGPRClass(TRI->getRegClass(RCID));
9058
9059 for (unsigned J = 0; J < NumVals; ++J) {
9060 SDValue Val = Op.getOperand(I + 1 + J);
9061 if (const RegisterSDNode *RegNode =
9063 Register Reg = RegNode->getReg();
9064 if (IsSGPRInput || (Reg.isPhysical() && TRI->isSGPRPhysReg(Reg)))
9065 SGPRInputRegs.insert(Reg);
9066 }
9067 }
9068 }
9069
9070 if (SGPRInputRegs.empty())
9071 return Op;
9072
9073 // Walk the glue chain and insert readfirstlane for divergent SGPR inputs.
9074 SDLoc DL(Op);
9075 SDNode *N = Op.getOperand(NumOps - 1).getNode();
9076
9077 while (N && N->getOpcode() == ISD::CopyToReg) {
9078 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
9079 SDValue SrcVal = N->getOperand(2);
9080
9081 // Insert readfirstlane if copying a divergent value to an SGPR input.
9082 if (SrcVal->isDivergent() && SGPRInputRegs.count(Reg)) {
9083 SDValue ReadFirstLaneID =
9084 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
9085 SDValue ReadFirstLane =
9087 ReadFirstLaneID, SrcVal);
9088
9089 SmallVector<SDValue, 4> Ops = {N->getOperand(0), N->getOperand(1),
9090 ReadFirstLane};
9091 if (N->getNumOperands() > 3)
9092 Ops.push_back(N->getOperand(3)); // Glue input
9093
9094 DAG.UpdateNodeOperands(N, Ops);
9095 }
9096
9097 // Follow glue chain to next CopyToReg.
9098 SDNode *Next = nullptr;
9099 for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) {
9100 if (N->getOperand(I).getValueType() == MVT::Glue) {
9101 Next = N->getOperand(I).getNode();
9102 break;
9103 }
9104 }
9105 N = Next;
9106 }
9107
9108 return Op;
9109}
9110
9111SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
9112 SelectionDAG &DAG) const {
9113 if (Subtarget->hasApertureRegs()) {
9114 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
9115 ? AMDGPU::SRC_SHARED_BASE
9116 : AMDGPU::SRC_PRIVATE_BASE;
9117 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
9118 !Subtarget->hasGloballyAddressableScratch()) &&
9119 "Cannot use src_private_base with globally addressable scratch!");
9120 // Note: this feature (register) is broken. When used as a 32-bit operand,
9121 // it returns a wrong value (all zeroes?). The real value is in the upper 32
9122 // bits.
9123 //
9124 // To work around the issue, emit a 64 bit copy from this register
9125 // then extract the high bits. Note that this shouldn't even result in a
9126 // shift being emitted and simply become a pair of registers (e.g.):
9127 // s_mov_b64 s[6:7], src_shared_base
9128 // v_mov_b32_e32 v1, s7
9129 SDValue Copy =
9130 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
9131 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
9132 }
9133
9134 // For code object version 5, private_base and shared_base are passed through
9135 // implicit kernargs.
9136 const Module *M = DAG.getMachineFunction().getFunction().getParent();
9140 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
9141 }
9142
9143 MachineFunction &MF = DAG.getMachineFunction();
9144 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9145 Register UserSGPR = Info->getQueuePtrUserSGPR();
9146 if (UserSGPR == AMDGPU::NoRegister) {
9147 // We probably are in a function incorrectly marked with
9148 // amdgpu-no-queue-ptr. This is undefined.
9149 return DAG.getPOISON(MVT::i32);
9150 }
9151
9152 SDValue QueuePtr =
9153 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
9154
9155 // Offset into amd_queue_t for group_segment_aperture_base_hi /
9156 // private_segment_aperture_base_hi.
9157 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
9158
9159 SDValue Ptr =
9160 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
9161
9162 // TODO: Use custom target PseudoSourceValue.
9163 // TODO: We should use the value from the IR intrinsic call, but it might not
9164 // be available and how do we get it?
9165 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
9166 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
9167 commonAlignment(Align(64), StructOffset),
9170}
9171
9172/// Return true if the value is a known valid address, such that a null check is
9173/// not necessary.
9175 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
9177 return true;
9178
9179 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
9180 return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
9181
9182 // TODO: Search through arithmetic, handle arguments and loads
9183 // marked nonnull.
9184 return false;
9185}
9186
9187SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
9188 SelectionDAG &DAG) const {
9189 SDLoc SL(Op);
9190
9191 const AMDGPUTargetMachine &TM =
9192 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
9193
9194 unsigned DestAS, SrcAS;
9195 SDValue Src;
9196 bool IsNonNull = false;
9197 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
9198 SrcAS = ASC->getSrcAddressSpace();
9199 Src = ASC->getOperand(0);
9200 DestAS = ASC->getDestAddressSpace();
9201 } else {
9202 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
9203 Op.getConstantOperandVal(0) ==
9204 Intrinsic::amdgcn_addrspacecast_nonnull);
9205 Src = Op->getOperand(1);
9206 SrcAS = Op->getConstantOperandVal(2);
9207 DestAS = Op->getConstantOperandVal(3);
9208 IsNonNull = true;
9209 }
9210
9211 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
9212
9213 // flat -> local/private
9214 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
9215 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
9216 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
9217 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
9218
9219 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
9220 Subtarget->hasGloballyAddressableScratch()) {
9221 // flat -> private with globally addressable scratch: subtract
9222 // src_flat_scratch_base_lo.
9223 SDValue FlatScratchBaseLo(
9224 DAG.getMachineNode(
9225 AMDGPU::S_MOV_B32, SL, MVT::i32,
9226 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
9227 0);
9228 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
9229 }
9230
9231 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
9232 return Ptr;
9233
9234 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
9235 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
9236 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
9237
9238 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
9239 SegmentNullPtr);
9240 }
9241 }
9242
9243 // local/private -> flat
9244 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
9245 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
9246 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
9247 SDValue CvtPtr;
9248 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
9249 Subtarget->hasGloballyAddressableScratch()) {
9250 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
9251 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
9252 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
9253 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
9254 ThreadID = DAG.getNode(
9255 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
9256 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
9257 AllOnes, ThreadID);
9258 if (Subtarget->isWave64())
9259 ThreadID = DAG.getNode(
9260 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
9261 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
9262 AllOnes, ThreadID);
9263 SDValue ShAmt = DAG.getShiftAmountConstant(
9264 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
9265 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
9266 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
9267 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
9268 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
9269 // 64-bit hi:lo value.
9270 SDValue FlatScratchBase = {
9271 DAG.getMachineNode(
9272 AMDGPU::S_MOV_B64, SL, MVT::i64,
9273 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
9274 0};
9275 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
9276 } else {
9277 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
9278 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
9279 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
9280 }
9281
9282 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
9283 return CvtPtr;
9284
9285 unsigned NullVal = AMDGPU::getNullPointerValue(SrcAS);
9286 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
9287
9288 SDValue NonNull =
9289 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
9290
9291 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
9292 FlatNullPtr);
9293 }
9294 }
9295
9296 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9297 Op.getValueType() == MVT::i64) {
9298 const SIMachineFunctionInfo *Info =
9299 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
9300 if (Info->get32BitAddressHighBits() == 0)
9301 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
9302
9303 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
9304 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
9305 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
9306 }
9307
9308 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9309 Src.getValueType() == MVT::i64)
9310 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
9311
9312 // global <-> flat are no-ops and never emitted.
9313
9314 // Invalid casts are poison.
9315 return DAG.getPOISON(Op->getValueType(0));
9316}
9317
9318// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
9319// the small vector and inserting them into the big vector. That is better than
9320// the default expansion of doing it via a stack slot. Even though the use of
9321// the stack slot would be optimized away afterwards, the stack slot itself
9322// remains.
9323SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
9324 SelectionDAG &DAG) const {
9325 SDValue Vec = Op.getOperand(0);
9326 SDValue Ins = Op.getOperand(1);
9327 SDValue Idx = Op.getOperand(2);
9328 EVT VecVT = Vec.getValueType();
9329 EVT InsVT = Ins.getValueType();
9330 EVT EltVT = VecVT.getVectorElementType();
9331 unsigned InsNumElts = InsVT.getVectorNumElements();
9332 unsigned IdxVal = Idx->getAsZExtVal();
9333 SDLoc SL(Op);
9334
9335 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
9336 // Insert 32-bit registers at a time.
9337 assert(InsNumElts % 2 == 0 && "expect legal vector types");
9338
9339 unsigned VecNumElts = VecVT.getVectorNumElements();
9340 EVT NewVecVT =
9341 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
9342 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
9344 MVT::i32, InsNumElts / 2);
9345
9346 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
9347 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
9348
9349 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
9350 SDValue Elt;
9351 if (InsNumElts == 2) {
9352 Elt = Ins;
9353 } else {
9354 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
9355 DAG.getConstant(I, SL, MVT::i32));
9356 }
9357 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
9358 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
9359 }
9360
9361 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
9362 }
9363
9364 for (unsigned I = 0; I != InsNumElts; ++I) {
9365 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
9366 DAG.getConstant(I, SL, MVT::i32));
9367 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
9368 DAG.getConstant(IdxVal + I, SL, MVT::i32));
9369 }
9370 return Vec;
9371}
9372
9373SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
9374 SelectionDAG &DAG) const {
9375 SDValue Vec = Op.getOperand(0);
9376 SDValue InsVal = Op.getOperand(1);
9377 SDValue Idx = Op.getOperand(2);
9378 EVT VecVT = Vec.getValueType();
9379 EVT EltVT = VecVT.getVectorElementType();
9380 unsigned VecSize = VecVT.getSizeInBits();
9381 unsigned EltSize = EltVT.getSizeInBits();
9382 SDLoc SL(Op);
9383
9384 // Specially handle the case of v4i16 with static indexing.
9385 unsigned NumElts = VecVT.getVectorNumElements();
9386 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
9387 if (NumElts == 4 && EltSize == 16 && KIdx) {
9388 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
9389
9390 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9391 DAG.getConstant(0, SL, MVT::i32));
9392 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9393 DAG.getConstant(1, SL, MVT::i32));
9394
9395 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
9396 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
9397
9398 unsigned Idx = KIdx->getZExtValue();
9399 bool InsertLo = Idx < 2;
9400 SDValue InsHalf = DAG.getNode(
9401 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
9402 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
9403 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9404
9405 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
9406
9407 SDValue Concat =
9408 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
9409 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9410
9411 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
9412 }
9413
9414 // Static indexing does not lower to stack access, and hence there is no need
9415 // for special custom lowering to avoid stack access.
9416 if (isa<ConstantSDNode>(Idx))
9417 return SDValue();
9418
9419 // Avoid stack access for dynamic indexing by custom lowering to
9420 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
9421
9422 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
9423
9424 MVT IntVT = MVT::getIntegerVT(VecSize);
9425
9426 // Convert vector index to bit-index and get the required bit mask.
9427 assert(isPowerOf2_32(EltSize));
9428 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
9429 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9430 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9431 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
9432 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
9433
9434 // 1. Create a congruent vector with the target value in each element.
9435 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
9436 DAG.getSplatBuildVector(VecVT, SL, InsVal));
9437
9438 // 2. Mask off all other indices except the required index within (1).
9439 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
9440
9441 // 3. Mask off the required index within the target vector.
9442 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9443 SDValue RHS =
9444 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
9445
9446 // 4. Get (2) and (3) ORed into the target vector.
9447 SDValue BFI =
9448 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
9449
9450 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
9451}
9452
9453SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
9454 SelectionDAG &DAG) const {
9455 SDLoc SL(Op);
9456
9457 EVT ResultVT = Op.getValueType();
9458 SDValue Vec = Op.getOperand(0);
9459 SDValue Idx = Op.getOperand(1);
9460 EVT VecVT = Vec.getValueType();
9461 unsigned VecSize = VecVT.getSizeInBits();
9462 EVT EltVT = VecVT.getVectorElementType();
9463
9464 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
9465
9466 // Make sure we do any optimizations that will make it easier to fold
9467 // source modifiers before obscuring it with bit operations.
9468
9469 // XXX - Why doesn't this get called when vector_shuffle is expanded?
9470 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
9471 return Combined;
9472
9473 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9474 SDValue Lo, Hi;
9475 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
9476
9477 if (VecSize == 128) {
9478 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
9479 Lo = DAG.getBitcast(LoVT,
9480 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9481 DAG.getConstant(0, SL, MVT::i32)));
9482 Hi = DAG.getBitcast(HiVT,
9483 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9484 DAG.getConstant(1, SL, MVT::i32)));
9485 } else if (VecSize == 256) {
9486 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
9487 SDValue Parts[4];
9488 for (unsigned P = 0; P < 4; ++P) {
9489 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9490 DAG.getConstant(P, SL, MVT::i32));
9491 }
9492
9493 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9494 Parts[0], Parts[1]));
9495 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9496 Parts[2], Parts[3]));
9497 } else {
9498 assert(VecSize == 512);
9499
9500 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
9501 SDValue Parts[8];
9502 for (unsigned P = 0; P < 8; ++P) {
9503 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9504 DAG.getConstant(P, SL, MVT::i32));
9505 }
9506
9507 Lo = DAG.getBitcast(LoVT,
9508 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9509 Parts[0], Parts[1], Parts[2], Parts[3]));
9510 Hi = DAG.getBitcast(HiVT,
9511 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9512 Parts[4], Parts[5], Parts[6], Parts[7]));
9513 }
9514
9515 EVT IdxVT = Idx.getValueType();
9516 unsigned NElem = VecVT.getVectorNumElements();
9517 assert(isPowerOf2_32(NElem));
9518 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
9519 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
9520 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
9521 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
9522 }
9523
9524 assert(VecSize <= 64);
9525
9526 MVT IntVT = MVT::getIntegerVT(VecSize);
9527
9528 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
9529 SDValue VecBC = peekThroughBitcasts(Vec);
9530 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
9531 SDValue Src = VecBC.getOperand(0);
9532 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9533 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
9534 }
9535
9536 unsigned EltSize = EltVT.getSizeInBits();
9537 assert(isPowerOf2_32(EltSize));
9538
9539 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9540
9541 // Convert vector index to bit-index (* EltSize)
9542 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9543
9544 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9545 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
9546
9547 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9548 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
9549 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
9550 }
9551
9552 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
9553}
9554
9555static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9556 assert(Elt % 2 == 0);
9557 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9558}
9559
9560static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9561 assert(Elt % 2 == 0);
9562 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9563 !(Mask[Elt + 1] & 1);
9564}
9565
9566SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9567 SelectionDAG &DAG) const {
9568 SDLoc SL(Op);
9569 EVT ResultVT = Op.getValueType();
9570 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
9571 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9572 const int NewSrcNumElts = 2;
9573 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
9574 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
9575
9576 // Break up the shuffle into registers sized pieces.
9577 //
9578 // We're trying to form sub-shuffles that the register allocation pipeline
9579 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9580 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9581 // pair of copies into a consecutive register copy, so use the ordinary
9582 // extract_vector_elt lowering unless we can use the shuffle.
9583 //
9584 // TODO: This is a bit of hack, and we should probably always use
9585 // extract_subvector for the largest possible subvector we can (or at least
9586 // use it for PackVT aligned pieces). However we have worse support for
9587 // combines on them don't directly treat extract_subvector / insert_subvector
9588 // as legal. The DAG scheduler also ends up doing a worse job with the
9589 // extract_subvectors.
9590 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9591
9592 // vector_shuffle <0,1,6,7> lhs, rhs
9593 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9594 //
9595 // vector_shuffle <6,7,2,3> lhs, rhs
9596 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9597 //
9598 // vector_shuffle <6,7,0,1> lhs, rhs
9599 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9600
9601 // Avoid scalarizing when both halves are reading from consecutive elements.
9602
9603 // If we're treating 2 element shuffles as legal, also create odd-to-even
9604 // shuffles of neighboring pairs.
9605 //
9606 // vector_shuffle <3,2,7,6> lhs, rhs
9607 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9608 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9609
9611 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9612 if (ShouldUseConsecutiveExtract &&
9614 const int Idx = SVN->getMaskElt(I);
9615 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9616 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9617 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
9618 SVN->getOperand(VecIdx),
9619 DAG.getConstant(EltIdx, SL, MVT::i32));
9620 Pieces.push_back(SubVec);
9621 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
9623 int Idx0 = SVN->getMaskElt(I);
9624 int Idx1 = SVN->getMaskElt(I + 1);
9625
9626 SDValue SrcOp0 = SVN->getOperand(0);
9627 SDValue SrcOp1 = SrcOp0;
9628 if (Idx0 >= SrcNumElts) {
9629 SrcOp0 = SVN->getOperand(1);
9630 Idx0 -= SrcNumElts;
9631 }
9632
9633 if (Idx1 >= SrcNumElts) {
9634 SrcOp1 = SVN->getOperand(1);
9635 Idx1 -= SrcNumElts;
9636 }
9637
9638 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9639 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9640
9641 // Extract nearest even aligned piece.
9642 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
9643 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
9644 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
9645 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
9646
9647 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9648 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9649
9650 SDValue Result0 = SubVec0;
9651 SDValue Result1 = SubVec0;
9652
9653 if (SubVec0 != SubVec1) {
9654 NewMaskIdx1 += NewSrcNumElts;
9655 Result1 = SubVec1;
9656 } else {
9657 Result1 = DAG.getPOISON(PackVT);
9658 }
9659
9660 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
9661 {NewMaskIdx0, NewMaskIdx1});
9662 Pieces.push_back(Shuf);
9663 } else {
9664 const int Idx0 = SVN->getMaskElt(I);
9665 const int Idx1 = SVN->getMaskElt(I + 1);
9666 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9667 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9668 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9669 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9670
9671 SDValue Vec0 = SVN->getOperand(VecIdx0);
9672 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
9673 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
9674
9675 SDValue Vec1 = SVN->getOperand(VecIdx1);
9676 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
9677 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
9678 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
9679 }
9680 }
9681
9682 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
9683}
9684
9685SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9686 SelectionDAG &DAG) const {
9687 SDValue SVal = Op.getOperand(0);
9688 EVT ResultVT = Op.getValueType();
9689 EVT SValVT = SVal.getValueType();
9690 SDValue UndefVal = DAG.getPOISON(SValVT);
9691 SDLoc SL(Op);
9692
9694 VElts.push_back(SVal);
9695 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9696 VElts.push_back(UndefVal);
9697
9698 return DAG.getBuildVector(ResultVT, SL, VElts);
9699}
9700
9701SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9702 SelectionDAG &DAG) const {
9703 SDLoc SL(Op);
9704 EVT VT = Op.getValueType();
9705
9706 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9707 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9708
9709 SDValue Lo = Op.getOperand(0);
9710 SDValue Hi = Op.getOperand(1);
9711
9712 // Avoid adding defined bits with the zero_extend.
9713 if (Hi.isUndef()) {
9714 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9715 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
9716 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
9717 }
9718
9719 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
9720 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
9721
9722 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
9723 DAG.getConstant(16, SL, MVT::i32));
9724 if (Lo.isUndef())
9725 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
9726
9727 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9728 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
9729
9730 SDValue Or =
9731 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
9732 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
9733 }
9734
9735 // Split into 2-element chunks.
9736 const unsigned NumParts = VT.getVectorNumElements() / 2;
9737 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
9738 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
9739
9741 for (unsigned P = 0; P < NumParts; ++P) {
9742 SDValue Vec = DAG.getBuildVector(
9743 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
9744 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
9745 }
9746
9747 SDValue Blend =
9748 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
9749 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
9750}
9751
9753 const GlobalAddressSDNode *GA) const {
9754 // OSes that use ELF REL relocations (instead of RELA) can only store a
9755 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9756 // which can create arbitrary 64-bit addends. (This is only a problem for
9757 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9758 // the high 32 bits of the addend.)
9759 //
9760 // This should be kept in sync with how HasRelocationAddend is initialized in
9761 // the constructor of ELFAMDGPUAsmBackend.
9762 if (!Subtarget->isAmdHsaOS())
9763 return false;
9764
9765 // We can fold offsets for anything that doesn't require a GOT relocation.
9766 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9770}
9771
9772static SDValue
9774 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9775 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9776 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9777 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9778 // lowered to the following code sequence:
9779 //
9780 // For constant address space:
9781 // s_getpc_b64 s[0:1]
9782 // s_add_u32 s0, s0, $symbol
9783 // s_addc_u32 s1, s1, 0
9784 //
9785 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9786 // a fixup or relocation is emitted to replace $symbol with a literal
9787 // constant, which is a pc-relative offset from the encoding of the $symbol
9788 // operand to the global variable.
9789 //
9790 // For global address space:
9791 // s_getpc_b64 s[0:1]
9792 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9793 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9794 //
9795 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9796 // fixups or relocations are emitted to replace $symbol@*@lo and
9797 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9798 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9799 // operand to the global variable.
9800 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9801 assert(GAFlags != SIInstrInfo::MO_NONE);
9802
9803 SDValue Ptr =
9804 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9805 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9806 }
9807
9808 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9809 SDValue PtrHi;
9810 if (GAFlags == SIInstrInfo::MO_NONE)
9811 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9812 else
9813 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9814 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9815}
9816
9817SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
9818 SDValue Op,
9819 SelectionDAG &DAG) const {
9820 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9821 SDLoc DL(GSD);
9822 EVT PtrVT = Op.getValueType();
9823
9824 const GlobalValue *GV = GSD->getGlobal();
9830 GV->hasExternalLinkage()) {
9831 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
9832 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9833 // zero-sized type in other languages to declare the dynamic shared
9834 // memory which size is not known at the compile time. They will be
9835 // allocated by the runtime and placed directly after the static
9836 // allocated ones. They all share the same offset.
9837 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
9838 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9839 // Adjust alignment for that dynamic shared memory array.
9841 MFI->setDynLDSAlign(F, GVar);
9842 MFI->setUsesDynamicLDS(true);
9843 return SDValue(
9844 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9845 }
9846 }
9848 }
9849
9851 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9853 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9854 }
9855
9856 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9857 if (Subtarget->has64BitLiterals()) {
9859 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9860 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9861 0);
9862 }
9863
9864 SDValue AddrLo = DAG.getTargetGlobalAddress(
9865 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9866 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9867
9868 SDValue AddrHi = DAG.getTargetGlobalAddress(
9869 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9870 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9871
9872 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9873 }
9874
9875 if (shouldEmitFixup(GV))
9876 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9877
9878 if (shouldEmitPCReloc(GV))
9879 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
9881
9882 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
9884 PointerType *PtrTy =
9886 const DataLayout &DataLayout = DAG.getDataLayout();
9887 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
9888 MachinePointerInfo PtrInfo =
9890
9891 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
9894}
9895
9896SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
9897 SelectionDAG &DAG) const {
9898 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
9899 const Function &Fn = DAG.getMachineFunction().getFunction();
9900 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
9901 Fn, "unsupported external symbol", Op.getDebugLoc()));
9902 return DAG.getPOISON(Op.getValueType());
9903}
9904
9906 const SDLoc &DL, SDValue V) const {
9907 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
9908 // the destination register.
9909 //
9910 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
9911 // so we will end up with redundant moves to m0.
9912 //
9913 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
9914
9915 // A Null SDValue creates a glue result.
9916 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
9917 V, Chain);
9918 return SDValue(M0, 0);
9919}
9920
9921SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
9922 MVT VT,
9923 unsigned Offset) const {
9924 SDLoc SL(Op);
9925 SDValue Param = lowerKernargMemParameter(
9926 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
9927 // The local size values will have the hi 16-bits as zero.
9928 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
9929 DAG.getValueType(VT));
9930}
9931
9933 EVT VT) {
9936 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
9937 return DAG.getPOISON(VT);
9938}
9939
9941 EVT VT) {
9944 "intrinsic not supported on subtarget", DL.getDebugLoc()));
9945 return DAG.getPOISON(VT);
9946}
9947
9949 ArrayRef<SDValue> Elts) {
9950 assert(!Elts.empty());
9951 MVT Type;
9952 unsigned NumElts = Elts.size();
9953
9954 if (NumElts <= 12) {
9955 Type = MVT::getVectorVT(MVT::f32, NumElts);
9956 } else {
9957 assert(Elts.size() <= 16);
9958 Type = MVT::v16f32;
9959 NumElts = 16;
9960 }
9961
9962 SmallVector<SDValue, 16> VecElts(NumElts);
9963 for (unsigned i = 0; i < Elts.size(); ++i) {
9964 SDValue Elt = Elts[i];
9965 if (Elt.getValueType() != MVT::f32)
9966 Elt = DAG.getBitcast(MVT::f32, Elt);
9967 VecElts[i] = Elt;
9968 }
9969 for (unsigned i = Elts.size(); i < NumElts; ++i)
9970 VecElts[i] = DAG.getPOISON(MVT::f32);
9971
9972 if (NumElts == 1)
9973 return VecElts[0];
9974 return DAG.getBuildVector(Type, DL, VecElts);
9975}
9976
9977static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
9978 SDValue Src, int ExtraElts) {
9979 EVT SrcVT = Src.getValueType();
9980
9982
9983 if (SrcVT.isVector())
9984 DAG.ExtractVectorElements(Src, Elts);
9985 else
9986 Elts.push_back(Src);
9987
9988 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
9989 while (ExtraElts--)
9990 Elts.push_back(Undef);
9991
9992 return DAG.getBuildVector(CastVT, DL, Elts);
9993}
9994
9995// Re-construct the required return value for a image load intrinsic.
9996// This is more complicated due to the optional use TexFailCtrl which means the
9997// required return type is an aggregate
9999 ArrayRef<EVT> ResultTypes, bool IsTexFail,
10000 bool Unpacked, bool IsD16, int DMaskPop,
10001 int NumVDataDwords, bool IsAtomicPacked16Bit,
10002 const SDLoc &DL) {
10003 // Determine the required return type. This is the same regardless of
10004 // IsTexFail flag
10005 EVT ReqRetVT = ResultTypes[0];
10006 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
10007 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
10008 ? (ReqRetNumElts + 1) / 2
10009 : ReqRetNumElts;
10010
10011 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
10012
10013 MVT DataDwordVT =
10014 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
10015
10016 MVT MaskPopVT =
10017 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
10018
10019 SDValue Data(Result, 0);
10020 SDValue TexFail;
10021
10022 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
10023 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
10024 if (MaskPopVT.isVector()) {
10025 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
10026 SDValue(Result, 0), ZeroIdx);
10027 } else {
10028 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
10029 SDValue(Result, 0), ZeroIdx);
10030 }
10031 }
10032
10033 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
10034 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
10035 NumDataDwords - MaskPopDwords);
10036
10037 if (IsD16)
10038 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
10039
10040 EVT LegalReqRetVT = ReqRetVT;
10041 if (!ReqRetVT.isVector()) {
10042 if (!Data.getValueType().isInteger())
10043 Data = DAG.getNode(ISD::BITCAST, DL,
10044 Data.getValueType().changeTypeToInteger(), Data);
10045 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
10046 } else {
10047 // We need to widen the return vector to a legal type
10048 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
10049 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
10050 LegalReqRetVT =
10052 ReqRetVT.getVectorNumElements() + 1);
10053 }
10054 }
10055 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
10056
10057 if (IsTexFail) {
10058 TexFail =
10059 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
10060 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
10061
10062 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
10063 }
10064
10065 if (Result->getNumValues() == 1)
10066 return Data;
10067
10068 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
10069}
10070
10071static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
10072 SDValue *LWE, bool &IsTexFail) {
10073 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
10074
10075 uint64_t Value = TexFailCtrlConst->getZExtValue();
10076 if (Value) {
10077 IsTexFail = true;
10078 }
10079
10080 SDLoc DL(TexFailCtrlConst);
10081 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
10082 Value &= ~(uint64_t)0x1;
10083 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
10084 Value &= ~(uint64_t)0x2;
10085
10086 return Value == 0;
10087}
10088
10090 MVT PackVectorVT,
10091 SmallVectorImpl<SDValue> &PackedAddrs,
10092 unsigned DimIdx, unsigned EndIdx,
10093 unsigned NumGradients) {
10094 SDLoc DL(Op);
10095 for (unsigned I = DimIdx; I < EndIdx; I++) {
10096 SDValue Addr = Op.getOperand(I);
10097
10098 // Gradients are packed with undef for each coordinate.
10099 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
10100 // 1D: undef,dx/dh; undef,dx/dv
10101 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
10102 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
10103 if (((I + 1) >= EndIdx) ||
10104 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
10105 I == DimIdx + NumGradients - 1))) {
10106 if (Addr.getValueType() != MVT::i16)
10107 Addr = DAG.getBitcast(MVT::i16, Addr);
10108 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
10109 } else {
10110 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
10111 I++;
10112 }
10113 Addr = DAG.getBitcast(MVT::f32, Addr);
10114 PackedAddrs.push_back(Addr);
10115 }
10116}
10117
10118SDValue SITargetLowering::lowerImage(SDValue Op,
10120 SelectionDAG &DAG, bool WithChain) const {
10121 SDLoc DL(Op);
10122 MachineFunction &MF = DAG.getMachineFunction();
10123 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
10124 unsigned IntrOpcode = Intr->BaseOpcode;
10125 // For image atomic: use no-return opcode if result is unused.
10126 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
10127 !Op.getNode()->hasAnyUseOfValue(0))
10128 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
10129 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
10131 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
10132 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
10133 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10134 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10135 bool IsGFX13 = AMDGPU::isGFX13(*Subtarget);
10136
10137 SmallVector<EVT, 3> ResultTypes(Op->values());
10138 SmallVector<EVT, 3> OrigResultTypes(Op->values());
10139 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
10140 ResultTypes.erase(&ResultTypes[0]);
10141
10142 bool IsD16 = false;
10143 bool IsG16 = false;
10144 bool IsA16 = false;
10145 SDValue VData;
10146 int NumVDataDwords = 0;
10147 bool AdjustRetType = false;
10148 bool IsAtomicPacked16Bit = false;
10149
10150 // Offset of intrinsic arguments
10151 const unsigned ArgOffset = WithChain ? 2 : 1;
10152
10153 unsigned DMask;
10154 unsigned DMaskLanes = 0;
10155
10156 if (BaseOpcode->Atomic) {
10157 VData = Op.getOperand(2);
10158
10159 IsAtomicPacked16Bit =
10160 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
10161 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
10162 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
10163 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
10164
10165 bool Is64Bit = VData.getValueSizeInBits() == 64;
10166 if (BaseOpcode->AtomicX2) {
10167 SDValue VData2 = Op.getOperand(3);
10168 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
10169 {VData, VData2});
10170 if (Is64Bit)
10171 VData = DAG.getBitcast(MVT::v4i32, VData);
10172
10173 if (!BaseOpcode->NoReturn)
10174 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
10175
10176 DMask = Is64Bit ? 0xf : 0x3;
10177 NumVDataDwords = Is64Bit ? 4 : 2;
10178 } else {
10179 DMask = Is64Bit ? 0x3 : 0x1;
10180 NumVDataDwords = Is64Bit ? 2 : 1;
10181 }
10182 } else {
10183 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
10184 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
10185
10186 if (BaseOpcode->Store) {
10187 VData = Op.getOperand(2);
10188
10189 MVT StoreVT = VData.getSimpleValueType();
10190 if (StoreVT.getScalarType() == MVT::f16) {
10191 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10192 return Op; // D16 is unsupported for this instruction
10193
10194 IsD16 = true;
10195 VData = handleD16VData(VData, DAG, true);
10196 }
10197
10198 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
10199 } else if (!BaseOpcode->NoReturn) {
10200 // Work out the num dwords based on the dmask popcount and underlying type
10201 // and whether packing is supported.
10202 MVT LoadVT = ResultTypes[0].getSimpleVT();
10203 if (LoadVT.getScalarType() == MVT::f16) {
10204 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10205 return Op; // D16 is unsupported for this instruction
10206
10207 IsD16 = true;
10208 }
10209
10210 // Confirm that the return type is large enough for the dmask specified
10211 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
10212 (!LoadVT.isVector() && DMaskLanes > 1))
10213 return Op;
10214
10215 // The sq block of gfx8 and gfx9 do not estimate register use correctly
10216 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
10217 // instructions.
10218 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
10219 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
10220 NumVDataDwords = (DMaskLanes + 1) / 2;
10221 else
10222 NumVDataDwords = DMaskLanes;
10223
10224 AdjustRetType = true;
10225 }
10226 }
10227
10228 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
10230
10231 // Check for 16 bit addresses or derivatives and pack if true.
10232 MVT VAddrVT =
10233 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
10234 MVT VAddrScalarVT = VAddrVT.getScalarType();
10235 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10236 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10237
10238 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
10239 VAddrScalarVT = VAddrVT.getScalarType();
10240 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10241 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10242
10243 // Push back extra arguments.
10244 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
10245 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
10246 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
10247 // Special handling of bias when A16 is on. Bias is of type half but
10248 // occupies full 32-bit.
10249 SDValue Bias = DAG.getBuildVector(
10250 MVT::v2f16, DL,
10251 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
10252 VAddrs.push_back(Bias);
10253 } else {
10254 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
10255 "Bias needs to be converted to 16 bit in A16 mode");
10256 VAddrs.push_back(Op.getOperand(ArgOffset + I));
10257 }
10258 }
10259
10260 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
10261 // 16 bit gradients are supported, but are tied to the A16 control
10262 // so both gradients and addresses must be 16 bit
10263 LLVM_DEBUG(
10264 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
10265 "require 16 bit args for both gradients and addresses");
10266 return Op;
10267 }
10268
10269 if (IsA16) {
10270 if (!ST->hasA16()) {
10271 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
10272 "support 16 bit addresses\n");
10273 return Op;
10274 }
10275 }
10276
10277 // We've dealt with incorrect input so we know that if IsA16, IsG16
10278 // are set then we have to compress/pack operands (either address,
10279 // gradient or both)
10280 // In the case where a16 and gradients are tied (no G16 support) then we
10281 // have already verified that both IsA16 and IsG16 are true
10282 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
10283 // Activate g16
10284 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
10286 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
10287 }
10288
10289 // Add gradients (packed or unpacked)
10290 if (IsG16) {
10291 // Pack the gradients
10292 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
10293 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
10294 ArgOffset + Intr->GradientStart,
10295 ArgOffset + Intr->CoordStart, Intr->NumGradients);
10296 } else {
10297 for (unsigned I = ArgOffset + Intr->GradientStart;
10298 I < ArgOffset + Intr->CoordStart; I++)
10299 VAddrs.push_back(Op.getOperand(I));
10300 }
10301
10302 // Add addresses (packed or unpacked)
10303 if (IsA16) {
10304 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
10305 ArgOffset + Intr->CoordStart, VAddrEnd,
10306 0 /* No gradients */);
10307 } else {
10308 // Add uncompressed address
10309 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
10310 VAddrs.push_back(Op.getOperand(I));
10311 }
10312
10313 // If the register allocator cannot place the address registers contiguously
10314 // without introducing moves, then using the non-sequential address encoding
10315 // is always preferable, since it saves VALU instructions and is usually a
10316 // wash in terms of code size or even better.
10317 //
10318 // However, we currently have no way of hinting to the register allocator that
10319 // MIMG addresses should be placed contiguously when it is possible to do so,
10320 // so force non-NSA for the common 2-address case as a heuristic.
10321 //
10322 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
10323 // allocation when possible.
10324 //
10325 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
10326 // set of the remaining addresses.
10327 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
10328 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
10329 const bool UseNSA = ST->hasNSAEncoding() &&
10330 VAddrs.size() >= ST->getNSAThreshold(MF) &&
10331 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
10332 const bool UsePartialNSA =
10333 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
10334
10335 SDValue VAddr;
10336 if (UsePartialNSA) {
10337 VAddr = getBuildDwordsVector(DAG, DL,
10338 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
10339 } else if (!UseNSA) {
10340 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
10341 }
10342
10343 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
10344 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
10345 SDValue Unorm;
10346 if (!BaseOpcode->Sampler) {
10347 Unorm = True;
10348 } else {
10349 uint64_t UnormConst =
10350 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
10351
10352 Unorm = UnormConst ? True : False;
10353 }
10354
10355 SDValue TFE;
10356 SDValue LWE;
10357 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
10358 bool IsTexFail = false;
10359 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10360 return Op;
10361
10362 if (IsTexFail) {
10363 if (!DMaskLanes) {
10364 // Expecting to get an error flag since TFC is on - and dmask is 0
10365 // Force dmask to be at least 1 otherwise the instruction will fail
10366 DMask = 0x1;
10367 DMaskLanes = 1;
10368 NumVDataDwords = 1;
10369 }
10370 NumVDataDwords += 1;
10371 AdjustRetType = true;
10372 }
10373
10374 // Has something earlier tagged that the return type needs adjusting
10375 // This happens if the instruction is a load or has set TexFailCtrl flags
10376 if (AdjustRetType) {
10377 // NumVDataDwords reflects the true number of dwords required in the return
10378 // type
10379 if (DMaskLanes == 0 && !BaseOpcode->Store) {
10380 // This is a no-op load. This can be eliminated
10381 SDValue Undef = DAG.getPOISON(Op.getValueType());
10382 if (isa<MemSDNode>(Op))
10383 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
10384 return Undef;
10385 }
10386
10387 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
10388 MVT::i32, NumVDataDwords)
10389 : MVT::i32;
10390
10391 ResultTypes[0] = NewVT;
10392 if (ResultTypes.size() == 3) {
10393 // Original result was aggregate type used for TexFailCtrl results
10394 // The actual instruction returns as a vector type which has now been
10395 // created. Remove the aggregate result.
10396 ResultTypes.erase(&ResultTypes[1]);
10397 }
10398 }
10399
10400 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
10401 // Keep GLC only when the atomic's result is actually used.
10402 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
10404 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
10406 return Op;
10407
10409 if (BaseOpcode->Store || BaseOpcode->Atomic)
10410 Ops.push_back(VData); // vdata
10411 if (UsePartialNSA) {
10412 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
10413 Ops.push_back(VAddr);
10414 } else if (UseNSA)
10415 append_range(Ops, VAddrs);
10416 else
10417 Ops.push_back(VAddr);
10418 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
10419 EVT RsrcVT = Rsrc.getValueType();
10420 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10421 return Op;
10422 Ops.push_back(Rsrc);
10423 if (BaseOpcode->Sampler) {
10424 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
10425 if (Samp.getValueType() != MVT::v4i32)
10426 return Op;
10427 Ops.push_back(Samp);
10428 }
10429 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
10430 if (IsGFX10Plus)
10431 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
10432 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10433 Ops.push_back(Unorm);
10434 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
10435 Ops.push_back(IsA16 && // r128, a16 for gfx9
10436 ST->hasFeature(AMDGPU::FeatureR128A16)
10437 ? True
10438 : False);
10439 if (IsGFX10Plus)
10440 Ops.push_back(IsA16 ? True : False);
10441
10442 if (!Subtarget->hasGFX90AInsts())
10443 Ops.push_back(TFE); // tfe
10444 else if (TFE->getAsZExtVal()) {
10445 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10447 "TFE is not supported on this GPU", DL.getDebugLoc()));
10448 }
10449
10450 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10451 Ops.push_back(LWE); // lwe
10452 if (!IsGFX10Plus)
10453 Ops.push_back(DimInfo->DA ? True : False);
10454 if (BaseOpcode->HasD16)
10455 Ops.push_back(IsD16 ? True : False);
10456 if (isa<MemSDNode>(Op))
10457 Ops.push_back(Op.getOperand(0)); // chain
10458
10459 int NumVAddrDwords =
10460 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
10461 int Opcode = -1;
10462
10463 if (IsGFX13) {
10464 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,
10465 NumVDataDwords, NumVAddrDwords);
10466 } else if (IsGFX12Plus) {
10467 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
10468 NumVDataDwords, NumVAddrDwords);
10469 } else if (IsGFX11Plus) {
10470 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10471 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10472 : AMDGPU::MIMGEncGfx11Default,
10473 NumVDataDwords, NumVAddrDwords);
10474 } else if (IsGFX10Plus) {
10475 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10476 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10477 : AMDGPU::MIMGEncGfx10Default,
10478 NumVDataDwords, NumVAddrDwords);
10479 } else {
10480 if (Subtarget->hasGFX90AInsts()) {
10481 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
10482 NumVDataDwords, NumVAddrDwords);
10483 if (Opcode == -1) {
10484 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10486 "requested image instruction is not supported on this GPU",
10487 DL.getDebugLoc()));
10488
10489 unsigned Idx = 0;
10490 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
10491 for (EVT VT : OrigResultTypes) {
10492 if (VT == MVT::Other)
10493 RetValues[Idx++] = Op.getOperand(0); // Chain
10494 else
10495 RetValues[Idx++] = DAG.getPOISON(VT);
10496 }
10497
10498 return DAG.getMergeValues(RetValues, DL);
10499 }
10500 }
10501 if (Opcode == -1 &&
10502 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10503 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
10504 NumVDataDwords, NumVAddrDwords);
10505 if (Opcode == -1)
10506 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
10507 NumVDataDwords, NumVAddrDwords);
10508 }
10509 if (Opcode == -1)
10510 return Op;
10511
10512 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
10513 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
10514 MachineMemOperand *MemRef = MemOp->getMemOperand();
10515 DAG.setNodeMemRefs(NewNode, {MemRef});
10516 }
10517
10518 if (BaseOpcode->NoReturn) {
10519 if (BaseOpcode->Atomic)
10520 return DAG.getMergeValues(
10521 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
10522
10523 return SDValue(NewNode, 0);
10524 }
10525
10526 if (BaseOpcode->AtomicX2) {
10528 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
10529 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
10530 }
10531
10532 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
10533 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10534 NumVDataDwords, IsAtomicPacked16Bit, DL);
10535}
10536
10537SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
10538 SDValue Offset, SDValue CachePolicy,
10539 SelectionDAG &DAG) const {
10540 MachineFunction &MF = DAG.getMachineFunction();
10541
10542 const DataLayout &DataLayout = DAG.getDataLayout();
10543 Align Alignment =
10544 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
10545
10546 MachineMemOperand *MMO = MF.getMachineMemOperand(
10547 MachinePointerInfo(),
10550 VT.getStoreSize(), Alignment);
10551
10552 if (!Offset->isDivergent()) {
10553 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
10554
10555 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10556 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
10557 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10558 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10559 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10560 SDValue BufferLoad =
10561 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
10562 DAG.getVTList(MVT::i32), Ops, VT, MMO);
10563 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
10564 }
10565
10566 // Widen vec3 load to vec4.
10567 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10568 !Subtarget->hasScalarDwordx3Loads()) {
10569 EVT WidenedVT =
10571 auto WidenedOp = DAG.getMemIntrinsicNode(
10572 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
10573 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
10574 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
10575 DAG.getVectorIdxConstant(0, DL));
10576 return Subvector;
10577 }
10578
10579 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
10580 DAG.getVTList(VT), Ops, VT, MMO);
10581 }
10582
10583 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10584 // assume that the buffer is unswizzled.
10585 SDValue Ops[] = {
10586 DAG.getEntryNode(), // Chain
10587 Rsrc, // rsrc
10588 DAG.getConstant(0, DL, MVT::i32), // vindex
10589 {}, // voffset
10590 {}, // soffset
10591 {}, // offset
10592 CachePolicy, // cachepolicy
10593 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10594 };
10595 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10596 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
10597 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
10598 }
10599
10601 unsigned NumLoads = 1;
10602 MVT LoadVT = VT.getSimpleVT();
10603 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10604 assert((LoadVT.getScalarType() == MVT::i32 ||
10605 LoadVT.getScalarType() == MVT::f32));
10606
10607 if (NumElts == 8 || NumElts == 16) {
10608 NumLoads = NumElts / 4;
10609 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
10610 }
10611
10612 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
10613
10614 // Use the alignment to ensure that the required offsets will fit into the
10615 // immediate offsets.
10616 setBufferOffsets(Offset, DAG, &Ops[3],
10617 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10618
10619 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10620 unsigned LoadSize = LoadVT.getStoreSize();
10621 for (unsigned i = 0; i < NumLoads; ++i) {
10622 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
10623 MachineMemOperand *LoadMMO = MF.getMachineMemOperand(MMO, 16 * i, LoadSize);
10624 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10625 LoadVT, LoadMMO, DAG));
10626 }
10627
10628 if (NumElts == 8 || NumElts == 16)
10629 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
10630
10631 return Loads[0];
10632}
10633
10634SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10635 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10636 if (!Subtarget->hasArchitectedSGPRs())
10637 return {};
10638 SDLoc SL(Op);
10639 MVT VT = MVT::i32;
10640 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
10641 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10642 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
10643}
10644
10645SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10646 AMDGPU::Hwreg::Id HwReg,
10647 unsigned LowBit,
10648 unsigned Width) const {
10649 SDLoc SL(Op);
10650 using namespace AMDGPU::Hwreg;
10651 return {DAG.getMachineNode(
10652 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10653 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
10654 SL, MVT::i32)),
10655 0};
10656}
10657
10658SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10659 unsigned Dim,
10660 const ArgDescriptor &Arg) const {
10661 SDLoc SL(Op);
10662 MachineFunction &MF = DAG.getMachineFunction();
10663 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
10664 if (MaxID == 0)
10665 return DAG.getConstant(0, SL, MVT::i32);
10666
10667 // It's undefined behavior if a function marked with the amdgpu-no-*
10668 // attributes uses the corresponding intrinsic.
10669 if (!Arg)
10670 return DAG.getPOISON(Op->getValueType(0));
10671
10672 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
10673 SDLoc(DAG.getEntryNode()), Arg);
10674
10675 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10676 // masking operations anyway.
10677 //
10678 // TODO: We could assert the top bit is 0 for the source copy.
10679 if (Arg.isMasked())
10680 return Val;
10681
10682 // Preserve the known bits after expansion to a copy.
10683 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
10684 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
10685 DAG.getValueType(SmallVT));
10686}
10687
10688SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10689 SelectionDAG &DAG) const {
10690 MachineFunction &MF = DAG.getMachineFunction();
10691 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10692
10693 EVT VT = Op.getValueType();
10694 SDLoc DL(Op);
10695 unsigned IntrinsicID = Op.getConstantOperandVal(0);
10696
10697 // TODO: Should this propagate fast-math-flags?
10698
10699 switch (IntrinsicID) {
10700 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10701 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
10702 return emitNonHSAIntrinsicError(DAG, DL, VT);
10703 return getPreloadedValue(DAG, *MFI, VT,
10705 }
10706 case Intrinsic::amdgcn_dispatch_ptr:
10707 case Intrinsic::amdgcn_queue_ptr: {
10708 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
10709 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10710 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10711 DL.getDebugLoc()));
10712 return DAG.getPOISON(VT);
10713 }
10714
10715 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10718 return getPreloadedValue(DAG, *MFI, VT, RegID);
10719 }
10720 case Intrinsic::amdgcn_implicitarg_ptr: {
10721 if (MFI->isEntryFunction())
10722 return getImplicitArgPtr(DAG, DL);
10723 return getPreloadedValue(DAG, *MFI, VT,
10725 }
10726 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10727 if (!AMDGPU::isKernel(MF.getFunction())) {
10728 // This only makes sense to call in a kernel, so just lower to null.
10729 return DAG.getConstant(0, DL, VT);
10730 }
10731
10732 return getPreloadedValue(DAG, *MFI, VT,
10734 }
10735 case Intrinsic::amdgcn_dispatch_id: {
10736 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
10737 }
10738 case Intrinsic::amdgcn_rcp:
10739 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
10740 case Intrinsic::amdgcn_rsq:
10741 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10742 case Intrinsic::amdgcn_rsq_legacy:
10743 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10744 return emitRemovedIntrinsicError(DAG, DL, VT);
10745 return SDValue();
10746 case Intrinsic::amdgcn_rcp_legacy:
10747 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10748 return emitRemovedIntrinsicError(DAG, DL, VT);
10749 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
10750 case Intrinsic::amdgcn_rsq_clamp: {
10751 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10752 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
10753
10754 Type *Type = VT.getTypeForEVT(*DAG.getContext());
10755 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
10756 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
10757
10758 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10759 SDValue Tmp =
10760 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
10761 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
10762 DAG.getConstantFP(Min, DL, VT));
10763 }
10764 case Intrinsic::r600_read_ngroups_x:
10765 if (Subtarget->isAmdHsaOS())
10766 return emitNonHSAIntrinsicError(DAG, DL, VT);
10767
10768 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10770 false);
10771 case Intrinsic::r600_read_ngroups_y:
10772 if (Subtarget->isAmdHsaOS())
10773 return emitNonHSAIntrinsicError(DAG, DL, VT);
10774
10775 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10777 false);
10778 case Intrinsic::r600_read_ngroups_z:
10779 if (Subtarget->isAmdHsaOS())
10780 return emitNonHSAIntrinsicError(DAG, DL, VT);
10781
10782 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10784 false);
10785 case Intrinsic::r600_read_local_size_x:
10786 if (Subtarget->isAmdHsaOS())
10787 return emitNonHSAIntrinsicError(DAG, DL, VT);
10788
10789 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10791 case Intrinsic::r600_read_local_size_y:
10792 if (Subtarget->isAmdHsaOS())
10793 return emitNonHSAIntrinsicError(DAG, DL, VT);
10794
10795 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10797 case Intrinsic::r600_read_local_size_z:
10798 if (Subtarget->isAmdHsaOS())
10799 return emitNonHSAIntrinsicError(DAG, DL, VT);
10800
10801 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10803 case Intrinsic::amdgcn_workgroup_id_x:
10804 return lowerWorkGroupId(DAG, *MFI, VT,
10808 case Intrinsic::amdgcn_workgroup_id_y:
10809 return lowerWorkGroupId(DAG, *MFI, VT,
10813 case Intrinsic::amdgcn_workgroup_id_z:
10814 return lowerWorkGroupId(DAG, *MFI, VT,
10818 case Intrinsic::amdgcn_cluster_id_x:
10819 return Subtarget->hasClusters()
10820 ? getPreloadedValue(DAG, *MFI, VT,
10822 : DAG.getPOISON(VT);
10823 case Intrinsic::amdgcn_cluster_id_y:
10824 return Subtarget->hasClusters()
10825 ? getPreloadedValue(DAG, *MFI, VT,
10827 : DAG.getPOISON(VT);
10828 case Intrinsic::amdgcn_cluster_id_z:
10829 return Subtarget->hasClusters()
10830 ? getPreloadedValue(DAG, *MFI, VT,
10832 : DAG.getPOISON(VT);
10833 case Intrinsic::amdgcn_cluster_workgroup_id_x:
10834 return Subtarget->hasClusters()
10835 ? getPreloadedValue(
10836 DAG, *MFI, VT,
10838 : DAG.getPOISON(VT);
10839 case Intrinsic::amdgcn_cluster_workgroup_id_y:
10840 return Subtarget->hasClusters()
10841 ? getPreloadedValue(
10842 DAG, *MFI, VT,
10844 : DAG.getPOISON(VT);
10845 case Intrinsic::amdgcn_cluster_workgroup_id_z:
10846 return Subtarget->hasClusters()
10847 ? getPreloadedValue(
10848 DAG, *MFI, VT,
10850 : DAG.getPOISON(VT);
10851 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
10852 return Subtarget->hasClusters()
10853 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
10854 : SDValue();
10855 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
10856 return Subtarget->hasClusters()
10857 ? getPreloadedValue(
10858 DAG, *MFI, VT,
10860 : DAG.getPOISON(VT);
10861 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
10862 return Subtarget->hasClusters()
10863 ? getPreloadedValue(
10864 DAG, *MFI, VT,
10866 : DAG.getPOISON(VT);
10867 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
10868 return Subtarget->hasClusters()
10869 ? getPreloadedValue(
10870 DAG, *MFI, VT,
10872 : DAG.getPOISON(VT);
10873 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
10874 return Subtarget->hasClusters()
10875 ? getPreloadedValue(
10876 DAG, *MFI, VT,
10878 : DAG.getPOISON(VT);
10879 case Intrinsic::amdgcn_wave_id:
10880 return lowerWaveID(DAG, Op);
10881 case Intrinsic::amdgcn_lds_kernel_id: {
10882 if (MFI->isEntryFunction())
10883 return getLDSKernelId(DAG, DL);
10884 return getPreloadedValue(DAG, *MFI, VT,
10886 }
10887 case Intrinsic::amdgcn_workitem_id_x:
10888 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
10889 case Intrinsic::amdgcn_workitem_id_y:
10890 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
10891 case Intrinsic::amdgcn_workitem_id_z:
10892 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
10893 case Intrinsic::amdgcn_wavefrontsize:
10894 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
10895 SDLoc(Op), MVT::i32);
10896 case Intrinsic::amdgcn_s_buffer_load: {
10897 unsigned CPol = Op.getConstantOperandVal(3);
10898 // s_buffer_load, because of how it's optimized, can't be volatile
10899 // so reject ones with the volatile bit set.
10900 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
10903 return Op;
10904 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
10905 Op.getOperand(3), DAG);
10906 }
10907 case Intrinsic::amdgcn_fdiv_fast:
10908 return lowerFDIV_FAST(Op, DAG);
10909 case Intrinsic::amdgcn_sin:
10910 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
10911
10912 case Intrinsic::amdgcn_cos:
10913 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
10914
10915 case Intrinsic::amdgcn_mul_u24:
10916 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
10917 Op.getOperand(2));
10918 case Intrinsic::amdgcn_mul_i24:
10919 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
10920 Op.getOperand(2));
10921
10922 case Intrinsic::amdgcn_log_clamp: {
10923 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10924 return SDValue();
10925
10926 return emitRemovedIntrinsicError(DAG, DL, VT);
10927 }
10928 case Intrinsic::amdgcn_fract:
10929 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
10930
10931 case Intrinsic::amdgcn_class:
10932 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
10933 Op.getOperand(2));
10934 case Intrinsic::amdgcn_div_fmas:
10935 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
10936 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10937
10938 case Intrinsic::amdgcn_div_fixup:
10939 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
10940 Op.getOperand(2), Op.getOperand(3));
10941
10942 case Intrinsic::amdgcn_div_scale: {
10943 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
10944
10945 // Translate to the operands expected by the machine instruction. The
10946 // first parameter must be the same as the first instruction.
10947 SDValue Numerator = Op.getOperand(1);
10948 SDValue Denominator = Op.getOperand(2);
10949
10950 // Note this order is opposite of the machine instruction's operations,
10951 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
10952 // intrinsic has the numerator as the first operand to match a normal
10953 // division operation.
10954
10955 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
10956
10957 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
10958 Denominator, Numerator);
10959 }
10960 case Intrinsic::amdgcn_icmp: {
10961 // There is a Pat that handles this variant, so return it as-is.
10962 if (Op.getOperand(1).getValueType() == MVT::i1 &&
10963 Op.getConstantOperandVal(2) == 0 &&
10964 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
10965 return Op;
10966 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
10967 }
10968 case Intrinsic::amdgcn_fcmp: {
10969 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
10970 }
10971 case Intrinsic::amdgcn_ballot:
10972 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
10973 case Intrinsic::amdgcn_fmed3:
10974 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
10975 Op.getOperand(2), Op.getOperand(3));
10976 case Intrinsic::amdgcn_fdot2:
10977 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
10978 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
10979 case Intrinsic::amdgcn_fmul_legacy:
10980 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
10981 Op.getOperand(2));
10982 case Intrinsic::amdgcn_sbfe:
10983 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
10984 Op.getOperand(2), Op.getOperand(3));
10985 case Intrinsic::amdgcn_ubfe:
10986 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
10987 Op.getOperand(2), Op.getOperand(3));
10988 case Intrinsic::amdgcn_cvt_pkrtz:
10989 case Intrinsic::amdgcn_cvt_pknorm_i16:
10990 case Intrinsic::amdgcn_cvt_pknorm_u16:
10991 case Intrinsic::amdgcn_cvt_pk_i16:
10992 case Intrinsic::amdgcn_cvt_pk_u16: {
10993 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
10994 EVT VT = Op.getValueType();
10995 unsigned Opcode;
10996
10997 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
10998 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
10999 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
11000 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
11001 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
11002 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
11003 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
11004 Opcode = AMDGPUISD::CVT_PK_I16_I32;
11005 else
11006 Opcode = AMDGPUISD::CVT_PK_U16_U32;
11007
11008 if (isTypeLegal(VT))
11009 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
11010
11011 SDValue Node =
11012 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
11013 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
11014 }
11015 case Intrinsic::amdgcn_fmad_ftz:
11016 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
11017 Op.getOperand(2), Op.getOperand(3));
11018
11019 case Intrinsic::amdgcn_if_break:
11020 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
11021 Op->getOperand(1), Op->getOperand(2)),
11022 0);
11023
11024 case Intrinsic::amdgcn_groupstaticsize: {
11026 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
11027 return Op;
11028
11029 const Module *M = MF.getFunction().getParent();
11030 const GlobalValue *GV =
11031 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
11032 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
11034 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
11035 }
11036 case Intrinsic::amdgcn_is_shared:
11037 case Intrinsic::amdgcn_is_private: {
11038 SDLoc SL(Op);
11039 SDValue SrcVec =
11040 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11041 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
11042 DAG.getConstant(1, SL, MVT::i32));
11043
11044 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
11046 : AMDGPUAS::PRIVATE_ADDRESS;
11047 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
11048 Subtarget->hasGloballyAddressableScratch()) {
11049 SDValue FlatScratchBaseHi(
11050 DAG.getMachineNode(
11051 AMDGPU::S_MOV_B32, DL, MVT::i32,
11052 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
11053 0);
11054 // Test bits 63..58 against the aperture address.
11055 return DAG.getSetCC(
11056 SL, MVT::i1,
11057 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
11058 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
11059 }
11060
11061 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
11062 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
11063 }
11064 case Intrinsic::amdgcn_perm:
11065 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
11066 Op.getOperand(2), Op.getOperand(3));
11067 case Intrinsic::amdgcn_reloc_constant: {
11068 Module *M = MF.getFunction().getParent();
11069 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
11070 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
11071 auto *RelocSymbol = cast<GlobalVariable>(
11072 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
11073 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
11075 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
11076 }
11077 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
11078 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
11079 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
11080 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
11081 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
11082 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
11083 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
11084 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
11085 if (Op.getOperand(4).getValueType() == MVT::i32)
11086 return SDValue();
11087
11088 SDLoc SL(Op);
11089 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
11090 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
11091 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11092 Op.getOperand(3), IndexKeyi32);
11093 }
11094 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
11095 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
11096 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
11097 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
11098 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
11099 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
11100 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
11101 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
11102 if (Op.getOperand(4).getValueType() == MVT::i64)
11103 return SDValue();
11104
11105 SDLoc SL(Op);
11106 auto IndexKeyi64 =
11107 Op.getOperand(4).getValueType() == MVT::v2i32
11108 ? DAG.getBitcast(MVT::i64, Op.getOperand(4))
11109 : DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
11110 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
11111 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11112 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
11113 Op.getOperand(6)});
11114 }
11115 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
11116 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
11117 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
11118 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
11119 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
11120 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
11121 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
11122 ? MVT::i64
11123 : MVT::i32;
11124 if (Op.getOperand(6).getValueType() == IndexKeyTy)
11125 return SDValue();
11126
11127 SDLoc SL(Op);
11128 auto IndexKey =
11129 Op.getOperand(6).getValueType().isVector()
11130 ? DAG.getBitcast(IndexKeyTy, Op.getOperand(6))
11131 : DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
11133 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11134 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11135 IndexKey, Op.getOperand(7), Op.getOperand(8)};
11136 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
11137 Args.push_back(Op.getOperand(9));
11138 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
11139 }
11140 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
11141 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
11142 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
11143 if (Op.getOperand(6).getValueType() == MVT::i32)
11144 return SDValue();
11145
11146 SDLoc SL(Op);
11147 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
11148 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
11149 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11150 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11151 IndexKeyi32, Op.getOperand(7)});
11152 }
11153 case Intrinsic::amdgcn_addrspacecast_nonnull:
11154 return lowerADDRSPACECAST(Op, DAG);
11155 case Intrinsic::amdgcn_readlane:
11156 case Intrinsic::amdgcn_readfirstlane:
11157 case Intrinsic::amdgcn_writelane:
11158 case Intrinsic::amdgcn_permlane16:
11159 case Intrinsic::amdgcn_permlanex16:
11160 case Intrinsic::amdgcn_permlane64:
11161 case Intrinsic::amdgcn_set_inactive:
11162 case Intrinsic::amdgcn_set_inactive_chain_arg:
11163 case Intrinsic::amdgcn_mov_dpp8:
11164 case Intrinsic::amdgcn_update_dpp:
11165 case Intrinsic::amdgcn_permlane_bcast:
11166 case Intrinsic::amdgcn_permlane_up:
11167 case Intrinsic::amdgcn_permlane_down:
11168 case Intrinsic::amdgcn_permlane_xor:
11169 return lowerLaneOp(*this, Op.getNode(), DAG);
11170 case Intrinsic::amdgcn_dead: {
11172 for (const EVT ValTy : Op.getNode()->values())
11173 Poisons.push_back(DAG.getPOISON(ValTy));
11174 return DAG.getMergeValues(Poisons, SDLoc(Op));
11175 }
11176 case Intrinsic::amdgcn_wave_shuffle:
11177 return lowerWaveShuffle(*this, Op.getNode(), DAG);
11178 default:
11179 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11181 return lowerImage(Op, ImageDimIntr, DAG, false);
11182
11183 return Op;
11184 }
11185}
11186
11187// On targets not supporting constant in soffset field, turn zero to
11188// SGPR_NULL to avoid generating an extra s_mov with zero.
11190 const GCNSubtarget *Subtarget) {
11191 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
11192 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
11193 return SOffset;
11194}
11195
11196SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
11197 SelectionDAG &DAG,
11198 unsigned NewOpcode) const {
11199 SDLoc DL(Op);
11200
11201 SDValue VData = Op.getOperand(2);
11202 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11203 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11204 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11205 SDValue Ops[] = {
11206 Op.getOperand(0), // Chain
11207 VData, // vdata
11208 Rsrc, // rsrc
11209 DAG.getConstant(0, DL, MVT::i32), // vindex
11210 VOffset, // voffset
11211 SOffset, // soffset
11212 Offset, // offset
11213 Op.getOperand(6), // cachepolicy
11214 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11215 };
11216
11217 auto *M = cast<MemSDNode>(Op);
11218
11219 EVT MemVT = VData.getValueType();
11220 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
11221 M->getMemOperand());
11222}
11223
11224SDValue
11225SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
11226 unsigned NewOpcode) const {
11227 SDLoc DL(Op);
11228
11229 SDValue VData = Op.getOperand(2);
11230 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11231 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11232 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11233 SDValue Ops[] = {
11234 Op.getOperand(0), // Chain
11235 VData, // vdata
11236 Rsrc, // rsrc
11237 Op.getOperand(4), // vindex
11238 VOffset, // voffset
11239 SOffset, // soffset
11240 Offset, // offset
11241 Op.getOperand(7), // cachepolicy
11242 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11243 };
11244
11245 auto *M = cast<MemSDNode>(Op);
11246
11247 EVT MemVT = VData.getValueType();
11248 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
11249 M->getMemOperand());
11250}
11251
11252SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
11253 SelectionDAG &DAG) const {
11254 unsigned IntrID = Op.getConstantOperandVal(1);
11255 SDLoc DL(Op);
11256
11257 switch (IntrID) {
11258 case Intrinsic::amdgcn_ds_ordered_add:
11259 case Intrinsic::amdgcn_ds_ordered_swap: {
11260 MemSDNode *M = cast<MemSDNode>(Op);
11261 SDValue Chain = M->getOperand(0);
11262 SDValue M0 = M->getOperand(2);
11263 SDValue Value = M->getOperand(3);
11264 unsigned IndexOperand = M->getConstantOperandVal(7);
11265 unsigned WaveRelease = M->getConstantOperandVal(8);
11266 unsigned WaveDone = M->getConstantOperandVal(9);
11267
11268 unsigned OrderedCountIndex = IndexOperand & 0x3f;
11269 IndexOperand &= ~0x3f;
11270 unsigned CountDw = 0;
11271
11272 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
11273 CountDw = (IndexOperand >> 24) & 0xf;
11274 IndexOperand &= ~(0xf << 24);
11275
11276 if (CountDw < 1 || CountDw > 4) {
11277 const Function &Fn = DAG.getMachineFunction().getFunction();
11278 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11279 Fn, "ds_ordered_count: dword count must be between 1 and 4",
11280 DL.getDebugLoc()));
11281 CountDw = 1;
11282 }
11283 }
11284
11285 if (IndexOperand) {
11286 const Function &Fn = DAG.getMachineFunction().getFunction();
11287 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11288 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
11289 }
11290
11291 if (WaveDone && !WaveRelease) {
11292 // TODO: Move this to IR verifier
11293 const Function &Fn = DAG.getMachineFunction().getFunction();
11294 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11295 Fn, "ds_ordered_count: wave_done requires wave_release",
11296 DL.getDebugLoc()));
11297 }
11298
11299 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
11300 unsigned ShaderType =
11302 unsigned Offset0 = OrderedCountIndex << 2;
11303 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
11304
11305 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
11306 Offset1 |= (CountDw - 1) << 6;
11307
11308 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
11309 Offset1 |= ShaderType << 2;
11310
11311 unsigned Offset = Offset0 | (Offset1 << 8);
11312
11313 SDValue Ops[] = {
11314 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
11315 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
11316 };
11317 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
11318 M->getVTList(), Ops, M->getMemoryVT(),
11319 M->getMemOperand());
11320 }
11321 case Intrinsic::amdgcn_raw_buffer_load:
11322 case Intrinsic::amdgcn_raw_ptr_buffer_load:
11323 case Intrinsic::amdgcn_raw_atomic_buffer_load:
11324 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
11325 case Intrinsic::amdgcn_raw_buffer_load_format:
11326 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
11327 const bool IsFormat =
11328 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
11329 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
11330
11331 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11332 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
11333 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
11334 SDValue Ops[] = {
11335 Op.getOperand(0), // Chain
11336 Rsrc, // rsrc
11337 DAG.getConstant(0, DL, MVT::i32), // vindex
11338 VOffset, // voffset
11339 SOffset, // soffset
11340 Offset, // offset
11341 Op.getOperand(5), // cachepolicy, swizzled buffer
11342 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11343 };
11344
11345 auto *M = cast<MemSDNode>(Op);
11346 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
11347 }
11348 case Intrinsic::amdgcn_struct_buffer_load:
11349 case Intrinsic::amdgcn_struct_ptr_buffer_load:
11350 case Intrinsic::amdgcn_struct_buffer_load_format:
11351 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
11352 case Intrinsic::amdgcn_struct_atomic_buffer_load:
11353 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
11354 const bool IsFormat =
11355 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
11356 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
11357
11358 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11359 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11360 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11361 SDValue Ops[] = {
11362 Op.getOperand(0), // Chain
11363 Rsrc, // rsrc
11364 Op.getOperand(3), // vindex
11365 VOffset, // voffset
11366 SOffset, // soffset
11367 Offset, // offset
11368 Op.getOperand(6), // cachepolicy, swizzled buffer
11369 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11370 };
11371
11372 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
11373 }
11374 case Intrinsic::amdgcn_raw_tbuffer_load:
11375 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11376 MemSDNode *M = cast<MemSDNode>(Op);
11377 EVT LoadVT = Op.getValueType();
11378 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11379 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
11380 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
11381
11382 SDValue Ops[] = {
11383 Op.getOperand(0), // Chain
11384 Rsrc, // rsrc
11385 DAG.getConstant(0, DL, MVT::i32), // vindex
11386 VOffset, // voffset
11387 SOffset, // soffset
11388 Offset, // offset
11389 Op.getOperand(5), // format
11390 Op.getOperand(6), // cachepolicy, swizzled buffer
11391 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11392 };
11393
11394 if (LoadVT.getScalarType() == MVT::f16)
11395 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11396 Ops);
11397 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11398 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11399 DAG);
11400 }
11401 case Intrinsic::amdgcn_struct_tbuffer_load:
11402 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11403 MemSDNode *M = cast<MemSDNode>(Op);
11404 EVT LoadVT = Op.getValueType();
11405 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11406 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11407 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11408
11409 SDValue Ops[] = {
11410 Op.getOperand(0), // Chain
11411 Rsrc, // rsrc
11412 Op.getOperand(3), // vindex
11413 VOffset, // voffset
11414 SOffset, // soffset
11415 Offset, // offset
11416 Op.getOperand(6), // format
11417 Op.getOperand(7), // cachepolicy, swizzled buffer
11418 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11419 };
11420
11421 if (LoadVT.getScalarType() == MVT::f16)
11422 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11423 Ops);
11424 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11425 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11426 DAG);
11427 }
11428 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11429 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11430 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11431 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11432 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11433 return lowerStructBufferAtomicIntrin(Op, DAG,
11434 AMDGPUISD::BUFFER_ATOMIC_FADD);
11435 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11436 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11437 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11438 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11439 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11440 return lowerStructBufferAtomicIntrin(Op, DAG,
11441 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11442 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11443 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11444 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11445 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11446 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11447 return lowerStructBufferAtomicIntrin(Op, DAG,
11448 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11449 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11450 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11451 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11452 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11453 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11454 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11455 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11456 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11457 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11458 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11459 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11460 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11461 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11462 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11463 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11464 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11465 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11466 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11467 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11468 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11469 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11470 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11471 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11472 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11473 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11474 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11475 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11476 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11477 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11478 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11479 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11480 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11481 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11482 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11483 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11484 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11485 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11486 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11487 return lowerStructBufferAtomicIntrin(Op, DAG,
11488 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11489 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11490 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11491 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11492 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11493 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11494 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11495 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11496 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11497 return lowerStructBufferAtomicIntrin(Op, DAG,
11498 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11499 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11500 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11501 return lowerStructBufferAtomicIntrin(Op, DAG,
11502 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11503 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11504 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11505 return lowerStructBufferAtomicIntrin(Op, DAG,
11506 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11507 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11508 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11509 return lowerStructBufferAtomicIntrin(Op, DAG,
11510 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11511 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11512 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11513 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11514 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11515 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11516 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11517 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11518 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11519 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11520 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11521 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11522 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11523 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11524 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11525 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11526 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11527 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11528 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11529 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11530 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11531 return lowerStructBufferAtomicIntrin(Op, DAG,
11532 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11533 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11534 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11535 return lowerRawBufferAtomicIntrin(Op, DAG,
11536 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11537 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11538 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11539 return lowerStructBufferAtomicIntrin(Op, DAG,
11540 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11541 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11542 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11543 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
11544 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11545 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11546 SDValue Ops[] = {
11547 Op.getOperand(0), // Chain
11548 Op.getOperand(2), // src
11549 Op.getOperand(3), // cmp
11550 Rsrc, // rsrc
11551 DAG.getConstant(0, DL, MVT::i32), // vindex
11552 VOffset, // voffset
11553 SOffset, // soffset
11554 Offset, // offset
11555 Op.getOperand(7), // cachepolicy
11556 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11557 };
11558 EVT VT = Op.getValueType();
11559 auto *M = cast<MemSDNode>(Op);
11560
11561 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11562 Op->getVTList(), Ops, VT,
11563 M->getMemOperand());
11564 }
11565 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11566 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11567 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
11568 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
11569 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
11570 SDValue Ops[] = {
11571 Op.getOperand(0), // Chain
11572 Op.getOperand(2), // src
11573 Op.getOperand(3), // cmp
11574 Rsrc, // rsrc
11575 Op.getOperand(5), // vindex
11576 VOffset, // voffset
11577 SOffset, // soffset
11578 Offset, // offset
11579 Op.getOperand(8), // cachepolicy
11580 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11581 };
11582 EVT VT = Op.getValueType();
11583 auto *M = cast<MemSDNode>(Op);
11584
11585 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11586 Op->getVTList(), Ops, VT,
11587 M->getMemOperand());
11588 }
11589 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11590 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11591 MemSDNode *M = cast<MemSDNode>(Op);
11592 SDValue NodePtr = M->getOperand(2);
11593 SDValue RayExtent = M->getOperand(3);
11594 SDValue InstanceMask = M->getOperand(4);
11595 SDValue RayOrigin = M->getOperand(5);
11596 SDValue RayDir = M->getOperand(6);
11597 SDValue Offsets = M->getOperand(7);
11598 SDValue TDescr = M->getOperand(8);
11599
11600 assert(NodePtr.getValueType() == MVT::i64);
11601 assert(RayDir.getValueType() == MVT::v3f32);
11602
11603 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11604 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11605 return SDValue();
11606 }
11607
11608 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11609 const unsigned NumVDataDwords = 10;
11610 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11611 int Opcode = AMDGPU::getMIMGOpcode(
11612 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11613 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11614 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11615 assert(Opcode != -1);
11616
11618 Ops.push_back(NodePtr);
11619 Ops.push_back(DAG.getBuildVector(
11620 MVT::v2i32, DL,
11621 {DAG.getBitcast(MVT::i32, RayExtent),
11622 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11623 Ops.push_back(RayOrigin);
11624 Ops.push_back(RayDir);
11625 Ops.push_back(Offsets);
11626 Ops.push_back(TDescr);
11627 Ops.push_back(M->getChain());
11628
11629 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11630 MachineMemOperand *MemRef = M->getMemOperand();
11631 DAG.setNodeMemRefs(NewNode, {MemRef});
11632 return SDValue(NewNode, 0);
11633 }
11634 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11635 MemSDNode *M = cast<MemSDNode>(Op);
11636 SDValue NodePtr = M->getOperand(2);
11637 SDValue RayExtent = M->getOperand(3);
11638 SDValue RayOrigin = M->getOperand(4);
11639 SDValue RayDir = M->getOperand(5);
11640 SDValue RayInvDir = M->getOperand(6);
11641 SDValue TDescr = M->getOperand(7);
11642
11643 assert(NodePtr.getValueType() == MVT::i32 ||
11644 NodePtr.getValueType() == MVT::i64);
11645 assert(RayDir.getValueType() == MVT::v3f16 ||
11646 RayDir.getValueType() == MVT::v3f32);
11647
11648 if (!Subtarget->hasGFX10_AEncoding()) {
11649 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11650 return SDValue();
11651 }
11652
11653 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
11654 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
11655 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11656 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11657 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11658 const unsigned NumVDataDwords = 4;
11659 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11660 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11661 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11662 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11663 IsGFX12Plus;
11664 const unsigned BaseOpcodes[2][2] = {
11665 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11666 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11667 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11668 int Opcode;
11669 if (UseNSA) {
11670 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11671 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11672 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11673 : AMDGPU::MIMGEncGfx10NSA,
11674 NumVDataDwords, NumVAddrDwords);
11675 } else {
11676 assert(!IsGFX12Plus);
11677 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11678 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11679 : AMDGPU::MIMGEncGfx10Default,
11680 NumVDataDwords, NumVAddrDwords);
11681 }
11682 assert(Opcode != -1);
11683
11685
11686 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11688 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
11689 if (Lanes[0].getValueSizeInBits() == 32) {
11690 for (unsigned I = 0; I < 3; ++I)
11691 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
11692 } else {
11693 if (IsAligned) {
11694 Ops.push_back(DAG.getBitcast(
11695 MVT::i32,
11696 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
11697 Ops.push_back(Lanes[2]);
11698 } else {
11699 SDValue Elt0 = Ops.pop_back_val();
11700 Ops.push_back(DAG.getBitcast(
11701 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
11702 Ops.push_back(DAG.getBitcast(
11703 MVT::i32,
11704 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
11705 }
11706 }
11707 };
11708
11709 if (UseNSA && IsGFX11Plus) {
11710 Ops.push_back(NodePtr);
11711 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11712 Ops.push_back(RayOrigin);
11713 if (IsA16) {
11714 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11715 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
11716 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
11717 for (unsigned I = 0; I < 3; ++I) {
11718 MergedLanes.push_back(DAG.getBitcast(
11719 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
11720 {DirLanes[I], InvDirLanes[I]})));
11721 }
11722 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
11723 } else {
11724 Ops.push_back(RayDir);
11725 Ops.push_back(RayInvDir);
11726 }
11727 } else {
11728 if (Is64)
11729 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
11730 2);
11731 else
11732 Ops.push_back(NodePtr);
11733
11734 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11735 packLanes(RayOrigin, true);
11736 packLanes(RayDir, true);
11737 packLanes(RayInvDir, false);
11738 }
11739
11740 if (!UseNSA) {
11741 // Build a single vector containing all the operands so far prepared.
11742 if (NumVAddrDwords > 12) {
11743 SDValue Undef = DAG.getPOISON(MVT::i32);
11744 Ops.append(16 - Ops.size(), Undef);
11745 }
11746 assert(Ops.size() >= 8 && Ops.size() <= 12);
11747 SDValue MergedOps =
11748 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
11749 Ops.clear();
11750 Ops.push_back(MergedOps);
11751 }
11752
11753 Ops.push_back(TDescr);
11754 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
11755 Ops.push_back(M->getChain());
11756
11757 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11758 MachineMemOperand *MemRef = M->getMemOperand();
11759 DAG.setNodeMemRefs(NewNode, {MemRef});
11760 return SDValue(NewNode, 0);
11761 }
11762 case Intrinsic::amdgcn_global_atomic_fmin_num:
11763 case Intrinsic::amdgcn_global_atomic_fmax_num:
11764 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11765 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11766 MemSDNode *M = cast<MemSDNode>(Op);
11767 SDValue Ops[] = {
11768 M->getOperand(0), // Chain
11769 M->getOperand(2), // Ptr
11770 M->getOperand(3) // Value
11771 };
11772 unsigned Opcode = 0;
11773 switch (IntrID) {
11774 case Intrinsic::amdgcn_global_atomic_fmin_num:
11775 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11776 Opcode = ISD::ATOMIC_LOAD_FMIN;
11777 break;
11778 }
11779 case Intrinsic::amdgcn_global_atomic_fmax_num:
11780 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11781 Opcode = ISD::ATOMIC_LOAD_FMAX;
11782 break;
11783 }
11784 default:
11785 llvm_unreachable("unhandled atomic opcode");
11786 }
11787 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
11788 Ops, M->getMemOperand());
11789 }
11790 case Intrinsic::amdgcn_s_alloc_vgpr: {
11791 SDValue NumVGPRs = Op.getOperand(2);
11792 if (!NumVGPRs->isDivergent())
11793 return Op;
11794
11795 SDValue ReadFirstLaneID =
11796 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
11797 NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
11798 ReadFirstLaneID, NumVGPRs);
11799
11800 return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
11801 Op.getOperand(0), Op.getOperand(1), NumVGPRs);
11802 }
11803 case Intrinsic::amdgcn_s_get_barrier_state:
11804 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11805 SDValue Chain = Op->getOperand(0);
11807 unsigned Opc;
11808
11809 if (isa<ConstantSDNode>(Op->getOperand(2))) {
11810 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
11811 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11812 BarID = (BarID >> 4) & 0x3F;
11813 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11814 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11815 Ops.push_back(K);
11816 Ops.push_back(Chain);
11817 } else {
11818 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
11819 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
11820 SDValue M0Val;
11821 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
11822 DAG.getShiftAmountConstant(4, MVT::i32, DL));
11823 M0Val = SDValue(
11824 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
11825 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
11826 0);
11827 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
11828 } else
11829 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
11830 }
11831
11832 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
11833 return SDValue(NewMI, 0);
11834 }
11835 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
11836 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
11837 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
11838 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11839 SDValue Chain = Op->getOperand(0);
11840 SDValue Ptr = Op->getOperand(2);
11841 EVT VT = Op->getValueType(0);
11842 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
11843 Chain, Ptr, MII->getMemOperand());
11844 }
11845 case Intrinsic::amdgcn_flat_load_monitor_b32:
11846 case Intrinsic::amdgcn_flat_load_monitor_b64:
11847 case Intrinsic::amdgcn_flat_load_monitor_b128: {
11848 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11849 SDValue Chain = Op->getOperand(0);
11850 SDValue Ptr = Op->getOperand(2);
11851 return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
11852 Op->getVTList(), {Chain, Ptr},
11853 MII->getMemoryVT(), MII->getMemOperand());
11854 }
11855 case Intrinsic::amdgcn_global_load_monitor_b32:
11856 case Intrinsic::amdgcn_global_load_monitor_b64:
11857 case Intrinsic::amdgcn_global_load_monitor_b128: {
11858 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
11859 SDValue Chain = Op->getOperand(0);
11860 SDValue Ptr = Op->getOperand(2);
11861 return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
11862 Op->getVTList(), {Chain, Ptr},
11863 MII->getMemoryVT(), MII->getMemOperand());
11864 }
11865 default:
11866
11867 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11869 return lowerImage(Op, ImageDimIntr, DAG, true);
11870
11871 return SDValue();
11872 }
11873}
11874
11875// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
11876// dwordx4 if on SI and handle TFE loads.
11877SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
11878 SDVTList VTList,
11879 ArrayRef<SDValue> Ops, EVT MemVT,
11880 MachineMemOperand *MMO,
11881 SelectionDAG &DAG) const {
11882 LLVMContext &C = *DAG.getContext();
11883 MachineFunction &MF = DAG.getMachineFunction();
11884 EVT VT = VTList.VTs[0];
11885
11886 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
11887 bool IsTFE = VTList.NumVTs == 3;
11888 if (IsTFE) {
11889 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
11890 unsigned NumOpDWords = NumValueDWords + 1;
11891 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
11892 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
11893 MachineMemOperand *OpDWordsMMO =
11894 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
11895 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
11896 OpDWordsVT, OpDWordsMMO, DAG);
11897 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
11898 DAG.getVectorIdxConstant(NumValueDWords, DL));
11899 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
11900 SDValue ValueDWords =
11901 NumValueDWords == 1
11902 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
11904 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
11905 ZeroIdx);
11906 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
11907 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
11908 }
11909
11910 if (!Subtarget->hasDwordx3LoadStores() &&
11911 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
11912 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
11913 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
11914 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
11915 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
11916 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
11917 WidenedMemVT, WidenedMMO);
11919 DAG.getVectorIdxConstant(0, DL));
11920 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
11921 }
11922
11923 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
11924}
11925
11926SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
11927 bool ImageStore) const {
11928 EVT StoreVT = VData.getValueType();
11929
11930 // No change for f16 and legal vector D16 types.
11931 if (!StoreVT.isVector())
11932 return VData;
11933
11934 SDLoc DL(VData);
11935 unsigned NumElements = StoreVT.getVectorNumElements();
11936
11937 if (Subtarget->hasUnpackedD16VMem()) {
11938 // We need to unpack the packed data to store.
11939 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11940 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11941
11942 EVT EquivStoreVT =
11943 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
11944 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
11945 return DAG.UnrollVectorOp(ZExt.getNode());
11946 }
11947
11948 // The sq block of gfx8.1 does not estimate register use correctly for d16
11949 // image store instructions. The data operand is computed as if it were not a
11950 // d16 image instruction.
11951 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
11952 // Bitcast to i16
11953 EVT IntStoreVT = StoreVT.changeTypeToInteger();
11954 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11955
11956 // Decompose into scalars
11958 DAG.ExtractVectorElements(IntVData, Elts);
11959
11960 // Group pairs of i16 into v2i16 and bitcast to i32
11961 SmallVector<SDValue, 4> PackedElts;
11962 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
11963 SDValue Pair =
11964 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
11965 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11966 PackedElts.push_back(IntPair);
11967 }
11968 if ((NumElements % 2) == 1) {
11969 // Handle v3i16
11970 unsigned I = Elts.size() / 2;
11971 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
11972 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
11973 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
11974 PackedElts.push_back(IntPair);
11975 }
11976
11977 // Pad using UNDEF
11978 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
11979
11980 // Build final vector
11981 EVT VecVT =
11982 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
11983 return DAG.getBuildVector(VecVT, DL, PackedElts);
11984 }
11985
11986 if (NumElements == 3) {
11987 EVT IntStoreVT =
11989 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
11990
11991 EVT WidenedStoreVT = EVT::getVectorVT(
11992 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
11993 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
11994 WidenedStoreVT.getStoreSizeInBits());
11995 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
11996 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
11997 }
11998
11999 assert(isTypeLegal(StoreVT));
12000 return VData;
12001}
12002
12003static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
12004 switch (Intr) {
12005 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12006 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12007 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12008 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
12009 case Intrinsic::amdgcn_load_async_to_lds:
12010 case Intrinsic::amdgcn_global_load_async_lds:
12011 return true;
12012 }
12013 return false;
12014}
12015
12016SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
12017 SelectionDAG &DAG) const {
12018 SDLoc DL(Op);
12019 SDValue Chain = Op.getOperand(0);
12020 unsigned IntrinsicID = Op.getConstantOperandVal(1);
12021
12022 switch (IntrinsicID) {
12023 case Intrinsic::amdgcn_exp_compr: {
12024 if (!Subtarget->hasCompressedExport()) {
12025 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
12027 "intrinsic not supported on subtarget", DL.getDebugLoc()));
12028 }
12029 SDValue Src0 = Op.getOperand(4);
12030 SDValue Src1 = Op.getOperand(5);
12031 // Hack around illegal type on SI by directly selecting it.
12032 if (isTypeLegal(Src0.getValueType()))
12033 return SDValue();
12034
12035 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
12036 SDValue Undef = DAG.getPOISON(MVT::f32);
12037 const SDValue Ops[] = {
12038 Op.getOperand(2), // tgt
12039 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
12040 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
12041 Undef, // src2
12042 Undef, // src3
12043 Op.getOperand(7), // vm
12044 DAG.getTargetConstant(1, DL, MVT::i1), // compr
12045 Op.getOperand(3), // en
12046 Op.getOperand(0) // Chain
12047 };
12048
12049 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
12050 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
12051 }
12052
12053 case Intrinsic::amdgcn_struct_tbuffer_store:
12054 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
12055 SDValue VData = Op.getOperand(2);
12056 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
12057 if (IsD16)
12058 VData = handleD16VData(VData, DAG);
12059 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12060 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
12061 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
12062 SDValue Ops[] = {
12063 Chain,
12064 VData, // vdata
12065 Rsrc, // rsrc
12066 Op.getOperand(4), // vindex
12067 VOffset, // voffset
12068 SOffset, // soffset
12069 Offset, // offset
12070 Op.getOperand(7), // format
12071 Op.getOperand(8), // cachepolicy, swizzled buffer
12072 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
12073 };
12074 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12075 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12076 MemSDNode *M = cast<MemSDNode>(Op);
12077 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12078 M->getMemoryVT(), M->getMemOperand());
12079 }
12080
12081 case Intrinsic::amdgcn_raw_tbuffer_store:
12082 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
12083 SDValue VData = Op.getOperand(2);
12084 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
12085 if (IsD16)
12086 VData = handleD16VData(VData, DAG);
12087 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12088 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
12089 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
12090 SDValue Ops[] = {
12091 Chain,
12092 VData, // vdata
12093 Rsrc, // rsrc
12094 DAG.getConstant(0, DL, MVT::i32), // vindex
12095 VOffset, // voffset
12096 SOffset, // soffset
12097 Offset, // offset
12098 Op.getOperand(6), // format
12099 Op.getOperand(7), // cachepolicy, swizzled buffer
12100 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
12101 };
12102 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12103 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12104 MemSDNode *M = cast<MemSDNode>(Op);
12105 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12106 M->getMemoryVT(), M->getMemOperand());
12107 }
12108
12109 case Intrinsic::amdgcn_raw_buffer_store:
12110 case Intrinsic::amdgcn_raw_ptr_buffer_store:
12111 case Intrinsic::amdgcn_raw_buffer_store_format:
12112 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
12113 const bool IsFormat =
12114 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
12115 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
12116
12117 SDValue VData = Op.getOperand(2);
12118 EVT VDataVT = VData.getValueType();
12119 EVT EltType = VDataVT.getScalarType();
12120 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
12121 if (IsD16) {
12122 VData = handleD16VData(VData, DAG);
12123 VDataVT = VData.getValueType();
12124 }
12125
12126 if (!isTypeLegal(VDataVT)) {
12127 VData =
12128 DAG.getNode(ISD::BITCAST, DL,
12129 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
12130 }
12131
12132 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12133 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
12134 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
12135 SDValue Ops[] = {
12136 Chain,
12137 VData,
12138 Rsrc,
12139 DAG.getConstant(0, DL, MVT::i32), // vindex
12140 VOffset, // voffset
12141 SOffset, // soffset
12142 Offset, // offset
12143 Op.getOperand(6), // cachepolicy, swizzled buffer
12144 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
12145 };
12146 unsigned Opc =
12147 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
12148 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12149 MemSDNode *M = cast<MemSDNode>(Op);
12150
12151 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12152 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12153 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
12154
12155 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12156 M->getMemoryVT(), M->getMemOperand());
12157 }
12158
12159 case Intrinsic::amdgcn_struct_buffer_store:
12160 case Intrinsic::amdgcn_struct_ptr_buffer_store:
12161 case Intrinsic::amdgcn_struct_buffer_store_format:
12162 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
12163 const bool IsFormat =
12164 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
12165 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
12166
12167 SDValue VData = Op.getOperand(2);
12168 EVT VDataVT = VData.getValueType();
12169 EVT EltType = VDataVT.getScalarType();
12170 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
12171
12172 if (IsD16) {
12173 VData = handleD16VData(VData, DAG);
12174 VDataVT = VData.getValueType();
12175 }
12176
12177 if (!isTypeLegal(VDataVT)) {
12178 VData =
12179 DAG.getNode(ISD::BITCAST, DL,
12180 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
12181 }
12182
12183 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12184 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
12185 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
12186 SDValue Ops[] = {
12187 Chain,
12188 VData,
12189 Rsrc,
12190 Op.getOperand(4), // vindex
12191 VOffset, // voffset
12192 SOffset, // soffset
12193 Offset, // offset
12194 Op.getOperand(7), // cachepolicy, swizzled buffer
12195 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
12196 };
12197 unsigned Opc =
12198 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
12199 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12200 MemSDNode *M = cast<MemSDNode>(Op);
12201
12202 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12203 EVT VDataType = VData.getValueType().getScalarType();
12204 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12205 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
12206
12207 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12208 M->getMemoryVT(), M->getMemOperand());
12209 }
12210 case Intrinsic::amdgcn_raw_buffer_load_lds:
12211 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12212 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
12213 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12214 case Intrinsic::amdgcn_struct_buffer_load_lds:
12215 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12216 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
12217 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
12218 if (!Subtarget->hasVMemToLDSLoad())
12219 return SDValue();
12220 unsigned Opc;
12221 bool HasVIndex =
12222 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
12223 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
12224 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
12225 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
12226 unsigned OpOffset = HasVIndex ? 1 : 0;
12227 SDValue VOffset = Op.getOperand(5 + OpOffset);
12228 bool HasVOffset = !isNullConstant(VOffset);
12229 unsigned Size = Op->getConstantOperandVal(4);
12230
12231 switch (Size) {
12232 default:
12233 return SDValue();
12234 case 1:
12235 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
12236 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
12237 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
12238 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
12239 break;
12240 case 2:
12241 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
12242 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
12243 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
12244 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
12245 break;
12246 case 4:
12247 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
12248 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
12249 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
12250 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
12251 break;
12252 case 12:
12253 if (!Subtarget->hasLDSLoadB96_B128())
12254 return SDValue();
12255 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
12256 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
12257 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
12258 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
12259 break;
12260 case 16:
12261 if (!Subtarget->hasLDSLoadB96_B128())
12262 return SDValue();
12263 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
12264 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
12265 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
12266 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
12267 break;
12268 }
12269
12270 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
12271
12273
12274 if (HasVIndex && HasVOffset)
12275 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
12276 {Op.getOperand(5), // VIndex
12277 VOffset}));
12278 else if (HasVIndex)
12279 Ops.push_back(Op.getOperand(5));
12280 else if (HasVOffset)
12281 Ops.push_back(VOffset);
12282
12283 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
12284 Ops.push_back(Rsrc);
12285 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
12286 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
12287 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
12288 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
12289 Ops.push_back(DAG.getTargetConstant(
12290 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
12291 DL, MVT::i8)); // cpol
12292 Ops.push_back(DAG.getTargetConstant(
12293 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
12294 ? 1
12295 : 0,
12296 DL, MVT::i8)); // swz
12297 Ops.push_back(
12298 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
12299 Ops.push_back(M0Val.getValue(0)); // Chain
12300 Ops.push_back(M0Val.getValue(1)); // Glue
12301
12302 auto *M = cast<MemSDNode>(Op);
12303 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
12304 DAG.setNodeMemRefs(Load, M->memoperands());
12305
12306 return SDValue(Load, 0);
12307 }
12308 // Buffers are handled by LowerBufferFatPointers, and we're going to go
12309 // for "trust me" that the remaining cases are global pointers until
12310 // such time as we can put two mem operands on an intrinsic.
12311 case Intrinsic::amdgcn_load_to_lds:
12312 case Intrinsic::amdgcn_load_async_to_lds:
12313 case Intrinsic::amdgcn_global_load_lds:
12314 case Intrinsic::amdgcn_global_load_async_lds: {
12315 if (!Subtarget->hasVMemToLDSLoad())
12316 return SDValue();
12317
12318 unsigned Opc;
12319 unsigned Size = Op->getConstantOperandVal(4);
12320 switch (Size) {
12321 default:
12322 return SDValue();
12323 case 1:
12324 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
12325 break;
12326 case 2:
12327 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
12328 break;
12329 case 4:
12330 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
12331 break;
12332 case 12:
12333 if (!Subtarget->hasLDSLoadB96_B128())
12334 return SDValue();
12335 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
12336 break;
12337 case 16:
12338 if (!Subtarget->hasLDSLoadB96_B128())
12339 return SDValue();
12340 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
12341 break;
12342 }
12343
12344 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
12345
12347
12348 SDValue Addr = Op.getOperand(2); // Global ptr
12349 SDValue VOffset;
12350 // Try to split SAddr and VOffset. Global and LDS pointers share the same
12351 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
12352 if (Addr->isDivergent() && Addr->isAnyAdd()) {
12353 SDValue LHS = Addr.getOperand(0);
12354 SDValue RHS = Addr.getOperand(1);
12355
12356 if (LHS->isDivergent())
12357 std::swap(LHS, RHS);
12358
12359 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
12360 RHS.getOperand(0).getValueType() == MVT::i32) {
12361 // add (i64 sgpr), (zero_extend (i32 vgpr))
12362 Addr = LHS;
12363 VOffset = RHS.getOperand(0);
12364 }
12365 }
12366
12367 Ops.push_back(Addr);
12368 if (!Addr->isDivergent()) {
12370 if (!VOffset)
12371 VOffset =
12372 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
12373 DAG.getTargetConstant(0, DL, MVT::i32)),
12374 0);
12375 Ops.push_back(VOffset);
12376 }
12377
12378 Ops.push_back(Op.getOperand(5)); // Offset
12379
12380 unsigned Aux = Op.getConstantOperandVal(6);
12381 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
12382 MVT::i32)); // CPol
12383 Ops.push_back(
12384 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
12385
12386 Ops.push_back(M0Val.getValue(0)); // Chain
12387 Ops.push_back(M0Val.getValue(1)); // Glue
12388
12389 auto *M = cast<MemSDNode>(Op);
12390 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12391 DAG.setNodeMemRefs(Load, M->memoperands());
12392
12393 return SDValue(Load, 0);
12394 }
12395 case Intrinsic::amdgcn_end_cf:
12396 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
12397 Op->getOperand(2), Chain),
12398 0);
12399 case Intrinsic::amdgcn_s_barrier_signal_var: {
12400 // Member count of 0 means to re-use a previous member count,
12401 // which, if the named barrier is statically chosen, means we can use
12402 // the immarg form. Otherwisee, fall through to constructiong M0 as for
12403 // s_barrier_init.
12404 SDValue CntOp = Op->getOperand(3);
12405 auto *CntC = dyn_cast<ConstantSDNode>(CntOp);
12406 if (CntC && CntC->isZero()) {
12407 SDValue Chain = Op->getOperand(0);
12408 SDValue BarOp = Op->getOperand(2);
12410
12411 std::optional<uint64_t> BarVal;
12412 if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
12413 BarVal = C->getZExtValue();
12414 else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
12416 *GA->getGlobal()))
12417 BarVal = *Addr + GA->getOffset();
12418
12419 if (BarVal) {
12420 unsigned BarID = (*BarVal >> 4) & 0x3F;
12421 Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));
12422 Ops.push_back(Chain);
12423 auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
12424 Op->getVTList(), Ops);
12425 return SDValue(NewMI, 0);
12426 }
12427 }
12428 [[fallthrough]];
12429 }
12430 case Intrinsic::amdgcn_s_barrier_init: {
12431 // these two intrinsics have two operands: barrier pointer and member count
12432 SDValue Chain = Op->getOperand(0);
12434 SDValue BarOp = Op->getOperand(2);
12435 SDValue CntOp = Op->getOperand(3);
12436 SDValue M0Val;
12437 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12438 ? AMDGPU::S_BARRIER_INIT_M0
12439 : AMDGPU::S_BARRIER_SIGNAL_M0;
12440 // extract the BarrierID from bits 4-9 of BarOp
12441 SDValue BarID;
12442 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12443 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12444 BarID =
12445 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
12446 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12447 0);
12448 // Member count should be put into M0[ShAmt:+6]
12449 // Barrier ID should be put into M0[5:0]
12450 M0Val =
12451 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
12452 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12453 0);
12454 constexpr unsigned ShAmt = 16;
12455 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
12456 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
12457
12458 M0Val = SDValue(
12459 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
12460
12461 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12462
12463 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12464 return SDValue(NewMI, 0);
12465 }
12466 case Intrinsic::amdgcn_s_wakeup_barrier: {
12467 if (!Subtarget->hasSWakeupBarrier())
12468 return SDValue();
12469 [[fallthrough]];
12470 }
12471 case Intrinsic::amdgcn_s_barrier_join: {
12472 // these three intrinsics have one operand: barrier pointer
12473 SDValue Chain = Op->getOperand(0);
12475 SDValue BarOp = Op->getOperand(2);
12476 unsigned Opc;
12477
12478 if (isa<ConstantSDNode>(BarOp)) {
12479 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
12480 switch (IntrinsicID) {
12481 default:
12482 return SDValue();
12483 case Intrinsic::amdgcn_s_barrier_join:
12484 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12485 break;
12486 case Intrinsic::amdgcn_s_wakeup_barrier:
12487 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12488 break;
12489 }
12490 // extract the BarrierID from bits 4-9 of the immediate
12491 unsigned BarID = (BarVal >> 4) & 0x3F;
12492 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
12493 Ops.push_back(K);
12494 Ops.push_back(Chain);
12495 } else {
12496 switch (IntrinsicID) {
12497 default:
12498 return SDValue();
12499 case Intrinsic::amdgcn_s_barrier_join:
12500 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12501 break;
12502 case Intrinsic::amdgcn_s_wakeup_barrier:
12503 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12504 break;
12505 }
12506 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
12507 SDValue M0Val;
12508 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12509 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12510 M0Val =
12511 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
12512 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12513 0);
12514 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12515 }
12516
12517 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12518 return SDValue(NewMI, 0);
12519 }
12520 case Intrinsic::amdgcn_s_prefetch_data: {
12521 // For non-global address space preserve the chain and remove the call.
12523 return Op.getOperand(0);
12524 return Op;
12525 }
12526 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12527 SDValue Ops[] = {
12528 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
12529 Op.getOperand(3), // offset
12530 Op.getOperand(4), // length
12531 };
12532
12533 MemSDNode *M = cast<MemSDNode>(Op);
12534 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
12535 Op->getVTList(), Ops, M->getMemoryVT(),
12536 M->getMemOperand());
12537 }
12538 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12539 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12540 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12541 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12542 SDValue Chain = Op->getOperand(0);
12543 SDValue Ptr = Op->getOperand(2);
12544 SDValue Val = Op->getOperand(3);
12545 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
12546 Ptr, MII->getMemOperand());
12547 }
12548 default: {
12549 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12551 return lowerImage(Op, ImageDimIntr, DAG, true);
12552
12553 return Op;
12554 }
12555 }
12556}
12557
12558// Return whether the operation has NoUnsignedWrap property.
12559static bool isNoUnsignedWrap(SDValue Addr) {
12560 return (Addr.getOpcode() == ISD::ADD &&
12561 Addr->getFlags().hasNoUnsignedWrap()) ||
12562 Addr->getOpcode() == ISD::OR;
12563}
12564
12566 EVT PtrVT) const {
12567 return PtrVT == MVT::i64;
12568}
12569
12571 EVT PtrVT) const {
12572 return true;
12573}
12574
12575// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
12576// offset (the offset that is included in bounds checking and swizzling, to be
12577// split between the instruction's voffset and immoffset fields) and soffset
12578// (the offset that is excluded from bounds checking and swizzling, to go in
12579// the instruction's soffset field). This function takes the first kind of
12580// offset and figures out how to split it between voffset and immoffset.
12581std::pair<SDValue, SDValue>
12582SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
12583 SDLoc DL(Offset);
12584 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
12585 SDValue N0 = Offset;
12586 ConstantSDNode *C1 = nullptr;
12587
12588 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
12589 N0 = SDValue();
12590 else if (DAG.isBaseWithConstantOffset(N0)) {
12591 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12592 // being added, so we can only safely match a 32-bit addition with no
12593 // unsigned overflow.
12594 bool CheckNUW = Subtarget->hasGFX1250Insts();
12595 if (!CheckNUW || isNoUnsignedWrap(N0)) {
12596 C1 = cast<ConstantSDNode>(N0.getOperand(1));
12597 N0 = N0.getOperand(0);
12598 }
12599 }
12600
12601 if (C1) {
12602 unsigned ImmOffset = C1->getZExtValue();
12603 // If the immediate value is too big for the immoffset field, put only bits
12604 // that would normally fit in the immoffset field. The remaining value that
12605 // is copied/added for the voffset field is a large power of 2, and it
12606 // stands more chance of being CSEd with the copy/add for another similar
12607 // load/store.
12608 // However, do not do that rounding down if that is a negative
12609 // number, as it appears to be illegal to have a negative offset in the
12610 // vgpr, even if adding the immediate offset makes it positive.
12611 unsigned Overflow = ImmOffset & ~MaxImm;
12612 ImmOffset -= Overflow;
12613 if ((int32_t)Overflow < 0) {
12614 Overflow += ImmOffset;
12615 ImmOffset = 0;
12616 }
12617 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
12618 if (Overflow) {
12619 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
12620 if (!N0)
12621 N0 = OverflowVal;
12622 else {
12623 SDValue Ops[] = {N0, OverflowVal};
12624 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
12625 }
12626 }
12627 }
12628 if (!N0)
12629 N0 = DAG.getConstant(0, DL, MVT::i32);
12630 if (!C1)
12631 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
12632 return {N0, SDValue(C1, 0)};
12633}
12634
12635// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12636// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12637// pointed to by Offsets.
12638void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12639 SelectionDAG &DAG, SDValue *Offsets,
12640 Align Alignment) const {
12641 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12642 SDLoc DL(CombinedOffset);
12643 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
12644 uint32_t Imm = C->getZExtValue();
12645 uint32_t SOffset, ImmOffset;
12646 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12647 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
12648 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12649 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12650 return;
12651 }
12652 }
12653 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
12654 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12655 // being added, so we can only safely match a 32-bit addition with no
12656 // unsigned overflow.
12657 bool CheckNUW = Subtarget->hasGFX1250Insts();
12658 SDValue N0 = CombinedOffset.getOperand(0);
12659 SDValue N1 = CombinedOffset.getOperand(1);
12660 uint32_t SOffset, ImmOffset;
12661 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
12662 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
12663 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
12664 Offsets[0] = N0;
12665 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12666 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12667 return;
12668 }
12669 }
12670
12671 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12672 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
12673 : DAG.getConstant(0, DL, MVT::i32);
12674
12675 Offsets[0] = CombinedOffset;
12676 Offsets[1] = SOffsetZero;
12677 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
12678}
12679
12680SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12681 SelectionDAG &DAG) const {
12682 if (!MaybePointer.getValueType().isScalarInteger())
12683 return MaybePointer;
12684
12685 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
12686 return Rsrc;
12687}
12688
12689// Wrap a global or flat pointer into a buffer intrinsic using the flags
12690// specified in the intrinsic.
12691SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12692 SelectionDAG &DAG) const {
12693 SDLoc Loc(Op);
12694
12695 SDValue Pointer = Op->getOperand(1);
12696 SDValue Stride = Op->getOperand(2);
12697 SDValue NumRecords = Op->getOperand(3);
12698 SDValue Flags = Op->getOperand(4);
12699
12700 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
12701 SDValue Rsrc;
12702
12703 if (Subtarget->has45BitNumRecordsBufferResource()) {
12704 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
12705 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12706 // num_records.
12707 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
12708 SDValue NumRecordsLHS =
12709 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
12710 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
12711 SDValue LowHalf =
12712 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
12713
12714 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12715 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12716 SDValue NumRecordsRHS =
12717 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
12718 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
12719 SDValue ShiftedStride =
12720 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12721 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
12722 SDValue ExtShiftedStrideVec =
12723 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
12724 SDValue ExtShiftedStride =
12725 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
12726 SDValue ShiftedFlags =
12727 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
12728 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
12729 SDValue ExtShiftedFlagsVec =
12730 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
12731 SDValue ExtShiftedFlags =
12732 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
12733 SDValue CombinedFields =
12734 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12735 SDValue HighHalf =
12736 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12737
12738 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
12739 } else {
12740 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
12741 auto [LowHalf, HighHalf] =
12742 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12743 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
12744 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
12745 SDValue ShiftedStride =
12746 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12747 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
12748 SDValue NewHighHalf =
12749 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
12750
12751 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
12752 NumRecords, Flags);
12753 }
12754
12755 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
12756 return RsrcPtr;
12757}
12758
12759// Handle 8 bit and 16 bit buffer loads
12760SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12761 EVT LoadVT, SDLoc DL,
12763 MachineMemOperand *MMO,
12764 bool IsTFE) const {
12765 EVT IntVT = LoadVT.changeTypeToInteger();
12766
12767 if (IsTFE) {
12768 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12769 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12770 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12771 MachineFunction &MF = DAG.getMachineFunction();
12772 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
12773 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
12774 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
12775 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12776 DAG.getConstant(1, DL, MVT::i32));
12777 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12778 DAG.getConstant(0, DL, MVT::i32));
12779 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
12780 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
12781 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
12782 }
12783
12784 unsigned Opc = LoadVT.getScalarType() == MVT::i8
12785 ? AMDGPUISD::BUFFER_LOAD_UBYTE
12786 : AMDGPUISD::BUFFER_LOAD_USHORT;
12787
12788 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
12789 SDValue BufferLoad =
12790 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
12791 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
12792 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
12793
12794 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
12795}
12796
12797// Handle 8 bit and 16 bit buffer stores
12798SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
12799 EVT VDataType, SDLoc DL,
12800 SDValue Ops[],
12801 MemSDNode *M) const {
12802 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
12803 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
12804
12805 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
12806 Ops[1] = BufferStoreExt;
12807 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
12808 : AMDGPUISD::BUFFER_STORE_SHORT;
12809 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
12810 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
12811 M->getMemOperand());
12812}
12813
12815 SDValue Op, const SDLoc &SL, EVT VT) {
12816 if (VT.bitsLT(Op.getValueType()))
12817 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
12818
12819 switch (ExtType) {
12820 case ISD::SEXTLOAD:
12821 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
12822 case ISD::ZEXTLOAD:
12823 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
12824 case ISD::EXTLOAD:
12825 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
12826 case ISD::NON_EXTLOAD:
12827 return Op;
12828 }
12829
12830 llvm_unreachable("invalid ext type");
12831}
12832
12833// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
12834// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
12835SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
12836 DAGCombinerInfo &DCI) const {
12837 SelectionDAG &DAG = DCI.DAG;
12838 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
12839 return SDValue();
12840
12841 // FIXME: Constant loads should all be marked invariant.
12842 unsigned AS = Ld->getAddressSpace();
12843 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
12845 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
12846 return SDValue();
12847
12848 // Don't do this early, since it may interfere with adjacent load merging for
12849 // illegal types. We can avoid losing alignment information for exotic types
12850 // pre-legalize.
12851 EVT MemVT = Ld->getMemoryVT();
12852 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
12853 MemVT.getSizeInBits() >= 32)
12854 return SDValue();
12855
12856 SDLoc SL(Ld);
12857
12858 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
12859 "unexpected vector extload");
12860
12861 // TODO: Drop only high part of range.
12862 SDValue Ptr = Ld->getBasePtr();
12863 SDValue NewLoad = DAG.getLoad(
12864 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
12865 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
12866 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
12867 nullptr); // Drop ranges
12868
12869 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
12870 if (MemVT.isFloatingPoint()) {
12872 "unexpected fp extload");
12873 TruncVT = MemVT.changeTypeToInteger();
12874 }
12875
12876 SDValue Cvt = NewLoad;
12877 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
12878 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
12879 DAG.getValueType(TruncVT));
12880 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
12882 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
12883 } else {
12885 }
12886
12887 EVT VT = Ld->getValueType(0);
12888 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
12889
12890 DCI.AddToWorklist(Cvt.getNode());
12891
12892 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
12893 // the appropriate extension from the 32-bit load.
12894 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
12895 DCI.AddToWorklist(Cvt.getNode());
12896
12897 // Handle conversion back to floating point if necessary.
12898 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
12899
12900 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
12901}
12902
12904 const SIMachineFunctionInfo &Info) {
12905 // TODO: Should check if the address can definitely not access stack.
12906 if (Info.isEntryFunction())
12907 return Info.getUserSGPRInfo().hasFlatScratchInit();
12908 return true;
12909}
12910
12911SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
12912 SDLoc DL(Op);
12913 LoadSDNode *Load = cast<LoadSDNode>(Op);
12914 ISD::LoadExtType ExtType = Load->getExtensionType();
12915 EVT MemVT = Load->getMemoryVT();
12916 MachineMemOperand *MMO = Load->getMemOperand();
12917
12918 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
12919 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
12920 return SDValue();
12921
12922 // FIXME: Copied from PPC
12923 // First, load into 32 bits, then truncate to 1 bit.
12924
12925 SDValue Chain = Load->getChain();
12926 SDValue BasePtr = Load->getBasePtr();
12927
12928 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
12929
12930 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
12931 RealMemVT, MMO);
12932
12933 if (!MemVT.isVector()) {
12934 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
12935 NewLD.getValue(1)};
12936
12937 return DAG.getMergeValues(Ops, DL);
12938 }
12939
12941 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
12942 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
12943 DAG.getConstant(I, DL, MVT::i32));
12944
12945 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
12946 }
12947
12948 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
12949
12950 return DAG.getMergeValues(Ops, DL);
12951 }
12952
12953 if (!MemVT.isVector())
12954 return SDValue();
12955
12956 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
12957 "Custom lowering for non-i32 vectors hasn't been implemented.");
12958
12959 Align Alignment = Load->getAlign();
12960 unsigned AS = Load->getAddressSpace();
12961 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
12962 AS == AMDGPUAS::FLAT_ADDRESS &&
12963 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
12964 return SplitVectorLoad(Op, DAG);
12965 }
12966
12967 MachineFunction &MF = DAG.getMachineFunction();
12968 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
12969 // If there is a possibility that flat instruction access scratch memory
12970 // then we need to use the same legalization rules we use for private.
12971 if (AS == AMDGPUAS::FLAT_ADDRESS &&
12972 !Subtarget->hasMultiDwordFlatScratchAddressing())
12973 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
12976
12977 unsigned NumElements = MemVT.getVectorNumElements();
12978
12979 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12981 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
12982 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
12983 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
12984 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
12985 Alignment >= Align(4) && NumElements < 32) {
12986 if (MemVT.isPow2VectorType() ||
12987 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
12988 return SDValue();
12989 return WidenOrSplitVectorLoad(Op, DAG);
12990 }
12991 // Non-uniform loads will be selected to MUBUF instructions, so they
12992 // have the same legalization requirements as global and private
12993 // loads.
12994 //
12995 }
12996 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
12999 if (NumElements > 4)
13000 return SplitVectorLoad(Op, DAG);
13001 // v3 loads not supported on SI.
13002 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13003 return WidenOrSplitVectorLoad(Op, DAG);
13004
13005 // v3 and v4 loads are supported for private and global memory.
13006 return SDValue();
13007 }
13008 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13009 // Depending on the setting of the private_element_size field in the
13010 // resource descriptor, we can only make private accesses up to a certain
13011 // size.
13012 switch (Subtarget->getMaxPrivateElementSize()) {
13013 case 4: {
13014 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
13015 return DAG.getMergeValues({Op0, Op1}, DL);
13016 }
13017 case 8:
13018 if (NumElements > 2)
13019 return SplitVectorLoad(Op, DAG);
13020 return SDValue();
13021 case 16:
13022 // Same as global/flat
13023 if (NumElements > 4)
13024 return SplitVectorLoad(Op, DAG);
13025 // v3 loads not supported on SI.
13026 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13027 return WidenOrSplitVectorLoad(Op, DAG);
13028
13029 return SDValue();
13030 default:
13031 llvm_unreachable("unsupported private_element_size");
13032 }
13033 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13034 unsigned Fast = 0;
13035 auto Flags = Load->getMemOperand()->getFlags();
13037 Load->getAlign(), Flags, &Fast) &&
13038 Fast > 1)
13039 return SDValue();
13040
13041 if (MemVT.isVector())
13042 return SplitVectorLoad(Op, DAG);
13043 }
13044
13046 MemVT, *Load->getMemOperand())) {
13047 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
13048 return DAG.getMergeValues({Op0, Op1}, DL);
13049 }
13050
13051 return SDValue();
13052}
13053
13054SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
13055 EVT VT = Op.getValueType();
13056 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
13057 VT.getSizeInBits() == 512)
13058 return splitTernaryVectorOp(Op, DAG);
13059
13060 assert(VT.getSizeInBits() == 64);
13061
13062 SDLoc DL(Op);
13063 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
13064
13065 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
13066 SDValue One = DAG.getConstant(1, DL, MVT::i32);
13067
13068 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
13069 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
13070
13071 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
13072 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
13073
13074 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
13075
13076 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
13077 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
13078
13079 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
13080
13081 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
13082 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
13083}
13084
13085// Catch division cases where we can use shortcuts with rcp and rsq
13086// instructions.
13087SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
13088 SelectionDAG &DAG) const {
13089 SDLoc SL(Op);
13090 SDValue LHS = Op.getOperand(0);
13091 SDValue RHS = Op.getOperand(1);
13092 EVT VT = Op.getValueType();
13093 const SDNodeFlags Flags = Op->getFlags();
13094
13095 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
13096
13097 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
13098 // Without !fpmath accuracy information, we can't do more because we don't
13099 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
13100 // f16 is always accurate enough
13101 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
13102 return SDValue();
13103
13104 if (CLHS->isExactlyValue(1.0)) {
13105 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
13106 // the CI documentation has a worst case error of 1 ulp.
13107 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
13108 // use it as long as we aren't trying to use denormals.
13109 //
13110 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
13111
13112 // 1.0 / sqrt(x) -> rsq(x)
13113
13114 // XXX - Is afn sufficient to do this for f64? The maximum ULP
13115 // error seems really high at 2^29 ULP.
13116 // 1.0 / x -> rcp(x)
13117 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
13118 }
13119
13120 // Same as for 1.0, but expand the sign out of the constant.
13121 if (CLHS->isExactlyValue(-1.0)) {
13122 // -1.0 / x -> rcp (fneg x)
13123 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
13124 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
13125 }
13126 }
13127
13128 // For f16 and bf16 require afn or arcp.
13129 // For f32 require afn.
13130 if (!AllowInaccurateRcp &&
13131 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
13132 return SDValue();
13133
13134 // Turn into multiply by the reciprocal.
13135 // x / y -> x * (1.0 / y)
13136 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
13137 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
13138}
13139
13140SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
13141 SelectionDAG &DAG) const {
13142 SDLoc SL(Op);
13143 SDValue X = Op.getOperand(0);
13144 SDValue Y = Op.getOperand(1);
13145 EVT VT = Op.getValueType();
13146 const SDNodeFlags Flags = Op->getFlags();
13147
13148 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
13149 if (!AllowInaccurateDiv)
13150 return SDValue();
13151
13152 const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(X);
13153 bool IsNegRcp = CLHS && CLHS->isExactlyValue(-1.0);
13154
13155 // Pull out the negation so it folds for free into the source modifiers.
13156 if (IsNegRcp)
13157 X = DAG.getConstantFP(1.0, SL, VT);
13158
13159 SDValue NegY = IsNegRcp ? Y : DAG.getNode(ISD::FNEG, SL, VT, Y);
13160 SDValue One = DAG.getConstantFP(1.0, SL, VT);
13161
13162 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
13163 if (IsNegRcp)
13164 R = DAG.getNode(ISD::FNEG, SL, VT, R);
13165
13166 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
13167
13168 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
13169 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
13170 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
13171
13172 // Skip the last 2 correction terms for reciprocal.
13173 if (IsNegRcp || (CLHS && CLHS->isExactlyValue(1.0)))
13174 return R;
13175
13176 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
13177 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
13178 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
13179}
13180
13181static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13182 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
13183 SDNodeFlags Flags) {
13184 if (GlueChain->getNumValues() <= 1) {
13185 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
13186 }
13187
13188 assert(GlueChain->getNumValues() == 3);
13189
13190 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
13191 switch (Opcode) {
13192 default:
13193 llvm_unreachable("no chain equivalent for opcode");
13194 case ISD::FMUL:
13195 Opcode = AMDGPUISD::FMUL_W_CHAIN;
13196 break;
13197 }
13198
13199 return DAG.getNode(Opcode, SL, VTList,
13200 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
13201 Flags);
13202}
13203
13204static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13205 EVT VT, SDValue A, SDValue B, SDValue C,
13206 SDValue GlueChain, SDNodeFlags Flags) {
13207 if (GlueChain->getNumValues() <= 1) {
13208 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
13209 }
13210
13211 assert(GlueChain->getNumValues() == 3);
13212
13213 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
13214 switch (Opcode) {
13215 default:
13216 llvm_unreachable("no chain equivalent for opcode");
13217 case ISD::FMA:
13218 Opcode = AMDGPUISD::FMA_W_CHAIN;
13219 break;
13220 }
13221
13222 return DAG.getNode(Opcode, SL, VTList,
13223 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
13224 Flags);
13225}
13226
13227SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
13228 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13229 return FastLowered;
13230
13231 SDLoc SL(Op);
13232 EVT VT = Op.getValueType();
13233 SDValue LHS = Op.getOperand(0);
13234 SDValue RHS = Op.getOperand(1);
13235
13236 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
13237 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
13238
13239 if (VT == MVT::bf16) {
13240 SDValue ExtDiv =
13241 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
13242 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
13243 DAG.getTargetConstant(0, SL, MVT::i32));
13244 }
13245
13246 assert(VT == MVT::f16);
13247
13248 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
13249 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
13250 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
13251 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
13252 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13253 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
13254 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13255 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
13256 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
13257 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
13258 // q16.u = opx(V_CVT_F16_F32, q32.u);
13259 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
13260
13261 // We will use ISD::FMA on targets that don't support ISD::FMAD.
13262 unsigned FMADOpCode =
13264 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
13265 SDValue Rcp =
13266 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
13267 SDValue Quot =
13268 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
13269 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13270 Op->getFlags());
13271 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
13272 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13273 Op->getFlags());
13274 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
13275 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
13276 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
13277 DAG.getConstant(0xff800000, SL, MVT::i32));
13278 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
13279 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
13280 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
13281 DAG.getTargetConstant(0, SL, MVT::i32));
13282 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
13283 Op->getFlags());
13284}
13285
13286// Faster 2.5 ULP division that does not support denormals.
13287SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
13288 SDNodeFlags Flags = Op->getFlags();
13289 SDLoc SL(Op);
13290 SDValue LHS = Op.getOperand(1);
13291 SDValue RHS = Op.getOperand(2);
13292
13293 // TODO: The combiner should probably handle elimination of redundant fabs.
13295 ? RHS
13296 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
13297
13298 const APFloat K0Val(0x1p+96f);
13299 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
13300
13301 const APFloat K1Val(0x1p-32f);
13302 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
13303
13304 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
13305
13306 EVT SetCCVT =
13307 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
13308
13309 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
13310
13311 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
13312
13313 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
13314
13315 // rcp does not support denormals.
13316 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
13317
13318 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
13319
13320 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
13321}
13322
13323// Returns immediate value for setting the F32 denorm mode when using the
13324// S_DENORM_MODE instruction.
13326 const SIMachineFunctionInfo *Info,
13327 const GCNSubtarget *ST) {
13328 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
13329 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
13330 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
13331 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
13332}
13333
13334SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
13335 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13336 return FastLowered;
13337
13338 // The selection matcher assumes anything with a chain selecting to a
13339 // mayRaiseFPException machine instruction. Since we're introducing a chain
13340 // here, we need to explicitly report nofpexcept for the regular fdiv
13341 // lowering.
13342 SDNodeFlags Flags = Op->getFlags();
13343 Flags.setNoFPExcept(true);
13344
13345 SDLoc SL(Op);
13346 SDValue LHS = Op.getOperand(0);
13347 SDValue RHS = Op.getOperand(1);
13348
13349 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
13350
13351 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
13352
13353 SDValue DenominatorScaled =
13354 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
13355 SDValue NumeratorScaled =
13356 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
13357
13358 // Denominator is scaled to not be denormal, so using rcp is ok.
13359 SDValue ApproxRcp =
13360 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
13361 SDValue NegDivScale0 =
13362 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
13363
13364 using namespace AMDGPU::Hwreg;
13365 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
13366 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
13367
13368 const MachineFunction &MF = DAG.getMachineFunction();
13369 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13370 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
13371
13372 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
13373 const bool HasDynamicDenormals =
13374 (DenormMode.Input == DenormalMode::Dynamic) ||
13375 (DenormMode.Output == DenormalMode::Dynamic);
13376
13377 SDValue SavedDenormMode;
13378
13379 if (!PreservesDenormals) {
13380 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
13381 // lowering. The chain dependence is insufficient, and we need glue. We do
13382 // not need the glue variants in a strictfp function.
13383
13384 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
13385
13386 SDValue Glue = DAG.getEntryNode();
13387 if (HasDynamicDenormals) {
13388 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
13389 DAG.getVTList(MVT::i32, MVT::Glue),
13390 {BitField, Glue});
13391 SavedDenormMode = SDValue(GetReg, 0);
13392
13393 Glue = DAG.getMergeValues(
13394 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
13395 }
13396
13397 SDNode *EnableDenorm;
13398 if (Subtarget->hasDenormModeInst()) {
13399 const SDValue EnableDenormValue =
13400 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
13401
13402 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
13403 EnableDenormValue)
13404 .getNode();
13405 } else {
13406 const SDValue EnableDenormValue =
13407 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
13408 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
13409 {EnableDenormValue, BitField, Glue});
13410 }
13411
13412 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
13413 SDValue(EnableDenorm, 1)};
13414
13415 NegDivScale0 = DAG.getMergeValues(Ops, SL);
13416 }
13417
13418 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
13419 ApproxRcp, One, NegDivScale0, Flags);
13420
13421 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
13422 ApproxRcp, Fma0, Flags);
13423
13424 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
13425 Fma1, Flags);
13426
13427 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
13428 NumeratorScaled, Mul, Flags);
13429
13430 SDValue Fma3 =
13431 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
13432
13433 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
13434 NumeratorScaled, Fma3, Flags);
13435
13436 if (!PreservesDenormals) {
13437 SDNode *DisableDenorm;
13438 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13439 const SDValue DisableDenormValue = getSPDenormModeValue(
13440 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
13441
13442 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
13443 DisableDenorm =
13444 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13445 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
13446 .getNode();
13447 } else {
13448 assert(HasDynamicDenormals == (bool)SavedDenormMode);
13449 const SDValue DisableDenormValue =
13450 HasDynamicDenormals
13451 ? SavedDenormMode
13452 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
13453
13454 DisableDenorm = DAG.getMachineNode(
13455 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13456 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
13457 }
13458
13459 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
13460 SDValue(DisableDenorm, 0), DAG.getRoot());
13461 DAG.setRoot(OutputChain);
13462 }
13463
13464 SDValue Scale = NumeratorScaled.getValue(1);
13465 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
13466 {Fma4, Fma1, Fma3, Scale}, Flags);
13467
13468 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
13469}
13470
13471SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
13472 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
13473 return FastLowered;
13474
13475 SDLoc SL(Op);
13476 SDValue X = Op.getOperand(0);
13477 SDValue Y = Op.getOperand(1);
13478
13479 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
13480
13481 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
13482
13483 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
13484
13485 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
13486
13487 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13488
13489 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
13490
13491 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
13492
13493 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
13494
13495 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
13496
13497 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
13498 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
13499
13500 SDValue Fma4 =
13501 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
13502
13503 SDValue Scale;
13504
13505 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13506 // Workaround a hardware bug on SI where the condition output from div_scale
13507 // is not usable.
13508
13509 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
13510
13511 // Figure out if the scale to use for div_fmas.
13512 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
13513 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
13514 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
13515 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
13516
13517 SDValue NumHi =
13518 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
13519 SDValue DenHi =
13520 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
13521
13522 SDValue Scale0Hi =
13523 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
13524 SDValue Scale1Hi =
13525 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
13526
13527 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
13528 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
13529 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
13530 } else {
13531 Scale = DivScale1.getValue(1);
13532 }
13533
13534 SDValue Fmas =
13535 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
13536
13537 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
13538}
13539
13540SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
13541 EVT VT = Op.getValueType();
13542
13543 if (VT == MVT::f32)
13544 return LowerFDIV32(Op, DAG);
13545
13546 if (VT == MVT::f64)
13547 return LowerFDIV64(Op, DAG);
13548
13549 if (VT == MVT::f16 || VT == MVT::bf16)
13550 return LowerFDIV16(Op, DAG);
13551
13552 llvm_unreachable("Unexpected type for fdiv");
13553}
13554
13555SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
13556 SDLoc dl(Op);
13557 SDValue Val = Op.getOperand(0);
13558 EVT VT = Val.getValueType();
13559 EVT ResultExpVT = Op->getValueType(1);
13560 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13561
13562 SDValue Mant = DAG.getNode(
13564 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
13565
13566 SDValue Exp = DAG.getNode(
13567 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
13568 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
13569
13570 if (Subtarget->hasFractBug()) {
13571 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
13572 SDValue Inf =
13574
13575 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
13576 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
13577 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
13578 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
13579 }
13580
13581 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
13582 return DAG.getMergeValues({Mant, CastExp}, dl);
13583}
13584
13585SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
13586 SDLoc DL(Op);
13587 StoreSDNode *Store = cast<StoreSDNode>(Op);
13588 EVT VT = Store->getMemoryVT();
13589
13590 if (VT == MVT::i1) {
13591 return DAG.getTruncStore(
13592 Store->getChain(), DL,
13593 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
13594 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
13595 }
13596
13597 assert(VT.isVector() &&
13598 Store->getValue().getValueType().getScalarType() == MVT::i32);
13599
13600 unsigned AS = Store->getAddressSpace();
13601 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13602 AS == AMDGPUAS::FLAT_ADDRESS &&
13603 Store->getAlign().value() < VT.getStoreSize() &&
13604 VT.getSizeInBits() > 32) {
13605 return SplitVectorStore(Op, DAG);
13606 }
13607
13608 MachineFunction &MF = DAG.getMachineFunction();
13609 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13610 // If there is a possibility that flat instruction access scratch memory
13611 // then we need to use the same legalization rules we use for private.
13612 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13613 !Subtarget->hasMultiDwordFlatScratchAddressing())
13614 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
13617
13618 unsigned NumElements = VT.getVectorNumElements();
13620 if (NumElements > 4)
13621 return SplitVectorStore(Op, DAG);
13622 // v3 stores not supported on SI.
13623 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13624 return SplitVectorStore(Op, DAG);
13625
13627 VT, *Store->getMemOperand()))
13628 return expandUnalignedStore(Store, DAG);
13629
13630 return SDValue();
13631 }
13632 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13633 switch (Subtarget->getMaxPrivateElementSize()) {
13634 case 4:
13635 return scalarizeVectorStore(Store, DAG);
13636 case 8:
13637 if (NumElements > 2)
13638 return SplitVectorStore(Op, DAG);
13639 return SDValue();
13640 case 16:
13641 if (NumElements > 4 ||
13642 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13643 return SplitVectorStore(Op, DAG);
13644 return SDValue();
13645 default:
13646 llvm_unreachable("unsupported private_element_size");
13647 }
13648 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13649 unsigned Fast = 0;
13650 auto Flags = Store->getMemOperand()->getFlags();
13652 Store->getAlign(), Flags, &Fast) &&
13653 Fast > 1)
13654 return SDValue();
13655
13656 if (VT.isVector())
13657 return SplitVectorStore(Op, DAG);
13658
13659 return expandUnalignedStore(Store, DAG);
13660 }
13661
13662 // Probably an invalid store. If so we'll end up emitting a selection error.
13663 return SDValue();
13664}
13665
13666// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13667SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13668 SDLoc SL(Op);
13669 assert(!Subtarget->has16BitInsts());
13670 SDNodeFlags Flags = Op->getFlags();
13671 SDValue Ext =
13672 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
13673
13674 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
13675 SDValue Sqrt =
13676 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
13677
13678 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
13679 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
13680}
13681
13682SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13683 SDLoc DL(Op);
13684 SDNodeFlags Flags = Op->getFlags();
13685 MVT VT = Op.getValueType().getSimpleVT();
13686 const SDValue X = Op.getOperand(0);
13687
13688 if (allowApproxFunc(DAG, Flags)) {
13689 // Instruction is 1ulp but ignores denormals.
13690 return DAG.getNode(
13692 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
13693 }
13694
13695 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
13696 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
13697
13698 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
13699
13700 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
13701
13702 SDValue SqrtX =
13703 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
13704
13705 SDValue SqrtS;
13706 if (needsDenormHandlingF32(DAG, X, Flags)) {
13707 SDValue SqrtID =
13708 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
13709 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
13710
13711 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
13712 SDValue SqrtSNextDownInt =
13713 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13714 DAG.getAllOnesConstant(DL, MVT::i32));
13715 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
13716
13717 SDValue NegSqrtSNextDown =
13718 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
13719
13720 SDValue SqrtVP =
13721 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
13722
13723 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13724 DAG.getConstant(1, DL, MVT::i32));
13725 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
13726
13727 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
13728 SDValue SqrtVS =
13729 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
13730
13731 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
13732 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
13733
13734 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
13735 Flags);
13736
13737 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
13738 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
13739 Flags);
13740 } else {
13741 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
13742
13743 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
13744
13745 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
13746 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
13747 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
13748
13749 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
13750 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
13751 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
13752
13753 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
13754 SDValue SqrtD =
13755 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
13756 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
13757 }
13758
13759 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
13760
13761 SDValue ScaledDown =
13762 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
13763
13764 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
13765 SDValue IsZeroOrInf =
13766 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13767 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13768
13769 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
13770}
13771
13772SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13773 // For double type, the SQRT and RSQ instructions don't have required
13774 // precision, we apply Goldschmidt's algorithm to improve the result:
13775 //
13776 // y0 = rsq(x)
13777 // g0 = x * y0
13778 // h0 = 0.5 * y0
13779 //
13780 // r0 = 0.5 - h0 * g0
13781 // g1 = g0 * r0 + g0
13782 // h1 = h0 * r0 + h0
13783 //
13784 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
13785 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
13786 // h2 = h1 * r1 + h1
13787 //
13788 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
13789 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
13790 //
13791 // sqrt(x) = g3
13792
13793 SDNodeFlags Flags = Op->getFlags();
13794
13795 SDLoc DL(Op);
13796
13797 SDValue X = Op.getOperand(0);
13798 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
13799
13800 SDValue SqrtX = X;
13801 SDValue Scaling;
13802 if (!Flags.hasApproximateFuncs()) {
13803 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
13804 Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
13805
13806 // Scale up input if it is too small.
13807 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
13808 SDValue ScaleUp =
13809 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
13810 SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
13811 }
13812
13813 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
13814
13815 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
13816
13817 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
13818 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
13819
13820 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
13821 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
13822
13823 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
13824
13825 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
13826
13827 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
13828 SDValue SqrtD0 =
13829 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
13830
13831 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
13832
13833 SDValue SqrtRet = SqrtS2;
13834 if (!Flags.hasApproximateFuncs()) {
13835 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
13836 SDValue SqrtD1 =
13837 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
13838
13839 SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
13840
13841 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
13842 SDValue ScaleDown = DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling,
13843 ScaleDownFactor, ZeroInt);
13844 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
13845 }
13846
13847 // TODO: Check for DAZ and expand to subnormals
13848
13849 SDValue IsZeroOrInf;
13850 if (Flags.hasNoInfs()) {
13851 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
13852 IsZeroOrInf = DAG.getSetCC(DL, MVT::i1, SqrtX, Zero, ISD::SETOEQ);
13853 } else {
13854 IsZeroOrInf =
13855 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13856 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13857 }
13858
13859 // If x is +INF, +0, or -0, use its original value
13860 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
13861 Flags);
13862}
13863
13864SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
13865 SDLoc DL(Op);
13866 EVT VT = Op.getValueType();
13867 SDValue Arg = Op.getOperand(0);
13868 SDValue TrigVal;
13869
13870 // Propagate fast-math flags so that the multiply we introduce can be folded
13871 // if Arg is already the result of a multiply by constant.
13872 auto Flags = Op->getFlags();
13873
13874 // AMDGPUISD nodes of vector type must be unrolled here since
13875 // they will not be expanded elsewhere.
13876 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
13877 if (!V.getValueType().isVector())
13878 return V;
13879
13880 return DAG.UnrollVectorOp(cast<SDNode>(V));
13881 };
13882
13883 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
13884
13885 if (Subtarget->hasTrigReducedRange()) {
13886 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13887 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
13888 } else {
13889 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
13890 }
13891
13892 switch (Op.getOpcode()) {
13893 case ISD::FCOS:
13894 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
13895 break;
13896 case ISD::FSIN:
13897 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
13898 break;
13899 default:
13900 llvm_unreachable("Wrong trig opcode");
13901 }
13902
13903 return UnrollIfVec(TrigVal);
13904}
13905
13906SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
13907 SelectionDAG &DAG) const {
13908 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
13909 assert(AtomicNode->isCompareAndSwap());
13910 unsigned AS = AtomicNode->getAddressSpace();
13911
13912 // No custom lowering required for local address space
13914 return Op;
13915
13916 // Non-local address space requires custom lowering for atomic compare
13917 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
13918 SDLoc DL(Op);
13919 SDValue ChainIn = Op.getOperand(0);
13920 SDValue Addr = Op.getOperand(1);
13921 SDValue Old = Op.getOperand(2);
13922 SDValue New = Op.getOperand(3);
13923 EVT VT = Op.getValueType();
13924 MVT SimpleVT = VT.getSimpleVT();
13925 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
13926
13927 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
13928 SDValue Ops[] = {ChainIn, Addr, NewOld};
13929
13930 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
13931 Op->getVTList(), Ops, VT,
13932 AtomicNode->getMemOperand());
13933}
13934
13935//===----------------------------------------------------------------------===//
13936// Custom DAG optimizations
13937//===----------------------------------------------------------------------===//
13938
13939SDValue
13940SITargetLowering::performUCharToFloatCombine(SDNode *N,
13941 DAGCombinerInfo &DCI) const {
13942 EVT VT = N->getValueType(0);
13943 EVT ScalarVT = VT.getScalarType();
13944 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
13945 return SDValue();
13946
13947 SelectionDAG &DAG = DCI.DAG;
13948 SDLoc DL(N);
13949
13950 SDValue Src = N->getOperand(0);
13951 EVT SrcVT = Src.getValueType();
13952
13953 // TODO: We could try to match extracting the higher bytes, which would be
13954 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
13955 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
13956 // about in practice.
13957 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
13958 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
13959 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
13960 DCI.AddToWorklist(Cvt.getNode());
13961
13962 // For the f16 case, fold to a cast to f32 and then cast back to f16.
13963 if (ScalarVT != MVT::f32) {
13964 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
13965 DAG.getTargetConstant(0, DL, MVT::i32));
13966 }
13967 return Cvt;
13968 }
13969 }
13970
13971 return SDValue();
13972}
13973
13974SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
13975 DAGCombinerInfo &DCI) const {
13976 SDValue MagnitudeOp = N->getOperand(0);
13977 SDValue SignOp = N->getOperand(1);
13978
13979 // The generic combine for fcopysign + fp cast is too conservative with
13980 // vectors, and also gets confused by the splitting we will perform here, so
13981 // peek through FP casts.
13982 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
13983 SignOp.getOpcode() == ISD::FP_ROUND)
13984 SignOp = SignOp.getOperand(0);
13985
13986 SelectionDAG &DAG = DCI.DAG;
13987 SDLoc DL(N);
13988 EVT SignVT = SignOp.getValueType();
13989
13990 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
13991 // lower half with a copy.
13992 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
13993 EVT MagVT = MagnitudeOp.getValueType();
13994
13995 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
13996
13997 if (MagVT.getScalarType() == MVT::f64) {
13998 EVT F32VT = MagVT.isVector()
13999 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
14000 : MVT::v2f32;
14001
14002 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
14003
14005 for (unsigned I = 0; I != NumElts; ++I) {
14006 SDValue MagLo =
14007 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
14008 DAG.getConstant(2 * I, DL, MVT::i32));
14009 SDValue MagHi =
14010 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
14011 DAG.getConstant(2 * I + 1, DL, MVT::i32));
14012
14013 SDValue SignOpElt =
14014 MagVT.isVector()
14016 SignOp, DAG.getConstant(I, DL, MVT::i32))
14017 : SignOp;
14018
14019 SDValue HiOp =
14020 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
14021
14022 SDValue Vector =
14023 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
14024
14025 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
14026 NewElts.push_back(NewElt);
14027 }
14028
14029 if (NewElts.size() == 1)
14030 return NewElts[0];
14031
14032 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
14033 }
14034
14035 if (SignVT.getScalarType() != MVT::f64)
14036 return SDValue();
14037
14038 // Reduce width of sign operand, we only need the highest bit.
14039 //
14040 // fcopysign f64:x, f64:y ->
14041 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
14042 // TODO: In some cases it might make sense to go all the way to f16.
14043
14044 EVT F32VT = MagVT.isVector()
14045 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
14046 : MVT::v2f32;
14047
14048 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
14049
14050 SmallVector<SDValue, 8> F32Signs;
14051 for (unsigned I = 0; I != NumElts; ++I) {
14052 // Take sign from odd elements of cast vector
14053 SDValue SignAsF32 =
14054 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
14055 DAG.getConstant(2 * I + 1, DL, MVT::i32));
14056 F32Signs.push_back(SignAsF32);
14057 }
14058
14059 SDValue NewSign =
14060 NumElts == 1
14061 ? F32Signs.back()
14063 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
14064 F32Signs);
14065
14066 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
14067 NewSign);
14068}
14069
14070// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
14071// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
14072// bits
14073
14074// This is a variant of
14075// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
14076//
14077// The normal DAG combiner will do this, but only if the add has one use since
14078// that would increase the number of instructions.
14079//
14080// This prevents us from seeing a constant offset that can be folded into a
14081// memory instruction's addressing mode. If we know the resulting add offset of
14082// a pointer can be folded into an addressing offset, we can replace the pointer
14083// operand with the add of new constant offset. This eliminates one of the uses,
14084// and may allow the remaining use to also be simplified.
14085//
14086SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
14087 EVT MemVT,
14088 DAGCombinerInfo &DCI) const {
14089 SDValue N0 = N->getOperand(0);
14090 SDValue N1 = N->getOperand(1);
14091
14092 // We only do this to handle cases where it's profitable when there are
14093 // multiple uses of the add, so defer to the standard combine.
14094 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
14095 return SDValue();
14096
14097 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
14098 if (!CN1)
14099 return SDValue();
14100
14101 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
14102 if (!CAdd)
14103 return SDValue();
14104
14105 SelectionDAG &DAG = DCI.DAG;
14106
14107 if (N0->getOpcode() == ISD::OR &&
14108 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
14109 return SDValue();
14110
14111 // If the resulting offset is too large, we can't fold it into the
14112 // addressing mode offset.
14113 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
14114 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
14115
14116 AddrMode AM;
14117 AM.HasBaseReg = true;
14118 AM.BaseOffs = Offset.getSExtValue();
14119 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
14120 return SDValue();
14121
14122 SDLoc SL(N);
14123 EVT VT = N->getValueType(0);
14124
14125 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
14126 SDValue COffset = DAG.getConstant(Offset, SL, VT);
14127
14128 SDNodeFlags Flags;
14129 Flags.setNoUnsignedWrap(
14130 N->getFlags().hasNoUnsignedWrap() &&
14131 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
14132
14133 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
14134 // be sure that the new left operand is a proper base pointer.
14135 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
14136}
14137
14138/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
14139/// by the chain and intrinsic ID. Theoretically we would also need to check the
14140/// specific intrinsic, but they all place the pointer operand first.
14141static unsigned getBasePtrIndex(const MemSDNode *N) {
14142 switch (N->getOpcode()) {
14143 case ISD::STORE:
14146 return 2;
14147 default:
14148 return 1;
14149 }
14150}
14151
14152SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
14153 DAGCombinerInfo &DCI) const {
14154 SelectionDAG &DAG = DCI.DAG;
14155
14156 unsigned PtrIdx = getBasePtrIndex(N);
14157 SDValue Ptr = N->getOperand(PtrIdx);
14158
14159 // TODO: We could also do this for multiplies.
14160 if (Ptr.getOpcode() == ISD::SHL) {
14161 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
14162 N->getMemoryVT(), DCI);
14163 if (NewPtr) {
14164 SmallVector<SDValue, 8> NewOps(N->ops());
14165
14166 NewOps[PtrIdx] = NewPtr;
14167 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
14168 }
14169 }
14170
14171 return SDValue();
14172}
14173
14174static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
14175 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
14176 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
14177 (Opc == ISD::XOR && Val == 0);
14178}
14179
14180// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
14181// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
14182// integer combine opportunities since most 64-bit operations are decomposed
14183// this way. TODO: We won't want this for SALU especially if it is an inline
14184// immediate.
14185SDValue SITargetLowering::splitBinaryBitConstantOp(
14186 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
14187 const ConstantSDNode *CRHS) const {
14188 uint64_t Val = CRHS->getZExtValue();
14189 uint32_t ValLo = Lo_32(Val);
14190 uint32_t ValHi = Hi_32(Val);
14191 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14192
14193 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
14195 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
14196 // We have 64-bit scalar and/or/xor, but do not have vector forms.
14197 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
14198 !CRHS->user_begin()->isDivergent())
14199 return SDValue();
14200
14201 // If we need to materialize a 64-bit immediate, it will be split up later
14202 // anyway. Avoid creating the harder to understand 64-bit immediate
14203 // materialization.
14204 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
14205 }
14206
14207 return SDValue();
14208}
14209
14211 if (V.getValueType() != MVT::i1)
14212 return false;
14213 switch (V.getOpcode()) {
14214 default:
14215 break;
14216 case ISD::SETCC:
14217 case ISD::IS_FPCLASS:
14218 case AMDGPUISD::FP_CLASS:
14219 return true;
14220 case ISD::AND:
14221 case ISD::OR:
14222 case ISD::XOR:
14223 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
14224 case ISD::SADDO:
14225 case ISD::UADDO:
14226 case ISD::SSUBO:
14227 case ISD::USUBO:
14228 case ISD::SMULO:
14229 case ISD::UMULO:
14230 return V.getResNo() == 1;
14232 unsigned IntrinsicID = V.getConstantOperandVal(0);
14233 switch (IntrinsicID) {
14234 case Intrinsic::amdgcn_is_shared:
14235 case Intrinsic::amdgcn_is_private:
14236 return true;
14237 default:
14238 return false;
14239 }
14240
14241 return false;
14242 }
14243 }
14244 return false;
14245}
14246
14247// If a constant has all zeroes or all ones within each byte return it.
14248// Otherwise return 0.
14250 // 0xff for any zero byte in the mask
14251 uint32_t ZeroByteMask = 0;
14252 if (!(C & 0x000000ff))
14253 ZeroByteMask |= 0x000000ff;
14254 if (!(C & 0x0000ff00))
14255 ZeroByteMask |= 0x0000ff00;
14256 if (!(C & 0x00ff0000))
14257 ZeroByteMask |= 0x00ff0000;
14258 if (!(C & 0xff000000))
14259 ZeroByteMask |= 0xff000000;
14260 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
14261 if ((NonZeroByteMask & C) != NonZeroByteMask)
14262 return 0; // Partial bytes selected.
14263 return C;
14264}
14265
14266// Check if a node selects whole bytes from its operand 0 starting at a byte
14267// boundary while masking the rest. Returns select mask as in the v_perm_b32
14268// or -1 if not succeeded.
14269// Note byte select encoding:
14270// value 0-3 selects corresponding source byte;
14271// value 0xc selects zero;
14272// value 0xff selects 0xff.
14274 assert(V.getValueSizeInBits() == 32);
14275
14276 if (V.getNumOperands() != 2)
14277 return ~0;
14278
14279 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
14280 if (!N1)
14281 return ~0;
14282
14283 uint32_t C = N1->getZExtValue();
14284
14285 switch (V.getOpcode()) {
14286 default:
14287 break;
14288 case ISD::AND:
14289 if (uint32_t ConstMask = getConstantPermuteMask(C))
14290 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
14291 break;
14292
14293 case ISD::OR:
14294 if (uint32_t ConstMask = getConstantPermuteMask(C))
14295 return (0x03020100 & ~ConstMask) | ConstMask;
14296 break;
14297
14298 case ISD::SHL:
14299 if (C % 8)
14300 return ~0;
14301
14302 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
14303
14304 case ISD::SRL:
14305 if (C % 8)
14306 return ~0;
14307
14308 return uint32_t(0x0c0c0c0c03020100ull >> C);
14309 }
14310
14311 return ~0;
14312}
14313
14314SDValue SITargetLowering::performAndCombine(SDNode *N,
14315 DAGCombinerInfo &DCI) const {
14316 if (DCI.isBeforeLegalize())
14317 return SDValue();
14318
14319 SelectionDAG &DAG = DCI.DAG;
14320 EVT VT = N->getValueType(0);
14321 SDValue LHS = N->getOperand(0);
14322 SDValue RHS = N->getOperand(1);
14323
14324 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
14325 if (VT == MVT::i64 && CRHS) {
14326 if (SDValue Split =
14327 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
14328 return Split;
14329 }
14330
14331 if (CRHS && VT == MVT::i32) {
14332 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
14333 // nb = number of trailing zeroes in mask
14334 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
14335 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
14336 uint64_t Mask = CRHS->getZExtValue();
14337 unsigned Bits = llvm::popcount(Mask);
14338 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
14339 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
14340 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
14341 unsigned Shift = CShift->getZExtValue();
14342 unsigned NB = CRHS->getAPIntValue().countr_zero();
14343 unsigned Offset = NB + Shift;
14344 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
14345 SDLoc SL(N);
14346 SDValue BFE =
14347 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
14348 DAG.getConstant(Offset, SL, MVT::i32),
14349 DAG.getConstant(Bits, SL, MVT::i32));
14350 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
14351 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
14352 DAG.getValueType(NarrowVT));
14353 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
14354 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
14355 return Shl;
14356 }
14357 }
14358 }
14359
14360 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14361 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
14362 isa<ConstantSDNode>(LHS.getOperand(2))) {
14363 uint32_t Sel = getConstantPermuteMask(Mask);
14364 if (!Sel)
14365 return SDValue();
14366
14367 // Select 0xc for all zero bytes
14368 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
14369 SDLoc DL(N);
14370 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14371 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14372 }
14373 }
14374
14375 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
14376 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
14377 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
14378 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
14379 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
14380
14381 SDValue X = LHS.getOperand(0);
14382 SDValue Y = RHS.getOperand(0);
14383 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
14384 !isTypeLegal(X.getValueType()))
14385 return SDValue();
14386
14387 if (LCC == ISD::SETO) {
14388 if (X != LHS.getOperand(1))
14389 return SDValue();
14390
14391 if (RCC == ISD::SETUNE) {
14392 const ConstantFPSDNode *C1 =
14393 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
14394 if (!C1 || !C1->isInfinity() || C1->isNegative())
14395 return SDValue();
14396
14397 const uint32_t Mask = SIInstrFlags::N_NORMAL |
14401
14402 static_assert(
14405 0x3ff) == Mask,
14406 "mask not equal");
14407
14408 SDLoc DL(N);
14409 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
14410 DAG.getConstant(Mask, DL, MVT::i32));
14411 }
14412 }
14413 }
14414
14415 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
14416 std::swap(LHS, RHS);
14417
14418 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14419 RHS.hasOneUse()) {
14420 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
14421 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
14422 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
14423 // | n_nan)
14424 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14425 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
14426 (RHS.getOperand(0) == LHS.getOperand(0) &&
14427 LHS.getOperand(0) == LHS.getOperand(1))) {
14428 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
14429 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
14430 : Mask->getZExtValue() & OrdMask;
14431
14432 SDLoc DL(N);
14433 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
14434 DAG.getConstant(NewMask, DL, MVT::i32));
14435 }
14436 }
14437
14438 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
14439 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
14440 // and x, (sext cc from i1) => select cc, x, 0
14441 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
14442 std::swap(LHS, RHS);
14443 if (isBoolSGPR(RHS.getOperand(0)))
14444 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
14445 DAG.getConstant(0, SDLoc(N), MVT::i32));
14446 }
14447
14448 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14449 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14450 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14451 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14452 uint32_t LHSMask = getPermuteMask(LHS);
14453 uint32_t RHSMask = getPermuteMask(RHS);
14454 if (LHSMask != ~0u && RHSMask != ~0u) {
14455 // Canonicalize the expression in an attempt to have fewer unique masks
14456 // and therefore fewer registers used to hold the masks.
14457 if (LHSMask > RHSMask) {
14458 std::swap(LHSMask, RHSMask);
14459 std::swap(LHS, RHS);
14460 }
14461
14462 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14463 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14464 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14465 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14466
14467 // Check of we need to combine values from two sources within a byte.
14468 if (!(LHSUsedLanes & RHSUsedLanes) &&
14469 // If we select high and lower word keep it for SDWA.
14470 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14471 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14472 // Each byte in each mask is either selector mask 0-3, or has higher
14473 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
14474 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
14475 // mask which is not 0xff wins. By anding both masks we have a correct
14476 // result except that 0x0c shall be corrected to give 0x0c only.
14477 uint32_t Mask = LHSMask & RHSMask;
14478 for (unsigned I = 0; I < 32; I += 8) {
14479 uint32_t ByteSel = 0xff << I;
14480 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14481 Mask &= (0x0c << I) & 0xffffffff;
14482 }
14483
14484 // Add 4 to each active LHS lane. It will not affect any existing 0xff
14485 // or 0x0c.
14486 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
14487 SDLoc DL(N);
14488
14489 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14490 RHS.getOperand(0),
14491 DAG.getConstant(Sel, DL, MVT::i32));
14492 }
14493 }
14494 }
14495
14496 return SDValue();
14497}
14498
14499// A key component of v_perm is a mapping between byte position of the src
14500// operands, and the byte position of the dest. To provide such, we need: 1. the
14501// node that provides x byte of the dest of the OR, and 2. the byte of the node
14502// used to provide that x byte. calculateByteProvider finds which node provides
14503// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
14504// and finds an ultimate src and byte position For example: The supported
14505// LoadCombine pattern for vector loads is as follows
14506// t1
14507// or
14508// / \
14509// t2 t3
14510// zext shl
14511// | | \
14512// t4 t5 16
14513// or anyext
14514// / \ |
14515// t6 t7 t8
14516// srl shl or
14517// / | / \ / \
14518// t9 t10 t11 t12 t13 t14
14519// trunc* 8 trunc* 8 and and
14520// | | / | | \
14521// t15 t16 t17 t18 t19 t20
14522// trunc* 255 srl -256
14523// | / \
14524// t15 t15 16
14525//
14526// *In this example, the truncs are from i32->i16
14527//
14528// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
14529// respectively. calculateSrcByte would find (given node) -> ultimate src &
14530// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
14531// After finding the mapping, we can combine the tree into vperm t15, t16,
14532// 0x05000407
14533
14534// Find the source and byte position from a node.
14535// \p DestByte is the byte position of the dest of the or that the src
14536// ultimately provides. \p SrcIndex is the byte of the src that maps to this
14537// dest of the or byte. \p Depth tracks how many recursive iterations we have
14538// performed.
14539static const std::optional<ByteProvider<SDValue>>
14540calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
14541 unsigned Depth = 0) {
14542 // We may need to recursively traverse a series of SRLs
14543 if (Depth >= 6)
14544 return std::nullopt;
14545
14546 if (Op.getValueSizeInBits() < 8)
14547 return std::nullopt;
14548
14549 if (Op.getValueType().isVector())
14550 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14551
14552 switch (Op->getOpcode()) {
14553 case ISD::TRUNCATE: {
14554 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14555 }
14556
14557 case ISD::ANY_EXTEND:
14558 case ISD::SIGN_EXTEND:
14559 case ISD::ZERO_EXTEND:
14561 SDValue NarrowOp = Op->getOperand(0);
14562 auto NarrowVT = NarrowOp.getValueType();
14563 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
14564 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14565 NarrowVT = VTSign->getVT();
14566 }
14567 if (!NarrowVT.isByteSized())
14568 return std::nullopt;
14569 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
14570
14571 if (SrcIndex >= NarrowByteWidth)
14572 return std::nullopt;
14573 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14574 }
14575
14576 case ISD::SRA:
14577 case ISD::SRL: {
14578 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14579 if (!ShiftOp)
14580 return std::nullopt;
14581
14582 uint64_t BitShift = ShiftOp->getZExtValue();
14583
14584 if (BitShift % 8 != 0)
14585 return std::nullopt;
14586
14587 SrcIndex += BitShift / 8;
14588
14589 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14590 }
14591
14592 default: {
14593 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14594 }
14595 }
14596 llvm_unreachable("fully handled switch");
14597}
14598
14599// For a byte position in the result of an Or, traverse the tree and find the
14600// node (and the byte of the node) which ultimately provides this {Or,
14601// BytePosition}. \p Op is the operand we are currently examining. \p Index is
14602// the byte position of the Op that corresponds with the originally requested
14603// byte of the Or \p Depth tracks how many recursive iterations we have
14604// performed. \p StartingIndex is the originally requested byte of the Or
14605static const std::optional<ByteProvider<SDValue>>
14606calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
14607 unsigned StartingIndex = 0) {
14608 // Finding Src tree of RHS of or typically requires at least 1 additional
14609 // depth
14610 if (Depth > 6)
14611 return std::nullopt;
14612
14613 unsigned BitWidth = Op.getScalarValueSizeInBits();
14614 if (BitWidth % 8 != 0)
14615 return std::nullopt;
14616 if (Index > BitWidth / 8 - 1)
14617 return std::nullopt;
14618
14619 bool IsVec = Op.getValueType().isVector();
14620 switch (Op.getOpcode()) {
14621 case ISD::OR: {
14622 if (IsVec)
14623 return std::nullopt;
14624
14625 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
14626 StartingIndex);
14627 if (!RHS)
14628 return std::nullopt;
14629 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14630 StartingIndex);
14631 if (!LHS)
14632 return std::nullopt;
14633 // A well formed Or will have two ByteProviders for each byte, one of which
14634 // is constant zero
14635 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14636 return std::nullopt;
14637 if (!LHS || LHS->isConstantZero())
14638 return RHS;
14639 if (!RHS || RHS->isConstantZero())
14640 return LHS;
14641 return std::nullopt;
14642 }
14643
14644 case ISD::AND: {
14645 if (IsVec)
14646 return std::nullopt;
14647
14648 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14649 if (!BitMaskOp)
14650 return std::nullopt;
14651
14652 uint32_t BitMask = BitMaskOp->getZExtValue();
14653 // Bits we expect for our StartingIndex
14654 uint32_t IndexMask = 0xFF << (Index * 8);
14655
14656 if ((IndexMask & BitMask) != IndexMask) {
14657 // If the result of the and partially provides the byte, then it
14658 // is not well formatted
14659 if (IndexMask & BitMask)
14660 return std::nullopt;
14662 }
14663
14664 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
14665 }
14666
14667 case ISD::FSHR: {
14668 if (IsVec)
14669 return std::nullopt;
14670
14671 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14672 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14673 if (!ShiftOp || Op.getValueType().isVector())
14674 return std::nullopt;
14675
14676 uint64_t BitsProvided = Op.getValueSizeInBits();
14677 if (BitsProvided % 8 != 0)
14678 return std::nullopt;
14679
14680 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14681 if (BitShift % 8)
14682 return std::nullopt;
14683
14684 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14685 uint64_t ByteShift = BitShift / 8;
14686
14687 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14688 uint64_t BytesProvided = BitsProvided / 8;
14689 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14690 NewIndex %= BytesProvided;
14691 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
14692 }
14693
14694 case ISD::SRA:
14695 case ISD::SRL: {
14696 if (IsVec)
14697 return std::nullopt;
14698
14699 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14700 if (!ShiftOp)
14701 return std::nullopt;
14702
14703 uint64_t BitShift = ShiftOp->getZExtValue();
14704 if (BitShift % 8)
14705 return std::nullopt;
14706
14707 auto BitsProvided = Op.getScalarValueSizeInBits();
14708 if (BitsProvided % 8 != 0)
14709 return std::nullopt;
14710
14711 uint64_t BytesProvided = BitsProvided / 8;
14712 uint64_t ByteShift = BitShift / 8;
14713 // The dest of shift will have good [0 : (BytesProvided - ByteShift)] bytes.
14714 // If the byte we are trying to provide (as tracked by index) falls in this
14715 // range, then the SRL provides the byte. The byte of interest of the src of
14716 // the SRL is Index + ByteShift
14717 return BytesProvided - ByteShift > Index
14718 ? calculateSrcByte(Op->getOperand(0), StartingIndex,
14719 Index + ByteShift)
14721 }
14722
14723 case ISD::SHL: {
14724 if (IsVec)
14725 return std::nullopt;
14726
14727 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14728 if (!ShiftOp)
14729 return std::nullopt;
14730
14731 uint64_t BitShift = ShiftOp->getZExtValue();
14732 if (BitShift % 8 != 0)
14733 return std::nullopt;
14734 uint64_t ByteShift = BitShift / 8;
14735
14736 // If we are shifting by an amount greater than (or equal to)
14737 // the index we are trying to provide, then it provides 0s. If not,
14738 // then this bytes are not definitively 0s, and the corresponding byte
14739 // of interest is Index - ByteShift of the src
14740 return Index < ByteShift
14742 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
14743 Depth + 1, StartingIndex);
14744 }
14745 case ISD::ANY_EXTEND:
14746 case ISD::SIGN_EXTEND:
14747 case ISD::ZERO_EXTEND:
14749 case ISD::AssertZext:
14750 case ISD::AssertSext: {
14751 if (IsVec)
14752 return std::nullopt;
14753
14754 SDValue NarrowOp = Op->getOperand(0);
14755 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14756 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14757 Op->getOpcode() == ISD::AssertZext ||
14758 Op->getOpcode() == ISD::AssertSext) {
14759 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14760 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14761 }
14762 if (NarrowBitWidth % 8 != 0)
14763 return std::nullopt;
14764 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14765
14766 if (Index >= NarrowByteWidth)
14767 return Op.getOpcode() == ISD::ZERO_EXTEND
14768 ? std::optional<ByteProvider<SDValue>>(
14770 : std::nullopt;
14771 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
14772 }
14773
14774 case ISD::TRUNCATE: {
14775 if (IsVec)
14776 return std::nullopt;
14777
14778 uint64_t NarrowByteWidth = BitWidth / 8;
14779
14780 if (NarrowByteWidth >= Index) {
14781 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14782 StartingIndex);
14783 }
14784
14785 return std::nullopt;
14786 }
14787
14788 case ISD::CopyFromReg: {
14789 if (BitWidth / 8 > Index)
14790 return calculateSrcByte(Op, StartingIndex, Index);
14791
14792 return std::nullopt;
14793 }
14794
14795 case ISD::LOAD: {
14796 auto *L = cast<LoadSDNode>(Op.getNode());
14797
14798 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
14799 if (NarrowBitWidth % 8 != 0)
14800 return std::nullopt;
14801 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14802
14803 // If the width of the load does not reach byte we are trying to provide for
14804 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
14805 // question
14806 if (Index >= NarrowByteWidth) {
14807 return L->getExtensionType() == ISD::ZEXTLOAD
14808 ? std::optional<ByteProvider<SDValue>>(
14810 : std::nullopt;
14811 }
14812
14813 if (NarrowByteWidth > Index) {
14814 return calculateSrcByte(Op, StartingIndex, Index);
14815 }
14816
14817 return std::nullopt;
14818 }
14819
14820 case ISD::BSWAP: {
14821 if (IsVec)
14822 return std::nullopt;
14823
14824 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
14825 Depth + 1, StartingIndex);
14826 }
14827
14829 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14830 if (!IdxOp)
14831 return std::nullopt;
14832 auto VecIdx = IdxOp->getZExtValue();
14833 auto ScalarSize = Op.getScalarValueSizeInBits();
14834 if (ScalarSize < 32)
14835 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
14836 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
14837 StartingIndex, Index);
14838 }
14839
14840 case AMDGPUISD::PERM: {
14841 if (IsVec)
14842 return std::nullopt;
14843
14844 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14845 if (!PermMask)
14846 return std::nullopt;
14847
14848 auto IdxMask =
14849 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
14850 if (IdxMask > 0x07 && IdxMask != 0x0c)
14851 return std::nullopt;
14852
14853 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
14854 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
14855
14856 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
14859 }
14860
14861 default: {
14862 return std::nullopt;
14863 }
14864 }
14865
14866 llvm_unreachable("fully handled switch");
14867}
14868
14869// Returns true if the Operand is a scalar and is 16 bits
14870static bool isExtendedFrom16Bits(SDValue &Operand) {
14871
14872 switch (Operand.getOpcode()) {
14873 case ISD::ANY_EXTEND:
14874 case ISD::SIGN_EXTEND:
14875 case ISD::ZERO_EXTEND: {
14876 auto OpVT = Operand.getOperand(0).getValueType();
14877 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
14878 }
14879 case ISD::LOAD: {
14880 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
14881 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
14882 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
14883 ExtType == ISD::EXTLOAD) {
14884 auto MemVT = L->getMemoryVT();
14885 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
14886 }
14887 return L->getMemoryVT().getSizeInBits() == 16;
14888 }
14889 default:
14890 return false;
14891 }
14892}
14893
14894// Returns true if the mask matches consecutive bytes, and the first byte
14895// begins at a power of 2 byte offset from 0th byte
14896static bool addresses16Bits(int Mask) {
14897 int Low8 = Mask & 0xff;
14898 int Hi8 = (Mask & 0xff00) >> 8;
14899
14900 assert(Low8 < 8 && Hi8 < 8);
14901 // Are the bytes contiguous in the order of increasing addresses.
14902 bool IsConsecutive = (Hi8 - Low8 == 1);
14903 // Is the first byte at location that is aligned for 16 bit instructions.
14904 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
14905 // In this case, we still need code to extract the 16 bit operand, so it
14906 // is better to use i8 v_perm
14907 bool Is16Aligned = !(Low8 % 2);
14908
14909 return IsConsecutive && Is16Aligned;
14910}
14911
14912// Do not lower into v_perm if the operands are actually 16 bit
14913// and the selected bits (based on PermMask) correspond with two
14914// easily addressable 16 bit operands.
14916 SDValue &OtherOp) {
14917 int Low16 = PermMask & 0xffff;
14918 int Hi16 = (PermMask & 0xffff0000) >> 16;
14919
14920 auto TempOp = peekThroughBitcasts(Op);
14921 auto TempOtherOp = peekThroughBitcasts(OtherOp);
14922
14923 auto OpIs16Bit =
14924 TempOtherOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
14925 if (!OpIs16Bit)
14926 return true;
14927
14928 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
14929 isExtendedFrom16Bits(TempOtherOp);
14930 if (!OtherOpIs16Bit)
14931 return true;
14932
14933 // Do we cleanly address both
14934 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
14935}
14936
14938 unsigned DWordOffset) {
14939 SDValue Ret;
14940
14941 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
14942 // ByteProvider must be at least 8 bits
14943 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
14944
14945 if (TypeSize <= 32)
14946 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
14947
14948 if (Src.getValueType().isVector()) {
14949 auto ScalarTySize = Src.getScalarValueSizeInBits();
14950 auto ScalarTy = Src.getValueType().getScalarType();
14951 if (ScalarTySize == 32) {
14952 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
14953 DAG.getConstant(DWordOffset, SL, MVT::i32));
14954 }
14955 if (ScalarTySize > 32) {
14956 Ret = DAG.getNode(
14957 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
14958 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
14959 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
14960 if (ShiftVal)
14961 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
14962 DAG.getConstant(ShiftVal, SL, MVT::i32));
14963 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14964 }
14965
14966 assert(ScalarTySize < 32);
14967 auto NumElements = TypeSize / ScalarTySize;
14968 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
14969 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
14970 auto NumElementsIn32 = 32 / ScalarTySize;
14971 auto NumAvailElements = DWordOffset < Trunc32Elements
14972 ? NumElementsIn32
14973 : NumElements - NormalizedTrunc;
14974
14976 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
14977 NumAvailElements);
14978
14979 Ret = DAG.getBuildVector(
14980 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
14981 VecSrcs);
14982 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14983 }
14984
14985 /// Scalar Type
14986 auto ShiftVal = 32 * DWordOffset;
14987 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
14988 DAG.getConstant(ShiftVal, SL, MVT::i32));
14989 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
14990}
14991
14993 SelectionDAG &DAG = DCI.DAG;
14994 [[maybe_unused]] EVT VT = N->getValueType(0);
14996
14997 // VT is known to be MVT::i32, so we need to provide 4 bytes.
14998 assert(VT == MVT::i32);
14999 for (int i = 0; i < 4; i++) {
15000 // Find the ByteProvider that provides the ith byte of the result of OR
15001 std::optional<ByteProvider<SDValue>> P =
15002 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
15003 // TODO support constantZero
15004 if (!P || P->isConstantZero())
15005 return SDValue();
15006
15007 PermNodes.push_back(*P);
15008 }
15009 if (PermNodes.size() != 4)
15010 return SDValue();
15011
15012 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
15013 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
15014 uint64_t PermMask = 0x00000000;
15015 for (size_t i = 0; i < PermNodes.size(); i++) {
15016 auto PermOp = PermNodes[i];
15017 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
15018 // by sizeof(Src2) = 4
15019 int SrcByteAdjust = 4;
15020
15021 // If the Src uses a byte from a different DWORD, then it corresponds
15022 // with a difference source
15023 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
15024 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
15025 if (SecondSrc)
15026 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
15027 ((PermOp.SrcOffset / 4) != SecondSrc->second))
15028 return SDValue();
15029
15030 // Set the index of the second distinct Src node
15031 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
15032 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
15033 SrcByteAdjust = 0;
15034 }
15035 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
15037 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
15038 }
15039 SDLoc DL(N);
15040 SDValue Op = *PermNodes[FirstSrc.first].Src;
15041 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
15042 assert(Op.getValueSizeInBits() == 32);
15043
15044 // Check that we are not just extracting the bytes in order from an op
15045 if (!SecondSrc) {
15046 int Low16 = PermMask & 0xffff;
15047 int Hi16 = (PermMask & 0xffff0000) >> 16;
15048
15049 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
15050 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
15051
15052 // The perm op would really just produce Op. So combine into Op
15053 if (WellFormedLow && WellFormedHi)
15054 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
15055 }
15056
15057 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
15058
15059 if (SecondSrc) {
15060 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
15061 assert(OtherOp.getValueSizeInBits() == 32);
15062 }
15063
15064 // Check that we haven't just recreated the same FSHR node.
15065 if (N->getOpcode() == ISD::FSHR &&
15066 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
15067 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
15068 return SDValue();
15069
15070 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
15071
15072 assert(Op.getValueType().isByteSized() &&
15073 OtherOp.getValueType().isByteSized());
15074
15075 // If the ultimate src is less than 32 bits, then we will only be
15076 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
15077 // CalculateByteProvider would not have returned Op as source if we
15078 // used a byte that is outside its ValueType. Thus, we are free to
15079 // ANY_EXTEND as the extended bits are dont-cares.
15080 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
15081 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
15082
15083 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
15084 DAG.getConstant(PermMask, DL, MVT::i32));
15085 }
15086 return SDValue();
15087}
15088
15089SDValue SITargetLowering::performOrCombine(SDNode *N,
15090 DAGCombinerInfo &DCI) const {
15091 SelectionDAG &DAG = DCI.DAG;
15092 SDValue LHS = N->getOperand(0);
15093 SDValue RHS = N->getOperand(1);
15094
15095 EVT VT = N->getValueType(0);
15096 if (VT == MVT::i1) {
15097 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
15098 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
15099 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
15100 SDValue Src = LHS.getOperand(0);
15101 if (Src != RHS.getOperand(0))
15102 return SDValue();
15103
15104 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
15105 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
15106 if (!CLHS || !CRHS)
15107 return SDValue();
15108
15109 // Only 10 bits are used.
15110 static const uint32_t MaxMask = 0x3ff;
15111
15112 uint32_t NewMask =
15113 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
15114 SDLoc DL(N);
15115 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
15116 DAG.getConstant(NewMask, DL, MVT::i32));
15117 }
15118
15119 return SDValue();
15120 }
15121
15122 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
15124 LHS.getOpcode() == AMDGPUISD::PERM &&
15125 isa<ConstantSDNode>(LHS.getOperand(2))) {
15126 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
15127 if (!Sel)
15128 return SDValue();
15129
15130 Sel |= LHS.getConstantOperandVal(2);
15131 SDLoc DL(N);
15132 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
15133 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
15134 }
15135
15136 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
15137 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15138 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
15139 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15140
15141 // If all the uses of an or need to extract the individual elements, do not
15142 // attempt to lower into v_perm
15143 auto usesCombinedOperand = [](SDNode *OrUse) {
15144 // If we have any non-vectorized use, then it is a candidate for v_perm
15145 if (OrUse->getOpcode() != ISD::BITCAST ||
15146 !OrUse->getValueType(0).isVector())
15147 return true;
15148
15149 // If we have any non-vectorized use, then it is a candidate for v_perm
15150 for (auto *VUser : OrUse->users()) {
15151 if (!VUser->getValueType(0).isVector())
15152 return true;
15153
15154 // If the use of a vector is a store, then combining via a v_perm
15155 // is beneficial.
15156 // TODO -- whitelist more uses
15157 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
15158 if (VUser->getOpcode() == VectorwiseOp)
15159 return true;
15160 }
15161 return false;
15162 };
15163
15164 if (!any_of(N->users(), usesCombinedOperand))
15165 return SDValue();
15166
15167 uint32_t LHSMask = getPermuteMask(LHS);
15168 uint32_t RHSMask = getPermuteMask(RHS);
15169
15170 if (LHSMask != ~0u && RHSMask != ~0u) {
15171 // Canonicalize the expression in an attempt to have fewer unique masks
15172 // and therefore fewer registers used to hold the masks.
15173 if (LHSMask > RHSMask) {
15174 std::swap(LHSMask, RHSMask);
15175 std::swap(LHS, RHS);
15176 }
15177
15178 // Select 0xc for each lane used from source operand. Zero has 0xc mask
15179 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
15180 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15181 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15182
15183 // Check of we need to combine values from two sources within a byte.
15184 if (!(LHSUsedLanes & RHSUsedLanes) &&
15185 // If we select high and lower word keep it for SDWA.
15186 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
15187 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
15188 // Kill zero bytes selected by other mask. Zero value is 0xc.
15189 LHSMask &= ~RHSUsedLanes;
15190 RHSMask &= ~LHSUsedLanes;
15191 // Add 4 to each active LHS lane
15192 LHSMask |= LHSUsedLanes & 0x04040404;
15193 // Combine masks
15194 uint32_t Sel = LHSMask | RHSMask;
15195 SDLoc DL(N);
15196
15197 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
15198 RHS.getOperand(0),
15199 DAG.getConstant(Sel, DL, MVT::i32));
15200 }
15201 }
15202 if (LHSMask == ~0u || RHSMask == ~0u) {
15203 if (SDValue Perm = matchPERM(N, DCI))
15204 return Perm;
15205 }
15206 }
15207
15208 // Detect identity v2i32 OR and replace with identity source node.
15209 // Specifically an Or that has operands constructed from the same source node
15210 // via extract_vector_elt and build_vector. I.E.
15211 // v2i32 or(
15212 // v2i32 build_vector(
15213 // i32 extract_elt(%IdentitySrc, 0),
15214 // i32 0
15215 // ),
15216 // v2i32 build_vector(
15217 // i32 0,
15218 // i32 extract_elt(%IdentitySrc, 1)
15219 // ) )
15220 // =>
15221 // v2i32 %IdentitySrc
15222
15223 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
15224 RHS->getOpcode() == ISD::BUILD_VECTOR) {
15225
15226 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
15227 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
15228
15229 // Test for and normalise build vectors.
15230 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
15231
15232 // Get the extract_vector_element operands.
15233 SDValue LEVE = LHS->getOperand(0);
15234 SDValue REVE = RHS->getOperand(1);
15235
15236 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15238 // Check that different elements from the same vector are
15239 // extracted.
15240 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
15241 LEVE->getOperand(1) != REVE->getOperand(1)) {
15242 SDValue IdentitySrc = LEVE.getOperand(0);
15243 return IdentitySrc;
15244 }
15245 }
15246 }
15247 }
15248
15249 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
15250 return SDValue();
15251
15252 // TODO: This could be a generic combine with a predicate for extracting the
15253 // high half of an integer being free.
15254
15255 // (or i64:x, (zero_extend i32:y)) ->
15256 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
15257 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
15258 RHS.getOpcode() != ISD::ZERO_EXTEND)
15259 std::swap(LHS, RHS);
15260
15261 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
15262 SDValue ExtSrc = RHS.getOperand(0);
15263 EVT SrcVT = ExtSrc.getValueType();
15264 if (SrcVT == MVT::i32) {
15265 SDLoc SL(N);
15266 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
15267 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
15268
15269 DCI.AddToWorklist(LowOr.getNode());
15270 DCI.AddToWorklist(HiBits.getNode());
15271
15272 SDValue Vec =
15273 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
15274 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
15275 }
15276 }
15277
15278 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
15279 if (CRHS) {
15280 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
15281 N->getOperand(0), CRHS))
15282 return Split;
15283 }
15284
15285 return SDValue();
15286}
15287
15288SDValue SITargetLowering::performXorCombine(SDNode *N,
15289 DAGCombinerInfo &DCI) const {
15290 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
15291 return RV;
15292
15293 SDValue LHS = N->getOperand(0);
15294 SDValue RHS = N->getOperand(1);
15295
15296 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
15297 SelectionDAG &DAG = DCI.DAG;
15298
15299 EVT VT = N->getValueType(0);
15300 if (CRHS && VT == MVT::i64) {
15301 if (SDValue Split =
15302 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
15303 return Split;
15304 }
15305
15306 // v2i32 (xor (vselect cc, x, y), K) ->
15307 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
15308 // replaced with source modifiers when the select is lowered to CNDMASK.
15309 unsigned Opc = LHS.getOpcode();
15310 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
15311 (Opc == ISD::SELECT && VT == MVT::i64)) &&
15312 CRHS && CRHS->getAPIntValue().isSignMask()) {
15313 SDValue CC = LHS->getOperand(0);
15314 SDValue TRUE = LHS->getOperand(1);
15315 SDValue FALSE = LHS->getOperand(2);
15316 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
15317 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
15318 SDValue XSelect =
15319 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
15320 return XSelect;
15321 }
15322
15323 // Make sure to apply the 64-bit constant splitting fold before trying to fold
15324 // fneg-like xors into 64-bit select.
15325 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
15326 // This looks like an fneg, try to fold as a source modifier.
15327 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
15329 // xor (select c, a, b), 0x80000000 ->
15330 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
15331 SDLoc DL(N);
15332 SDValue CastLHS =
15333 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
15334 SDValue CastRHS =
15335 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
15336 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
15337 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
15338 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
15339 LHS->getOperand(0), FNegLHS, FNegRHS);
15340 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
15341 }
15342 }
15343
15344 return SDValue();
15345}
15346
15347SDValue
15348SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
15349 DAGCombinerInfo &DCI) const {
15350 if (!Subtarget->has16BitInsts() ||
15351 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
15352 return SDValue();
15353
15354 EVT VT = N->getValueType(0);
15355 if (VT != MVT::i32)
15356 return SDValue();
15357
15358 SDValue Src = N->getOperand(0);
15359 if (Src.getValueType() != MVT::i16)
15360 return SDValue();
15361
15362 if (!Src->hasOneUse())
15363 return SDValue();
15364
15365 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
15366 // possible we're missing out on some combine opportunities, but we'd need to
15367 // weigh the cost of extracting the byte from the upper dwords.
15368
15369 std::optional<ByteProvider<SDValue>> BP0 =
15370 calculateByteProvider(SDValue(N, 0), 0, 0, 0);
15371 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
15372 return SDValue();
15373 SDValue V0 = *BP0->Src;
15374
15375 std::optional<ByteProvider<SDValue>> BP1 =
15376 calculateByteProvider(SDValue(N, 0), 1, 0, 1);
15377 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
15378 return SDValue();
15379
15380 SDValue V1 = *BP1->Src;
15381
15382 if (V0 == V1)
15383 return SDValue();
15384
15385 SelectionDAG &DAG = DCI.DAG;
15386 SDLoc DL(N);
15387 uint32_t PermMask = 0x0c0c0c0c;
15388 if (V0) {
15389 V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
15390 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
15391 }
15392
15393 if (V1) {
15394 V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
15395 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
15396 }
15397
15398 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
15399 DAG.getConstant(PermMask, DL, MVT::i32));
15400}
15401
15402SDValue
15403SITargetLowering::performSignExtendInRegCombine(SDNode *N,
15404 DAGCombinerInfo &DCI) const {
15405 SDValue Src = N->getOperand(0);
15406 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
15407
15408 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
15409 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
15410 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
15411 VTSign->getVT() == MVT::i8) ||
15412 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
15413 VTSign->getVT() == MVT::i16))) {
15414 assert(Subtarget->hasScalarSubwordLoads() &&
15415 "s_buffer_load_{u8, i8} are supported "
15416 "in GFX12 (or newer) architectures.");
15417 EVT VT = Src.getValueType();
15418 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
15419 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15420 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15421 SDLoc DL(N);
15422 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15423 SDValue Ops[] = {
15424 Src.getOperand(0), // source register
15425 Src.getOperand(1), // offset
15426 Src.getOperand(2) // cachePolicy
15427 };
15428 auto *M = cast<MemSDNode>(Src);
15429 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15430 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15431 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
15432 return LoadVal;
15433 }
15434 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15435 VTSign->getVT() == MVT::i8) ||
15436 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15437 VTSign->getVT() == MVT::i16)) &&
15438 Src.hasOneUse()) {
15439 auto *M = cast<MemSDNode>(Src);
15440 SDValue Ops[] = {Src.getOperand(0), // Chain
15441 Src.getOperand(1), // rsrc
15442 Src.getOperand(2), // vindex
15443 Src.getOperand(3), // voffset
15444 Src.getOperand(4), // soffset
15445 Src.getOperand(5), // offset
15446 Src.getOperand(6), Src.getOperand(7)};
15447 // replace with BUFFER_LOAD_BYTE/SHORT
15448 SDVTList ResList =
15449 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15450 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15451 ? AMDGPUISD::BUFFER_LOAD_BYTE
15452 : AMDGPUISD::BUFFER_LOAD_SHORT;
15453 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15454 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15455 return DCI.DAG.getMergeValues(
15456 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
15457 }
15458 return SDValue();
15459}
15460
15461SDValue SITargetLowering::performClassCombine(SDNode *N,
15462 DAGCombinerInfo &DCI) const {
15463 SelectionDAG &DAG = DCI.DAG;
15464 SDValue Mask = N->getOperand(1);
15465
15466 // fp_class x, 0 -> false
15467 if (isNullConstant(Mask))
15468 return DAG.getConstant(0, SDLoc(N), MVT::i1);
15469
15470 if (N->getOperand(0).isUndef())
15471 return DAG.getUNDEF(MVT::i1);
15472
15473 return SDValue();
15474}
15475
15476SDValue SITargetLowering::performRcpCombine(SDNode *N,
15477 DAGCombinerInfo &DCI) const {
15478 EVT VT = N->getValueType(0);
15479 SDValue N0 = N->getOperand(0);
15480
15481 if (N0.isUndef()) {
15482 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
15483 SDLoc(N), VT);
15484 }
15485
15486 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
15487 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
15488 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
15489 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
15490 N->getFlags());
15491 }
15492
15494}
15495
15497 SDNodeFlags UserFlags,
15498 unsigned MaxDepth) const {
15499 unsigned Opcode = Op.getOpcode();
15500 if (Opcode == ISD::FCANONICALIZE)
15501 return true;
15502
15503 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15504 const auto &F = CFP->getValueAPF();
15505 if (F.isNaN() && F.isSignaling())
15506 return false;
15507 if (!F.isDenormal())
15508 return true;
15509
15510 DenormalMode Mode =
15511 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
15512 return Mode == DenormalMode::getIEEE();
15513 }
15514
15515 // If source is a result of another standard FP operation it is already in
15516 // canonical form.
15517 if (MaxDepth == 0)
15518 return false;
15519
15520 switch (Opcode) {
15521 // These will flush denorms if required.
15522 case ISD::FADD:
15523 case ISD::FSUB:
15524 case ISD::FMUL:
15525 case ISD::FCEIL:
15526 case ISD::FFLOOR:
15527 case ISD::FMA:
15528 case ISD::FMAD:
15529 case ISD::FSQRT:
15530 case ISD::FDIV:
15531 case ISD::FREM:
15532 case ISD::FP_ROUND:
15533 case ISD::FP_EXTEND:
15534 case ISD::FP16_TO_FP:
15535 case ISD::FP_TO_FP16:
15536 case ISD::BF16_TO_FP:
15537 case ISD::FP_TO_BF16:
15538 case ISD::FLDEXP:
15539 case AMDGPUISD::FMUL_LEGACY:
15540 case AMDGPUISD::FMAD_FTZ:
15541 case AMDGPUISD::RCP:
15542 case AMDGPUISD::RSQ:
15543 case AMDGPUISD::RSQ_CLAMP:
15544 case AMDGPUISD::RCP_LEGACY:
15545 case AMDGPUISD::RCP_IFLAG:
15546 case AMDGPUISD::LOG:
15547 case AMDGPUISD::EXP:
15548 case AMDGPUISD::DIV_SCALE:
15549 case AMDGPUISD::DIV_FMAS:
15550 case AMDGPUISD::DIV_FIXUP:
15551 case AMDGPUISD::FRACT:
15552 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15553 case AMDGPUISD::CVT_F32_UBYTE0:
15554 case AMDGPUISD::CVT_F32_UBYTE1:
15555 case AMDGPUISD::CVT_F32_UBYTE2:
15556 case AMDGPUISD::CVT_F32_UBYTE3:
15557 case AMDGPUISD::FP_TO_FP16:
15558 case AMDGPUISD::SIN_HW:
15559 case AMDGPUISD::COS_HW:
15560 return true;
15561
15562 // It can/will be lowered or combined as a bit operation.
15563 // Need to check their input recursively to handle.
15564 case ISD::FNEG:
15565 case ISD::FABS:
15566 case ISD::FCOPYSIGN:
15567 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15568
15569 case ISD::AND:
15570 if (Op.getValueType() == MVT::i32) {
15571 // Be careful as we only know it is a bitcast floating point type. It
15572 // could be f32, v2f16, we have no way of knowing. Luckily the constant
15573 // value that we optimize for, which comes up in fp32 to bf16 conversions,
15574 // is valid to optimize for all types.
15575 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
15576 if (RHS->getZExtValue() == 0xffff0000) {
15577 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15578 }
15579 }
15580 }
15581 break;
15582
15583 case ISD::FSIN:
15584 case ISD::FCOS:
15585 case ISD::FSINCOS:
15586 return Op.getValueType().getScalarType() != MVT::f16;
15587
15588 case ISD::FMINNUM:
15589 case ISD::FMAXNUM:
15590 case ISD::FMINNUM_IEEE:
15591 case ISD::FMAXNUM_IEEE:
15592 case ISD::FMINIMUM:
15593 case ISD::FMAXIMUM:
15594 case ISD::FMINIMUMNUM:
15595 case ISD::FMAXIMUMNUM:
15596 case AMDGPUISD::CLAMP:
15597 case AMDGPUISD::FMED3:
15598 case AMDGPUISD::FMAX3:
15599 case AMDGPUISD::FMIN3:
15600 case AMDGPUISD::FMAXIMUM3:
15601 case AMDGPUISD::FMINIMUM3: {
15602 // FIXME: Shouldn't treat the generic operations different based these.
15603 // However, we aren't really required to flush the result from
15604 // minnum/maxnum..
15605
15606 // snans will be quieted, so we only need to worry about denormals.
15607 if (Subtarget->supportsMinMaxDenormModes() ||
15608 // FIXME: denormalsEnabledForType is broken for dynamic
15609 denormalsEnabledForType(DAG, Op.getValueType()))
15610 return true;
15611
15612 // Flushing may be required.
15613 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15614 // targets need to check their input recursively.
15615
15616 // FIXME: Does this apply with clamp? It's implemented with max.
15617 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15618 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
15619 return false;
15620 }
15621
15622 return true;
15623 }
15624 case ISD::SELECT: {
15625 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
15626 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
15627 }
15628 case ISD::BUILD_VECTOR: {
15629 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15630 SDValue SrcOp = Op.getOperand(i);
15631 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
15632 return false;
15633 }
15634
15635 return true;
15636 }
15639 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15640 }
15642 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
15643 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
15644 }
15645 case ISD::UNDEF:
15646 // Could be anything.
15647 return false;
15648
15649 case ISD::BITCAST:
15650 // TODO: This is incorrect as it loses track of the operand's type. We may
15651 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15652 // same bits that are canonicalized in one type need not be in the other.
15653 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15654 case ISD::TRUNCATE: {
15655 // Hack round the mess we make when legalizing extract_vector_elt
15656 if (Op.getValueType() == MVT::i16) {
15657 SDValue TruncSrc = Op.getOperand(0);
15658 if (TruncSrc.getValueType() == MVT::i32 &&
15659 TruncSrc.getOpcode() == ISD::BITCAST &&
15660 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
15661 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
15662 }
15663 }
15664 return false;
15665 }
15667 unsigned IntrinsicID = Op.getConstantOperandVal(0);
15668 // TODO: Handle more intrinsics
15669 switch (IntrinsicID) {
15670 case Intrinsic::amdgcn_cvt_pkrtz:
15671 case Intrinsic::amdgcn_cubeid:
15672 case Intrinsic::amdgcn_frexp_mant:
15673 case Intrinsic::amdgcn_fdot2:
15674 case Intrinsic::amdgcn_rcp:
15675 case Intrinsic::amdgcn_rsq:
15676 case Intrinsic::amdgcn_rsq_clamp:
15677 case Intrinsic::amdgcn_rcp_legacy:
15678 case Intrinsic::amdgcn_rsq_legacy:
15679 case Intrinsic::amdgcn_trig_preop:
15680 case Intrinsic::amdgcn_tanh:
15681 case Intrinsic::amdgcn_log:
15682 case Intrinsic::amdgcn_exp2:
15683 case Intrinsic::amdgcn_sqrt:
15684 return true;
15685 default:
15686 break;
15687 }
15688
15689 break;
15690 }
15691 default:
15692 break;
15693 }
15694
15695 // FIXME: denormalsEnabledForType is broken for dynamic
15696 return denormalsEnabledForType(DAG, Op.getValueType()) &&
15697 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15698}
15699
15701 unsigned MaxDepth) const {
15702 const MachineRegisterInfo &MRI = MF.getRegInfo();
15703 MachineInstr *MI = MRI.getVRegDef(Reg);
15704 unsigned Opcode = MI->getOpcode();
15705
15706 if (Opcode == AMDGPU::G_FCANONICALIZE)
15707 return true;
15708
15709 std::optional<FPValueAndVReg> FCR;
15710 // Constant splat (can be padded with undef) or scalar constant.
15711 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
15712 if (FCR->Value.isSignaling())
15713 return false;
15714 if (!FCR->Value.isDenormal())
15715 return true;
15716
15717 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
15718 return Mode == DenormalMode::getIEEE();
15719 }
15720
15721 if (MaxDepth == 0)
15722 return false;
15723
15724 switch (Opcode) {
15725 case AMDGPU::G_FADD:
15726 case AMDGPU::G_FSUB:
15727 case AMDGPU::G_FMUL:
15728 case AMDGPU::G_FCEIL:
15729 case AMDGPU::G_FFLOOR:
15730 case AMDGPU::G_FRINT:
15731 case AMDGPU::G_FNEARBYINT:
15732 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15733 case AMDGPU::G_INTRINSIC_TRUNC:
15734 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15735 case AMDGPU::G_FMA:
15736 case AMDGPU::G_FMAD:
15737 case AMDGPU::G_FSQRT:
15738 case AMDGPU::G_FDIV:
15739 case AMDGPU::G_FREM:
15740 case AMDGPU::G_FPOW:
15741 case AMDGPU::G_FPEXT:
15742 case AMDGPU::G_FLOG:
15743 case AMDGPU::G_FLOG2:
15744 case AMDGPU::G_FLOG10:
15745 case AMDGPU::G_FPTRUNC:
15746 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15747 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15748 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15749 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15750 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15751 return true;
15752 case AMDGPU::G_FNEG:
15753 case AMDGPU::G_FABS:
15754 case AMDGPU::G_FCOPYSIGN:
15755 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
15756 case AMDGPU::G_FMINNUM:
15757 case AMDGPU::G_FMAXNUM:
15758 case AMDGPU::G_FMINNUM_IEEE:
15759 case AMDGPU::G_FMAXNUM_IEEE:
15760 case AMDGPU::G_FMINIMUM:
15761 case AMDGPU::G_FMAXIMUM:
15762 case AMDGPU::G_FMINIMUMNUM:
15763 case AMDGPU::G_FMAXIMUMNUM: {
15764 if (Subtarget->supportsMinMaxDenormModes() ||
15765 // FIXME: denormalsEnabledForType is broken for dynamic
15766 denormalsEnabledForType(MRI.getType(Reg), MF))
15767 return true;
15768
15769 [[fallthrough]];
15770 }
15771 case AMDGPU::G_BUILD_VECTOR:
15772 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
15773 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
15774 return false;
15775 return true;
15776 case AMDGPU::G_INTRINSIC:
15777 case AMDGPU::G_INTRINSIC_CONVERGENT:
15778 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15779 case Intrinsic::amdgcn_fmul_legacy:
15780 case Intrinsic::amdgcn_fmad_ftz:
15781 case Intrinsic::amdgcn_sqrt:
15782 case Intrinsic::amdgcn_fmed3:
15783 case Intrinsic::amdgcn_sin:
15784 case Intrinsic::amdgcn_cos:
15785 case Intrinsic::amdgcn_log:
15786 case Intrinsic::amdgcn_exp2:
15787 case Intrinsic::amdgcn_log_clamp:
15788 case Intrinsic::amdgcn_rcp:
15789 case Intrinsic::amdgcn_rcp_legacy:
15790 case Intrinsic::amdgcn_rsq:
15791 case Intrinsic::amdgcn_rsq_clamp:
15792 case Intrinsic::amdgcn_rsq_legacy:
15793 case Intrinsic::amdgcn_div_scale:
15794 case Intrinsic::amdgcn_div_fmas:
15795 case Intrinsic::amdgcn_div_fixup:
15796 case Intrinsic::amdgcn_fract:
15797 case Intrinsic::amdgcn_cvt_pkrtz:
15798 case Intrinsic::amdgcn_cubeid:
15799 case Intrinsic::amdgcn_cubema:
15800 case Intrinsic::amdgcn_cubesc:
15801 case Intrinsic::amdgcn_cubetc:
15802 case Intrinsic::amdgcn_frexp_mant:
15803 case Intrinsic::amdgcn_fdot2:
15804 case Intrinsic::amdgcn_trig_preop:
15805 case Intrinsic::amdgcn_tanh:
15806 return true;
15807 default:
15808 break;
15809 }
15810
15811 [[fallthrough]];
15812 default:
15813 return false;
15814 }
15815
15816 llvm_unreachable("invalid operation");
15817}
15818
15819// Constant fold canonicalize.
15820SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
15821 const SDLoc &SL, EVT VT,
15822 const APFloat &C) const {
15823 // Flush denormals to 0 if not enabled.
15824 if (C.isDenormal()) {
15825 DenormalMode Mode =
15826 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
15827 if (Mode == DenormalMode::getPreserveSign()) {
15828 return DAG.getConstantFP(
15829 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
15830 }
15831
15832 if (Mode != DenormalMode::getIEEE())
15833 return SDValue();
15834 }
15835
15836 if (C.isNaN()) {
15837 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
15838 if (C.isSignaling()) {
15839 // Quiet a signaling NaN.
15840 // FIXME: Is this supposed to preserve payload bits?
15841 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15842 }
15843
15844 // Make sure it is the canonical NaN bitpattern.
15845 //
15846 // TODO: Can we use -1 as the canonical NaN value since it's an inline
15847 // immediate?
15848 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
15849 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
15850 }
15851
15852 // Already canonical.
15853 return DAG.getConstantFP(C, SL, VT);
15854}
15855
15857 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
15858}
15859
15860SDValue
15861SITargetLowering::performFCanonicalizeCombine(SDNode *N,
15862 DAGCombinerInfo &DCI) const {
15863 SelectionDAG &DAG = DCI.DAG;
15864 SDValue N0 = N->getOperand(0);
15865 EVT VT = N->getValueType(0);
15866
15867 // fcanonicalize undef -> qnan
15868 if (N0.isUndef()) {
15870 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
15871 }
15872
15873 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
15874 EVT VT = N->getValueType(0);
15875 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
15876 }
15877
15878 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
15879 // (fcanonicalize k)
15880 //
15881 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
15882
15883 // TODO: This could be better with wider vectors that will be split to v2f16,
15884 // and to consider uses since there aren't that many packed operations.
15885 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
15886 isTypeLegal(MVT::v2f16)) {
15887 SDLoc SL(N);
15888 SDValue NewElts[2];
15889 SDValue Lo = N0.getOperand(0);
15890 SDValue Hi = N0.getOperand(1);
15891 EVT EltVT = Lo.getValueType();
15892
15894 for (unsigned I = 0; I != 2; ++I) {
15895 SDValue Op = N0.getOperand(I);
15896 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15897 NewElts[I] =
15898 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
15899 } else if (Op.isUndef()) {
15900 // Handled below based on what the other operand is.
15901 NewElts[I] = Op;
15902 } else {
15903 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
15904 }
15905 }
15906
15907 // If one half is undef, and one is constant, prefer a splat vector rather
15908 // than the normal qNaN. If it's a register, prefer 0.0 since that's
15909 // cheaper to use and may be free with a packed operation.
15910 if (NewElts[0].isUndef()) {
15911 if (isa<ConstantFPSDNode>(NewElts[1]))
15912 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
15913 ? NewElts[1]
15914 : DAG.getConstantFP(0.0f, SL, EltVT);
15915 }
15916
15917 if (NewElts[1].isUndef()) {
15918 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
15919 ? NewElts[0]
15920 : DAG.getConstantFP(0.0f, SL, EltVT);
15921 }
15922
15923 return DAG.getBuildVector(VT, SL, NewElts);
15924 }
15925 }
15926
15927 return SDValue();
15928}
15929
15930static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
15931 switch (Opc) {
15932 case ISD::FMAXNUM:
15933 case ISD::FMAXNUM_IEEE:
15934 case ISD::FMAXIMUMNUM:
15935 return AMDGPUISD::FMAX3;
15936 case ISD::FMAXIMUM:
15937 return AMDGPUISD::FMAXIMUM3;
15938 case ISD::SMAX:
15939 return AMDGPUISD::SMAX3;
15940 case ISD::UMAX:
15941 return AMDGPUISD::UMAX3;
15942 case ISD::FMINNUM:
15943 case ISD::FMINNUM_IEEE:
15944 case ISD::FMINIMUMNUM:
15945 return AMDGPUISD::FMIN3;
15946 case ISD::FMINIMUM:
15947 return AMDGPUISD::FMINIMUM3;
15948 case ISD::SMIN:
15949 return AMDGPUISD::SMIN3;
15950 case ISD::UMIN:
15951 return AMDGPUISD::UMIN3;
15952 default:
15953 llvm_unreachable("Not a min/max opcode");
15954 }
15955}
15956
15957SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
15958 const SDLoc &SL, SDValue Src,
15959 SDValue MinVal,
15960 SDValue MaxVal,
15961 bool Signed) const {
15962
15963 // med3 comes from
15964 // min(max(x, K0), K1), K0 < K1
15965 // max(min(x, K0), K1), K1 < K0
15966 //
15967 // "MinVal" and "MaxVal" respectively refer to the rhs of the
15968 // min/max op.
15969 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
15970 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
15971
15972 if (!MinK || !MaxK)
15973 return SDValue();
15974
15975 if (Signed) {
15976 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
15977 return SDValue();
15978 } else {
15979 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
15980 return SDValue();
15981 }
15982
15983 EVT VT = MinK->getValueType(0);
15984 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
15985 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
15986 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
15987
15988 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
15989 // not available, but this is unlikely to be profitable as constants
15990 // will often need to be materialized & extended, especially on
15991 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
15992 return SDValue();
15993}
15994
15997 return C;
15998
16000 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
16001 return C;
16002 }
16003
16004 return nullptr;
16005}
16006
16007SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
16008 const SDLoc &SL, SDValue Op0,
16009 SDValue Op1,
16010 bool IsKnownNoNaNs) const {
16011 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
16012 if (!K1)
16013 return SDValue();
16014
16015 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
16016 if (!K0)
16017 return SDValue();
16018
16019 // Ordered >= (although NaN inputs should have folded away by now).
16020 if (K0->getValueAPF() > K1->getValueAPF())
16021 return SDValue();
16022
16023 // med3 with a nan input acts like
16024 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
16025 //
16026 // So the result depends on whether the IEEE mode bit is enabled or not with a
16027 // signaling nan input.
16028 // ieee=1
16029 // s0 snan: yields s2
16030 // s1 snan: yields s2
16031 // s2 snan: qnan
16032
16033 // s0 qnan: min(s1, s2)
16034 // s1 qnan: min(s0, s2)
16035 // s2 qnan: min(s0, s1)
16036
16037 // ieee=0
16038 // s0 snan: min(s1, s2)
16039 // s1 snan: min(s0, s2)
16040 // s2 snan: qnan
16041
16042 // s0 qnan: min(s1, s2)
16043 // s1 qnan: min(s0, s2)
16044 // s2 qnan: min(s0, s1)
16045 const MachineFunction &MF = DAG.getMachineFunction();
16046 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16047
16048 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
16049 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
16050 // can only form if op0 is fmaxnum_ieee if IEEE=1.
16051 EVT VT = Op0.getValueType();
16052 if (Info->getMode().DX10Clamp) {
16053 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
16054 // hardware fmed3 behavior converting to a min.
16055 // FIXME: Should this be allowing -0.0?
16056 if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
16057 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
16058 }
16059
16060 // med3 for f16 is only available on gfx9+, and not available for v2f16.
16061 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
16062 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
16063 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
16064 // then give the other result, which is different from med3 with a NaN
16065 // input.
16066 SDValue Var = Op0.getOperand(0);
16067 if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Var))
16068 return SDValue();
16069
16070 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16071
16072 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
16073 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
16074 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
16075 SDValue(K0, 0), SDValue(K1, 0));
16076 }
16077 }
16078
16079 return SDValue();
16080}
16081
16082/// \return true if the subtarget supports minimum3 and maximum3 with the given
16083/// base min/max opcode \p Opc for type \p VT.
16084static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
16085 EVT VT) {
16086 switch (Opc) {
16087 case ISD::FMINNUM:
16088 case ISD::FMAXNUM:
16089 case ISD::FMINNUM_IEEE:
16090 case ISD::FMAXNUM_IEEE:
16091 case ISD::FMINIMUMNUM:
16092 case ISD::FMAXIMUMNUM:
16093 case AMDGPUISD::FMIN_LEGACY:
16094 case AMDGPUISD::FMAX_LEGACY:
16095 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
16096 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
16097 case ISD::FMINIMUM:
16098 case ISD::FMAXIMUM:
16099 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
16100 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
16101 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
16102 case ISD::SMAX:
16103 case ISD::SMIN:
16104 case ISD::UMAX:
16105 case ISD::UMIN:
16106 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
16107 default:
16108 return false;
16109 }
16110
16111 llvm_unreachable("not a min/max opcode");
16112}
16113
16114SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
16115 DAGCombinerInfo &DCI) const {
16116 SelectionDAG &DAG = DCI.DAG;
16117
16118 EVT VT = N->getValueType(0);
16119 unsigned Opc = N->getOpcode();
16120 SDValue Op0 = N->getOperand(0);
16121 SDValue Op1 = N->getOperand(1);
16122
16123 // Only do this if the inner op has one use since this will just increases
16124 // register pressure for no benefit.
16125
16126 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
16127 auto IsTreeWithCombinableChildren = [Opc](SDValue Op) {
16128 return (Op.getOperand(0).getOpcode() == Opc &&
16129 Op.getOperand(0).hasOneUse()) ||
16130 (Op.getOperand(1).getOpcode() == Opc &&
16131 Op.getOperand(1).hasOneUse());
16132 };
16133
16134 bool CanTreeCombineApply = Op0.getOpcode() == Opc && Op0.hasOneUse() &&
16135 Op1.getOpcode() == Opc && Op1.hasOneUse();
16136 bool HasCombinableTreeChild =
16137 CanTreeCombineApply && (IsTreeWithCombinableChildren(Op0) ||
16138 IsTreeWithCombinableChildren(Op1));
16139
16140 // Tree reduction: when both operands are the same min/max op, restructure
16141 // to keep a 2-op node on top so higher tree levels can still combine.
16142 //
16143 // max(max(a, b), max(c, d)) -> max(max3(a, b, c), d)
16144 // min(min(a, b), min(c, d)) -> min(min3(a, b, c), d)
16145 //
16146 // Defer when either inner op is a tree node with combinable children.
16147 if (CanTreeCombineApply && !HasCombinableTreeChild) {
16148 SDLoc DL(N);
16149 SDValue Inner =
16151 Op0.getOperand(1), Op1.getOperand(0));
16152 return DAG.getNode(Opc, DL, VT, Inner, Op1.getOperand(1));
16153 }
16154
16155 // max(max(a, b), c) -> max3(a, b, c)
16156 // min(min(a, b), c) -> min3(a, b, c)
16157 // Deferred when Op0 is a tree node with combinable children.
16158 if (Op0.getOpcode() == Opc && Op0.hasOneUse() && !HasCombinableTreeChild) {
16159 SDLoc DL(N);
16160 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
16161 Op0.getOperand(0), Op0.getOperand(1), Op1);
16162 }
16163
16164 // Try commuted.
16165 // max(a, max(b, c)) -> max3(a, b, c)
16166 // min(a, min(b, c)) -> min3(a, b, c)
16167 // Deferred when Op1 is a tree node with combinable children.
16168 if (Op1.getOpcode() == Opc && Op1.hasOneUse() && !HasCombinableTreeChild) {
16169 SDLoc DL(N);
16170 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
16171 Op0, Op1.getOperand(0), Op1.getOperand(1));
16172 }
16173 }
16174
16175 // umin(sffbh(x), bitwidth) -> sffbh(x) if x is known to be not 0 or -1.
16176 SDValue FfbhSrc;
16177 uint64_t Clamp = 0;
16178 if (Opc == ISD::UMIN &&
16179 sd_match(Op0,
16181 sd_match(Op1, m_ConstInt(Clamp))) {
16182 unsigned BitWidth = FfbhSrc.getValueType().getScalarSizeInBits();
16183 if (Clamp >= BitWidth) {
16184 KnownBits Known = DAG.computeKnownBits(FfbhSrc);
16185 if (Known.isNonZero() && !Known.isAllOnes())
16186 return Op0;
16187 }
16188 }
16189
16190 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
16191 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
16192 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
16193 if (SDValue Med3 = performIntMed3ImmCombine(
16194 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
16195 return Med3;
16196 }
16197 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
16198 if (SDValue Med3 = performIntMed3ImmCombine(
16199 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
16200 return Med3;
16201 }
16202
16203 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
16204 if (SDValue Med3 = performIntMed3ImmCombine(
16205 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
16206 return Med3;
16207 }
16208 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
16209 if (SDValue Med3 = performIntMed3ImmCombine(
16210 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
16211 return Med3;
16212 }
16213
16214 // if !is_snan(x):
16215 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16216 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16217 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16218 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16219 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
16222 (Opc == AMDGPUISD::FMIN_LEGACY &&
16223 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
16224 (VT == MVT::f32 || VT == MVT::f64 ||
16225 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
16226 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
16227 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
16228 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
16229 Op0.hasOneUse()) {
16230 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1,
16231 N->getFlags().hasNoNaNs()))
16232 return Res;
16233 }
16234
16235 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
16236 // for some types, but at a higher cost since it's implemented with a 3
16237 // operand form.
16238 const SDNodeFlags Flags = N->getFlags();
16239 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
16240 !Subtarget->hasIEEEMinimumMaximumInsts() &&
16242 unsigned NewOpc =
16244 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
16245 }
16246
16247 return SDValue();
16248}
16249
16253 // FIXME: Should this be allowing -0.0?
16254 return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
16255 (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
16256 }
16257 }
16258
16259 return false;
16260}
16261
16262// FIXME: Should only worry about snans for version with chain.
16263SDValue SITargetLowering::performFMed3Combine(SDNode *N,
16264 DAGCombinerInfo &DCI) const {
16265 EVT VT = N->getValueType(0);
16266 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
16267 // NaNs. With a NaN input, the order of the operands may change the result.
16268
16269 SelectionDAG &DAG = DCI.DAG;
16270 SDLoc SL(N);
16271
16272 SDValue Src0 = N->getOperand(0);
16273 SDValue Src1 = N->getOperand(1);
16274 SDValue Src2 = N->getOperand(2);
16275
16276 if (isClampZeroToOne(Src0, Src1)) {
16277 // const_a, const_b, x -> clamp is safe in all cases including signaling
16278 // nans.
16279 // FIXME: Should this be allowing -0.0?
16280 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
16281 }
16282
16283 const MachineFunction &MF = DAG.getMachineFunction();
16284 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16285
16286 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
16287 // handling no dx10-clamp?
16288 if (Info->getMode().DX10Clamp) {
16289 // If NaNs is clamped to 0, we are free to reorder the inputs.
16290
16291 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
16292 std::swap(Src0, Src1);
16293
16294 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
16295 std::swap(Src1, Src2);
16296
16297 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
16298 std::swap(Src0, Src1);
16299
16300 if (isClampZeroToOne(Src1, Src2))
16301 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
16302 }
16303
16304 return SDValue();
16305}
16306
16307SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
16308 DAGCombinerInfo &DCI) const {
16309 SDValue Src0 = N->getOperand(0);
16310 SDValue Src1 = N->getOperand(1);
16311 if (Src0.isUndef() && Src1.isUndef())
16312 return DCI.DAG.getUNDEF(N->getValueType(0));
16313 return SDValue();
16314}
16315
16316// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
16317// expanded into a set of cmp/select instructions.
16319 unsigned NumElem,
16320 bool IsDivergentIdx,
16321 const GCNSubtarget *Subtarget) {
16323 return false;
16324
16325 unsigned VecSize = EltSize * NumElem;
16326
16327 // Sub-dword vectors of size 2 dword or less have better implementation.
16328 if (VecSize <= 64 && EltSize < 32)
16329 return false;
16330
16331 // Always expand the rest of sub-dword instructions, otherwise it will be
16332 // lowered via memory.
16333 if (EltSize < 32)
16334 return true;
16335
16336 // Always do this if var-idx is divergent, otherwise it will become a loop.
16337 if (IsDivergentIdx)
16338 return true;
16339
16340 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
16341 unsigned NumInsts = NumElem /* Number of compares */ +
16342 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
16343
16344 // On some architectures (GFX9) movrel is not available and it's better
16345 // to expand.
16346 if (Subtarget->useVGPRIndexMode())
16347 return NumInsts <= 16;
16348
16349 // If movrel is available, use it instead of expanding for vector of 8
16350 // elements.
16351 if (Subtarget->hasMovrel())
16352 return NumInsts <= 15;
16353
16354 return true;
16355}
16356
16358 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
16359 if (isa<ConstantSDNode>(Idx))
16360 return false;
16361
16362 SDValue Vec = N->getOperand(0);
16363 EVT VecVT = Vec.getValueType();
16364 EVT EltVT = VecVT.getVectorElementType();
16365 unsigned EltSize = EltVT.getSizeInBits();
16366 unsigned NumElem = VecVT.getVectorNumElements();
16367
16369 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
16370}
16371
16372SDValue
16373SITargetLowering::performExtractVectorEltCombine(SDNode *N,
16374 DAGCombinerInfo &DCI) const {
16375 SDValue Vec = N->getOperand(0);
16376 SelectionDAG &DAG = DCI.DAG;
16377
16378 EVT VecVT = Vec.getValueType();
16379 EVT VecEltVT = VecVT.getVectorElementType();
16380 EVT ResVT = N->getValueType(0);
16381
16382 unsigned VecSize = VecVT.getSizeInBits();
16383 unsigned VecEltSize = VecEltVT.getSizeInBits();
16384
16385 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
16387 SDLoc SL(N);
16388 SDValue Idx = N->getOperand(1);
16389 SDValue Elt =
16390 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
16391 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
16392 }
16393
16394 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
16395 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
16396 // There are optimisations to transform 64-bit shifts into 32-bit shifts
16397 // depending on the shift operand. See e.g. performSraCombine().
16398 // This combine ensures that the optimisation is compatible with v2i32
16399 // legalised AND.
16400 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
16401 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
16402
16404 if (!C || C->getZExtValue() != 0x1f)
16405 return SDValue();
16406
16407 SDLoc SL(N);
16408 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
16409 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
16410 Vec->getOperand(0), N->getOperand(1));
16411 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
16412 DAG.ReplaceAllUsesWith(N, A.getNode());
16413 }
16414
16415 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
16416 // =>
16417 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
16418 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
16419 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
16420 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
16421 SDLoc SL(N);
16422 SDValue Idx = N->getOperand(1);
16423 unsigned Opc = Vec.getOpcode();
16424
16425 switch (Opc) {
16426 default:
16427 break;
16428 // TODO: Support other binary operations.
16429 case ISD::FADD:
16430 case ISD::FSUB:
16431 case ISD::FMUL:
16432 case ISD::ADD:
16433 case ISD::UMIN:
16434 case ISD::UMAX:
16435 case ISD::SMIN:
16436 case ISD::SMAX:
16437 case ISD::FMAXNUM:
16438 case ISD::FMINNUM:
16439 case ISD::FMAXNUM_IEEE:
16440 case ISD::FMINNUM_IEEE:
16441 case ISD::FMAXIMUM:
16442 case ISD::FMINIMUM: {
16443 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
16444 Vec.getOperand(0), Idx);
16445 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
16446 Vec.getOperand(1), Idx);
16447
16448 DCI.AddToWorklist(Elt0.getNode());
16449 DCI.AddToWorklist(Elt1.getNode());
16450 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
16451 }
16452 }
16453 }
16454
16455 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
16457 SDLoc SL(N);
16458 SDValue Idx = N->getOperand(1);
16459 SDValue V;
16460 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16461 SDValue IC = DAG.getVectorIdxConstant(I, SL);
16462 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
16463 if (I == 0)
16464 V = Elt;
16465 else
16466 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
16467 }
16468 return V;
16469 }
16470
16471 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
16472 // =>
16473 // i32:Lo(k) if Idx == 0, or
16474 // i32:Hi(k) if Idx == 1
16475 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
16476 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
16477 SDLoc SL(N);
16478 SDValue PeekThrough = Vec.getOperand(0);
16479 auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
16480 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16481 uint64_t KImmValue = KImm->getZExtValue();
16482 return DAG.getConstant(
16483 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16484 }
16485 auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
16486 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16487 uint64_t KFPImmValue =
16488 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16489 return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16490 0xffffffff,
16491 SL, MVT::i32);
16492 }
16493 }
16494
16495 if (!DCI.isBeforeLegalize())
16496 return SDValue();
16497
16498 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
16499 // elements. This exposes more load reduction opportunities by replacing
16500 // multiple small extract_vector_elements with a single 32-bit extract.
16501 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
16502 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16503 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
16504
16505 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16506 unsigned EltIdx = BitIndex / 32;
16507 unsigned LeftoverBitIdx = BitIndex % 32;
16508 SDLoc SL(N);
16509
16510 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
16511 DCI.AddToWorklist(Cast.getNode());
16512
16513 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
16514 DAG.getConstant(EltIdx, SL, MVT::i32));
16515 DCI.AddToWorklist(Elt.getNode());
16516 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
16517 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
16518 DCI.AddToWorklist(Srl.getNode());
16519
16520 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
16521 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
16522 DCI.AddToWorklist(Trunc.getNode());
16523
16524 if (VecEltVT == ResVT) {
16525 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
16526 }
16527
16528 assert(ResVT.isScalarInteger());
16529 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
16530 }
16531
16532 return SDValue();
16533}
16534
16535SDValue
16536SITargetLowering::performInsertVectorEltCombine(SDNode *N,
16537 DAGCombinerInfo &DCI) const {
16538 SDValue Vec = N->getOperand(0);
16539 SDValue Idx = N->getOperand(2);
16540 EVT VecVT = Vec.getValueType();
16541 EVT EltVT = VecVT.getVectorElementType();
16542
16543 // INSERT_VECTOR_ELT (<n x e>, var-idx)
16544 // => BUILD_VECTOR n x select (e, const-idx)
16546 return SDValue();
16547
16548 SelectionDAG &DAG = DCI.DAG;
16549 SDLoc SL(N);
16550 SDValue Ins = N->getOperand(1);
16551 EVT IdxVT = Idx.getValueType();
16552
16554 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16555 SDValue IC = DAG.getConstant(I, SL, IdxVT);
16556 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
16557 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
16558 Ops.push_back(V);
16559 }
16560
16561 return DAG.getBuildVector(VecVT, SL, Ops);
16562}
16563
16564/// Return the source of an fp_extend from f16 to f32, or a converted FP
16565/// constant.
16567 if (Src.getOpcode() == ISD::FP_EXTEND &&
16568 Src.getOperand(0).getValueType() == MVT::f16) {
16569 return Src.getOperand(0);
16570 }
16571
16572 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
16573 APFloat Val = CFP->getValueAPF();
16574 bool LosesInfo = true;
16576 if (!LosesInfo)
16577 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
16578 }
16579
16580 return SDValue();
16581}
16582
16583SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
16584 DAGCombinerInfo &DCI) const {
16585 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16586 "combine only useful on gfx8");
16587
16588 SDValue TruncSrc = N->getOperand(0);
16589 EVT VT = N->getValueType(0);
16590 if (VT != MVT::f16)
16591 return SDValue();
16592
16593 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
16594 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
16595 return SDValue();
16596
16597 SelectionDAG &DAG = DCI.DAG;
16598 SDLoc SL(N);
16599
16600 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
16601 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
16602 // casting back.
16603
16604 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
16605 // fmin(fmax(a, b), fmax(fmin(a, b), c))
16606 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
16607 if (!A)
16608 return SDValue();
16609
16610 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
16611 if (!B)
16612 return SDValue();
16613
16614 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
16615 if (!C)
16616 return SDValue();
16617
16618 // This changes signaling nan behavior. If an input is a signaling nan, it
16619 // would have been quieted by the fpext originally. We don't care because
16620 // these are unconstrained ops. If we needed to insert quieting canonicalizes
16621 // we would be worse off than just doing the promotion.
16622 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
16623 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
16624 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
16625 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
16626}
16627
16628unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
16629 const SDNode *N0,
16630 const SDNode *N1) const {
16631 EVT VT = N0->getValueType(0);
16632
16633 // Only do this if we are not trying to support denormals. v_mad_f32 does not
16634 // support denormals ever.
16635 if (((VT == MVT::f32 &&
16637 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16640 return ISD::FMAD;
16641
16642 const TargetOptions &Options = DAG.getTarget().Options;
16643 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
16644 (N0->getFlags().hasAllowContract() &&
16645 N1->getFlags().hasAllowContract())) &&
16647 return ISD::FMA;
16648 }
16649
16650 return 0;
16651}
16652
16653// For a reassociatable opcode perform:
16654// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
16655SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16656 SelectionDAG &DAG) const {
16657 EVT VT = N->getValueType(0);
16658 if (VT != MVT::i32 && VT != MVT::i64)
16659 return SDValue();
16660
16661 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
16662 return SDValue();
16663
16664 unsigned Opc = N->getOpcode();
16665 SDValue Op0 = N->getOperand(0);
16666 SDValue Op1 = N->getOperand(1);
16667
16668 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
16669 return SDValue();
16670
16671 if (Op0->isDivergent())
16672 std::swap(Op0, Op1);
16673
16674 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
16675 return SDValue();
16676
16677 SDValue Op2 = Op1.getOperand(1);
16678 Op1 = Op1.getOperand(0);
16679 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
16680 return SDValue();
16681
16682 if (Op1->isDivergent())
16683 std::swap(Op1, Op2);
16684
16685 SDLoc SL(N);
16686 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
16687 return DAG.getNode(Opc, SL, VT, Add1, Op2);
16688}
16689
16690static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16691 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16693 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
16694 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
16695 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
16696}
16697
16698// Fold
16699// y = lshr i64 x, 32
16700// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16701// with Const.hi == -1
16702// To
16703// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16705 SDValue MulLHS, SDValue MulRHS,
16706 SDValue AddRHS) {
16707 if (MulRHS.getOpcode() == ISD::SRL)
16708 std::swap(MulLHS, MulRHS);
16709
16710 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16711 return SDValue();
16712
16713 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
16714 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16715 MulLHS.getOperand(0) != AddRHS)
16716 return SDValue();
16717
16719 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
16720 return SDValue();
16721
16722 SDValue ConstMul =
16723 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
16724 return getMad64_32(DAG, SL, MVT::i64,
16725 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
16726 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
16727}
16728
16729// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16730// multiplies, if any.
16731//
16732// Full 64-bit multiplies that feed into an addition are lowered here instead
16733// of using the generic expansion. The generic expansion ends up with
16734// a tree of ADD nodes that prevents us from using the "add" part of the
16735// MAD instruction. The expansion produced here results in a chain of ADDs
16736// instead of a tree.
16737SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16738 DAGCombinerInfo &DCI) const {
16739 assert(N->isAnyAdd());
16740
16741 SelectionDAG &DAG = DCI.DAG;
16742 EVT VT = N->getValueType(0);
16743 SDLoc SL(N);
16744 SDValue LHS = N->getOperand(0);
16745 SDValue RHS = N->getOperand(1);
16746
16747 if (VT.isVector())
16748 return SDValue();
16749
16750 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16751 // result in scalar registers for uniform values.
16752 if (!N->isDivergent() && Subtarget->hasSMulHi())
16753 return SDValue();
16754
16755 unsigned NumBits = VT.getScalarSizeInBits();
16756 if (NumBits <= 32 || NumBits > 64)
16757 return SDValue();
16758
16759 if (LHS.getOpcode() != ISD::MUL) {
16760 assert(RHS.getOpcode() == ISD::MUL);
16761 std::swap(LHS, RHS);
16762 }
16763
16764 // Avoid the fold if it would unduly increase the number of multiplies due to
16765 // multiple uses, except on hardware with full-rate multiply-add (which is
16766 // part of full-rate 64-bit ops).
16767 if (!Subtarget->hasFullRate64Ops()) {
16768 unsigned NumUsers = 0;
16769 for (SDNode *User : LHS->users()) {
16770 // There is a use that does not feed into addition, so the multiply can't
16771 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16772 if (!User->isAnyAdd())
16773 return SDValue();
16774
16775 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16776 // MUL + 3xADD + 3xADDC over 3xMAD.
16777 ++NumUsers;
16778 if (NumUsers >= 3)
16779 return SDValue();
16780 }
16781 }
16782
16783 SDValue MulLHS = LHS.getOperand(0);
16784 SDValue MulRHS = LHS.getOperand(1);
16785 SDValue AddRHS = RHS;
16786
16787 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
16788 return FoldedMAD;
16789
16790 // Always check whether operands are small unsigned values, since that
16791 // knowledge is useful in more cases. Check for small signed values only if
16792 // doing so can unlock a shorter code sequence.
16793 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
16794 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
16795
16796 bool MulSignedLo = false;
16797 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
16798 MulSignedLo =
16799 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
16800 }
16801
16802 // The operands and final result all have the same number of bits. If
16803 // operands need to be extended, they can be extended with garbage. The
16804 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
16805 // truncated away in the end.
16806 if (VT != MVT::i64) {
16807 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
16808 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
16809 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
16810 }
16811
16812 // The basic code generated is conceptually straightforward. Pseudo code:
16813 //
16814 // accum = mad_64_32 lhs.lo, rhs.lo, accum
16815 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
16816 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
16817 //
16818 // The second and third lines are optional, depending on whether the factors
16819 // are {sign,zero}-extended or not.
16820 //
16821 // The actual DAG is noisier than the pseudo code, but only due to
16822 // instructions that disassemble values into low and high parts, and
16823 // assemble the final result.
16824 SDValue One = DAG.getConstant(1, SL, MVT::i32);
16825
16826 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
16827 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
16828 SDValue Accum =
16829 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
16830
16831 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
16832 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
16833
16834 if (!MulLHSUnsigned32) {
16835 auto MulLHSHi =
16836 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
16837 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
16838 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16839 }
16840
16841 if (!MulRHSUnsigned32) {
16842 auto MulRHSHi =
16843 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
16844 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
16845 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
16846 }
16847
16848 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
16849 Accum = DAG.getBitcast(MVT::i64, Accum);
16850 }
16851
16852 if (VT != MVT::i64)
16853 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
16854 return Accum;
16855}
16856
16857SDValue
16858SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
16859 DAGCombinerInfo &DCI) const {
16860 SDValue RHS = N->getOperand(1);
16861 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
16862 if (!CRHS)
16863 return SDValue();
16864
16865 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
16866 // common.
16867 uint64_t Val = CRHS->getZExtValue();
16868 if (countr_zero(Val) >= 32) {
16869 SelectionDAG &DAG = DCI.DAG;
16870 SDLoc SL(N);
16871 SDValue LHS = N->getOperand(0);
16872
16873 // Avoid carry machinery if we know the low half of the add does not
16874 // contribute to the final result.
16875 //
16876 // add i64:x, K if computeTrailingZeros(K) >= 32
16877 // => build_pair (add x.hi, K.hi), x.lo
16878
16879 // Breaking the 64-bit add here with this strange constant is unlikely
16880 // to interfere with addressing mode patterns.
16881
16882 SDValue Hi = getHiHalf64(LHS, DAG);
16883 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
16884 unsigned Opcode = N->getOpcode();
16885 if (Opcode == ISD::PTRADD)
16886 Opcode = ISD::ADD;
16887 SDValue AddHi =
16888 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
16889
16890 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
16891 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
16892 }
16893
16894 return SDValue();
16895}
16896
16897// Collect the ultimate src of each of the mul node's operands, and confirm
16898// each operand is 8 bytes.
16899static std::optional<ByteProvider<SDValue>>
16900handleMulOperand(const SDValue &MulOperand) {
16901 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
16902 if (!Byte0 || Byte0->isConstantZero()) {
16903 return std::nullopt;
16904 }
16905 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
16906 if (Byte1 && !Byte1->isConstantZero()) {
16907 return std::nullopt;
16908 }
16909 return Byte0;
16910}
16911
16912static unsigned addPermMasks(unsigned First, unsigned Second) {
16913 unsigned FirstCs = First & 0x0c0c0c0c;
16914 unsigned SecondCs = Second & 0x0c0c0c0c;
16915 unsigned FirstNoCs = First & ~0x0c0c0c0c;
16916 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
16917
16918 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
16919 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
16920 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
16921 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
16922
16923 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
16924}
16925
16926struct DotSrc {
16928 int64_t PermMask;
16930};
16931
16935 SmallVectorImpl<DotSrc> &Src1s, int Step) {
16936
16937 assert(Src0.Src.has_value() && Src1.Src.has_value());
16938 // Src0s and Src1s are empty, just place arbitrarily.
16939 if (Step == 0) {
16940 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
16941 Src0.SrcOffset / 4});
16942 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
16943 Src1.SrcOffset / 4});
16944 return;
16945 }
16946
16947 for (int BPI = 0; BPI < 2; BPI++) {
16948 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
16949 if (BPI == 1) {
16950 BPP = {Src1, Src0};
16951 }
16952 unsigned ZeroMask = 0x0c0c0c0c;
16953 unsigned FMask = 0xFF << (8 * (3 - Step));
16954
16955 unsigned FirstMask =
16956 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16957 unsigned SecondMask =
16958 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
16959 // Attempt to find Src vector which contains our SDValue, if so, add our
16960 // perm mask to the existing one. If we are unable to find a match for the
16961 // first SDValue, attempt to find match for the second.
16962 int FirstGroup = -1;
16963 for (int I = 0; I < 2; I++) {
16964 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
16965 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
16966 return IterElt.SrcOp == *BPP.first.Src &&
16967 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
16968 };
16969
16970 auto *Match = llvm::find_if(Srcs, MatchesFirst);
16971 if (Match != Srcs.end()) {
16972 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
16973 FirstGroup = I;
16974 break;
16975 }
16976 }
16977 if (FirstGroup != -1) {
16978 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
16979 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
16980 return IterElt.SrcOp == *BPP.second.Src &&
16981 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
16982 };
16983 auto *Match = llvm::find_if(Srcs, MatchesSecond);
16984 if (Match != Srcs.end()) {
16985 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
16986 } else
16987 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
16988 return;
16989 }
16990 }
16991
16992 // If we have made it here, then we could not find a match in Src0s or Src1s
16993 // for either Src0 or Src1, so just place them arbitrarily.
16994
16995 unsigned ZeroMask = 0x0c0c0c0c;
16996 unsigned FMask = 0xFF << (8 * (3 - Step));
16997
16998 Src0s.push_back(
16999 {*Src0.Src,
17000 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17001 Src0.SrcOffset / 4});
17002 Src1s.push_back(
17003 {*Src1.Src,
17004 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17005 Src1.SrcOffset / 4});
17006}
17007
17009 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
17010 bool IsAny) {
17011
17012 // If we just have one source, just permute it accordingly.
17013 if (Srcs.size() == 1) {
17014 auto *Elt = Srcs.begin();
17015 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
17016
17017 // v_perm will produce the original value
17018 if (Elt->PermMask == 0x3020100)
17019 return EltOp;
17020
17021 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17022 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
17023 }
17024
17025 auto *FirstElt = Srcs.begin();
17026 auto *SecondElt = std::next(FirstElt);
17027
17029
17030 // If we have multiple sources in the chain, combine them via perms (using
17031 // calculated perm mask) and Ors.
17032 while (true) {
17033 auto FirstMask = FirstElt->PermMask;
17034 auto SecondMask = SecondElt->PermMask;
17035
17036 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
17037 unsigned FirstPlusFour = FirstMask | 0x04040404;
17038 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
17039 // original 0x0C.
17040 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
17041
17042 auto PermMask = addPermMasks(FirstMask, SecondMask);
17043 auto FirstVal =
17044 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
17045 auto SecondVal =
17046 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
17047
17048 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
17049 SecondVal,
17050 DAG.getConstant(PermMask, SL, MVT::i32)));
17051
17052 FirstElt = std::next(SecondElt);
17053 if (FirstElt == Srcs.end())
17054 break;
17055
17056 SecondElt = std::next(FirstElt);
17057 // If we only have a FirstElt, then just combine that into the cumulative
17058 // source node.
17059 if (SecondElt == Srcs.end()) {
17060 auto EltOp =
17061 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
17062
17063 Perms.push_back(
17064 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17065 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
17066 break;
17067 }
17068 }
17069
17070 assert(Perms.size() == 1 || Perms.size() == 2);
17071 return Perms.size() == 2
17072 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
17073 : Perms[0];
17074}
17075
17076static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
17077 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
17078 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
17079 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
17080 EntryMask += ZeroMask;
17081 }
17082}
17083
17084static bool isMul(const SDValue Op) {
17085 auto Opcode = Op.getOpcode();
17086
17087 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
17088 Opcode == AMDGPUISD::MUL_I24);
17089}
17090
17091static std::optional<bool>
17093 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
17094 const SDValue &S1Op, const SelectionDAG &DAG) {
17095 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
17096 // of the dot4 is irrelevant.
17097 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
17098 return false;
17099
17100 auto Known0 = DAG.computeKnownBits(S0Op, 0);
17101 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
17102 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
17103 auto Known1 = DAG.computeKnownBits(S1Op, 0);
17104 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
17105 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
17106
17107 assert(!(S0IsUnsigned && S0IsSigned));
17108 assert(!(S1IsUnsigned && S1IsSigned));
17109
17110 // There are 9 possible permutations of
17111 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
17112
17113 // In two permutations, the sign bits are known to be the same for both Ops,
17114 // so simply return Signed / Unsigned corresponding to the MSB
17115
17116 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
17117 return S0IsSigned;
17118
17119 // In another two permutations, the sign bits are known to be opposite. In
17120 // this case return std::nullopt to indicate a bad match.
17121
17122 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
17123 return std::nullopt;
17124
17125 // In the remaining five permutations, we don't know the value of the sign
17126 // bit for at least one Op. Since we have a valid ByteProvider, we know that
17127 // the upper bits must be extension bits. Thus, the only ways for the sign
17128 // bit to be unknown is if it was sign extended from unknown value, or if it
17129 // was any extended. In either case, it is correct to use the signed
17130 // version of the signedness semantics of dot4
17131
17132 // In two of such permutations, we known the sign bit is set for
17133 // one op, and the other is unknown. It is okay to used signed version of
17134 // dot4.
17135 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
17136 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
17137 return true;
17138
17139 // In one such permutation, we don't know either of the sign bits. It is okay
17140 // to used the signed version of dot4.
17141 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
17142 return true;
17143
17144 // In two of such permutations, we known the sign bit is unset for
17145 // one op, and the other is unknown. Return std::nullopt to indicate a
17146 // bad match.
17147 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
17148 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
17149 return std::nullopt;
17150
17151 llvm_unreachable("Fully covered condition");
17152}
17153
17154SDValue SITargetLowering::performAddCombine(SDNode *N,
17155 DAGCombinerInfo &DCI) const {
17156 SelectionDAG &DAG = DCI.DAG;
17157 EVT VT = N->getValueType(0);
17158 SDLoc SL(N);
17159 SDValue LHS = N->getOperand(0);
17160 SDValue RHS = N->getOperand(1);
17161
17162 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
17163 if (Subtarget->hasMad64_32()) {
17164 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17165 return Folded;
17166 }
17167 }
17168
17169 if (SDValue V = reassociateScalarOps(N, DAG)) {
17170 return V;
17171 }
17172
17173 if (VT == MVT::i64) {
17174 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17175 return Folded;
17176 }
17177
17178 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
17179 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
17180 SDValue TempNode(N, 0);
17181 std::optional<bool> IsSigned;
17185
17186 // Match the v_dot4 tree, while collecting src nodes.
17187 int ChainLength = 0;
17188 for (int I = 0; I < 4; I++) {
17189 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
17190 if (MulIdx == -1)
17191 break;
17192 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
17193 if (!Src0)
17194 break;
17195 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
17196 if (!Src1)
17197 break;
17198
17199 auto IterIsSigned = checkDot4MulSignedness(
17200 TempNode->getOperand(MulIdx), *Src0, *Src1,
17201 TempNode->getOperand(MulIdx)->getOperand(0),
17202 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
17203 if (!IterIsSigned)
17204 break;
17205 if (!IsSigned)
17206 IsSigned = *IterIsSigned;
17207 if (*IterIsSigned != *IsSigned)
17208 break;
17209 placeSources(*Src0, *Src1, Src0s, Src1s, I);
17210 auto AddIdx = 1 - MulIdx;
17211 // Allow the special case where add (add (mul24, 0), mul24) became ->
17212 // add (mul24, mul24).
17213 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
17214 Src2s.push_back(TempNode->getOperand(AddIdx));
17215 auto Src0 =
17216 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
17217 if (!Src0)
17218 break;
17219 auto Src1 =
17220 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
17221 if (!Src1)
17222 break;
17223 auto IterIsSigned = checkDot4MulSignedness(
17224 TempNode->getOperand(AddIdx), *Src0, *Src1,
17225 TempNode->getOperand(AddIdx)->getOperand(0),
17226 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
17227 if (!IterIsSigned)
17228 break;
17229 assert(IsSigned);
17230 if (*IterIsSigned != *IsSigned)
17231 break;
17232 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
17233 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
17234 ChainLength = I + 2;
17235 break;
17236 }
17237
17238 TempNode = TempNode->getOperand(AddIdx);
17239 Src2s.push_back(TempNode);
17240 ChainLength = I + 1;
17241 if (TempNode->getNumOperands() < 2)
17242 break;
17243 LHS = TempNode->getOperand(0);
17244 RHS = TempNode->getOperand(1);
17245 }
17246
17247 if (ChainLength < 2)
17248 return SDValue();
17249
17250 // Masks were constructed with assumption that we would find a chain of
17251 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
17252 // 0x0c) so they do not affect dot calculation.
17253 if (ChainLength < 4) {
17254 fixMasks(Src0s, ChainLength);
17255 fixMasks(Src1s, ChainLength);
17256 }
17257
17258 SDValue Src0, Src1;
17259
17260 // If we are just using a single source for both, and have permuted the
17261 // bytes consistently, we can just use the sources without permuting
17262 // (commutation).
17263 bool UseOriginalSrc = false;
17264 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
17265 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
17266 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
17267 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
17268 SmallVector<unsigned, 4> SrcBytes;
17269 auto Src0Mask = Src0s.begin()->PermMask;
17270 SrcBytes.push_back(Src0Mask & 0xFF000000);
17271 bool UniqueEntries = true;
17272 for (auto I = 1; I < 4; I++) {
17273 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
17274
17275 if (is_contained(SrcBytes, NextByte)) {
17276 UniqueEntries = false;
17277 break;
17278 }
17279 SrcBytes.push_back(NextByte);
17280 }
17281
17282 if (UniqueEntries) {
17283 UseOriginalSrc = true;
17284
17285 auto *FirstElt = Src0s.begin();
17286 auto FirstEltOp =
17287 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
17288
17289 auto *SecondElt = Src1s.begin();
17290 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
17291 SecondElt->DWordOffset);
17292
17293 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
17294 MVT::getIntegerVT(32));
17295 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
17296 MVT::getIntegerVT(32));
17297 }
17298 }
17299
17300 if (!UseOriginalSrc) {
17301 Src0 = resolveSources(DAG, SL, Src0s, false, true);
17302 Src1 = resolveSources(DAG, SL, Src1s, false, true);
17303 }
17304
17305 assert(IsSigned);
17306 SDValue Src2 =
17307 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
17308
17309 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
17310 : Intrinsic::amdgcn_udot4,
17311 SL, MVT::i64);
17312
17313 assert(!VT.isVector());
17314 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
17315 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
17316
17317 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
17318 }
17319
17320 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
17321 return SDValue();
17322
17323 // add x, zext (setcc) => uaddo_carry x, 0, setcc
17324 // add x, sext (setcc) => usubo_carry x, 0, setcc
17325 unsigned Opc = LHS.getOpcode();
17328 std::swap(RHS, LHS);
17329
17330 Opc = RHS.getOpcode();
17331 switch (Opc) {
17332 default:
17333 break;
17334 case ISD::ZERO_EXTEND:
17335 case ISD::SIGN_EXTEND:
17336 case ISD::ANY_EXTEND: {
17337 auto Cond = RHS.getOperand(0);
17338 // If this won't be a real VOPC output, we would still need to insert an
17339 // extra instruction anyway.
17340 if (!isBoolSGPR(Cond))
17341 break;
17342 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
17343 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
17345 return DAG.getNode(Opc, SL, VTList, Args);
17346 }
17347 case ISD::UADDO_CARRY: {
17348 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
17349 if (!isNullConstant(RHS.getOperand(1)))
17350 break;
17351 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
17352 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
17353 }
17354 }
17355 return SDValue();
17356}
17357
17358SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
17359 DAGCombinerInfo &DCI) const {
17360 SelectionDAG &DAG = DCI.DAG;
17361 SDLoc DL(N);
17362 EVT VT = N->getValueType(0);
17363 SDValue N0 = N->getOperand(0);
17364 SDValue N1 = N->getOperand(1);
17365
17366 // The following folds transform PTRADDs into regular arithmetic in cases
17367 // where the PTRADD wouldn't be folded as an immediate offset into memory
17368 // instructions anyway. They are target-specific in that other targets might
17369 // prefer to not lose information about the pointer arithmetic.
17370
17371 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
17372 // Adapted from DAGCombiner::visitADDLikeCommutative.
17373 SDValue V, K;
17374 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
17375 SDNodeFlags ShlFlags = N1->getFlags();
17376 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
17377 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
17378 // preserved.
17379 SDNodeFlags NewShlFlags =
17380 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
17382 : SDNodeFlags();
17383 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
17384 DCI.AddToWorklist(Inner.getNode());
17385 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
17386 }
17387
17388 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
17389 // performAddCombine.
17390 if (N1.getOpcode() == ISD::MUL) {
17391 if (Subtarget->hasMad64_32()) {
17392 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17393 return Folded;
17394 }
17395 }
17396
17397 // If the 32 low bits of the constant are all zero, there is nothing to fold
17398 // into an immediate offset, so it's better to eliminate the unnecessary
17399 // addition for the lower 32 bits than to preserve the PTRADD.
17400 // Analogous to a fold in performAddCombine.
17401 if (VT == MVT::i64) {
17402 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17403 return Folded;
17404 }
17405
17406 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
17407 return SDValue();
17408
17409 SDValue X = N0;
17410 SDValue Y = N1.getOperand(0);
17411 SDValue Z = N1.getOperand(1);
17412 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
17413 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
17414
17415 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
17416 Y->isDivergent() != Z->isDivergent()) {
17417 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
17418 // y are uniform and z isn't.
17419 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
17420 // z are uniform and y isn't.
17421 // The goal is to push uniform operands up in the computation, so that they
17422 // can be handled with scalar operations. We can't use reassociateScalarOps
17423 // for this since it requires two identical commutative operations to
17424 // reassociate.
17425 if (Y->isDivergent())
17426 std::swap(Y, Z);
17427 // If both additions in the original were NUW, reassociation preserves that.
17428 SDNodeFlags ReassocFlags =
17429 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
17430 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
17431 DCI.AddToWorklist(UniformInner.getNode());
17432 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
17433 }
17434
17435 return SDValue();
17436}
17437
17438static bool isCtlzOpc(unsigned Opc) {
17439 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
17440}
17441
17442SDValue SITargetLowering::performSubCombine(SDNode *N,
17443 DAGCombinerInfo &DCI) const {
17444 SelectionDAG &DAG = DCI.DAG;
17445 EVT VT = N->getValueType(0);
17446
17447 if (VT == MVT::i64) {
17448 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17449 return Folded;
17450 }
17451
17452 if (VT != MVT::i32)
17453 return SDValue();
17454
17455 SDLoc SL(N);
17456 SDValue LHS = N->getOperand(0);
17457 SDValue RHS = N->getOperand(1);
17458
17459 // sub x, zext (setcc) => usubo_carry x, 0, setcc
17460 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
17461 unsigned Opc = RHS.getOpcode();
17462 switch (Opc) {
17463 default:
17464 break;
17465 case ISD::ZERO_EXTEND:
17466 case ISD::SIGN_EXTEND:
17467 case ISD::ANY_EXTEND: {
17468 auto Cond = RHS.getOperand(0);
17469 // If this won't be a real VOPC output, we would still need to insert an
17470 // extra instruction anyway.
17471 if (!isBoolSGPR(Cond))
17472 break;
17473 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
17474 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
17476 return DAG.getNode(Opc, SL, VTList, Args);
17477 }
17478 }
17479
17480 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
17481 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
17482 if (!isNullConstant(LHS.getOperand(1)))
17483 return SDValue();
17484 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
17485 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
17486 }
17487
17488 // sub (ctlz (xor x, (sra x, 31))), 1 -> ctls x.
17489 if (isOneConstant(RHS) && isCtlzOpc(LHS.getOpcode())) {
17490 SDValue CtlzSrc = LHS.getOperand(0);
17491 // Check for xor x, (sra x, 31) pattern.
17492 if (CtlzSrc.getOpcode() == ISD::XOR) {
17493 SDValue X = CtlzSrc.getOperand(0);
17494 SDValue SignExt = CtlzSrc.getOperand(1);
17495 // Try both ordering of XOR operands.
17496 if (SignExt.getOpcode() != ISD::SRA)
17497 std::swap(X, SignExt);
17498 if (SignExt.getOpcode() == ISD::SRA && SignExt.getOperand(0) == X) {
17499 ConstantSDNode *ShiftAmt =
17501 unsigned BitWidth = X.getValueType().getScalarSizeInBits();
17502 if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1)
17503 return DAG.getNode(ISD::CTLS, SL, VT, X);
17504 }
17505 }
17506 }
17507
17508 return SDValue();
17509}
17510
17511SDValue
17512SITargetLowering::performAddCarrySubCarryCombine(SDNode *N,
17513 DAGCombinerInfo &DCI) const {
17514
17515 if (N->getValueType(0) != MVT::i32)
17516 return SDValue();
17517
17518 if (!isNullConstant(N->getOperand(1)))
17519 return SDValue();
17520
17521 SelectionDAG &DAG = DCI.DAG;
17522 SDValue LHS = N->getOperand(0);
17523
17524 // uaddo_carry (add x, y), 0, cc => uaddo_carry x, y, cc
17525 // usubo_carry (sub x, y), 0, cc => usubo_carry x, y, cc
17526 unsigned LHSOpc = LHS.getOpcode();
17527 unsigned Opc = N->getOpcode();
17528 if ((LHSOpc == ISD::ADD && Opc == ISD::UADDO_CARRY) ||
17529 (LHSOpc == ISD::SUB && Opc == ISD::USUBO_CARRY)) {
17530 SDValue Args[] = {LHS.getOperand(0), LHS.getOperand(1), N->getOperand(2)};
17531 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), Args);
17532 }
17533 return SDValue();
17534}
17535
17536SDValue SITargetLowering::performFAddCombine(SDNode *N,
17537 DAGCombinerInfo &DCI) const {
17538 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17539 return SDValue();
17540
17541 SelectionDAG &DAG = DCI.DAG;
17542 EVT VT = N->getValueType(0);
17543
17544 SDLoc SL(N);
17545 SDValue LHS = N->getOperand(0);
17546 SDValue RHS = N->getOperand(1);
17547
17548 // These should really be instruction patterns, but writing patterns with
17549 // source modifiers is a pain.
17550
17551 // fadd (fadd (a, a), b) -> mad 2.0, a, b
17552 if (LHS.getOpcode() == ISD::FADD) {
17553 SDValue A = LHS.getOperand(0);
17554 if (A == LHS.getOperand(1)) {
17555 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17556 if (FusedOp != 0) {
17557 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17558 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
17559 }
17560 }
17561 }
17562
17563 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
17564 if (RHS.getOpcode() == ISD::FADD) {
17565 SDValue A = RHS.getOperand(0);
17566 if (A == RHS.getOperand(1)) {
17567 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17568 if (FusedOp != 0) {
17569 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17570 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
17571 }
17572 }
17573 }
17574
17575 return SDValue();
17576}
17577
17578SDValue SITargetLowering::performFSubCombine(SDNode *N,
17579 DAGCombinerInfo &DCI) const {
17580 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17581 return SDValue();
17582
17583 SelectionDAG &DAG = DCI.DAG;
17584 SDLoc SL(N);
17585 EVT VT = N->getValueType(0);
17586 assert(!VT.isVector());
17587
17588 // Try to get the fneg to fold into the source modifier. This undoes generic
17589 // DAG combines and folds them into the mad.
17590 //
17591 // Only do this if we are not trying to support denormals. v_mad_f32 does
17592 // not support denormals ever.
17593 SDValue LHS = N->getOperand(0);
17594 SDValue RHS = N->getOperand(1);
17595 if (LHS.getOpcode() == ISD::FADD) {
17596 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
17597 SDValue A = LHS.getOperand(0);
17598 if (A == LHS.getOperand(1)) {
17599 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17600 if (FusedOp != 0) {
17601 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17602 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
17603
17604 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
17605 }
17606 }
17607 }
17608
17609 if (RHS.getOpcode() == ISD::FADD) {
17610 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
17611
17612 SDValue A = RHS.getOperand(0);
17613 if (A == RHS.getOperand(1)) {
17614 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17615 if (FusedOp != 0) {
17616 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
17617 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
17618 }
17619 }
17620 }
17621
17622 return SDValue();
17623}
17624
17625SDValue SITargetLowering::performFDivCombine(SDNode *N,
17626 DAGCombinerInfo &DCI) const {
17627 SelectionDAG &DAG = DCI.DAG;
17628 SDLoc SL(N);
17629 EVT VT = N->getValueType(0);
17630
17631 // fsqrt legality correlates to rsq availability.
17632 if ((VT != MVT::f16 && VT != MVT::bf16) || !isOperationLegal(ISD::FSQRT, VT))
17633 return SDValue();
17634
17635 SDValue LHS = N->getOperand(0);
17636 SDValue RHS = N->getOperand(1);
17637
17638 SDNodeFlags Flags = N->getFlags();
17639 SDNodeFlags RHSFlags = RHS->getFlags();
17640 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
17641 !RHS->hasOneUse())
17642 return SDValue();
17643
17644 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
17645 bool IsNegative = false;
17646 if (CLHS->isExactlyValue(1.0) ||
17647 (IsNegative = CLHS->isExactlyValue(-1.0))) {
17648 // fdiv contract 1.0, (sqrt contract x) -> rsq for f16
17649 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq) for f16
17650 if (RHS.getOpcode() == ISD::FSQRT) {
17651 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
17652 SDValue Rsq =
17653 DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0), Flags);
17654 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
17655 }
17656 }
17657 }
17658
17659 return SDValue();
17660}
17661
17662SDValue SITargetLowering::performFMulCombine(SDNode *N,
17663 DAGCombinerInfo &DCI) const {
17664 SelectionDAG &DAG = DCI.DAG;
17665 EVT VT = N->getValueType(0);
17666 EVT ScalarVT = VT.getScalarType();
17667 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
17668
17669 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
17670 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17671 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
17672 return SDValue();
17673 }
17674
17675 SDValue LHS = N->getOperand(0);
17676 SDValue RHS = N->getOperand(1);
17677
17678 // It is cheaper to realize i32 inline constants as compared against
17679 // materializing f16 or f64 (or even non-inline f32) values,
17680 // possible via ldexp usage, as shown below :
17681 //
17682 // Given : A = 2^a & B = 2^b ; where a and b are integers.
17683 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17684 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17685 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17686 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17687 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
17688 if (!TrueNode)
17689 return SDValue();
17690 const ConstantFPSDNode *FalseNode =
17691 isConstOrConstSplatFP(RHS.getOperand(2));
17692 if (!FalseNode)
17693 return SDValue();
17694
17695 if (TrueNode->isNegative() != FalseNode->isNegative())
17696 return SDValue();
17697
17698 // For f32, only non-inline constants should be transformed.
17699 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17700 if (ScalarVT == MVT::f32 &&
17701 TII->isInlineConstant(TrueNode->getValueAPF()) &&
17702 TII->isInlineConstant(FalseNode->getValueAPF()))
17703 return SDValue();
17704
17705 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17706 if (TrueNodeExpVal == INT_MIN)
17707 return SDValue();
17708 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17709 if (FalseNodeExpVal == INT_MIN)
17710 return SDValue();
17711
17712 SDLoc SL(N);
17713 SDValue SelectNode =
17714 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
17715 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
17716 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
17717
17718 LHS = TrueNode->isNegative()
17719 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
17720 : LHS;
17721
17722 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
17723 }
17724
17725 return SDValue();
17726}
17727
17728SDValue SITargetLowering::performFMACombine(SDNode *N,
17729 DAGCombinerInfo &DCI) const {
17730 SelectionDAG &DAG = DCI.DAG;
17731 EVT VT = N->getValueType(0);
17732 SDLoc SL(N);
17733
17734 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17735 return SDValue();
17736
17737 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17738 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17739 SDValue Op1 = N->getOperand(0);
17740 SDValue Op2 = N->getOperand(1);
17741 SDValue FMA = N->getOperand(2);
17742
17743 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17744 Op2.getOpcode() != ISD::FP_EXTEND)
17745 return SDValue();
17746
17747 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17748 // regardless of the denorm mode setting. Therefore,
17749 // fp-contract is sufficient to allow generating fdot2.
17750 const TargetOptions &Options = DAG.getTarget().Options;
17751 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17752 (N->getFlags().hasAllowContract() &&
17753 FMA->getFlags().hasAllowContract())) {
17754 Op1 = Op1.getOperand(0);
17755 Op2 = Op2.getOperand(0);
17756 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17758 return SDValue();
17759
17760 SDValue Vec1 = Op1.getOperand(0);
17761 SDValue Idx1 = Op1.getOperand(1);
17762 SDValue Vec2 = Op2.getOperand(0);
17763
17764 SDValue FMAOp1 = FMA.getOperand(0);
17765 SDValue FMAOp2 = FMA.getOperand(1);
17766 SDValue FMAAcc = FMA.getOperand(2);
17767
17768 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17769 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17770 return SDValue();
17771
17772 FMAOp1 = FMAOp1.getOperand(0);
17773 FMAOp2 = FMAOp2.getOperand(0);
17774 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17776 return SDValue();
17777
17778 SDValue Vec3 = FMAOp1.getOperand(0);
17779 SDValue Vec4 = FMAOp2.getOperand(0);
17780 SDValue Idx2 = FMAOp1.getOperand(1);
17781
17782 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
17783 // Idx1 and Idx2 cannot be the same.
17784 Idx1 == Idx2)
17785 return SDValue();
17786
17787 if (Vec1 == Vec2 || Vec3 == Vec4)
17788 return SDValue();
17789
17790 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
17791 return SDValue();
17792
17793 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
17794 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
17795 DAG.getTargetConstant(0, SL, MVT::i1));
17796 }
17797 }
17798 return SDValue();
17799}
17800
17801// Given a double-precision ordered or unordered comparison, return the
17802// condition code for an equivalent integral comparison of the operands' upper
17803// 32 bits, or `SETCC_INVALID` if not possible.
17804// For simplicity, no simplification occurs if the operands are not both known
17805// to have sign bit zero.
17806//
17807// EQ/NE:
17808// If LHS.lo32 == RHS.lo32:
17809// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
17810// If LHS.lo32 != RHS.lo32:
17811// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
17812// The reduction is not possible if operands may be +0 and -0.
17813// For ordered eq / unordered ne, at most one operand may be NaN.
17814// For unordered eq / ordered ne, neither operand can be NaN.
17815//
17816// LT/GE:
17817// If LHS.lo32 >= RHS.lo32 (unsigned):
17818// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
17819// If LHS.lo32 < RHS.lo32 (unsigned):
17820// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
17821// The reduction is only supported if both operands are nonnegative.
17822// For ordered lt / unordered ge, the RHS cannot be NaN.
17823// For unordered lt / ordered ge, neither operand can be NaN.
17824//
17825// LE/GT:
17826// If LHS.lo32 > RHS.lo32 (unsigned):
17827// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
17828// If LHS.lo32 <= RHS.lo32 (unsigned):
17829// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
17830// The reduction is only supported if both operands are nonnegative.
17831// For unordered le / ordered gt, the LHS cannot be NaN.
17832// For ordered le / unordered gt, neither operand can be NaN.
17834 const SDValue LHS,
17835 const SDValue RHS,
17836 const SelectionDAG &DAG) {
17837 EVT VT = LHS.getValueType();
17838 assert(VT == MVT::f64 && "Incorrect operand type!");
17839
17840 const KnownBits RHSBits = DAG.computeKnownBits(RHS);
17841 // Bail if RHS sign bit is not known to be zero.
17842 if (!RHSBits.Zero.isSignBitSet())
17843 return ISD::SETCC_INVALID;
17844
17845 const KnownBits RHSKnownLo32 = RHSBits.trunc(32);
17846 const KnownFPClass RHSFPClass =
17848 const bool RHSMaybeNaN = !RHSFPClass.isKnownNeverNaN();
17849
17850 const KnownBits LHSBits = DAG.computeKnownBits(LHS);
17851 const KnownBits LHSKnownLo32 = LHSBits.trunc(32);
17852 const KnownFPClass LHSFPClass =
17854 const bool LHSMaybeNaN = !LHSFPClass.isKnownNeverNaN();
17855
17856 // Bail if LHS sign bit is not known to be zero.
17857 if (!LHSBits.Zero.isSignBitSet())
17858 return ISD::SETCC_INVALID;
17859
17860 switch (CC) {
17861 default:
17862 break;
17863 case ISD::SETEQ:
17864 case ISD::SETOEQ:
17865 case ISD::SETUEQ:
17866 case ISD::SETONE:
17867 case ISD::SETUNE: {
17868 // OEQ should be false if either operand is NaN, so it suffices that at
17869 // least one operand is not NaN.
17870 if (CC == ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)
17871 break;
17872 // UEQ should be true if either operand is NaN, but this cannot be checked
17873 // on underlying bits.
17874 if (CC == ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))
17875 break;
17876 // ONE should be false if either operand is NaN, but this cannot be
17877 // checked on underlying bits.
17878 if (CC == ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))
17879 break;
17880 // UNE should be true if either operand is NaN, so it suffices that they
17881 // are not both NaN.
17882 if (CC == ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)
17883 break;
17884
17885 const std::optional<bool> KnownEq =
17886 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
17887
17888 if (!KnownEq)
17889 break;
17890
17891 if (*KnownEq)
17892 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
17893 ? ISD::SETEQ
17894 : ISD::SETNE;
17895
17896 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
17898 : ISD::SETTRUE;
17899 }
17900 case ISD::SETLT:
17901 case ISD::SETOLT:
17902 case ISD::SETULT:
17903 case ISD::SETGE:
17904 case ISD::SETOGE:
17905 case ISD::SETUGE: {
17906 // OLT should be false if either operand is NaN.
17907 // Since NaNs have maximum exponent and nonzero mantissa, false positives
17908 // are only possible if the RHS is NaN. (No issue with RHS == +inf since
17909 // the inequality is strict)
17910 if (CC == ISD::SETOLT && RHSMaybeNaN)
17911 break;
17912 // ULT should be true if either operand is NaN, but this cannot be ensured
17913 // with a truncated comparison.
17914 if (CC == ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))
17915 break;
17916 // OGE should be false if either operand is NaN, but this cannot be
17917 // ensured with a truncated comparison.
17918 if (CC == ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))
17919 break;
17920 // UGE should be true if either operand is NaN.
17921 // False negatives are only possible if the RHS is NaN.
17922 // (No issue with RHS == +inf since the inequality is inclusive)
17923 if (CC == ISD::SETUGE && RHSMaybeNaN)
17924 break;
17925
17926 const std::optional<bool> KnownUge =
17927 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
17928
17929 if (!KnownUge)
17930 break;
17931
17932 if (*KnownUge) {
17933 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
17934 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
17935 ? ISD::SETLT
17936 : ISD::SETGE;
17937 }
17938 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
17939 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
17940 ? ISD::SETLE
17941 : ISD::SETGT;
17942 }
17943 case ISD::SETLE:
17944 case ISD::SETOLE:
17945 case ISD::SETULE:
17946 case ISD::SETGT:
17947 case ISD::SETOGT:
17948 case ISD::SETUGT: {
17949 // OLE should be false if either operand is NaN, but this cannot be
17950 // ensured with a truncated comparison.
17951 if (CC == ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))
17952 break;
17953 // ULE should be true if either operand is NaN.
17954 // False negatives are only possible if the LHS is NaN.
17955 // (No issue with LHS == +inf since the inequality is inclusive)
17956 if (CC == ISD::SETULE && LHSMaybeNaN)
17957 break;
17958 // OGT should be false if either operand is NaN.
17959 // False positives are only possible if the LHS is NaN.
17960 // (No issue with LHS == +inf since the inequality is strict)
17961 if (CC == ISD::SETOGT && LHSMaybeNaN)
17962 break;
17963 // UGT should be true if either operand is NaN, but this cannot be ensured
17964 // with a truncated comparison.
17965 if (CC == ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))
17966 break;
17967
17968 const std::optional<bool> KnownUle =
17969 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
17970
17971 if (!KnownUle)
17972 break;
17973
17974 if (*KnownUle) {
17975 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
17976 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
17977 ? ISD::SETLE
17978 : ISD::SETGT;
17979 }
17980 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
17981 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
17982 ? ISD::SETLT
17983 : ISD::SETGE;
17984 }
17985 }
17986
17987 return ISD::SETCC_INVALID;
17988}
17989
17990SDValue SITargetLowering::performSetCCCombine(SDNode *N,
17991 DAGCombinerInfo &DCI) const {
17992 SelectionDAG &DAG = DCI.DAG;
17993 SDLoc SL(N);
17994
17995 SDValue LHS = N->getOperand(0);
17996 SDValue RHS = N->getOperand(1);
17997 EVT VT = LHS.getValueType();
17998 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
17999
18000 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
18001 if (!CRHS) {
18003 if (CRHS) {
18004 std::swap(LHS, RHS);
18005 CC = getSetCCSwappedOperands(CC);
18006 }
18007 }
18008
18009 if (CRHS) {
18010 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
18011 isBoolSGPR(LHS.getOperand(0))) {
18012 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
18013 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
18014 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
18015 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
18016 if ((CRHS->isAllOnes() &&
18017 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
18018 (CRHS->isZero() &&
18019 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
18020 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
18021 DAG.getAllOnesConstant(SL, MVT::i1));
18022 if ((CRHS->isAllOnes() &&
18023 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
18024 (CRHS->isZero() &&
18025 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
18026 return LHS.getOperand(0);
18027 }
18028
18029 const APInt &CRHSVal = CRHS->getAPIntValue();
18030 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
18031 LHS.getOpcode() == ISD::SELECT &&
18032 isa<ConstantSDNode>(LHS.getOperand(1)) &&
18033 isa<ConstantSDNode>(LHS.getOperand(2)) &&
18034 isBoolSGPR(LHS.getOperand(0))) {
18035 // Given CT != FT:
18036 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
18037 // setcc (select cc, CT, CF), CF, ne => cc
18038 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
18039 // setcc (select cc, CT, CF), CT, eq => cc
18040 const APInt &CT = LHS.getConstantOperandAPInt(1);
18041 const APInt &CF = LHS.getConstantOperandAPInt(2);
18042
18043 if (CT != CF) {
18044 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
18045 (CT == CRHSVal && CC == ISD::SETNE))
18046 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
18047 if ((CF == CRHSVal && CC == ISD::SETNE) ||
18048 (CT == CRHSVal && CC == ISD::SETEQ))
18049 return LHS.getOperand(0);
18050 }
18051 }
18052 }
18053
18054 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
18055 // following cases where information about the lower 32-bits of its operands
18056 // is known:
18057 //
18058 // If LHS.lo32 == RHS.lo32:
18059 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
18060 // If LHS.lo32 != RHS.lo32:
18061 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
18062 // If LHS.lo32 >= RHS.lo32 (unsigned):
18063 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
18064 // If LHS.lo32 > RHS.lo32 (unsigned):
18065 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
18066 // If LHS.lo32 <= RHS.lo32 (unsigned):
18067 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
18068 // If LHS.lo32 < RHS.lo32 (unsigned):
18069 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
18070 if (VT == MVT::i64) {
18071 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(LHS).trunc(32);
18072 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(RHS).trunc(32);
18073
18074 // NewCC is valid iff we can truncate the setcc to only test the upper 32
18075 // bits
18077
18078 switch (CC) {
18079 default:
18080 break;
18081 case ISD::SETEQ: {
18082 const std::optional<bool> KnownEq =
18083 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
18084 if (KnownEq)
18085 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
18086
18087 break;
18088 }
18089 case ISD::SETNE: {
18090 const std::optional<bool> KnownEq =
18091 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
18092 if (KnownEq)
18093 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
18094
18095 break;
18096 }
18097 case ISD::SETULT:
18098 case ISD::SETUGE:
18099 case ISD::SETLT:
18100 case ISD::SETGE: {
18101 const std::optional<bool> KnownUge =
18102 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
18103 if (KnownUge) {
18104 if (*KnownUge) {
18105 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
18106 NewCC = CC;
18107 } else {
18108 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
18109 NewCC = CC == ISD::SETULT ? ISD::SETULE
18110 : CC == ISD::SETUGE ? ISD::SETUGT
18111 : CC == ISD::SETLT ? ISD::SETLE
18112 : ISD::SETGT;
18113 }
18114 }
18115 break;
18116 }
18117 case ISD::SETULE:
18118 case ISD::SETUGT:
18119 case ISD::SETLE:
18120 case ISD::SETGT: {
18121 const std::optional<bool> KnownUle =
18122 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
18123 if (KnownUle) {
18124 if (*KnownUle) {
18125 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
18126 NewCC = CC;
18127 } else {
18128 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
18129 NewCC = CC == ISD::SETULE ? ISD::SETULT
18130 : CC == ISD::SETUGT ? ISD::SETUGE
18131 : CC == ISD::SETLE ? ISD::SETLT
18132 : ISD::SETGE;
18133 }
18134 }
18135 break;
18136 }
18137 }
18138
18139 if (NewCC != ISD::SETCC_INVALID)
18140 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
18141 getHiHalf64(RHS, DAG), NewCC);
18142 }
18143
18144 // Eliminate setcc by using carryout from add/sub instruction
18145
18146 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
18147 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
18148 // similarly for subtraction
18149
18150 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
18151 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
18152
18153 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
18155 (CC == ISD::SETUGT &&
18157 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
18158 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
18159 bool IsAdd = LHS.getOpcode() == ISD::ADD;
18160
18161 SDValue Op0 = LHS.getOperand(0);
18162 SDValue Op1 = LHS.getOperand(1);
18163
18164 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
18165 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
18166
18167 SDValue Op0Hi = getHiHalf64(Op0, DAG);
18168 SDValue Op1Hi = getHiHalf64(Op1, DAG);
18169
18170 SDValue NodeLo =
18171 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
18172 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
18173
18174 SDValue CarryInHi = NodeLo.getValue(1);
18175 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
18176 SL, DAG.getVTList(MVT::i32, MVT::i1),
18177 {Op0Hi, Op1Hi, CarryInHi});
18178
18179 SDValue ResultLo = NodeLo.getValue(0);
18180 SDValue ResultHi = NodeHi.getValue(0);
18181
18182 SDValue JoinedResult =
18183 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
18184
18185 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
18186 SDValue Overflow = NodeHi.getValue(1);
18187 DCI.CombineTo(LHS.getNode(), Result);
18188 return Overflow;
18189 }
18190
18191 if (VT != MVT::f32 && VT != MVT::f64 &&
18192 (!Subtarget->has16BitInsts() || VT != MVT::f16))
18193 return SDValue();
18194
18195 // Match isinf/isfinite pattern
18196 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
18197 // (fcmp one (fabs x), inf) -> (fp_class x,
18198 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
18199 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
18200 LHS.getOpcode() == ISD::FABS) {
18201 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
18202 if (!CRHS)
18203 return SDValue();
18204
18205 const APFloat &APF = CRHS->getValueAPF();
18206 if (APF.isInfinity() && !APF.isNegative()) {
18207 const unsigned IsInfMask =
18209 const unsigned IsFiniteMask =
18213 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
18214 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
18215 DAG.getConstant(Mask, SL, MVT::i32));
18216 }
18217 }
18218
18219 if (VT == MVT::f64) {
18220 ISD::CondCode HiHalfCC = tryReduceF64CompareToHiHalf(CC, LHS, RHS, DAG);
18221 if (HiHalfCC != ISD::SETCC_INVALID)
18222 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
18223 getHiHalf64(RHS, DAG), HiHalfCC);
18224 }
18225
18226 return SDValue();
18227}
18228
18229SDValue
18230SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
18231 DAGCombinerInfo &DCI) const {
18232 SelectionDAG &DAG = DCI.DAG;
18233 SDLoc SL(N);
18234 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
18235
18236 SDValue Src = N->getOperand(0);
18237 SDValue Shift = N->getOperand(0);
18238
18239 // TODO: Extend type shouldn't matter (assuming legal types).
18240 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
18241 Shift = Shift.getOperand(0);
18242
18243 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
18244 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
18245 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
18246 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
18247 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
18248 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
18249 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
18250 SDValue Shifted = DAG.getZExtOrTrunc(
18251 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
18252
18253 unsigned ShiftOffset = 8 * Offset;
18254 if (Shift.getOpcode() == ISD::SHL)
18255 ShiftOffset -= C->getZExtValue();
18256 else
18257 ShiftOffset += C->getZExtValue();
18258
18259 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
18260 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
18261 MVT::f32, Shifted);
18262 }
18263 }
18264 }
18265
18266 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18267 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
18268 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
18269 // We simplified Src. If this node is not dead, visit it again so it is
18270 // folded properly.
18271 if (N->getOpcode() != ISD::DELETED_NODE)
18272 DCI.AddToWorklist(N);
18273 return SDValue(N, 0);
18274 }
18275
18276 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
18277 if (SDValue DemandedSrc =
18278 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
18279 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
18280
18281 return SDValue();
18282}
18283
18284SDValue SITargetLowering::performClampCombine(SDNode *N,
18285 DAGCombinerInfo &DCI) const {
18286 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
18287 if (!CSrc)
18288 return SDValue();
18289
18290 const MachineFunction &MF = DCI.DAG.getMachineFunction();
18291 const APFloat &F = CSrc->getValueAPF();
18292 APFloat Zero = APFloat::getZero(F.getSemantics());
18293 if (F < Zero ||
18294 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
18295 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
18296 }
18297
18298 APFloat One(F.getSemantics(), "1.0");
18299 if (F > One)
18300 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
18301
18302 return SDValue(CSrc, 0);
18303}
18304
18305SDValue SITargetLowering::performSelectCombine(SDNode *N,
18306 DAGCombinerInfo &DCI) const {
18307
18308 // Try to fold CMP + SELECT patterns with shared constants (both FP and
18309 // integer).
18310 // Detect when CMP and SELECT use the same constant and fold them to avoid
18311 // loading the constant twice. Specifically handles patterns like:
18312 // %cmp = icmp eq i32 %val, 4242
18313 // %sel = select i1 %cmp, i32 4242, i32 %other
18314 // It can be optimized to reuse %val instead of 4242 in select.
18315 SDValue Cond = N->getOperand(0);
18316 SDValue TrueVal = N->getOperand(1);
18317 SDValue FalseVal = N->getOperand(2);
18318
18319 // Check if condition is a comparison.
18320 if (Cond.getOpcode() != ISD::SETCC)
18321 return SDValue();
18322
18323 SDValue LHS = Cond.getOperand(0);
18324 SDValue RHS = Cond.getOperand(1);
18325 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
18326
18327 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
18328 bool isInteger = LHS.getValueType().isInteger();
18329
18330 // Handle simple floating-point and integer types only.
18331 if (!isFloatingPoint && !isInteger)
18332 return SDValue();
18333
18334 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
18335 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
18336 if (!isEquality && !isNonEquality)
18337 return SDValue();
18338
18339 SDValue ArgVal, ConstVal;
18340 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
18341 (isInteger && isa<ConstantSDNode>(RHS))) {
18342 ConstVal = RHS;
18343 ArgVal = LHS;
18344 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
18345 (isInteger && isa<ConstantSDNode>(LHS))) {
18346 ConstVal = LHS;
18347 ArgVal = RHS;
18348 } else {
18349 return SDValue();
18350 }
18351
18352 // Skip optimization for inlinable immediates.
18353 if (isFloatingPoint) {
18354 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
18355 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
18356 return SDValue();
18357 } else {
18358 const std::optional<int64_t> Val =
18359 cast<ConstantSDNode>(ConstVal)->getAPIntValue().trySExtValue();
18360 if (Val && AMDGPU::isInlinableIntLiteral(*Val))
18361 return SDValue();
18362 }
18363
18364 // For equality and non-equality comparisons, patterns:
18365 // select (setcc x, const), const, y -> select (setcc x, const), x, y
18366 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
18367 if (!(isEquality && TrueVal == ConstVal) &&
18368 !(isNonEquality && FalseVal == ConstVal))
18369 return SDValue();
18370
18371 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
18372 SDValue SelectRHS =
18373 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
18374 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
18375 SelectLHS, SelectRHS);
18376}
18377
18379 DAGCombinerInfo &DCI) const {
18380 switch (N->getOpcode()) {
18381 case ISD::ADD:
18382 case ISD::SUB:
18383 case ISD::SHL:
18384 case ISD::SRL:
18385 case ISD::SRA:
18386 case ISD::AND:
18387 case ISD::OR:
18388 case ISD::XOR:
18389 case ISD::MUL:
18390 case ISD::SETCC:
18391 case ISD::SELECT:
18392 case ISD::SMIN:
18393 case ISD::SMAX:
18394 case ISD::UMIN:
18395 case ISD::UMAX:
18396 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
18397 return Res;
18398 break;
18399 default:
18400 break;
18401 }
18402
18403 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
18404 return SDValue();
18405
18406 switch (N->getOpcode()) {
18407 case ISD::ADD:
18408 return performAddCombine(N, DCI);
18409 case ISD::PTRADD:
18410 return performPtrAddCombine(N, DCI);
18411 case ISD::SUB:
18412 return performSubCombine(N, DCI);
18413 case ISD::UADDO_CARRY:
18414 case ISD::USUBO_CARRY:
18415 return performAddCarrySubCarryCombine(N, DCI);
18416 case ISD::FADD:
18417 return performFAddCombine(N, DCI);
18418 case ISD::FSUB:
18419 return performFSubCombine(N, DCI);
18420 case ISD::FDIV:
18421 return performFDivCombine(N, DCI);
18422 case ISD::FMUL:
18423 return performFMulCombine(N, DCI);
18424 case ISD::SETCC:
18425 return performSetCCCombine(N, DCI);
18426 case ISD::SELECT:
18427 if (auto Res = performSelectCombine(N, DCI))
18428 return Res;
18429 break;
18430 case ISD::FMAXNUM:
18431 case ISD::FMINNUM:
18432 case ISD::FMAXNUM_IEEE:
18433 case ISD::FMINNUM_IEEE:
18434 case ISD::FMAXIMUM:
18435 case ISD::FMINIMUM:
18436 case ISD::FMAXIMUMNUM:
18437 case ISD::FMINIMUMNUM:
18438 case ISD::SMAX:
18439 case ISD::SMIN:
18440 case ISD::UMAX:
18441 case ISD::UMIN:
18442 case AMDGPUISD::FMIN_LEGACY:
18443 case AMDGPUISD::FMAX_LEGACY:
18444 return performMinMaxCombine(N, DCI);
18445 case ISD::FMA:
18446 return performFMACombine(N, DCI);
18447 case ISD::AND:
18448 return performAndCombine(N, DCI);
18449 case ISD::OR:
18450 return performOrCombine(N, DCI);
18451 case ISD::FSHR: {
18453 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
18454 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
18455 return matchPERM(N, DCI);
18456 }
18457 break;
18458 }
18459 case ISD::XOR:
18460 return performXorCombine(N, DCI);
18461 case ISD::ANY_EXTEND:
18462 case ISD::ZERO_EXTEND:
18463 return performZeroOrAnyExtendCombine(N, DCI);
18465 return performSignExtendInRegCombine(N, DCI);
18466 case AMDGPUISD::FP_CLASS:
18467 return performClassCombine(N, DCI);
18468 case ISD::FCANONICALIZE:
18469 return performFCanonicalizeCombine(N, DCI);
18470 case AMDGPUISD::RCP:
18471 return performRcpCombine(N, DCI);
18472 case ISD::FLDEXP:
18473 case AMDGPUISD::FRACT:
18474 case AMDGPUISD::RSQ:
18475 case AMDGPUISD::RCP_LEGACY:
18476 case AMDGPUISD::RCP_IFLAG:
18477 case AMDGPUISD::RSQ_CLAMP: {
18478 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
18479 SDValue Src = N->getOperand(0);
18480 if (Src.isUndef())
18481 return Src;
18482 break;
18483 }
18484 case ISD::SINT_TO_FP:
18485 case ISD::UINT_TO_FP:
18486 return performUCharToFloatCombine(N, DCI);
18487 case ISD::FCOPYSIGN:
18488 return performFCopySignCombine(N, DCI);
18489 case AMDGPUISD::CVT_F32_UBYTE0:
18490 case AMDGPUISD::CVT_F32_UBYTE1:
18491 case AMDGPUISD::CVT_F32_UBYTE2:
18492 case AMDGPUISD::CVT_F32_UBYTE3:
18493 return performCvtF32UByteNCombine(N, DCI);
18494 case AMDGPUISD::FMED3:
18495 return performFMed3Combine(N, DCI);
18496 case AMDGPUISD::CVT_PKRTZ_F16_F32:
18497 return performCvtPkRTZCombine(N, DCI);
18498 case AMDGPUISD::CLAMP:
18499 return performClampCombine(N, DCI);
18500 case ISD::SCALAR_TO_VECTOR: {
18501 SelectionDAG &DAG = DCI.DAG;
18502 EVT VT = N->getValueType(0);
18503
18504 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
18505 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
18506 SDLoc SL(N);
18507 SDValue Src = N->getOperand(0);
18508 EVT EltVT = Src.getValueType();
18509 if (EltVT != MVT::i16)
18510 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
18511
18512 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
18513 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
18514 }
18515
18516 break;
18517 }
18519 return performExtractVectorEltCombine(N, DCI);
18521 return performInsertVectorEltCombine(N, DCI);
18522 case ISD::FP_ROUND:
18523 return performFPRoundCombine(N, DCI);
18524 case ISD::LOAD: {
18525 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
18526 return Widened;
18527 [[fallthrough]];
18528 }
18529 default: {
18530 if (!DCI.isBeforeLegalize()) {
18531 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
18532 return performMemSDNodeCombine(MemNode, DCI);
18533 }
18534
18535 break;
18536 }
18537 }
18538
18540}
18541
18542/// Helper function for adjustWritemask
18543static unsigned SubIdx2Lane(unsigned Idx) {
18544 switch (Idx) {
18545 default:
18546 return ~0u;
18547 case AMDGPU::sub0:
18548 return 0;
18549 case AMDGPU::sub1:
18550 return 1;
18551 case AMDGPU::sub2:
18552 return 2;
18553 case AMDGPU::sub3:
18554 return 3;
18555 case AMDGPU::sub4:
18556 return 4; // Possible with TFE/LWE
18557 }
18558}
18559
18560/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
18561SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
18562 SelectionDAG &DAG) const {
18563 unsigned Opcode = Node->getMachineOpcode();
18564
18565 // Subtract 1 because the vdata output is not a MachineSDNode operand.
18566 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
18567 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
18568 return Node; // not implemented for D16
18569
18570 SDNode *Users[5] = {nullptr};
18571 unsigned Lane = 0;
18572 unsigned DmaskIdx =
18573 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
18574 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
18575 unsigned NewDmask = 0;
18576 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
18577 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
18578 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
18579 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
18580 unsigned TFCLane = 0;
18581 bool HasChain = Node->getNumValues() > 1;
18582
18583 if (OldDmask == 0) {
18584 // These are folded out, but on the chance it happens don't assert.
18585 return Node;
18586 }
18587
18588 unsigned OldBitsSet = llvm::popcount(OldDmask);
18589 // Work out which is the TFE/LWE lane if that is enabled.
18590 if (UsesTFC) {
18591 TFCLane = OldBitsSet;
18592 }
18593
18594 // Try to figure out the used register components
18595 for (SDUse &Use : Node->uses()) {
18596
18597 // Don't look at users of the chain.
18598 if (Use.getResNo() != 0)
18599 continue;
18600
18601 SDNode *User = Use.getUser();
18602
18603 // Abort if we can't understand the usage
18604 if (!User->isMachineOpcode() ||
18605 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
18606 return Node;
18607
18608 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
18609 // Note that subregs are packed, i.e. Lane==0 is the first bit set
18610 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
18611 // set, etc.
18612 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
18613 if (Lane == ~0u)
18614 return Node;
18615
18616 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
18617 if (UsesTFC && Lane == TFCLane) {
18618 Users[Lane] = User;
18619 } else {
18620 // Set which texture component corresponds to the lane.
18621 unsigned Comp;
18622 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
18623 Comp = llvm::countr_zero(Dmask);
18624 Dmask &= ~(1 << Comp);
18625 }
18626
18627 // Abort if we have more than one user per component.
18628 if (Users[Lane])
18629 return Node;
18630
18631 Users[Lane] = User;
18632 NewDmask |= 1 << Comp;
18633 }
18634 }
18635
18636 // Don't allow 0 dmask, as hardware assumes one channel enabled.
18637 bool NoChannels = !NewDmask;
18638 if (NoChannels) {
18639 if (!UsesTFC) {
18640 // No uses of the result and not using TFC. Then do nothing.
18641 return Node;
18642 }
18643 // If the original dmask has one channel - then nothing to do
18644 if (OldBitsSet == 1)
18645 return Node;
18646 // Use an arbitrary dmask - required for the instruction to work
18647 NewDmask = 1;
18648 }
18649 // Abort if there's no change
18650 if (NewDmask == OldDmask)
18651 return Node;
18652
18653 unsigned BitsSet = llvm::popcount(NewDmask);
18654
18655 // Check for TFE or LWE - increase the number of channels by one to account
18656 // for the extra return value
18657 // This will need adjustment for D16 if this is also included in
18658 // adjustWriteMask (this function) but at present D16 are excluded.
18659 unsigned NewChannels = BitsSet + UsesTFC;
18660
18661 int NewOpcode =
18662 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
18663 assert(NewOpcode != -1 &&
18664 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
18665 "failed to find equivalent MIMG op");
18666
18667 // Adjust the writemask in the node
18669 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
18670 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
18671 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
18672
18673 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
18674
18675 MVT ResultVT = NewChannels == 1
18676 ? SVT
18677 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
18678 : NewChannels == 5 ? 8
18679 : NewChannels);
18680 SDVTList NewVTList =
18681 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
18682
18683 MachineSDNode *NewNode =
18684 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
18685
18686 if (HasChain) {
18687 // Update chain.
18688 DAG.setNodeMemRefs(NewNode, Node->memoperands());
18689 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
18690 }
18691
18692 if (NewChannels == 1) {
18693 assert(Node->hasNUsesOfValue(1, 0));
18694 SDNode *Copy =
18695 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
18696 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
18697 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
18698 return nullptr;
18699 }
18700
18701 // Update the users of the node with the new indices
18702 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18703 SDNode *User = Users[i];
18704 if (!User) {
18705 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
18706 // Users[0] is still nullptr because channel 0 doesn't really have a use.
18707 if (i || !NoChannels)
18708 continue;
18709 } else {
18710 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
18711 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
18712 if (NewUser != User) {
18713 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
18714 DAG.RemoveDeadNode(User);
18715 }
18716 }
18717
18718 switch (Idx) {
18719 default:
18720 break;
18721 case AMDGPU::sub0:
18722 Idx = AMDGPU::sub1;
18723 break;
18724 case AMDGPU::sub1:
18725 Idx = AMDGPU::sub2;
18726 break;
18727 case AMDGPU::sub2:
18728 Idx = AMDGPU::sub3;
18729 break;
18730 case AMDGPU::sub3:
18731 Idx = AMDGPU::sub4;
18732 break;
18733 }
18734 }
18735
18736 DAG.RemoveDeadNode(Node);
18737 return nullptr;
18738}
18739
18741 if (Op.getOpcode() == ISD::AssertZext)
18742 Op = Op.getOperand(0);
18743
18744 return isa<FrameIndexSDNode>(Op);
18745}
18746
18747/// Legalize target independent instructions (e.g. INSERT_SUBREG)
18748/// with frame index operands.
18749/// LLVM assumes that inputs are to these instructions are registers.
18750SDNode *
18752 SelectionDAG &DAG) const {
18753 if (Node->getOpcode() == ISD::CopyToReg) {
18754 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
18755 SDValue SrcVal = Node->getOperand(2);
18756
18757 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
18758 // to try understanding copies to physical registers.
18759 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
18760 SDLoc SL(Node);
18762 SDValue VReg = DAG.getRegister(
18763 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
18764
18765 SDNode *Glued = Node->getGluedNode();
18766 SDValue ToVReg = DAG.getCopyToReg(
18767 Node->getOperand(0), SL, VReg, SrcVal,
18768 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
18769 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
18770 VReg, ToVReg.getValue(1));
18771 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
18772 DAG.RemoveDeadNode(Node);
18773 return ToResultReg.getNode();
18774 }
18775 }
18776
18778 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
18779 if (!isFrameIndexOp(Node->getOperand(i))) {
18780 Ops.push_back(Node->getOperand(i));
18781 continue;
18782 }
18783
18784 SDLoc DL(Node);
18785 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
18786 Node->getOperand(i).getValueType(),
18787 Node->getOperand(i)),
18788 0));
18789 }
18790
18791 return DAG.UpdateNodeOperands(Node, Ops);
18792}
18793
18794/// Fold the instructions after selecting them.
18795/// Returns null if users were already updated.
18797 SelectionDAG &DAG) const {
18799 unsigned Opcode = Node->getMachineOpcode();
18800
18801 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
18802 !TII->isGather4(Opcode) &&
18803 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
18804 return adjustWritemask(Node, DAG);
18805 }
18806
18807 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
18809 return Node;
18810 }
18811
18812 switch (Opcode) {
18813 case AMDGPU::V_DIV_SCALE_F32_e64:
18814 case AMDGPU::V_DIV_SCALE_F64_e64: {
18815 // Satisfy the operand register constraint when one of the inputs is
18816 // undefined. Ordinarily each undef value will have its own implicit_def of
18817 // a vreg, so force these to use a single register.
18818 SDValue Src0 = Node->getOperand(1);
18819 SDValue Src1 = Node->getOperand(3);
18820 SDValue Src2 = Node->getOperand(5);
18821
18822 if ((Src0.isMachineOpcode() &&
18823 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
18824 (Src0 == Src1 || Src0 == Src2))
18825 break;
18826
18827 MVT VT = Src0.getValueType().getSimpleVT();
18828 const TargetRegisterClass *RC =
18829 getRegClassFor(VT, Src0.getNode()->isDivergent());
18830
18832 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
18833
18834 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
18835 Src0, SDValue());
18836
18837 // src0 must be the same register as src1 or src2, even if the value is
18838 // undefined, so make sure we don't violate this constraint.
18839 if (Src0.isMachineOpcode() &&
18840 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
18841 if (Src1.isMachineOpcode() &&
18842 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
18843 Src0 = Src1;
18844 else if (Src2.isMachineOpcode() &&
18845 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
18846 Src0 = Src2;
18847 else {
18848 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
18849 Src0 = UndefReg;
18850 Src1 = UndefReg;
18851 }
18852 } else
18853 break;
18854
18856 Ops[1] = Src0;
18857 Ops[3] = Src1;
18858 Ops[5] = Src2;
18859 Ops.push_back(ImpDef.getValue(1));
18860 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
18861 }
18862 default:
18863 break;
18864 }
18865
18866 return Node;
18867}
18868
18869// Any MIMG instructions that use tfe or lwe require an initialization of the
18870// result register that will be written in the case of a memory access failure.
18871// The required code is also added to tie this init code to the result of the
18872// img instruction.
18875 const SIRegisterInfo &TRI = TII->getRegisterInfo();
18876 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
18877 MachineBasicBlock &MBB = *MI.getParent();
18878
18879 int DstIdx =
18880 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
18881 unsigned InitIdx = 0;
18882
18883 if (TII->isImage(MI)) {
18884 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
18885 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
18886 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
18887
18888 if (!TFE && !LWE) // intersect_ray
18889 return;
18890
18891 unsigned TFEVal = TFE ? TFE->getImm() : 0;
18892 unsigned LWEVal = LWE ? LWE->getImm() : 0;
18893 unsigned D16Val = D16 ? D16->getImm() : 0;
18894
18895 if (!TFEVal && !LWEVal)
18896 return;
18897
18898 // At least one of TFE or LWE are non-zero
18899 // We have to insert a suitable initialization of the result value and
18900 // tie this to the dest of the image instruction.
18901
18902 // Calculate which dword we have to initialize to 0.
18903 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
18904
18905 // check that dmask operand is found.
18906 assert(MO_Dmask && "Expected dmask operand in instruction");
18907
18908 unsigned dmask = MO_Dmask->getImm();
18909 // Determine the number of active lanes taking into account the
18910 // Gather4 special case
18911 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
18912
18913 bool Packed = !Subtarget->hasUnpackedD16VMem();
18914
18915 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
18916
18917 // Abandon attempt if the dst size isn't large enough
18918 // - this is in fact an error but this is picked up elsewhere and
18919 // reported correctly.
18920 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18921
18922 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
18923 if (DstSize < InitIdx)
18924 return;
18925 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
18926 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
18927 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
18928 } else {
18929 return;
18930 }
18931
18932 const DebugLoc &DL = MI.getDebugLoc();
18933
18934 // Create a register for the initialization value.
18935 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
18936 unsigned NewDst = 0; // Final initialized value will be in here
18937
18938 // If PRTStrictNull feature is enabled (the default) then initialize
18939 // all the result registers to 0, otherwise just the error indication
18940 // register (VGPRn+1)
18941 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
18942 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
18943
18944 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
18945 for (; SizeLeft; SizeLeft--, CurrIdx++) {
18946 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
18947 // Initialize dword
18948 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
18949 // clang-format off
18950 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
18951 .addImm(0);
18952 // clang-format on
18953 // Insert into the super-reg
18954 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
18955 .addReg(PrevDst)
18956 .addReg(SubReg)
18958
18959 PrevDst = NewDst;
18960 }
18961
18962 // Add as an implicit operand
18963 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
18964
18965 // Tie the just added implicit operand to the dst
18966 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
18967}
18968
18969/// Assign the register class depending on the number of
18970/// bits set in the writemask
18972 SDNode *Node) const {
18974
18975 MachineFunction *MF = MI.getMF();
18976 MachineRegisterInfo &MRI = MF->getRegInfo();
18977
18978 if (TII->isVOP3(MI.getOpcode())) {
18979 // Make sure constant bus requirements are respected.
18980 TII->legalizeOperandsVOP3(MRI, MI);
18981
18982 if (TII->isMAI(MI)) {
18983 // The ordinary src0, src1, src2 were legalized above.
18984 //
18985 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
18986 // as a separate instruction.
18987 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18988 AMDGPU::OpName::scale_src0);
18989 if (Src0Idx != -1) {
18990 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
18991 AMDGPU::OpName::scale_src1);
18992 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
18993 TII->usesConstantBus(MRI, MI, Src1Idx))
18994 TII->legalizeOpWithMove(MI, Src1Idx);
18995 }
18996 }
18997
18998 return;
18999 }
19000
19001 if (TII->isImage(MI))
19002 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
19003}
19004
19006 uint64_t Val) {
19007 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
19008 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
19009}
19010
19012 const SDLoc &DL,
19013 SDValue Ptr) const {
19015
19016 // Build the half of the subregister with the constants before building the
19017 // full 128-bit register. If we are building multiple resource descriptors,
19018 // this will allow CSEing of the 2-component register.
19019 const SDValue Ops0[] = {
19020 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
19021 buildSMovImm32(DAG, DL, 0),
19022 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
19023 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
19024 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
19025
19026 SDValue SubRegHi = SDValue(
19027 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
19028
19029 // Combine the constants and the pointer.
19030 const SDValue Ops1[] = {
19031 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
19032 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
19033 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
19034
19035 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
19036}
19037
19038/// Return a resource descriptor with the 'Add TID' bit enabled
19039/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
19040/// of the resource descriptor) to create an offset, which is added to
19041/// the resource pointer.
19043 SDValue Ptr, uint32_t RsrcDword1,
19044 uint64_t RsrcDword2And3) const {
19045 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
19046 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
19047 if (RsrcDword1) {
19048 PtrHi =
19049 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
19050 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
19051 0);
19052 }
19053
19054 SDValue DataLo =
19055 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
19056 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
19057
19058 const SDValue Ops[] = {
19059 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
19060 PtrLo,
19061 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
19062 PtrHi,
19063 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
19064 DataLo,
19065 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
19066 DataHi,
19067 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
19068
19069 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
19070}
19071
19072//===----------------------------------------------------------------------===//
19073// SI Inline Assembly Support
19074//===----------------------------------------------------------------------===//
19075
19076std::pair<unsigned, const TargetRegisterClass *>
19078 StringRef Constraint,
19079 MVT VT) const {
19080 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
19081
19082 const TargetRegisterClass *RC = nullptr;
19083 if (Constraint.size() == 1) {
19084 // Check if we cannot determine the bit size of the given value type. This
19085 // can happen, for example, in this situation where we have an empty struct
19086 // (size 0): `call void asm "", "v"({} poison)`-
19087 if (VT == MVT::Other)
19088 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19089 const unsigned BitWidth = VT.getSizeInBits();
19090 switch (Constraint[0]) {
19091 default:
19092 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19093 case 's':
19094 case 'r':
19095 switch (BitWidth) {
19096 case 16:
19097 RC = &AMDGPU::SReg_32RegClass;
19098 break;
19099 case 64:
19100 RC = &AMDGPU::SGPR_64RegClass;
19101 break;
19102 default:
19104 if (!RC)
19105 return std::pair(0U, nullptr);
19106 break;
19107 }
19108 break;
19109 case 'v':
19110 switch (BitWidth) {
19111 case 1:
19112 return std::pair(0U, nullptr);
19113 case 16:
19114 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
19115 : &AMDGPU::VGPR_32_Lo256RegClass;
19116 break;
19117 default:
19118 RC = Subtarget->has1024AddressableVGPRs()
19119 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
19120 : TRI->getVGPRClassForBitWidth(BitWidth);
19121 if (!RC)
19122 return std::pair(0U, nullptr);
19123 break;
19124 }
19125 break;
19126 case 'a':
19127 if (!Subtarget->hasMAIInsts())
19128 break;
19129 switch (BitWidth) {
19130 case 1:
19131 return std::pair(0U, nullptr);
19132 case 16:
19133 RC = &AMDGPU::AGPR_32RegClass;
19134 break;
19135 default:
19136 RC = TRI->getAGPRClassForBitWidth(BitWidth);
19137 if (!RC)
19138 return std::pair(0U, nullptr);
19139 break;
19140 }
19141 break;
19142 }
19143 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
19144 const unsigned BitWidth = VT.getSizeInBits();
19145 switch (BitWidth) {
19146 case 16:
19147 RC = &AMDGPU::AV_32RegClass;
19148 break;
19149 default:
19150 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
19151 if (!RC)
19152 return std::pair(0U, nullptr);
19153 break;
19154 }
19155 }
19156
19157 // We actually support i128, i16 and f16 as inline parameters
19158 // even if they are not reported as legal
19159 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
19160 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
19161 return std::pair(0U, RC);
19162
19163 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
19164 if (Kind != '\0') {
19165 if (Kind == 'v') {
19166 RC = &AMDGPU::VGPR_32_Lo256RegClass;
19167 } else if (Kind == 's') {
19168 RC = &AMDGPU::SGPR_32RegClass;
19169 } else if (Kind == 'a') {
19170 RC = &AMDGPU::AGPR_32RegClass;
19171 }
19172
19173 if (RC) {
19174 if (NumRegs > 1) {
19175 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
19176 return std::pair(0U, nullptr);
19177
19178 uint32_t Width = NumRegs * 32;
19179 // Prohibit constraints for register ranges with a width that does not
19180 // match the required type.
19181 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
19182 return std::pair(0U, nullptr);
19183
19184 MCRegister Reg = RC->getRegister(Idx);
19186 RC = TRI->getVGPRClassForBitWidth(Width);
19187 else if (SIRegisterInfo::isSGPRClass(RC))
19188 RC = TRI->getSGPRClassForBitWidth(Width);
19189 else if (SIRegisterInfo::isAGPRClass(RC))
19190 RC = TRI->getAGPRClassForBitWidth(Width);
19191 if (RC) {
19192 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
19193 if (!Reg) {
19194 // The register class does not contain the requested register,
19195 // e.g., because it is an SGPR pair that would violate alignment
19196 // requirements.
19197 return std::pair(0U, nullptr);
19198 }
19199 return std::pair(Reg, RC);
19200 }
19201 }
19202
19203 // Check for lossy scalar/vector conversions.
19204 if (VT.isVector() && VT.getSizeInBits() != 32)
19205 return std::pair(0U, nullptr);
19206 if (RC && Idx < RC->getNumRegs())
19207 return std::pair(RC->getRegister(Idx), RC);
19208 return std::pair(0U, nullptr);
19209 }
19210 }
19211
19212 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19213 if (Ret.first)
19214 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
19215
19216 return Ret;
19217}
19218
19219static bool isImmConstraint(StringRef Constraint) {
19220 if (Constraint.size() == 1) {
19221 switch (Constraint[0]) {
19222 default:
19223 break;
19224 case 'I':
19225 case 'J':
19226 case 'A':
19227 case 'B':
19228 case 'C':
19229 return true;
19230 }
19231 } else if (Constraint == "DA" || Constraint == "DB") {
19232 return true;
19233 }
19234 return false;
19235}
19236
19239 if (Constraint.size() == 1) {
19240 switch (Constraint[0]) {
19241 default:
19242 break;
19243 case 's':
19244 case 'v':
19245 case 'a':
19246 return C_RegisterClass;
19247 }
19248 } else if (Constraint.size() == 2) {
19249 if (Constraint == "VA")
19250 return C_RegisterClass;
19251 }
19252 if (isImmConstraint(Constraint)) {
19253 return C_Other;
19254 }
19255 return TargetLowering::getConstraintType(Constraint);
19256}
19257
19258static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
19260 Val = Val & maskTrailingOnes<uint64_t>(Size);
19261 }
19262 return Val;
19263}
19264
19266 StringRef Constraint,
19267 std::vector<SDValue> &Ops,
19268 SelectionDAG &DAG) const {
19269 if (isImmConstraint(Constraint)) {
19270 uint64_t Val;
19271 if (getAsmOperandConstVal(Op, Val) &&
19272 checkAsmConstraintVal(Op, Constraint, Val)) {
19273 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
19274 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
19275 }
19276 } else {
19278 }
19279}
19280
19282 unsigned Size = Op.getScalarValueSizeInBits();
19283 if (Size > 64)
19284 return false;
19285
19286 if (Size == 16 && !Subtarget->has16BitInsts())
19287 return false;
19288
19290 Val = C->getSExtValue();
19291 return true;
19292 }
19294 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19295 return true;
19296 }
19298 if (Size != 16 || Op.getNumOperands() != 2)
19299 return false;
19300 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
19301 return false;
19302 if (ConstantSDNode *C = V->getConstantSplatNode()) {
19303 Val = C->getSExtValue();
19304 return true;
19305 }
19306 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
19307 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19308 return true;
19309 }
19310 }
19311
19312 return false;
19313}
19314
19316 uint64_t Val) const {
19317 if (Constraint.size() == 1) {
19318 switch (Constraint[0]) {
19319 case 'I':
19321 case 'J':
19322 return isInt<16>(Val);
19323 case 'A':
19324 return checkAsmConstraintValA(Op, Val);
19325 case 'B':
19326 return isInt<32>(Val);
19327 case 'C':
19328 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
19330 default:
19331 break;
19332 }
19333 } else if (Constraint.size() == 2) {
19334 if (Constraint == "DA") {
19335 int64_t HiBits = static_cast<int32_t>(Val >> 32);
19336 int64_t LoBits = static_cast<int32_t>(Val);
19337 return checkAsmConstraintValA(Op, HiBits, 32) &&
19338 checkAsmConstraintValA(Op, LoBits, 32);
19339 }
19340 if (Constraint == "DB") {
19341 return true;
19342 }
19343 }
19344 llvm_unreachable("Invalid asm constraint");
19345}
19346
19348 unsigned MaxSize) const {
19349 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
19350 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
19351 if (Size == 16) {
19352 MVT VT = Op.getSimpleValueType();
19353 switch (VT.SimpleTy) {
19354 default:
19355 return false;
19356 case MVT::i16:
19357 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
19358 case MVT::f16:
19359 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
19360 case MVT::bf16:
19361 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
19362 case MVT::v2i16:
19363 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
19364 case MVT::v2f16:
19365 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
19366 case MVT::v2bf16:
19367 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
19368 }
19369 }
19370 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
19371 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
19372 return true;
19373 return false;
19374}
19375
19376static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
19377 switch (UnalignedClassID) {
19378 case AMDGPU::VReg_64RegClassID:
19379 return AMDGPU::VReg_64_Align2RegClassID;
19380 case AMDGPU::VReg_96RegClassID:
19381 return AMDGPU::VReg_96_Align2RegClassID;
19382 case AMDGPU::VReg_128RegClassID:
19383 return AMDGPU::VReg_128_Align2RegClassID;
19384 case AMDGPU::VReg_160RegClassID:
19385 return AMDGPU::VReg_160_Align2RegClassID;
19386 case AMDGPU::VReg_192RegClassID:
19387 return AMDGPU::VReg_192_Align2RegClassID;
19388 case AMDGPU::VReg_224RegClassID:
19389 return AMDGPU::VReg_224_Align2RegClassID;
19390 case AMDGPU::VReg_256RegClassID:
19391 return AMDGPU::VReg_256_Align2RegClassID;
19392 case AMDGPU::VReg_288RegClassID:
19393 return AMDGPU::VReg_288_Align2RegClassID;
19394 case AMDGPU::VReg_320RegClassID:
19395 return AMDGPU::VReg_320_Align2RegClassID;
19396 case AMDGPU::VReg_352RegClassID:
19397 return AMDGPU::VReg_352_Align2RegClassID;
19398 case AMDGPU::VReg_384RegClassID:
19399 return AMDGPU::VReg_384_Align2RegClassID;
19400 case AMDGPU::VReg_512RegClassID:
19401 return AMDGPU::VReg_512_Align2RegClassID;
19402 case AMDGPU::VReg_1024RegClassID:
19403 return AMDGPU::VReg_1024_Align2RegClassID;
19404 case AMDGPU::AReg_64RegClassID:
19405 return AMDGPU::AReg_64_Align2RegClassID;
19406 case AMDGPU::AReg_96RegClassID:
19407 return AMDGPU::AReg_96_Align2RegClassID;
19408 case AMDGPU::AReg_128RegClassID:
19409 return AMDGPU::AReg_128_Align2RegClassID;
19410 case AMDGPU::AReg_160RegClassID:
19411 return AMDGPU::AReg_160_Align2RegClassID;
19412 case AMDGPU::AReg_192RegClassID:
19413 return AMDGPU::AReg_192_Align2RegClassID;
19414 case AMDGPU::AReg_256RegClassID:
19415 return AMDGPU::AReg_256_Align2RegClassID;
19416 case AMDGPU::AReg_512RegClassID:
19417 return AMDGPU::AReg_512_Align2RegClassID;
19418 case AMDGPU::AReg_1024RegClassID:
19419 return AMDGPU::AReg_1024_Align2RegClassID;
19420 default:
19421 return -1;
19422 }
19423}
19424
19425// Figure out which registers should be reserved for stack access. Only after
19426// the function is legalized do we know all of the non-spill stack objects or if
19427// calls are present.
19429 MachineRegisterInfo &MRI = MF.getRegInfo();
19431 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
19432 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19433 const SIInstrInfo *TII = ST.getInstrInfo();
19434
19435 if (Info->isEntryFunction()) {
19436 // Callable functions have fixed registers used for stack access.
19438 }
19439
19440 // TODO: Move this logic to getReservedRegs()
19441 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
19442 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
19443 Register SReg = ST.isWave32()
19444 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
19445 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
19446 &AMDGPU::SGPR_64RegClass);
19447 Info->setSGPRForEXECCopy(SReg);
19448
19449 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
19450 Info->getStackPtrOffsetReg()));
19451 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
19452 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
19453
19454 // We need to worry about replacing the default register with itself in case
19455 // of MIR testcases missing the MFI.
19456 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
19457 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
19458
19459 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
19460 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
19461
19462 Info->limitOccupancy(MF);
19463
19464 if (ST.isWave32() && !MF.empty()) {
19465 for (auto &MBB : MF) {
19466 for (auto &MI : MBB) {
19467 TII->fixImplicitOperands(MI);
19468 }
19469 }
19470 }
19471
19472 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
19473 // classes if required. Ideally the register class constraints would differ
19474 // per-subtarget, but there's no easy way to achieve that right now. This is
19475 // not a problem for VGPRs because the correctly aligned VGPR class is implied
19476 // from using them as the register class for legal types.
19477 if (ST.needsAlignedVGPRs()) {
19478 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
19479 const Register Reg = Register::index2VirtReg(I);
19480 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
19481 if (!RC)
19482 continue;
19483 int NewClassID = getAlignedAGPRClassID(RC->getID());
19484 if (NewClassID != -1)
19485 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
19486 }
19487 }
19488
19490}
19491
19493 KnownBits &Known,
19494 const APInt &DemandedElts,
19495 const SelectionDAG &DAG,
19496 unsigned Depth) const {
19497 Known.resetAll();
19498 unsigned Opc = Op.getOpcode();
19499 switch (Opc) {
19501 unsigned IID = Op.getConstantOperandVal(0);
19502 switch (IID) {
19503 case Intrinsic::amdgcn_mbcnt_lo:
19504 case Intrinsic::amdgcn_mbcnt_hi: {
19505 const GCNSubtarget &ST =
19507 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19508 // most 31 + src1.
19509 Known.Zero.setBitsFrom(
19510 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
19511 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
19512 Known = KnownBits::add(Known, Known2);
19513 return;
19514 }
19515 }
19516 break;
19517 }
19518 }
19520 Op, Known, DemandedElts, DAG, Depth);
19521}
19522
19524 const int FI, KnownBits &Known, const MachineFunction &MF) const {
19526
19527 // Set the high bits to zero based on the maximum allowed scratch size per
19528 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
19529 // calculation won't overflow, so assume the sign bit is never set.
19530 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
19531}
19532
19534 GISelValueTracking &VT, KnownBits &Known,
19535 unsigned Dim) {
19536 unsigned MaxValue =
19537 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
19538 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
19539}
19540
19542 KnownBits &Known, const APInt &DemandedElts,
19543 unsigned BFEWidth, bool SExt, unsigned Depth) {
19545 const MachineOperand &Src1 = MI.getOperand(2);
19546
19547 unsigned Src1Cst = 0;
19548 if (Src1.isImm()) {
19549 Src1Cst = Src1.getImm();
19550 } else if (Src1.isReg()) {
19551 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
19552 if (!Cst)
19553 return;
19554 Src1Cst = Cst->Value.getZExtValue();
19555 } else {
19556 return;
19557 }
19558
19559 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
19560 // Width is always [22:16].
19561 const unsigned Offset =
19562 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
19563 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
19564
19565 if (Width >= BFEWidth) // Ill-formed.
19566 return;
19567
19568 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
19569 Depth + 1);
19570
19571 Known = Known.extractBits(Width, Offset);
19572
19573 if (SExt)
19574 Known = Known.sext(BFEWidth);
19575 else
19576 Known = Known.zext(BFEWidth);
19577}
19578
19580 GISelValueTracking &VT, Register R, KnownBits &Known,
19581 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
19582 unsigned Depth) const {
19583 Known.resetAll();
19584 const MachineInstr *MI = MRI.getVRegDef(R);
19585 switch (MI->getOpcode()) {
19586 case AMDGPU::S_BFE_I32:
19587 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
19588 /*SExt=*/true, Depth);
19589 case AMDGPU::S_BFE_U32:
19590 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
19591 /*SExt=*/false, Depth);
19592 case AMDGPU::S_BFE_I64:
19593 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
19594 /*SExt=*/true, Depth);
19595 case AMDGPU::S_BFE_U64:
19596 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
19597 /*SExt=*/false, Depth);
19598 case AMDGPU::G_INTRINSIC:
19599 case AMDGPU::G_INTRINSIC_CONVERGENT: {
19600 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
19601 switch (IID) {
19602 case Intrinsic::amdgcn_workitem_id_x:
19603 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
19604 break;
19605 case Intrinsic::amdgcn_workitem_id_y:
19606 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
19607 break;
19608 case Intrinsic::amdgcn_workitem_id_z:
19609 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
19610 break;
19611 case Intrinsic::amdgcn_mbcnt_lo:
19612 case Intrinsic::amdgcn_mbcnt_hi: {
19613 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19614 // most 31 + src1.
19615 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
19616 ? getSubtarget()->getWavefrontSizeLog2()
19617 : 5);
19618 KnownBits Known2;
19619 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
19620 Depth + 1);
19621 Known = KnownBits::add(Known, Known2);
19622 break;
19623 }
19624 case Intrinsic::amdgcn_groupstaticsize: {
19625 // We can report everything over the maximum size as 0. We can't report
19626 // based on the actual size because we don't know if it's accurate or not
19627 // at any given point.
19628 Known.Zero.setHighBits(
19629 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
19630 break;
19631 }
19632 }
19633 break;
19634 }
19635 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
19636 Known.Zero.setHighBits(24);
19637 break;
19638 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
19639 Known.Zero.setHighBits(16);
19640 break;
19641 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
19642 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
19643 // producing exactly 0 or 1.
19644 Known.Zero.setHighBits(Known.getBitWidth() - 1);
19645 break;
19646 case AMDGPU::G_AMDGPU_SMED3:
19647 case AMDGPU::G_AMDGPU_UMED3: {
19648 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
19649
19650 KnownBits Known2;
19651 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
19652 if (Known2.isUnknown())
19653 break;
19654
19655 KnownBits Known1;
19656 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
19657 if (Known1.isUnknown())
19658 break;
19659
19660 KnownBits Known0;
19661 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
19662 if (Known0.isUnknown())
19663 break;
19664
19665 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
19666 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
19667 Known.One = Known0.One & Known1.One & Known2.One;
19668 break;
19669 }
19670 }
19671}
19672
19675 unsigned Depth) const {
19676 const MachineInstr *MI = MRI.getVRegDef(R);
19677 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
19678 // FIXME: Can this move to generic code? What about the case where the call
19679 // site specifies a lower alignment?
19680 Intrinsic::ID IID = GI->getIntrinsicID();
19682 AttributeList Attrs =
19683 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
19684 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
19685 return *RetAlign;
19686 }
19687 return Align(1);
19688}
19689
19692 const Align CacheLineAlign = Align(64);
19693
19694 // GFX950: Prevent an 8-byte instruction at loop header from being split by
19695 // the 32-byte instruction fetch window boundary. This avoids a significant
19696 // fetch delay after backward branch. We use 32-byte alignment with max
19697 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
19698 if (ML && !DisableLoopAlignment &&
19699 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
19700 const MachineBasicBlock *Header = ML->getHeader();
19701 // Respect user-specified or previously set alignment.
19702 if (Header->getAlignment() != PrefAlign)
19703 return Header->getAlignment();
19704 if (needsFetchWindowAlignment(*Header))
19705 return Align(32);
19706 }
19707
19708 // Pre-GFX10 target did not benefit from loop alignment
19709 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
19710 getSubtarget()->hasInstFwdPrefetchBug())
19711 return PrefAlign;
19712
19713 // On GFX10 I$ is 4 x 64 bytes cache lines.
19714 // By default prefetcher keeps one cache line behind and reads two ahead.
19715 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
19716 // behind and one ahead.
19717 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
19718 // If loop fits 64 bytes it always spans no more than two cache lines and
19719 // does not need an alignment.
19720 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
19721 // Else if loop is less or equal 192 bytes we need two lines behind.
19722
19724 const MachineBasicBlock *Header = ML->getHeader();
19725 if (Header->getAlignment() != PrefAlign)
19726 return Header->getAlignment(); // Already processed.
19727
19728 unsigned LoopSize = 0;
19729 for (const MachineBasicBlock *MBB : ML->blocks()) {
19730 // If inner loop block is aligned assume in average half of the alignment
19731 // size to be added as nops.
19732 if (MBB != Header)
19733 LoopSize += MBB->getAlignment().value() / 2;
19734
19735 for (const MachineInstr &MI : *MBB) {
19736 LoopSize += TII->getInstSizeInBytes(MI);
19737 if (LoopSize > 192)
19738 return PrefAlign;
19739 }
19740 }
19741
19742 if (LoopSize <= 64)
19743 return PrefAlign;
19744
19745 if (LoopSize <= 128)
19746 return CacheLineAlign;
19747
19748 // If any of parent loops is surrounded by prefetch instructions do not
19749 // insert new for inner loop, which would reset parent's settings.
19750 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
19751 if (MachineBasicBlock *Exit = P->getExitBlock()) {
19752 auto I = Exit->getFirstNonDebugInstr();
19753 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19754 return CacheLineAlign;
19755 }
19756 }
19757
19758 MachineBasicBlock *Pre = ML->getLoopPreheader();
19759 MachineBasicBlock *Exit = ML->getExitBlock();
19760
19761 if (Pre && Exit) {
19762 auto PreTerm = Pre->getFirstTerminator();
19763 if (PreTerm == Pre->begin() ||
19764 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19765 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19766 .addImm(1); // prefetch 2 lines behind PC
19767
19768 auto ExitHead = Exit->getFirstNonDebugInstr();
19769 if (ExitHead == Exit->end() ||
19770 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19771 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19772 .addImm(2); // prefetch 1 line behind PC
19773 }
19774
19775 return CacheLineAlign;
19776}
19777
19779 MachineBasicBlock *MBB) const {
19780 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
19781 // instruction could be split by the 32-byte fetch window boundary.
19782 // See getPrefLoopAlignment() for context.
19783 if (needsFetchWindowAlignment(*MBB))
19784 return 4;
19786}
19787
19788bool SITargetLowering::needsFetchWindowAlignment(
19789 const MachineBasicBlock &MBB) const {
19790 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
19791 return false;
19793 for (const MachineInstr &MI : MBB) {
19794 if (MI.isMetaInstruction())
19795 continue;
19796 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
19797 return TII->getInstSizeInBytes(MI) > 4;
19798 }
19799 return false;
19800}
19801
19802[[maybe_unused]]
19803static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
19804 assert(N->getOpcode() == ISD::CopyFromReg);
19805 do {
19806 // Follow the chain until we find an INLINEASM node.
19807 N = N->getOperand(0).getNode();
19808 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
19809 return true;
19810 } while (N->getOpcode() == ISD::CopyFromReg);
19811 return false;
19812}
19813
19816 UniformityInfo *UA) const {
19817 switch (N->getOpcode()) {
19818 case ISD::CopyFromReg: {
19819 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
19820 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
19821 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19822 Register Reg = R->getReg();
19823
19824 // FIXME: Why does this need to consider isLiveIn?
19825 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
19826 return !TRI->isSGPRReg(MRI, Reg);
19827
19828 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
19829 return UA->isDivergentAtDef(V);
19830
19832 return !TRI->isSGPRReg(MRI, Reg);
19833 }
19834 case ISD::LOAD: {
19835 const LoadSDNode *L = cast<LoadSDNode>(N);
19836 unsigned AS = L->getAddressSpace();
19837 // A flat load may access private memory.
19839 }
19840 case ISD::CALLSEQ_END:
19841 return true;
19843 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
19845 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
19846 case AMDGPUISD::ATOMIC_CMP_SWAP:
19847 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
19848 case AMDGPUISD::BUFFER_ATOMIC_ADD:
19849 case AMDGPUISD::BUFFER_ATOMIC_SUB:
19850 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
19851 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
19852 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
19853 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
19854 case AMDGPUISD::BUFFER_ATOMIC_AND:
19855 case AMDGPUISD::BUFFER_ATOMIC_OR:
19856 case AMDGPUISD::BUFFER_ATOMIC_XOR:
19857 case AMDGPUISD::BUFFER_ATOMIC_INC:
19858 case AMDGPUISD::BUFFER_ATOMIC_DEC:
19859 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
19860 case AMDGPUISD::BUFFER_ATOMIC_FADD:
19861 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
19862 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
19863 // Target-specific read-modify-write atomics are sources of divergence.
19864 return true;
19865 default:
19866 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
19867 // Generic read-modify-write atomics are sources of divergence.
19868 return A->readMem() && A->writeMem();
19869 }
19870 return false;
19871 }
19872}
19873
19875 EVT VT) const {
19876 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
19877 case MVT::f32:
19879 case MVT::f64:
19880 case MVT::f16:
19882 default:
19883 return false;
19884 }
19885}
19886
19888 LLT Ty, const MachineFunction &MF) const {
19889 switch (Ty.getScalarSizeInBits()) {
19890 case 32:
19891 return !denormalModeIsFlushAllF32(MF);
19892 case 64:
19893 case 16:
19894 return !denormalModeIsFlushAllF64F16(MF);
19895 default:
19896 return false;
19897 }
19898}
19899
19901 const APInt &DemandedElts,
19902 const SelectionDAG &DAG,
19903 bool SNaN,
19904 unsigned Depth) const {
19905 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
19906 const MachineFunction &MF = DAG.getMachineFunction();
19908
19909 if (Info->getMode().DX10Clamp)
19910 return true; // Clamped to 0.
19911 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
19912 }
19913
19915 DAG, SNaN, Depth);
19916}
19917
19918// On older subtargets, global FP atomic instructions have a hardcoded FP mode
19919// and do not support FP32 denormals, and only support v2f16/f64 denormals.
19921 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
19922 return true;
19923
19924 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
19925 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
19926 if (DenormMode == DenormalMode::getPreserveSign())
19927 return true;
19928
19929 // TODO: Remove this.
19930 return RMW->getFunction()
19931 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
19932 .getValueAsBool();
19933}
19934
19936 LLVMContext &Ctx = RMW->getContext();
19937 StringRef MemScope =
19938 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
19939
19940 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
19941 << "Hardware instruction generated for atomic "
19942 << RMW->getOperationName(RMW->getOperation())
19943 << " operation at memory scope " << MemScope;
19944}
19945
19946static bool isV2F16OrV2BF16(Type *Ty) {
19947 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
19948 Type *EltTy = VT->getElementType();
19949 return VT->getNumElements() == 2 &&
19950 (EltTy->isHalfTy() || EltTy->isBFloatTy());
19951 }
19952
19953 return false;
19954}
19955
19956static bool isV2F16(Type *Ty) {
19958 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
19959}
19960
19961static bool isV2BF16(Type *Ty) {
19963 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
19964}
19965
19966/// \return true if atomicrmw integer ops work for the type.
19967static bool isAtomicRMWLegalIntTy(Type *Ty) {
19968 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
19969 unsigned BW = IT->getBitWidth();
19970 return BW == 32 || BW == 64;
19971 }
19972
19973 return false;
19974}
19975
19976/// \return true if this atomicrmw xchg type can be selected.
19977static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
19978 Type *Ty = RMW->getType();
19979 if (isAtomicRMWLegalIntTy(Ty))
19980 return true;
19981
19982 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
19983 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
19984 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
19985 return BW == 32 || BW == 64;
19986 }
19987
19988 if (Ty->isFloatTy() || Ty->isDoubleTy())
19989 return true;
19990
19992 return VT->getNumElements() == 2 &&
19993 VT->getElementType()->getPrimitiveSizeInBits() == 16;
19994 }
19995
19996 return false;
19997}
19998
19999/// \returns true if it's valid to emit a native instruction for \p RMW, based
20000/// on the properties of the target memory.
20001static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
20002 const AtomicRMWInst *RMW,
20003 bool HasSystemScope) {
20004 // The remote/fine-grained access logic is different from the integer
20005 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
20006 // fine-grained access does not work, even for a device local allocation.
20007 //
20008 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
20009 // allocations work.
20010 if (HasSystemScope) {
20011 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
20012 RMW->hasMetadata("amdgpu.no.remote.memory"))
20013 return true;
20014 if (Subtarget.hasEmulatedSystemScopeAtomics())
20015 return true;
20016 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
20017 return true;
20018
20019 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
20020}
20021
20022/// \return Action to perform on AtomicRMWInsts for integer operations.
20029
20030/// Return if a flat address space atomicrmw can access private memory.
20032 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
20033 return !MD ||
20035}
20036
20039 // For GAS, lower to flat atomic.
20040 return STI.hasGloballyAddressableScratch()
20043}
20044
20047 unsigned AS = RMW->getPointerAddressSpace();
20048 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
20050
20051 // 64-bit flat atomics that dynamically reside in private memory will silently
20052 // be dropped.
20053 //
20054 // Note that we will emit a new copy of the original atomic in the expansion,
20055 // which will be incrementally relegalized.
20056 const DataLayout &DL = RMW->getFunction()->getDataLayout();
20057 if (AS == AMDGPUAS::FLAT_ADDRESS &&
20058 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
20061
20062 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
20064 ORE.emit([=]() {
20065 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
20066 });
20067 return Kind;
20068 };
20069
20070 auto SSID = RMW->getSyncScopeID();
20071 bool HasSystemScope =
20072 SSID == SyncScope::System ||
20073 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
20074
20075 auto Op = RMW->getOperation();
20076 switch (Op) {
20078 // PCIe supports add and xchg for system atomics.
20079 return isAtomicRMWLegalXChgTy(RMW)
20082 case AtomicRMWInst::Add:
20083 // PCIe supports add and xchg for system atomics.
20085 case AtomicRMWInst::Sub:
20086 case AtomicRMWInst::And:
20087 case AtomicRMWInst::Or:
20088 case AtomicRMWInst::Xor:
20089 case AtomicRMWInst::Max:
20090 case AtomicRMWInst::Min:
20097 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
20099 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
20102 auto *IT = dyn_cast<IntegerType>(RMW->getType());
20103 if (!IT || IT->getBitWidth() != 32)
20105 }
20106
20109 if (Subtarget->hasEmulatedSystemScopeAtomics())
20111
20112 // On most subtargets, for atomicrmw operations other than add/xchg,
20113 // whether or not the instructions will behave correctly depends on where
20114 // the address physically resides and what interconnect is used in the
20115 // system configuration. On some some targets the instruction will nop,
20116 // and in others synchronization will only occur at degraded device scope.
20117 //
20118 // If the allocation is known local to the device, the instructions should
20119 // work correctly.
20120 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
20122
20123 // If fine-grained remote memory works at device scope, we don't need to
20124 // do anything.
20125 if (!HasSystemScope &&
20126 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
20128
20129 // If we are targeting a remote allocated address, it depends what kind of
20130 // allocation the address belongs to.
20131 //
20132 // If the allocation is fine-grained (in host memory, or in PCIe peer
20133 // device memory), the operation will fail depending on the target.
20134 //
20135 // Note fine-grained host memory access does work on APUs or if XGMI is
20136 // used, but we do not know if we are targeting an APU or the system
20137 // configuration from the ISA version/target-cpu.
20138 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
20140
20143 // Atomic sub/or/xor do not work over PCI express, but atomic add
20144 // does. InstCombine transforms these with 0 to or, so undo that.
20145 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
20146 ConstVal && ConstVal->isNullValue())
20148 }
20149
20150 // If the allocation could be in remote, fine-grained memory, the rmw
20151 // instructions may fail. cmpxchg should work, so emit that. On some
20152 // system configurations, PCIe atomics aren't supported so cmpxchg won't
20153 // even work, so you're out of luck anyway.
20154
20155 // In summary:
20156 //
20157 // Cases that may fail:
20158 // - fine-grained pinned host memory
20159 // - fine-grained migratable host memory
20160 // - fine-grained PCIe peer device
20161 //
20162 // Cases that should work, but may be treated overly conservatively.
20163 // - fine-grained host memory on an APU
20164 // - fine-grained XGMI peer device
20166 }
20167
20169 }
20170 case AtomicRMWInst::FAdd: {
20171 Type *Ty = RMW->getType();
20172
20173 // TODO: Handle REGION_ADDRESS
20174 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20175 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
20176 // is fixed to round-to-nearest-even.
20177 //
20178 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
20179 // round-to-nearest-even.
20180 //
20181 // We ignore the rounding mode problem, even in strictfp. The C++ standard
20182 // suggests it is OK if the floating-point mode may not match the calling
20183 // thread.
20184 if (Ty->isFloatTy()) {
20185 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
20187 }
20188
20189 if (Ty->isDoubleTy()) {
20190 // Ignores denormal mode, but we don't consider flushing mandatory.
20191 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
20193 }
20194
20195 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20197
20199 }
20200
20201 // LDS atomics respect the denormal mode from the mode register.
20202 //
20203 // Traditionally f32 global/buffer memory atomics would unconditionally
20204 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
20205 // flush.
20206 //
20207 // On targets with flat atomic fadd, denormals would flush depending on
20208 // whether the target address resides in LDS or global memory. We consider
20209 // this flat-maybe-flush as will-flush.
20210 if (Ty->isFloatTy() &&
20211 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
20214
20215 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
20216 // safe. The message phrasing also should be better.
20217 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
20218 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20219 // gfx942, gfx12
20220 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20221 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20222 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
20223 // gfx90a, gfx942, gfx12
20224 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20225 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20226
20227 // gfx942, gfx12
20228 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
20229 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20230 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
20231 // gfx90a, gfx942, gfx12
20232 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20233 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20234
20235 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
20236 // buffer. gfx12 does have the buffer version.
20237 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
20238 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20239 }
20240
20241 // global and flat atomic fadd f64: gfx90a, gfx942.
20242 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
20243 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20244
20245 if (AS != AMDGPUAS::FLAT_ADDRESS) {
20246 if (Ty->isFloatTy()) {
20247 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
20248 // gfx11+.
20249 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20250 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20251 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
20252 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20253 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20254 } else {
20255 // gfx908
20256 if (RMW->use_empty() &&
20257 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
20258 isV2F16(Ty))
20259 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20260 }
20261 }
20262
20263 // flat atomic fadd f32: gfx942, gfx11+.
20264 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
20265 if (Subtarget->hasFlatAtomicFaddF32Inst())
20266 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20267
20268 // If it is in flat address space, and the type is float, we will try to
20269 // expand it, if the target supports global and lds atomic fadd. The
20270 // reason we need that is, in the expansion, we emit the check of
20271 // address space. If it is in global address space, we emit the global
20272 // atomic fadd; if it is in shared address space, we emit the LDS atomic
20273 // fadd.
20274 if (Subtarget->hasLDSFPAtomicAddF32()) {
20275 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20277 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20279 }
20280 }
20281 }
20282
20284 }
20286 case AtomicRMWInst::FMax: {
20287 Type *Ty = RMW->getType();
20288
20289 // LDS float and double fmin/fmax were always supported.
20290 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20291 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
20293 }
20294
20295 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
20296 // For flat and global cases:
20297 // float, double in gfx7. Manual claims denormal support.
20298 // Removed in gfx8.
20299 // float, double restored in gfx10.
20300 // double removed again in gfx11, so only f32 for gfx11/gfx12.
20301 //
20302 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
20303 // no f32.
20304 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20305 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
20306 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20307 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
20308 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20309 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
20311 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
20312 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20313 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
20314 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20315 }
20316 }
20317
20319 }
20322 default:
20324 }
20325
20326 llvm_unreachable("covered atomicrmw op switch");
20327}
20328
20335
20342
20345 const AtomicCmpXchgInst *CmpX) const {
20346 unsigned AddrSpace = CmpX->getPointerAddressSpace();
20347 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
20349
20350 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
20352
20353 const DataLayout &DL = CmpX->getDataLayout();
20354
20355 Type *ValTy = CmpX->getNewValOperand()->getType();
20356
20357 // If a 64-bit flat atomic may alias private, we need to avoid using the
20358 // atomic in the private case.
20359 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
20361}
20362
20363const TargetRegisterClass *
20364SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
20366 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
20367 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
20368 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
20369 : &AMDGPU::SReg_32RegClass;
20370 if (!TRI->isSGPRClass(RC) && !isDivergent)
20371 return TRI->getEquivalentSGPRClass(RC);
20372 if (TRI->isSGPRClass(RC) && isDivergent) {
20373 if (Subtarget->hasGFX90AInsts())
20374 return TRI->getEquivalentAVClass(RC);
20375 return TRI->getEquivalentVGPRClass(RC);
20376 }
20377
20378 return RC;
20379}
20380
20381// FIXME: This is a workaround for DivergenceAnalysis not understanding always
20382// uniform values (as produced by the mask results of control flow intrinsics)
20383// used outside of divergent blocks. The phi users need to also be treated as
20384// always uniform.
20385//
20386// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
20387static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
20388 unsigned WaveSize) {
20389 // FIXME: We assume we never cast the mask results of a control flow
20390 // intrinsic.
20391 // Early exit if the type won't be consistent as a compile time hack.
20392 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
20393 if (!IT || IT->getBitWidth() != WaveSize)
20394 return false;
20395
20396 if (!isa<Instruction>(V))
20397 return false;
20398 if (!Visited.insert(V).second)
20399 return false;
20400 bool Result = false;
20401 for (const auto *U : V->users()) {
20403 if (V == U->getOperand(1)) {
20404 switch (Intrinsic->getIntrinsicID()) {
20405 default:
20406 Result = false;
20407 break;
20408 case Intrinsic::amdgcn_if_break:
20409 case Intrinsic::amdgcn_if:
20410 case Intrinsic::amdgcn_else:
20411 Result = true;
20412 break;
20413 }
20414 }
20415 if (V == U->getOperand(0)) {
20416 switch (Intrinsic->getIntrinsicID()) {
20417 default:
20418 Result = false;
20419 break;
20420 case Intrinsic::amdgcn_end_cf:
20421 case Intrinsic::amdgcn_loop:
20422 Result = true;
20423 break;
20424 }
20425 }
20426 } else {
20427 Result = hasCFUser(U, Visited, WaveSize);
20428 }
20429 if (Result)
20430 break;
20431 }
20432 return Result;
20433}
20434
20436 const Value *V) const {
20437 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
20438 if (CI->isInlineAsm()) {
20439 // FIXME: This cannot give a correct answer. This should only trigger in
20440 // the case where inline asm returns mixed SGPR and VGPR results, used
20441 // outside the defining block. We don't have a specific result to
20442 // consider, so this assumes if any value is SGPR, the overall register
20443 // also needs to be SGPR.
20444 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
20446 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
20447 for (auto &TC : TargetConstraints) {
20448 if (TC.Type == InlineAsm::isOutput) {
20450 const TargetRegisterClass *RC =
20451 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
20452 TC.ConstraintVT)
20453 .second;
20454 if (RC && SIRI->isSGPRClass(RC))
20455 return true;
20456 }
20457 }
20458 }
20459 }
20461 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
20462}
20463
20465 for (SDUse &Use : N->uses()) {
20467 if (getBasePtrIndex(M) == Use.getOperandNo())
20468 return true;
20469 }
20470 }
20471 return false;
20472}
20473
20475 SDValue N1) const {
20476 if (!N0.hasOneUse())
20477 return false;
20478 // Take care of the opportunity to keep N0 uniform
20479 if (N0->isDivergent() || !N1->isDivergent())
20480 return true;
20481 // Check if we have a good chance to form the memory access pattern with the
20482 // base and offset
20483 return (DAG.isBaseWithConstantOffset(N0) &&
20485}
20486
20488 Register N0, Register N1) const {
20489 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
20490}
20491
20494 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
20496 if (I.getMetadata("amdgpu.noclobber"))
20497 Flags |= MONoClobber;
20498 if (I.getMetadata("amdgpu.last.use"))
20499 Flags |= MOLastUse;
20500 return Flags;
20501}
20502
20504 Instruction *AI) const {
20505 // Given: atomicrmw fadd ptr %addr, float %val ordering
20506 //
20507 // With this expansion we produce the following code:
20508 // [...]
20509 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
20510 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
20511 //
20512 // atomicrmw.shared:
20513 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
20514 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
20515 // float %val ordering
20516 // br label %atomicrmw.phi
20517 //
20518 // atomicrmw.check.private:
20519 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
20520 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
20521 //
20522 // atomicrmw.private:
20523 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
20524 // %loaded.private = load float, ptr addrspace(5) %cast.private
20525 // %val.new = fadd float %loaded.private, %val
20526 // store float %val.new, ptr addrspace(5) %cast.private
20527 // br label %atomicrmw.phi
20528 //
20529 // atomicrmw.global:
20530 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
20531 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
20532 // float %val ordering
20533 // br label %atomicrmw.phi
20534 //
20535 // atomicrmw.phi:
20536 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
20537 // [ %loaded.private, %atomicrmw.private ],
20538 // [ %loaded.global, %atomicrmw.global ]
20539 // br label %atomicrmw.end
20540 //
20541 // atomicrmw.end:
20542 // [...]
20543 //
20544 //
20545 // For 64-bit atomics which may reside in private memory, we perform a simpler
20546 // version that only inserts the private check, and uses the flat operation.
20547
20548 IRBuilder<> Builder(AI);
20549 LLVMContext &Ctx = Builder.getContext();
20550
20551 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
20552 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
20554 Value *Addr = AI->getOperand(PtrOpIdx);
20555
20556 /// TODO: Only need to check private, then emit flat-known-not private (no
20557 /// need for shared block, or cast to global).
20559
20560 Align Alignment;
20561 if (RMW)
20562 Alignment = RMW->getAlign();
20563 else if (CX)
20564 Alignment = CX->getAlign();
20565 else
20566 llvm_unreachable("unhandled atomic operation");
20567
20568 // FullFlatEmulation is true if we need to issue the private, shared, and
20569 // global cases.
20570 //
20571 // If this is false, we are only dealing with the flat-targeting-private case,
20572 // where we only insert a check for private and still use the flat instruction
20573 // for global and shared.
20574
20575 bool FullFlatEmulation =
20576 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
20577 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
20578 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
20579 RMW->getType()->isDoubleTy()));
20580
20581 // If the return value isn't used, do not introduce a false use in the phi.
20582 bool ReturnValueIsUsed = !AI->use_empty();
20583
20584 BasicBlock *BB = Builder.GetInsertBlock();
20585 Function *F = BB->getParent();
20586 BasicBlock *ExitBB =
20587 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
20588 BasicBlock *SharedBB = nullptr;
20589
20590 BasicBlock *CheckPrivateBB = BB;
20591 if (FullFlatEmulation) {
20592 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
20593 CheckPrivateBB =
20594 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
20595 }
20596
20597 BasicBlock *PrivateBB =
20598 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
20599 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
20600 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
20601
20602 std::prev(BB->end())->eraseFromParent();
20603 Builder.SetInsertPoint(BB);
20604
20605 Value *LoadedShared = nullptr;
20606 if (FullFlatEmulation) {
20607 CallInst *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
20608 {Addr}, nullptr, "is.shared");
20609 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
20610 Builder.SetInsertPoint(SharedBB);
20611 Value *CastToLocal = Builder.CreateAddrSpaceCast(
20613
20614 Instruction *Clone = AI->clone();
20615 Clone->insertInto(SharedBB, SharedBB->end());
20616 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
20617 LoadedShared = Clone;
20618
20619 Builder.CreateBr(PhiBB);
20620 Builder.SetInsertPoint(CheckPrivateBB);
20621 }
20622
20623 CallInst *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
20624 {Addr}, nullptr, "is.private");
20625 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
20626
20627 Builder.SetInsertPoint(PrivateBB);
20628
20629 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
20631
20632 Value *LoadedPrivate;
20633 if (RMW) {
20634 LoadedPrivate = Builder.CreateAlignedLoad(
20635 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
20636
20637 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
20638 LoadedPrivate, RMW->getValOperand());
20639
20640 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
20641 } else {
20642 auto [ResultLoad, Equal] =
20643 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
20644 CX->getNewValOperand(), CX->getAlign());
20645
20646 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
20647 ResultLoad, 0);
20648 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
20649 }
20650
20651 Builder.CreateBr(PhiBB);
20652
20653 Builder.SetInsertPoint(GlobalBB);
20654
20655 // Continue using a flat instruction if we only emitted the check for private.
20656 Instruction *LoadedGlobal = AI;
20657 if (FullFlatEmulation) {
20658 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
20660 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
20661 }
20662
20663 AI->removeFromParent();
20664 AI->insertInto(GlobalBB, GlobalBB->end());
20665
20666 // The new atomicrmw may go through another round of legalization later.
20667 if (!FullFlatEmulation) {
20668 // We inserted the runtime check already, make sure we do not try to
20669 // re-expand this.
20670 // TODO: Should union with any existing metadata.
20671 MDBuilder MDB(F->getContext());
20672 MDNode *RangeNotPrivate =
20675 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
20676 RangeNotPrivate);
20677 }
20678
20679 Builder.CreateBr(PhiBB);
20680
20681 Builder.SetInsertPoint(PhiBB);
20682
20683 if (ReturnValueIsUsed) {
20684 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
20685 AI->replaceAllUsesWith(Loaded);
20686 if (FullFlatEmulation)
20687 Loaded->addIncoming(LoadedShared, SharedBB);
20688 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20689 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20690 Loaded->takeName(AI);
20691 }
20692
20693 Builder.CreateBr(ExitBB);
20694}
20695
20697 unsigned PtrOpIdx) {
20698 Value *PtrOp = I->getOperand(PtrOpIdx);
20701
20702 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
20703 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
20704 I->getIterator());
20705 I->setOperand(PtrOpIdx, ASCast);
20706}
20707
20710
20713
20716 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
20717 ConstVal && ConstVal->isNullValue()) {
20718 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
20720
20721 // We may still need the private-alias-flat handling below.
20722
20723 // TODO: Skip this for cases where we cannot access remote memory.
20724 }
20725 }
20726
20727 // The non-flat expansions should only perform the de-canonicalization of
20728 // identity values.
20730 return;
20731
20733}
20734
20741
20745
20747 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20748}
20749
20751 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20752 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
20753
20755 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20756}
20757
20758LoadInst *
20760 IRBuilder<> Builder(AI);
20761 auto Order = AI->getOrdering();
20762
20763 // The optimization removes store aspect of the atomicrmw. Therefore, cache
20764 // must be flushed if the atomic ordering had a release semantics. This is
20765 // not necessary a fence, a release fence just coincides to do that flush.
20766 // Avoid replacing of an atomicrmw with a release semantics.
20767 if (isReleaseOrStronger(Order))
20768 return nullptr;
20769
20770 LoadInst *LI = Builder.CreateAlignedLoad(
20771 AI->getType(), AI->getPointerOperand(), AI->getAlign());
20772 LI->setAtomic(Order, AI->getSyncScopeID());
20773 LI->copyMetadata(*AI);
20774 LI->takeName(AI);
20775 AI->replaceAllUsesWith(LI);
20776 AI->eraseFromParent();
20777 return LI;
20778}
static bool isMul(MachineInstr *MI)
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
static bool isCtlzOpc(unsigned Opc)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:853
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
static constexpr Value * getValue(Ty &ValueOrUse)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1273
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1270
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static MachineBasicBlock * expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static std::tuple< unsigned, unsigned > getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static std::pair< Register, Register > ExtractSubRegs(MachineInstr &MI, MachineOperand &Op, const TargetRegisterClass *SrcRC, const GCNSubtarget &ST, MachineRegisterInfo &MRI)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC, const SDValue LHS, const SDValue RHS, const SelectionDAG &DAG)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
static uint64_t getIdentityValueForWaveReduction(unsigned Opc)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:483
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1179
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5899
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1594
bool isNegative() const
Definition APFloat.h:1538
bool isNormal() const
Definition APFloat.h:1542
APInt bitcastToAPInt() const
Definition APFloat.h:1430
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1197
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1157
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1138
bool isInfinity() const
Definition APFloat.h:1535
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1408
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:342
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:338
const Function * getParent() const
Definition Argument.h:44
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ ICMP_NE
not equal
Definition InstrTypes.h:698
bool isSigned() const
Definition InstrTypes.h:930
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
const APFloat & getValueAPF() const
bool isExactlyValue(double V) const
We don't rely on operator== working on double values, as it returns true for things that are clearly ...
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:218
A debug info location.
Definition DebugLoc.h:123
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:763
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:804
Argument * getArg(unsigned i) const
Definition Function.h:886
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergentAtDef(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:141
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:569
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2858
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1080
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1444
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:252
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:246
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:249
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:280
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:176
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:445
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:313
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:370
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:275
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:110
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:35
LLVM_ABI void set(Value *Val)
Definition Value.h:883
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:549
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:399
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
bool isGFX13(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:823
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:783
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:857
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:884
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:997
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:792
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:979
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:848
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:800
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:796
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:974
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:854
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:815
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:892
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:982
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:809
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:930
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:791
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:963
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:949
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:860
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:837
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > OverloadTys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_IntrinsicWOChain(const OpndPreds &...Opnds)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:557
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1738
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2207
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:356
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition MathExtras.h:357
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1745
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
bool isBoolSGPR(SDValue V)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
FunctionAddr VTableAddr uintptr_t uintptr_t Data
Definition InstrProf.h:221
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ AfterLegalizeTypes
Definition DAGCombine.h:17
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
FunctionAddr VTableAddr Next
Definition InstrProf.h:141
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:433
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1771
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1946
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:876
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:403
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:308
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:251
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:486
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:420
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:264
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:109
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:239
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
bool isAllOnes() const
Returns true if value is all one bits.
Definition KnownBits.h:81
bool isKnownNeverNaN() const
Return true if it's known this can never be a nan.
static LLVM_ABI KnownFPClass bitcast(const fltSemantics &FltSemantics, const KnownBits &Bits)
Report known values for a bitcast into a float with provided semantics.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs