LLVM 23.0.0git
SIISelLowering.cpp
Go to the documentation of this file.
1//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// Custom DAG lowering for SI
11//
12//===----------------------------------------------------------------------===//
13
14#include "SIISelLowering.h"
15#include "AMDGPU.h"
16#include "AMDGPUInstrInfo.h"
17#include "AMDGPULaneMaskUtils.h"
18#include "AMDGPUMemoryUtils.h"
20#include "AMDGPUTargetMachine.h"
21#include "GCNSubtarget.h"
24#include "SIRegisterInfo.h"
25#include "llvm/ADT/APInt.h"
27#include "llvm/ADT/Statistic.h"
42#include "llvm/IR/IRBuilder.h"
44#include "llvm/IR/IntrinsicsAMDGPU.h"
45#include "llvm/IR/IntrinsicsR600.h"
46#include "llvm/IR/MDBuilder.h"
49#include "llvm/Support/ModRef.h"
51#include <optional>
52
53using namespace llvm;
54using namespace llvm::SDPatternMatch;
55
56#define DEBUG_TYPE "si-lower"
57
58STATISTIC(NumTailCalls, "Number of tail calls");
59
60static cl::opt<bool>
61 DisableLoopAlignment("amdgpu-disable-loop-alignment",
62 cl::desc("Do not align and prefetch loops"),
63 cl::init(false));
64
66 "amdgpu-use-divergent-register-indexing", cl::Hidden,
67 cl::desc("Use indirect register addressing for divergent indexes"),
68 cl::init(false));
69
74
79
80static unsigned findFirstFreeSGPR(CCState &CCInfo) {
81 unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
82 for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
83 if (!CCInfo.isAllocated(AMDGPU::SGPR0 + Reg)) {
84 return AMDGPU::SGPR0 + Reg;
85 }
86 }
87 llvm_unreachable("Cannot allocate sgpr");
88}
89
91 const GCNSubtarget &STI)
92 : AMDGPUTargetLowering(TM, STI, STI), Subtarget(&STI) {
93 addRegisterClass(MVT::i1, &AMDGPU::VReg_1RegClass);
94 addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
95
96 addRegisterClass(MVT::i32, &AMDGPU::SReg_32RegClass);
97
98 const SIRegisterInfo *TRI = STI.getRegisterInfo();
99 const TargetRegisterClass *V32RegClass =
100 TRI->getDefaultVectorSuperClassForBitWidth(32);
101 addRegisterClass(MVT::f32, V32RegClass);
102
103 addRegisterClass(MVT::v2i32, &AMDGPU::SReg_64RegClass);
104
105 const TargetRegisterClass *V64RegClass =
106 TRI->getDefaultVectorSuperClassForBitWidth(64);
107
108 addRegisterClass(MVT::f64, V64RegClass);
109 addRegisterClass(MVT::v2f32, V64RegClass);
110 addRegisterClass(MVT::Untyped, V64RegClass);
111
112 addRegisterClass(MVT::v3i32, &AMDGPU::SGPR_96RegClass);
113 addRegisterClass(MVT::v3f32, TRI->getDefaultVectorSuperClassForBitWidth(96));
114
115 addRegisterClass(MVT::v2i64, &AMDGPU::SGPR_128RegClass);
116 addRegisterClass(MVT::v2f64, &AMDGPU::SGPR_128RegClass);
117
118 addRegisterClass(MVT::v4i32, &AMDGPU::SGPR_128RegClass);
119 addRegisterClass(MVT::v4f32, TRI->getDefaultVectorSuperClassForBitWidth(128));
120
121 addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
122 addRegisterClass(MVT::v5f32, TRI->getDefaultVectorSuperClassForBitWidth(160));
123
124 addRegisterClass(MVT::v6i32, &AMDGPU::SGPR_192RegClass);
125 addRegisterClass(MVT::v6f32, TRI->getDefaultVectorSuperClassForBitWidth(192));
126
127 addRegisterClass(MVT::v3i64, &AMDGPU::SGPR_192RegClass);
128 addRegisterClass(MVT::v3f64, TRI->getDefaultVectorSuperClassForBitWidth(192));
129
130 addRegisterClass(MVT::v7i32, &AMDGPU::SGPR_224RegClass);
131 addRegisterClass(MVT::v7f32, TRI->getDefaultVectorSuperClassForBitWidth(224));
132
133 addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
134 addRegisterClass(MVT::v8f32, TRI->getDefaultVectorSuperClassForBitWidth(256));
135
136 addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
137 addRegisterClass(MVT::v4f64, TRI->getDefaultVectorSuperClassForBitWidth(256));
138
139 addRegisterClass(MVT::v9i32, &AMDGPU::SGPR_288RegClass);
140 addRegisterClass(MVT::v9f32, TRI->getDefaultVectorSuperClassForBitWidth(288));
141
142 addRegisterClass(MVT::v10i32, &AMDGPU::SGPR_320RegClass);
143 addRegisterClass(MVT::v10f32,
144 TRI->getDefaultVectorSuperClassForBitWidth(320));
145
146 addRegisterClass(MVT::v11i32, &AMDGPU::SGPR_352RegClass);
147 addRegisterClass(MVT::v11f32,
148 TRI->getDefaultVectorSuperClassForBitWidth(352));
149
150 addRegisterClass(MVT::v12i32, &AMDGPU::SGPR_384RegClass);
151 addRegisterClass(MVT::v12f32,
152 TRI->getDefaultVectorSuperClassForBitWidth(384));
153
154 addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
155 addRegisterClass(MVT::v16f32,
156 TRI->getDefaultVectorSuperClassForBitWidth(512));
157
158 addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
159 addRegisterClass(MVT::v8f64, TRI->getDefaultVectorSuperClassForBitWidth(512));
160
161 addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
162 addRegisterClass(MVT::v16f64,
163 TRI->getDefaultVectorSuperClassForBitWidth(1024));
164
165 if (Subtarget->has16BitInsts()) {
166 if (Subtarget->useRealTrue16Insts()) {
167 addRegisterClass(MVT::i16, &AMDGPU::VGPR_16RegClass);
168 addRegisterClass(MVT::f16, &AMDGPU::VGPR_16RegClass);
169 addRegisterClass(MVT::bf16, &AMDGPU::VGPR_16RegClass);
170 } else {
171 addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
172 addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
173 addRegisterClass(MVT::bf16, &AMDGPU::SReg_32RegClass);
174 }
175
176 // Unless there are also VOP3P operations, not operations are really legal.
177 addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32RegClass);
178 addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32RegClass);
179 addRegisterClass(MVT::v2bf16, &AMDGPU::SReg_32RegClass);
180 addRegisterClass(MVT::v4i16, &AMDGPU::SReg_64RegClass);
181 addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
182 addRegisterClass(MVT::v4bf16, &AMDGPU::SReg_64RegClass);
183 addRegisterClass(MVT::v8i16, &AMDGPU::SGPR_128RegClass);
184 addRegisterClass(MVT::v8f16, &AMDGPU::SGPR_128RegClass);
185 addRegisterClass(MVT::v8bf16, &AMDGPU::SGPR_128RegClass);
186 addRegisterClass(MVT::v16i16, &AMDGPU::SGPR_256RegClass);
187 addRegisterClass(MVT::v16f16, &AMDGPU::SGPR_256RegClass);
188 addRegisterClass(MVT::v16bf16, &AMDGPU::SGPR_256RegClass);
189 addRegisterClass(MVT::v32i16, &AMDGPU::SGPR_512RegClass);
190 addRegisterClass(MVT::v32f16, &AMDGPU::SGPR_512RegClass);
191 addRegisterClass(MVT::v32bf16, &AMDGPU::SGPR_512RegClass);
192 }
193
194 addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
195 addRegisterClass(MVT::v32f32,
196 TRI->getDefaultVectorSuperClassForBitWidth(1024));
197
198 computeRegisterProperties(Subtarget->getRegisterInfo());
199
202
203 // The boolean content concept here is too inflexible. Compares only ever
204 // really produce a 1-bit result. Any copy/extend from these will turn into a
205 // select, and zext/1 or sext/-1 are equally cheap. Arbitrarily choose 0/1, as
206 // it's what most targets use.
209
210 // We need to custom lower vector stores from local memory
212 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
213 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
214 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
215 MVT::i1, MVT::v32i32},
216 Custom);
217
219 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
220 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
221 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32,
222 MVT::i1, MVT::v32i32},
223 Custom);
224
225 if (isTypeLegal(MVT::bf16)) {
226 for (unsigned Opc :
235 ISD::SETCC}) {
236 setOperationAction(Opc, MVT::bf16, Promote);
237 }
238
240
242 AddPromotedToType(ISD::SELECT, MVT::bf16, MVT::i16);
243
247
248 // We only need to custom lower because we can't specify an action for bf16
249 // sources.
252 }
253
254 setTruncStoreAction(MVT::v2i32, MVT::v2i16, Expand);
255 setTruncStoreAction(MVT::v3i32, MVT::v3i16, Expand);
256 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Expand);
257 setTruncStoreAction(MVT::v8i32, MVT::v8i16, Expand);
258 setTruncStoreAction(MVT::v16i32, MVT::v16i16, Expand);
259 setTruncStoreAction(MVT::v32i32, MVT::v32i16, Expand);
260 setTruncStoreAction(MVT::v2i32, MVT::v2i8, Expand);
261 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Expand);
262 setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
263 setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
264 setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
265 setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
266 setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
267 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
268 setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
269 setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
270
271 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
272 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
273 setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
274 setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
275 setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
276 setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
277 setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
278
279 setOperationAction(ISD::GlobalAddress, {MVT::i32, MVT::i64}, Custom);
280 setOperationAction(ISD::BlockAddress, {MVT::i32, MVT::i64}, Custom);
281 setOperationAction(ISD::ExternalSymbol, {MVT::i32, MVT::i64}, Custom);
282
286 AddPromotedToType(ISD::SELECT, MVT::f64, MVT::i64);
287
288 setOperationAction(ISD::FSQRT, {MVT::f32, MVT::f64}, Custom);
289
291 {MVT::f32, MVT::i32, MVT::i64, MVT::f64, MVT::i1}, Expand);
292
294 setOperationAction(ISD::SETCC, {MVT::v2i1, MVT::v4i1}, Expand);
295 AddPromotedToType(ISD::SETCC, MVT::i1, MVT::i32);
296
298 {MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32,
299 MVT::v6i32, MVT::v7i32, MVT::v8i32, MVT::v9i32,
300 MVT::v10i32, MVT::v11i32, MVT::v12i32, MVT::v16i32},
301 Expand);
303 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
304 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v9f32,
305 MVT::v10f32, MVT::v11f32, MVT::v12f32, MVT::v16f32},
306 Expand);
307
309 {MVT::v2i1, MVT::v4i1, MVT::v2i8, MVT::v4i8, MVT::v2i16,
310 MVT::v3i16, MVT::v4i16, MVT::Other},
311 Custom);
312
315 {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
316
319
322
324 Expand);
325
327
328 // We only support LOAD/STORE and vector manipulation ops for vectors
329 // with > 4 elements.
330 for (MVT VT :
331 {MVT::v8i32, MVT::v8f32, MVT::v9i32, MVT::v9f32, MVT::v10i32,
332 MVT::v10f32, MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32,
333 MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64, MVT::v4i16,
334 MVT::v4f16, MVT::v4bf16, MVT::v3i64, MVT::v3f64, MVT::v6i32,
335 MVT::v6f32, MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
336 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
337 MVT::v16bf16, MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32,
338 MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
339 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
340 switch (Op) {
341 case ISD::LOAD:
342 case ISD::STORE:
344 case ISD::BITCAST:
345 case ISD::UNDEF:
349 case ISD::IS_FPCLASS:
350 break;
355 break;
356 default:
358 break;
359 }
360 }
361 }
362
364
365 // TODO: For dynamic 64-bit vector inserts/extracts, should emit a pseudo that
366 // is expanded to avoid having two separate loops in case the index is a VGPR.
367
368 // Most operations are naturally 32-bit vector operations. We only support
369 // load and store of i64 vectors, so promote v2i64 vector operations to v4i32.
370 for (MVT Vec64 : {MVT::v2i64, MVT::v2f64}) {
372 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v4i32);
373
375 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v4i32);
376
378 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v4i32);
379
381 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
382 }
383
384 for (MVT Vec64 : {MVT::v3i64, MVT::v3f64}) {
386 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v6i32);
387
389 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v6i32);
390
392 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v6i32);
393
395 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v6i32);
396 }
397
398 for (MVT Vec64 : {MVT::v4i64, MVT::v4f64}) {
400 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
401
403 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
404
406 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
407
409 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
410 }
411
412 for (MVT Vec64 : {MVT::v8i64, MVT::v8f64}) {
414 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
415
417 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
418
420 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
421
423 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
424 }
425
426 for (MVT Vec64 : {MVT::v16i64, MVT::v16f64}) {
428 AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
429
431 AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
432
434 AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
435
437 AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
438 }
439
441 {MVT::v4i32, MVT::v4f32, MVT::v8i32, MVT::v8f32,
442 MVT::v16i32, MVT::v16f32, MVT::v32i32, MVT::v32f32},
443 Custom);
444
445 if (Subtarget->hasPkMovB32()) {
446 // TODO: 16-bit element vectors should be legal with even aligned elements.
447 // TODO: Can be legal with wider source types than the result with
448 // subregister extracts.
449 setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal);
450 }
451
453 // Prevent SELECT v2i32 from being implemented with the above bitwise ops and
454 // instead lower to cndmask in SITargetLowering::LowerSELECT().
456 // Enable MatchRotate to produce ISD::ROTR, which is later transformed to
457 // alignbit.
458 setOperationAction(ISD::ROTR, MVT::v2i32, Custom);
459
460 setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16},
461 Custom);
462
463 // Avoid stack access for these.
464 // TODO: Generalize to more vector types.
466 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v2i8, MVT::v4i8,
467 MVT::v8i8, MVT::v4i16, MVT::v4f16, MVT::v4bf16},
468 Custom);
469
470 // Deal with vec3 vector operations when widened to vec4.
472 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32}, Custom);
473
474 // Deal with vec5/6/7 vector operations when widened to vec8.
476 {MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
477 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
478 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
479 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
480 Custom);
481
482 // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling,
483 // and output demarshalling
484 setOperationAction(ISD::ATOMIC_CMP_SWAP, {MVT::i32, MVT::i64}, Custom);
485
486 // We can't return success/failure, only the old value,
487 // let LLVM add the comparison
489 Expand);
490
491 setOperationAction(ISD::ADDRSPACECAST, {MVT::i32, MVT::i64}, Custom);
492
493 setOperationAction(ISD::BITREVERSE, {MVT::i32, MVT::i64}, Legal);
494
495 // FIXME: This should be narrowed to i32, but that only happens if i64 is
496 // illegal.
497 // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
498 setOperationAction(ISD::BSWAP, {MVT::i64, MVT::i32}, Legal);
499
500 // On SI this is s_memtime and s_memrealtime on VI.
502
503 if (Subtarget->hasSMemRealTime() ||
504 Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11)
507
508 if (Subtarget->has16BitInsts()) {
511 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
514 } else {
516 }
517
518 if (Subtarget->hasMadMacF32Insts())
520
524
525 // We only really have 32-bit BFE instructions (and 16-bit on VI).
526 //
527 // On SI+ there are 64-bit BFEs, but they are scalar only and there isn't any
528 // effort to match them now. We want this to be false for i64 cases when the
529 // extraction isn't restricted to the upper or lower half. Ideally we would
530 // have some pass reduce 64-bit extracts to 32-bit if possible. Extracts that
531 // span the midpoint are probably relatively rare, so don't worry about them
532 // for now.
534
535 // Clamp modifier on add/sub
536 if (Subtarget->hasIntClamp())
538
539 if (Subtarget->hasAddNoCarryInsts())
540 setOperationAction({ISD::SADDSAT, ISD::SSUBSAT}, {MVT::i16, MVT::i32},
541 Legal);
542
545 {MVT::f32, MVT::f64}, Custom);
546
547 // These are really only legal for ieee_mode functions. We should be avoiding
548 // them for functions that don't have ieee_mode enabled, so just say they are
549 // legal.
551 {MVT::f32, MVT::f64}, Legal);
552
553 if (Subtarget->haveRoundOpsF64())
555 Legal);
556 else
558 MVT::f64, Custom);
559
561 setOperationAction({ISD::FLDEXP, ISD::STRICT_FLDEXP}, {MVT::f32, MVT::f64},
562 Legal);
563 setOperationAction(ISD::FFREXP, {MVT::f32, MVT::f64}, Custom);
564
567
568 setOperationAction(ISD::BF16_TO_FP, {MVT::i16, MVT::f32, MVT::f64}, Expand);
569 setOperationAction(ISD::FP_TO_BF16, {MVT::i16, MVT::f32, MVT::f64}, Expand);
570
572 Custom);
574 Custom);
576 Custom);
577
578 // Custom lower these because we can't specify a rule based on an illegal
579 // source bf16.
582
583 if (Subtarget->has16BitInsts()) {
586 MVT::i16, Legal);
587
588 AddPromotedToType(ISD::SIGN_EXTEND, MVT::i16, MVT::i32);
589
591 MVT::i16, Expand);
592
596 ISD::CTPOP},
597 MVT::i16, Promote);
598
600
601 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
602
604 AddPromotedToType(ISD::FP16_TO_FP, MVT::i16, MVT::i32);
606 AddPromotedToType(ISD::FP_TO_FP16, MVT::i16, MVT::i32);
607
612
614
615 // F16 - Constant Actions.
618
619 // F16 - Load/Store Actions.
621 AddPromotedToType(ISD::LOAD, MVT::f16, MVT::i16);
623 AddPromotedToType(ISD::STORE, MVT::f16, MVT::i16);
624
625 // BF16 - Load/Store Actions.
627 AddPromotedToType(ISD::LOAD, MVT::bf16, MVT::i16);
629 AddPromotedToType(ISD::STORE, MVT::bf16, MVT::i16);
630
631 // F16 - VOP1 Actions.
634 MVT::f16, Custom);
635
636 // BF16 - VOP1 Actions.
637 if (Subtarget->hasBF16TransInsts())
639
640 // F16 - VOP2 Actions.
641 setOperationAction({ISD::BR_CC, ISD::SELECT_CC}, {MVT::f16, MVT::bf16},
642 Expand);
646
647 // F16 - VOP3 Actions.
649 if (STI.hasMadF16())
651
652 for (MVT VT :
653 {MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::v4i16, MVT::v4f16,
654 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16,
655 MVT::v16f16, MVT::v16bf16, MVT::v32i16, MVT::v32f16}) {
656 for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
657 switch (Op) {
658 case ISD::LOAD:
659 case ISD::STORE:
661 case ISD::BITCAST:
662 case ISD::UNDEF:
667 case ISD::IS_FPCLASS:
668 break;
671 case ISD::FSIN:
672 case ISD::FCOS:
674 break;
675 default:
677 break;
678 }
679 }
680 }
681
682 // v_perm_b32 can handle either of these.
683 setOperationAction(ISD::BSWAP, {MVT::i16, MVT::v2i16}, Legal);
685
686 // Legalize vector types for sat conversions to select v_cvt_pk_[iu]16_f32.
687 if (Subtarget->hasVCvtPkIU16F32())
690 {MVT::v2i16, MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16},
691 Custom);
692
693 // XXX - Do these do anything? Vector constants turn into build_vector.
694 setOperationAction(ISD::Constant, {MVT::v2i16, MVT::v2f16}, Legal);
695
696 setOperationAction(ISD::UNDEF, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
697 Legal);
698
700 AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
702 AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
703
705 AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
707 AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
708
710 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2i16, MVT::i32);
712 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f16, MVT::i32);
713
715 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2i16, MVT::i32);
717 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f16, MVT::i32);
718
719 setOperationAction(ISD::AND, MVT::v2i16, Promote);
720 AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
721 setOperationAction(ISD::OR, MVT::v2i16, Promote);
722 AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
723 setOperationAction(ISD::XOR, MVT::v2i16, Promote);
724 AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
725
727 AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::v2i32);
729 AddPromotedToType(ISD::LOAD, MVT::v4f16, MVT::v2i32);
730 setOperationAction(ISD::LOAD, MVT::v4bf16, Promote);
731 AddPromotedToType(ISD::LOAD, MVT::v4bf16, MVT::v2i32);
732
734 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4i16, MVT::i64);
736 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v4f16, MVT::i64);
737
739 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4i16, MVT::i64);
741 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v4f16, MVT::i64);
742
744 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
746 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
748 AddPromotedToType(ISD::STORE, MVT::v4bf16, MVT::v2i32);
749
751 AddPromotedToType(ISD::LOAD, MVT::v8i16, MVT::v4i32);
753 AddPromotedToType(ISD::LOAD, MVT::v8f16, MVT::v4i32);
754 setOperationAction(ISD::LOAD, MVT::v8bf16, Promote);
755 AddPromotedToType(ISD::LOAD, MVT::v8bf16, MVT::v4i32);
756
758 AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::v2i32);
760 AddPromotedToType(ISD::STORE, MVT::v4f16, MVT::v2i32);
761
763 AddPromotedToType(ISD::STORE, MVT::v8i16, MVT::v4i32);
765 AddPromotedToType(ISD::STORE, MVT::v8f16, MVT::v4i32);
767 AddPromotedToType(ISD::STORE, MVT::v8bf16, MVT::v4i32);
768
769 setOperationAction(ISD::LOAD, MVT::v16i16, Promote);
770 AddPromotedToType(ISD::LOAD, MVT::v16i16, MVT::v8i32);
771 setOperationAction(ISD::LOAD, MVT::v16f16, Promote);
772 AddPromotedToType(ISD::LOAD, MVT::v16f16, MVT::v8i32);
773 setOperationAction(ISD::LOAD, MVT::v16bf16, Promote);
774 AddPromotedToType(ISD::LOAD, MVT::v16bf16, MVT::v8i32);
775
777 AddPromotedToType(ISD::STORE, MVT::v16i16, MVT::v8i32);
779 AddPromotedToType(ISD::STORE, MVT::v16f16, MVT::v8i32);
780 setOperationAction(ISD::STORE, MVT::v16bf16, Promote);
781 AddPromotedToType(ISD::STORE, MVT::v16bf16, MVT::v8i32);
782
783 setOperationAction(ISD::LOAD, MVT::v32i16, Promote);
784 AddPromotedToType(ISD::LOAD, MVT::v32i16, MVT::v16i32);
785 setOperationAction(ISD::LOAD, MVT::v32f16, Promote);
786 AddPromotedToType(ISD::LOAD, MVT::v32f16, MVT::v16i32);
787 setOperationAction(ISD::LOAD, MVT::v32bf16, Promote);
788 AddPromotedToType(ISD::LOAD, MVT::v32bf16, MVT::v16i32);
789
791 AddPromotedToType(ISD::STORE, MVT::v32i16, MVT::v16i32);
793 AddPromotedToType(ISD::STORE, MVT::v32f16, MVT::v16i32);
794 setOperationAction(ISD::STORE, MVT::v32bf16, Promote);
795 AddPromotedToType(ISD::STORE, MVT::v32bf16, MVT::v16i32);
796
798 MVT::v2i32, Expand);
800
802 MVT::v4i32, Expand);
803
805 MVT::v8i32, Expand);
806
807 setOperationAction(ISD::BUILD_VECTOR, {MVT::v2i16, MVT::v2f16, MVT::v2bf16},
808 Subtarget->hasVOP3PInsts() ? Legal : Custom);
809
810 setOperationAction(ISD::FNEG, {MVT::v2f16, MVT::v2bf16}, Legal);
811 // This isn't really legal, but this avoids the legalizer unrolling it (and
812 // allows matching fneg (fabs x) patterns)
813 setOperationAction(ISD::FABS, {MVT::v2f16, MVT::v2bf16}, Legal);
814
815 // Can do this in one BFI plus a constant materialize.
817 {MVT::v2f16, MVT::v2bf16, MVT::v4f16, MVT::v4bf16,
818 MVT::v8f16, MVT::v8bf16, MVT::v16f16, MVT::v16bf16,
819 MVT::v32f16, MVT::v32bf16},
820 Custom);
821
824 MVT::f16, Custom);
826
829 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
830 Custom);
831
833 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
834 Expand);
835
836 for (MVT Vec16 :
837 {MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::v16i16, MVT::v16f16,
838 MVT::v16bf16, MVT::v32i16, MVT::v32f16, MVT::v32bf16}) {
841 Vec16, Custom);
843 }
844 }
845
846 if (Subtarget->hasVOP3PInsts()) {
850 MVT::v2i16, Legal);
851
855 MVT::v2f16, Legal);
856
858 {MVT::v2i16, MVT::v2f16, MVT::v2bf16}, Custom);
859
861 {MVT::v4f16, MVT::v4i16, MVT::v4bf16, MVT::v8f16,
862 MVT::v8i16, MVT::v8bf16, MVT::v16f16, MVT::v16i16,
863 MVT::v16bf16, MVT::v32f16, MVT::v32i16, MVT::v32bf16},
864 Custom);
865
866 for (MVT VT : {MVT::v4i16, MVT::v8i16, MVT::v16i16, MVT::v32i16})
867 // Split vector operations.
872 VT, Custom);
873
874 for (MVT VT : {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16})
875 // Split vector operations.
878 VT, Custom);
879
882 {MVT::v2f16, MVT::v4f16}, Custom);
883
884 setOperationAction(ISD::FEXP, MVT::v2f16, Custom);
885 setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
886 Custom);
887
888 if (Subtarget->hasBF16PackedInsts()) {
891 MVT::v2bf16, Legal);
892
893 for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
894 // Split vector operations.
897 VT, Custom);
898 }
899
900 if (Subtarget->hasPackedFP32Ops()) {
902 MVT::v2f32, Legal);
904 {MVT::v4f32, MVT::v8f32, MVT::v16f32, MVT::v32f32},
905 Custom);
906 }
907 if (Subtarget->hasPackedFP64Ops()) {
911 MVT::v2f64, Legal);
914 MVT::v2f64, Custom);
919 {MVT::v4f64, MVT::v8f64, MVT::v16f64, MVT::v32f64}, Custom);
920 }
921
922 if (Subtarget->hasPackedU64Ops()) {
924 MVT::v2i64, Legal);
926 {MVT::v4i64, MVT::v8i64, MVT::v16i64, MVT::v32i64},
927 Custom);
928 }
929 }
930
932
933 if (Subtarget->has16BitInsts()) {
935 AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
937 AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
938 } else {
939 // Legalization hack.
940 setOperationAction(ISD::SELECT, {MVT::v2i16, MVT::v2f16}, Custom);
941
943 }
944
946 {MVT::v4i16, MVT::v4f16, MVT::v4bf16, MVT::v2i8, MVT::v4i8,
947 MVT::v8i8, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
948 MVT::v16i16, MVT::v16f16, MVT::v16bf16, MVT::v32i16,
949 MVT::v32f16, MVT::v32bf16},
950 Custom);
951
953
954 if (Subtarget->hasVMulU64Inst())
956 else if (Subtarget->hasScalarSMulU64())
958
959 if (Subtarget->hasMad64_32())
961
962 if (Subtarget->hasSafeSmemPrefetch() || Subtarget->hasVmemPrefInsts())
964
965 if (Subtarget->hasIEEEMinimumMaximumInsts()) {
967 {MVT::f16, MVT::f32, MVT::f64, MVT::v2f16}, Legal);
968 } else {
969 // FIXME: For nnan fmaximum, emit the fmaximum3 instead of fmaxnum
970 if (Subtarget->hasMinimum3Maximum3F32())
972
973 if (Subtarget->hasMinimum3Maximum3PKF16()) {
975
976 // If only the vector form is available, we need to widen to a vector.
977 if (!Subtarget->hasMinimum3Maximum3F16())
979 }
980 }
981
982 if (Subtarget->hasVOP3PInsts()) {
983 // We want to break these into v2f16 pieces, not scalarize.
985 {MVT::v4f16, MVT::v8f16, MVT::v16f16, MVT::v32f16},
986 Custom);
987 }
988
989 if (Subtarget->hasMinMaxI64Insts())
991 Legal);
992
994 {MVT::Other, MVT::f32, MVT::v4f32, MVT::i16, MVT::f16,
995 MVT::bf16, MVT::v2i16, MVT::v2f16, MVT::v2bf16, MVT::i128,
996 MVT::i8},
997 Custom);
998
1000 {MVT::v2f16, MVT::v2i16, MVT::v2bf16, MVT::v3f16,
1001 MVT::v3i16, MVT::v4f16, MVT::v4i16, MVT::v4bf16,
1002 MVT::v8i16, MVT::v8f16, MVT::v8bf16, MVT::Other, MVT::f16,
1003 MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1004 Custom);
1005
1007 {MVT::Other, MVT::v2i16, MVT::v2f16, MVT::v2bf16,
1008 MVT::v3i16, MVT::v3f16, MVT::v4f16, MVT::v4i16,
1009 MVT::v4bf16, MVT::v8i16, MVT::v8f16, MVT::v8bf16,
1010 MVT::f16, MVT::i16, MVT::bf16, MVT::i8, MVT::i128},
1011 Custom);
1012
1018
1019 // TODO: Could move this to custom lowering, could benefit from combines on
1020 // extract of relevant bits.
1022
1024
1025 if (Subtarget->hasBF16ConversionInsts()) {
1026 setOperationAction(ISD::FP_ROUND, {MVT::bf16, MVT::v2bf16}, Custom);
1028 }
1029
1030 if (Subtarget->hasBF16TransInsts()) {
1032 }
1033
1034 if (Subtarget->hasCvtPkF16F32Inst()) {
1036 {MVT::v2f16, MVT::v4f16, MVT::v8f16, MVT::v16f16},
1037 Custom);
1038 }
1039
1042 ISD::SUB,
1043 ISD::MUL,
1044 ISD::FADD,
1045 ISD::FSUB,
1046 ISD::FDIV,
1047 ISD::FMUL,
1056 ISD::FMA,
1057 ISD::ABS,
1058 ISD::SMIN,
1059 ISD::SMAX,
1060 ISD::UMIN,
1061 ISD::UMAX,
1062 ISD::SETCC,
1064 ISD::SMIN,
1065 ISD::SMAX,
1066 ISD::UMIN,
1067 ISD::UMAX,
1069 ISD::AND,
1070 ISD::OR,
1071 ISD::XOR,
1072 ISD::SHL,
1073 ISD::SRL,
1074 ISD::SRA,
1075 ISD::FSHR,
1086
1087 if (Subtarget->has16BitInsts() && !Subtarget->hasMed3_16())
1089
1090 // All memory operations. Some folding on the pointer operand is done to help
1091 // matching the constant offsets in the addressing modes.
1093 ISD::STORE,
1118
1119 // FIXME: In other contexts we pretend this is a per-function property.
1121
1123}
1124
1125const GCNSubtarget *SITargetLowering::getSubtarget() const { return Subtarget; }
1126
1128 static const MCPhysReg RCRegs[] = {AMDGPU::MODE};
1129 return RCRegs;
1130}
1131
1132//===----------------------------------------------------------------------===//
1133// TargetLowering queries
1134//===----------------------------------------------------------------------===//
1135
1136// v_mad_mix* support a conversion from f16 to f32.
1137//
1138// There is only one special case when denormals are enabled we don't currently,
1139// where this is OK to use.
1140bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
1141 EVT DestVT, EVT SrcVT) const {
1142 return DestVT.getScalarType() == MVT::f32 &&
1143 ((((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
1144 (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
1145 SrcVT.getScalarType() == MVT::f16) ||
1146 (Opcode == ISD::FMA && Subtarget->hasFmaMixBF16Insts() &&
1147 SrcVT.getScalarType() == MVT::bf16)) &&
1148 // TODO: This probably only requires no input flushing?
1150}
1151
1153 LLT DestTy, LLT SrcTy) const {
1154 return ((Opcode == TargetOpcode::G_FMAD && Subtarget->hasMadMixInsts()) ||
1155 (Opcode == TargetOpcode::G_FMA && Subtarget->hasFmaMixInsts())) &&
1156 DestTy.getScalarSizeInBits() == 32 &&
1157 SrcTy.getScalarSizeInBits() == 16 &&
1158 // TODO: This probably only requires no input flushing?
1159 denormalModeIsFlushAllF32(*MI.getMF());
1160}
1161
1163 // SI has some legal vector types, but no legal vector operations. Say no
1164 // shuffles are legal in order to prefer scalarizing some vector operations.
1165 return false;
1166}
1167
1169 CallingConv::ID CC,
1170 EVT VT) const {
1172 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1173
1174 if (VT.isVector()) {
1175 EVT ScalarVT = VT.getScalarType();
1176 unsigned Size = ScalarVT.getSizeInBits();
1177 if (Size == 16) {
1178 return Subtarget->has16BitInsts()
1179 ? MVT::getVectorVT(ScalarVT.getSimpleVT(), 2)
1180 : MVT::i32;
1181 }
1182
1183 if (Size < 16)
1184 return Subtarget->has16BitInsts() ? MVT::i16 : MVT::i32;
1185 return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
1186 }
1187
1188 if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
1189 return MVT::i32;
1190
1191 if (VT.getSizeInBits() > 32)
1192 return MVT::i32;
1193
1194 return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
1195}
1196
1198 CallingConv::ID CC,
1199 EVT VT) const {
1201 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1202
1203 if (VT.isVector()) {
1204 unsigned NumElts = VT.getVectorNumElements();
1205 EVT ScalarVT = VT.getScalarType();
1206 unsigned Size = ScalarVT.getSizeInBits();
1207
1208 // FIXME: Should probably promote 8-bit vectors to i16.
1209 if (Size == 16)
1210 return (NumElts + 1) / 2;
1211
1212 if (Size <= 32)
1213 return NumElts;
1214
1215 if (Size > 32)
1216 return NumElts * ((Size + 31) / 32);
1217 } else if (VT.getSizeInBits() > 32)
1218 return (VT.getSizeInBits() + 31) / 32;
1219
1220 return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
1221}
1222
1224 LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT,
1225 unsigned &NumIntermediates, MVT &RegisterVT) const {
1226 if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) {
1227 unsigned NumElts = VT.getVectorNumElements();
1228 EVT ScalarVT = VT.getScalarType();
1229 unsigned Size = ScalarVT.getSizeInBits();
1230 // FIXME: We should fix the ABI to be the same on targets without 16-bit
1231 // support, but unless we can properly handle 3-vectors, it will be still be
1232 // inconsistent.
1233 if (Size == 16) {
1234 MVT SimpleIntermediateVT =
1236 IntermediateVT = SimpleIntermediateVT;
1237 RegisterVT = Subtarget->has16BitInsts() ? SimpleIntermediateVT : MVT::i32;
1238 NumIntermediates = (NumElts + 1) / 2;
1239 return (NumElts + 1) / 2;
1240 }
1241
1242 if (Size == 32) {
1243 RegisterVT = ScalarVT.getSimpleVT();
1244 IntermediateVT = RegisterVT;
1245 NumIntermediates = NumElts;
1246 return NumIntermediates;
1247 }
1248
1249 if (Size < 16 && Subtarget->has16BitInsts()) {
1250 // FIXME: Should probably form v2i16 pieces
1251 RegisterVT = MVT::i16;
1252 IntermediateVT = ScalarVT;
1253 NumIntermediates = NumElts;
1254 return NumIntermediates;
1255 }
1256
1257 if (Size != 16 && Size <= 32) {
1258 RegisterVT = MVT::i32;
1259 IntermediateVT = ScalarVT;
1260 NumIntermediates = NumElts;
1261 return NumIntermediates;
1262 }
1263
1264 if (Size > 32) {
1265 RegisterVT = MVT::i32;
1266 IntermediateVT = RegisterVT;
1267 NumIntermediates = NumElts * ((Size + 31) / 32);
1268 return NumIntermediates;
1269 }
1270 }
1271
1273 Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
1274}
1275
1277 const DataLayout &DL, Type *Ty,
1278 unsigned MaxNumLanes) {
1279 assert(MaxNumLanes != 0);
1280
1281 LLVMContext &Ctx = Ty->getContext();
1282 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
1283 unsigned NumElts = std::min(MaxNumLanes, VT->getNumElements());
1284 return EVT::getVectorVT(Ctx, TLI.getValueType(DL, VT->getElementType()),
1285 NumElts);
1286 }
1287
1288 return TLI.getValueType(DL, Ty);
1289}
1290
1291// Peek through TFE struct returns to only use the data size.
1293 const DataLayout &DL, Type *Ty,
1294 unsigned MaxNumLanes) {
1295 auto *ST = dyn_cast<StructType>(Ty);
1296 if (!ST)
1297 return memVTFromLoadIntrData(TLI, DL, Ty, MaxNumLanes);
1298
1299 // TFE intrinsics return an aggregate type.
1300 assert(ST->getNumContainedTypes() == 2 &&
1301 ST->getContainedType(1)->isIntegerTy(32));
1302 return memVTFromLoadIntrData(TLI, DL, ST->getContainedType(0), MaxNumLanes);
1303}
1304
1305/// Map address space 7 to MVT::amdgpuBufferFatPointer because that's its
1306/// in-memory representation. This return value is a custom type because there
1307/// is no MVT::i160 and adding one breaks integer promotion logic. While this
1308/// could cause issues during codegen, these address space 7 pointers will be
1309/// rewritten away by then. Therefore, we can return MVT::amdgpuBufferFatPointer
1310/// in order to allow pre-codegen passes that query TargetTransformInfo, often
1311/// for cost modeling, to work. (This also sets us up decently for doing the
1312/// buffer lowering in GlobalISel if SelectionDAG ever goes away.)
1314 if (AMDGPUAS::BUFFER_FAT_POINTER == AS && DL.getPointerSizeInBits(AS) == 160)
1315 return MVT::amdgpuBufferFatPointer;
1317 DL.getPointerSizeInBits(AS) == 192)
1318 return MVT::amdgpuBufferStridedPointer;
1320}
1321/// Similarly, the in-memory representation of a p7 is {p8, i32}, aka
1322/// v8i32 when padding is added.
1323/// The in-memory representation of a p9 is {p8, i32, i32}, which is
1324/// also v8i32 with padding.
1326 if ((AMDGPUAS::BUFFER_FAT_POINTER == AS &&
1327 DL.getPointerSizeInBits(AS) == 160) ||
1329 DL.getPointerSizeInBits(AS) == 192))
1330 return MVT::v8i32;
1332}
1333
1334static unsigned getIntrMemWidth(unsigned IntrID) {
1335 switch (IntrID) {
1336 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1337 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1338 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1339 return 8;
1340 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1341 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1342 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1343 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1344 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1345 case Intrinsic::amdgcn_flat_load_monitor_b32:
1346 case Intrinsic::amdgcn_global_load_monitor_b32:
1347 return 32;
1348 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1349 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1350 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1351 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1352 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1353 case Intrinsic::amdgcn_flat_load_monitor_b64:
1354 case Intrinsic::amdgcn_global_load_monitor_b64:
1355 return 64;
1356 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1357 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1358 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1359 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B:
1360 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B:
1361 case Intrinsic::amdgcn_flat_load_monitor_b128:
1362 case Intrinsic::amdgcn_global_load_monitor_b128:
1363 return 128;
1364 default:
1365 llvm_unreachable("Unknown width");
1366 }
1367}
1368
1370 unsigned ArgIdx) {
1371 Value *OrderingArg = CI.getArgOperand(ArgIdx);
1372 unsigned Ord = cast<ConstantInt>(OrderingArg)->getZExtValue();
1373 switch (AtomicOrderingCABI(Ord)) {
1376 break;
1379 break;
1382 break;
1383 default:
1385 }
1386}
1387
1388static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx) {
1389 MDNode *ScopeMD = cast<MDNode>(
1390 cast<MetadataAsValue>(CI.getArgOperand(ArgIdx))->getMetadata());
1391 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1392 return CI.getContext().getOrInsertSyncScopeID(Scope);
1393}
1394
1396 const CallBase &CI,
1397 MachineFunction &MF,
1398 unsigned IntrID) const {
1400 if (CI.hasMetadata(LLVMContext::MD_invariant_load))
1402 if (CI.hasMetadata(LLVMContext::MD_nontemporal))
1404 Flags |= getTargetMMOFlags(CI);
1405
1406 if (const AMDGPU::RsrcIntrinsic *RsrcIntr =
1408 AttributeSet Attr =
1410 MemoryEffects ME = Attr.getMemoryEffects();
1411 if (ME.doesNotAccessMemory())
1412 return;
1413
1414 bool IsSPrefetch = IntrID == Intrinsic::amdgcn_s_buffer_prefetch_data;
1415 if (!IsSPrefetch) {
1416 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1417 if (Aux->getZExtValue() & AMDGPU::CPol::VOLATILE)
1419 }
1421
1422 IntrinsicInfo Info;
1423 // TODO: Should images get their own address space?
1425
1426 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode = nullptr;
1427 if (RsrcIntr->IsImage) {
1428 const AMDGPU::ImageDimIntrinsicInfo *Intr =
1430 BaseOpcode = AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
1431 Info.align.reset();
1432 }
1433
1434 Value *RsrcArg = CI.getArgOperand(RsrcIntr->RsrcArg);
1435 if (auto *RsrcPtrTy = dyn_cast<PointerType>(RsrcArg->getType())) {
1436 if (RsrcPtrTy->getAddressSpace() == AMDGPUAS::BUFFER_RESOURCE)
1437 // We conservatively set the memory operand of a buffer intrinsic to the
1438 // base resource pointer, so that we can access alias information about
1439 // those pointers. Cases like "this points at the same value
1440 // but with a different offset" are handled in
1441 // areMemAccessesTriviallyDisjoint.
1442 Info.ptrVal = RsrcArg;
1443 }
1444
1445 if (ME.onlyReadsMemory()) {
1446 if (RsrcIntr->IsImage) {
1447 unsigned MaxNumLanes = 4;
1448
1449 if (!BaseOpcode->Gather4) {
1450 // If this isn't a gather, we may have excess loaded elements in the
1451 // IR type. Check the dmask for the real number of elements loaded.
1452 unsigned DMask =
1453 cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
1454 MaxNumLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1455 }
1456
1457 Info.memVT = memVTFromLoadIntrReturn(*this, MF.getDataLayout(),
1458 CI.getType(), MaxNumLanes);
1459 } else {
1460 Info.memVT =
1462 std::numeric_limits<unsigned>::max());
1463 }
1464
1465 // FIXME: What does alignment mean for an image?
1466 Info.opc = ISD::INTRINSIC_W_CHAIN;
1467 Info.flags = Flags | MachineMemOperand::MOLoad;
1468 } else if (ME.onlyWritesMemory()) {
1469 Info.opc = ISD::INTRINSIC_VOID;
1470
1471 Type *DataTy = CI.getArgOperand(0)->getType();
1472 if (RsrcIntr->IsImage) {
1473 unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
1474 unsigned DMaskLanes = DMask == 0 ? 1 : llvm::popcount(DMask);
1475 Info.memVT = memVTFromLoadIntrData(*this, MF.getDataLayout(), DataTy,
1476 DMaskLanes);
1477 } else
1478 Info.memVT = getValueType(MF.getDataLayout(), DataTy);
1479
1480 Info.flags = Flags | MachineMemOperand::MOStore;
1481 } else {
1482 // Atomic, NoReturn Sampler or prefetch
1483 Info.opc = CI.getType()->isVoidTy() ? ISD::INTRINSIC_VOID
1485
1486 switch (IntrID) {
1487 default:
1488 Info.flags = Flags | MachineMemOperand::MOLoad;
1489 if (!IsSPrefetch)
1490 Info.flags |= MachineMemOperand::MOStore;
1491
1492 if ((RsrcIntr->IsImage && BaseOpcode->NoReturn) || IsSPrefetch) {
1493 // Fake memory access type for no return sampler intrinsics
1494 Info.memVT = MVT::i32;
1495 } else {
1496 // XXX - Should this be volatile without known ordering?
1497 Info.flags |= MachineMemOperand::MOVolatile;
1498 Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
1499 }
1500 break;
1501 case Intrinsic::amdgcn_raw_buffer_load_lds:
1502 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
1503 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
1504 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
1505 case Intrinsic::amdgcn_struct_buffer_load_lds:
1506 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
1507 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
1508 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
1509 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1510
1511 // Entry 0: Load from buffer.
1512 // Don't set an offset, since the pointer value always represents the
1513 // base of the buffer.
1514 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1515 Info.flags = Flags | MachineMemOperand::MOLoad;
1516 Infos.push_back(Info);
1517
1518 // Entry 1: Store to LDS.
1519 // Instruction offset is applied, and an additional per-lane offset
1520 // which we simulate using a larger memory type.
1521 Info.memVT = EVT::getIntegerVT(
1522 CI.getContext(), Width * 8 * Subtarget->getWavefrontSize());
1523 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1524 Info.offset = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 2))
1525 ->getZExtValue();
1526 Info.fallbackAddressSpace = AMDGPUAS::LOCAL_ADDRESS;
1527 Info.flags = Flags | MachineMemOperand::MOStore;
1528 Infos.push_back(Info);
1529 return;
1530 }
1531 case Intrinsic::amdgcn_raw_atomic_buffer_load:
1532 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
1533 case Intrinsic::amdgcn_struct_atomic_buffer_load:
1534 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
1535 Info.memVT =
1537 std::numeric_limits<unsigned>::max());
1538 Info.flags = Flags | MachineMemOperand::MOLoad;
1539 Infos.push_back(Info);
1540 return;
1541 }
1542 }
1543 }
1544 Infos.push_back(Info);
1545 return;
1546 }
1547
1548 IntrinsicInfo Info;
1549 switch (IntrID) {
1550 case Intrinsic::amdgcn_ds_ordered_add:
1551 case Intrinsic::amdgcn_ds_ordered_swap: {
1552 Info.opc = ISD::INTRINSIC_W_CHAIN;
1553 Info.memVT = MVT::getVT(CI.getType());
1554 Info.ptrVal = CI.getOperand(0);
1555 Info.align.reset();
1557
1558 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(4));
1559 if (!Vol->isZero())
1560 Info.flags |= MachineMemOperand::MOVolatile;
1561
1562 Infos.push_back(Info);
1563 return;
1564 }
1565 case Intrinsic::amdgcn_ds_add_gs_reg_rtn:
1566 case Intrinsic::amdgcn_ds_sub_gs_reg_rtn: {
1567 Info.opc = ISD::INTRINSIC_W_CHAIN;
1568 Info.memVT = MVT::getVT(CI.getOperand(0)->getType());
1569 Info.ptrVal = nullptr;
1570 Info.fallbackAddressSpace = AMDGPUAS::STREAMOUT_REGISTER;
1572 Infos.push_back(Info);
1573 return;
1574 }
1575 case Intrinsic::amdgcn_ds_append:
1576 case Intrinsic::amdgcn_ds_consume: {
1577 Info.opc = ISD::INTRINSIC_W_CHAIN;
1578 Info.memVT = MVT::getVT(CI.getType());
1579 Info.ptrVal = CI.getOperand(0);
1580 Info.align.reset();
1582
1583 const ConstantInt *Vol = cast<ConstantInt>(CI.getOperand(1));
1584 if (!Vol->isZero())
1585 Info.flags |= MachineMemOperand::MOVolatile;
1586
1587 Infos.push_back(Info);
1588 return;
1589 }
1590 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1591 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64: {
1592 Info.opc = (IntrID == Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64)
1595 Info.memVT = MVT::getVT(CI.getType());
1596 Info.ptrVal = CI.getOperand(0);
1597 Info.memVT = MVT::i64;
1598 Info.size = 8;
1599 Info.align.reset();
1601 Info.order = AtomicOrdering::Monotonic;
1602 Infos.push_back(Info);
1603 return;
1604 }
1605 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
1606 case Intrinsic::amdgcn_image_bvh_intersect_ray:
1607 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
1608 Info.opc = ISD::INTRINSIC_W_CHAIN;
1609 Info.memVT =
1610 MVT::getVT(IntrID == Intrinsic::amdgcn_image_bvh_intersect_ray
1611 ? CI.getType()
1613 ->getElementType(0)); // XXX: what is correct VT?
1614
1615 Info.fallbackAddressSpace = AMDGPUAS::BUFFER_RESOURCE;
1616 Info.align.reset();
1617 Info.flags = Flags | MachineMemOperand::MOLoad |
1619 Infos.push_back(Info);
1620 return;
1621 }
1622 case Intrinsic::amdgcn_global_atomic_fmin_num:
1623 case Intrinsic::amdgcn_global_atomic_fmax_num:
1624 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1625 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1626 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
1627 Info.opc = ISD::INTRINSIC_W_CHAIN;
1628 Info.memVT = MVT::getVT(CI.getType());
1629 Info.ptrVal = CI.getOperand(0);
1630 Info.align.reset();
1631 Info.flags =
1634 Infos.push_back(Info);
1635 return;
1636 }
1637 case Intrinsic::amdgcn_cluster_load_b32:
1638 case Intrinsic::amdgcn_cluster_load_b64:
1639 case Intrinsic::amdgcn_cluster_load_b128:
1640 case Intrinsic::amdgcn_ds_load_tr6_b96:
1641 case Intrinsic::amdgcn_ds_load_tr4_b64:
1642 case Intrinsic::amdgcn_ds_load_tr8_b64:
1643 case Intrinsic::amdgcn_ds_load_tr16_b128:
1644 case Intrinsic::amdgcn_global_load_tr6_b96:
1645 case Intrinsic::amdgcn_global_load_tr4_b64:
1646 case Intrinsic::amdgcn_global_load_tr_b64:
1647 case Intrinsic::amdgcn_global_load_tr_b128:
1648 case Intrinsic::amdgcn_ds_read_tr4_b64:
1649 case Intrinsic::amdgcn_ds_read_tr6_b96:
1650 case Intrinsic::amdgcn_ds_read_tr8_b64:
1651 case Intrinsic::amdgcn_ds_read_tr16_b64: {
1652 Info.opc = ISD::INTRINSIC_W_CHAIN;
1653 Info.memVT = MVT::getVT(CI.getType());
1654 Info.ptrVal = CI.getOperand(0);
1655 Info.align.reset();
1656 Info.flags = Flags | MachineMemOperand::MOLoad;
1657 Infos.push_back(Info);
1658 return;
1659 }
1660 case Intrinsic::amdgcn_flat_load_monitor_b32:
1661 case Intrinsic::amdgcn_flat_load_monitor_b64:
1662 case Intrinsic::amdgcn_flat_load_monitor_b128:
1663 case Intrinsic::amdgcn_global_load_monitor_b32:
1664 case Intrinsic::amdgcn_global_load_monitor_b64:
1665 case Intrinsic::amdgcn_global_load_monitor_b128: {
1666 Info.opc = ISD::INTRINSIC_W_CHAIN;
1667 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1668 Info.ptrVal = CI.getOperand(0);
1669 Info.align.reset();
1670 Info.flags = MachineMemOperand::MOLoad;
1671 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1672 Info.ssid = parseSyncscopeMDArg(CI, 2);
1673 Infos.push_back(Info);
1674 return;
1675 }
1676 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
1677 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
1678 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
1679 Info.opc = ISD::INTRINSIC_W_CHAIN;
1680 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1681 Info.ptrVal = CI.getOperand(0);
1682 Info.align.reset();
1684 Info.order = parseAtomicOrderingCABIArg(CI, 1);
1685 Info.ssid = parseSyncscopeMDArg(CI, 2);
1686 Infos.push_back(Info);
1687 return;
1688 }
1689 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
1690 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
1691 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
1692 Info.opc = ISD::INTRINSIC_VOID;
1693 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1694 Info.ptrVal = CI.getArgOperand(0);
1695 Info.align.reset();
1697 Info.order = parseAtomicOrderingCABIArg(CI, 2);
1698 Info.ssid = parseSyncscopeMDArg(CI, 3);
1699 Infos.push_back(Info);
1700 return;
1701 }
1702 case Intrinsic::amdgcn_ds_gws_init:
1703 case Intrinsic::amdgcn_ds_gws_barrier:
1704 case Intrinsic::amdgcn_ds_gws_sema_v:
1705 case Intrinsic::amdgcn_ds_gws_sema_br:
1706 case Intrinsic::amdgcn_ds_gws_sema_p:
1707 case Intrinsic::amdgcn_ds_gws_sema_release_all: {
1708 Info.opc = ISD::INTRINSIC_VOID;
1709
1710 const GCNTargetMachine &TM =
1711 static_cast<const GCNTargetMachine &>(getTargetMachine());
1712
1714 Info.ptrVal = MFI->getGWSPSV(TM);
1715
1716 // This is an abstract access, but we need to specify a type and size.
1717 Info.memVT = MVT::i32;
1718 Info.size = 4;
1719 Info.align = Align(4);
1720
1721 if (IntrID == Intrinsic::amdgcn_ds_gws_barrier)
1722 Info.flags = Flags | MachineMemOperand::MOLoad;
1723 else
1724 Info.flags = Flags | MachineMemOperand::MOStore;
1725 Infos.push_back(Info);
1726 return;
1727 }
1728 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1729 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1730 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1731 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1732 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1733 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1734 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1735 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
1736 // Entry 0: Load from source (global/flat).
1737 Info.opc = ISD::INTRINSIC_VOID;
1738 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1739 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1740 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1741 Info.flags = Flags | MachineMemOperand::MOLoad;
1742 Infos.push_back(Info);
1743
1744 // Entry 1: Store to LDS (same offset).
1745 Info.flags = Flags | MachineMemOperand::MOStore;
1746 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1747 Infos.push_back(Info);
1748 return;
1749 }
1750 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1751 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1752 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1753 case Intrinsic::amdgcn_global_store_async_from_lds_b128: {
1754 // Entry 0: Load from LDS.
1755 Info.opc = ISD::INTRINSIC_VOID;
1756 Info.memVT = EVT::getIntegerVT(CI.getContext(), getIntrMemWidth(IntrID));
1757 Info.ptrVal = CI.getArgOperand(1); // LDS pointer
1758 Info.offset = cast<ConstantInt>(CI.getArgOperand(2))->getSExtValue();
1759 Info.flags = Flags | MachineMemOperand::MOLoad;
1760 Infos.push_back(Info);
1761
1762 // Entry 1: Store to global (same offset).
1763 Info.flags = Flags | MachineMemOperand::MOStore;
1764 Info.ptrVal = CI.getArgOperand(0); // Global pointer
1765 Infos.push_back(Info);
1766 return;
1767 }
1768 case Intrinsic::amdgcn_av_load_b128:
1769 case Intrinsic::amdgcn_av_store_b128: {
1770 bool IsStore = IntrID == Intrinsic::amdgcn_av_store_b128;
1771 Info.opc = IsStore ? ISD::INTRINSIC_VOID : ISD::INTRINSIC_W_CHAIN;
1772 Info.memVT = MVT::v4i32;
1773 Info.ptrVal = CI.getArgOperand(0);
1774 Info.align = Align(16);
1775 Info.flags |=
1777 // Pretend to be atomic so that SIMemoryLegalizer::expandStore sets cache
1778 // flags appropriately.
1779 Info.order = AtomicOrdering::Monotonic;
1780
1781 LLVMContext &Ctx = CI.getContext();
1782 unsigned ScopeIdx = CI.arg_size() - 1;
1783 MDNode *ScopeMD = cast<MDNode>(
1784 cast<MetadataAsValue>(CI.getArgOperand(ScopeIdx))->getMetadata());
1785 StringRef Scope = cast<MDString>(ScopeMD->getOperand(0))->getString();
1786 Info.ssid = Ctx.getOrInsertSyncScopeID(Scope);
1787 Infos.push_back(Info);
1788 return;
1789 }
1790 case Intrinsic::amdgcn_load_to_lds:
1791 case Intrinsic::amdgcn_load_async_to_lds:
1792 case Intrinsic::amdgcn_global_load_lds:
1793 case Intrinsic::amdgcn_global_load_async_lds: {
1794 unsigned Width = cast<ConstantInt>(CI.getArgOperand(2))->getZExtValue();
1795 auto *Aux = cast<ConstantInt>(CI.getArgOperand(CI.arg_size() - 1));
1796 bool IsVolatile = Aux->getZExtValue() & AMDGPU::CPol::VOLATILE;
1797 if (IsVolatile)
1799
1800 // Entry 0: Load from source (global/flat).
1801 Info.opc = ISD::INTRINSIC_VOID;
1802 Info.memVT = EVT::getIntegerVT(CI.getContext(), Width * 8);
1803 Info.ptrVal = CI.getArgOperand(0); // Source pointer
1804 Info.offset = cast<ConstantInt>(CI.getArgOperand(3))->getSExtValue();
1805 Info.flags = Flags | MachineMemOperand::MOLoad;
1806 Infos.push_back(Info);
1807
1808 // Entry 1: Store to LDS.
1809 // Same offset from the instruction, but an additional per-lane offset is
1810 // added. Represent that using a wider memory type.
1811 Info.memVT = EVT::getIntegerVT(CI.getContext(),
1812 Width * 8 * Subtarget->getWavefrontSize());
1813 Info.ptrVal = CI.getArgOperand(1); // LDS destination pointer
1814 Info.flags = Flags | MachineMemOperand::MOStore;
1815 Infos.push_back(Info);
1816 return;
1817 }
1818 case Intrinsic::amdgcn_ds_bvh_stack_rtn:
1819 case Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn:
1820 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn:
1821 case Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn: {
1822 Info.opc = ISD::INTRINSIC_W_CHAIN;
1823
1824 const GCNTargetMachine &TM =
1825 static_cast<const GCNTargetMachine &>(getTargetMachine());
1826
1828 Info.ptrVal = MFI->getGWSPSV(TM);
1829
1830 // This is an abstract access, but we need to specify a type and size.
1831 Info.memVT = MVT::i32;
1832 Info.size = 4;
1833 Info.align = Align(4);
1834
1836 Infos.push_back(Info);
1837 return;
1838 }
1839 case Intrinsic::amdgcn_s_prefetch_data:
1840 case Intrinsic::amdgcn_s_prefetch_inst:
1841 case Intrinsic::amdgcn_flat_prefetch:
1842 case Intrinsic::amdgcn_global_prefetch: {
1843 Info.opc = ISD::INTRINSIC_VOID;
1844 Info.memVT = EVT::getIntegerVT(CI.getContext(), 8);
1845 Info.ptrVal = CI.getArgOperand(0);
1846 Info.flags = Flags | MachineMemOperand::MOLoad;
1847 Infos.push_back(Info);
1848 return;
1849 }
1850 default:
1851 return;
1852 }
1853}
1854
1856 const CallInst &I, SmallVectorImpl<SDValue> &Ops, SelectionDAG &DAG) const {
1858 case Intrinsic::amdgcn_addrspacecast_nonnull: {
1859 // The DAG's ValueType loses the addrspaces.
1860 // Add them as 2 extra Constant operands "from" and "to".
1861 unsigned SrcAS = I.getOperand(0)->getType()->getPointerAddressSpace();
1862 unsigned DstAS = I.getType()->getPointerAddressSpace();
1863 Ops.push_back(DAG.getTargetConstant(SrcAS, SDLoc(), MVT::i32));
1864 Ops.push_back(DAG.getTargetConstant(DstAS, SDLoc(), MVT::i32));
1865 break;
1866 }
1867 default:
1868 break;
1869 }
1870}
1871
1874 Type *&AccessTy) const {
1875 Value *Ptr = nullptr;
1876 switch (II->getIntrinsicID()) {
1877 case Intrinsic::amdgcn_cluster_load_b128:
1878 case Intrinsic::amdgcn_cluster_load_b64:
1879 case Intrinsic::amdgcn_cluster_load_b32:
1880 case Intrinsic::amdgcn_ds_append:
1881 case Intrinsic::amdgcn_ds_consume:
1882 case Intrinsic::amdgcn_ds_load_tr8_b64:
1883 case Intrinsic::amdgcn_ds_load_tr16_b128:
1884 case Intrinsic::amdgcn_ds_load_tr4_b64:
1885 case Intrinsic::amdgcn_ds_load_tr6_b96:
1886 case Intrinsic::amdgcn_ds_read_tr4_b64:
1887 case Intrinsic::amdgcn_ds_read_tr6_b96:
1888 case Intrinsic::amdgcn_ds_read_tr8_b64:
1889 case Intrinsic::amdgcn_ds_read_tr16_b64:
1890 case Intrinsic::amdgcn_ds_ordered_add:
1891 case Intrinsic::amdgcn_ds_ordered_swap:
1892 case Intrinsic::amdgcn_ds_atomic_async_barrier_arrive_b64:
1893 case Intrinsic::amdgcn_ds_atomic_barrier_arrive_rtn_b64:
1894 case Intrinsic::amdgcn_flat_atomic_fmax_num:
1895 case Intrinsic::amdgcn_flat_atomic_fmin_num:
1896 case Intrinsic::amdgcn_global_atomic_fmax_num:
1897 case Intrinsic::amdgcn_global_atomic_fmin_num:
1898 case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
1899 case Intrinsic::amdgcn_global_load_tr_b64:
1900 case Intrinsic::amdgcn_global_load_tr_b128:
1901 case Intrinsic::amdgcn_global_load_tr4_b64:
1902 case Intrinsic::amdgcn_global_load_tr6_b96:
1903 case Intrinsic::amdgcn_global_store_async_from_lds_b8:
1904 case Intrinsic::amdgcn_global_store_async_from_lds_b32:
1905 case Intrinsic::amdgcn_global_store_async_from_lds_b64:
1906 case Intrinsic::amdgcn_global_store_async_from_lds_b128:
1907 case Intrinsic::amdgcn_av_load_b128:
1908 case Intrinsic::amdgcn_av_store_b128:
1909 Ptr = II->getArgOperand(0);
1910 break;
1911 case Intrinsic::amdgcn_load_to_lds:
1912 case Intrinsic::amdgcn_load_async_to_lds:
1913 case Intrinsic::amdgcn_global_load_lds:
1914 case Intrinsic::amdgcn_global_load_async_lds:
1915 case Intrinsic::amdgcn_global_load_async_to_lds_b8:
1916 case Intrinsic::amdgcn_global_load_async_to_lds_b32:
1917 case Intrinsic::amdgcn_global_load_async_to_lds_b64:
1918 case Intrinsic::amdgcn_global_load_async_to_lds_b128:
1919 case Intrinsic::amdgcn_cluster_load_async_to_lds_b8:
1920 case Intrinsic::amdgcn_cluster_load_async_to_lds_b32:
1921 case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
1922 case Intrinsic::amdgcn_cluster_load_async_to_lds_b128:
1923 Ptr = II->getArgOperand(1);
1924 break;
1925 default:
1926 return false;
1927 }
1928 AccessTy = II->getType();
1929 Ops.push_back(Ptr);
1930 return true;
1931}
1932
1934 unsigned AddrSpace) const {
1935 if (!Subtarget->hasFlatInstOffsets()) {
1936 // Flat instructions do not have offsets, and only have the register
1937 // address.
1938 return AM.BaseOffs == 0 && AM.Scale == 0;
1939 }
1940
1942 FlatAddrSpace FlatVariant =
1943 AddrSpace == AMDGPUAS::GLOBAL_ADDRESS ? FlatAddrSpace::FlatGlobal
1944 : AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ? FlatAddrSpace::FlatScratch
1945 : FlatAddrSpace::FLAT;
1946
1947 return AM.Scale == 0 &&
1948 (AM.BaseOffs == 0 || Subtarget->getInstrInfo()->isLegalFLATOffset(
1949 AM.BaseOffs, AddrSpace, FlatVariant));
1950}
1951
1953 if (Subtarget->hasFlatGlobalInsts())
1955
1956 if (!Subtarget->hasAddr64() || Subtarget->useFlatForGlobal()) {
1957 // Assume the we will use FLAT for all global memory accesses
1958 // on VI.
1959 // FIXME: This assumption is currently wrong. On VI we still use
1960 // MUBUF instructions for the r + i addressing mode. As currently
1961 // implemented, the MUBUF instructions only work on buffer < 4GB.
1962 // It may be possible to support > 4GB buffers with MUBUF instructions,
1963 // by setting the stride value in the resource descriptor which would
1964 // increase the size limit to (stride * 4GB). However, this is risky,
1965 // because it has never been validated.
1967 }
1968
1969 return isLegalMUBUFAddressingMode(AM);
1970}
1971
1972bool SITargetLowering::isLegalMUBUFAddressingMode(const AddrMode &AM) const {
1973 // MUBUF / MTBUF instructions have a 12-bit unsigned byte offset, and
1974 // additionally can do r + r + i with addr64. 32-bit has more addressing
1975 // mode options. Depending on the resource constant, it can also do
1976 // (i64 r0) + (i32 r1) * (i14 i).
1977 //
1978 // Private arrays end up using a scratch buffer most of the time, so also
1979 // assume those use MUBUF instructions. Scratch loads / stores are currently
1980 // implemented as mubuf instructions with offen bit set, so slightly
1981 // different than the normal addr64.
1982 const SIInstrInfo *TII = Subtarget->getInstrInfo();
1983 if (!TII->isLegalMUBUFImmOffset(AM.BaseOffs))
1984 return false;
1985
1986 // FIXME: Since we can split immediate into soffset and immediate offset,
1987 // would it make sense to allow any immediate?
1988
1989 switch (AM.Scale) {
1990 case 0: // r + i or just i, depending on HasBaseReg.
1991 return true;
1992 case 1:
1993 return true; // We have r + r or r + i.
1994 case 2:
1995 if (AM.HasBaseReg) {
1996 // Reject 2 * r + r.
1997 return false;
1998 }
1999
2000 // Allow 2 * r as r + r
2001 // Or 2 * r + i is allowed as r + r + i.
2002 return true;
2003 default: // Don't allow n * r
2004 return false;
2005 }
2006}
2007
2009 const AddrMode &AM, Type *Ty,
2010 unsigned AS,
2011 Instruction *I) const {
2012 // No global is ever allowed as a base.
2013 if (AM.BaseGV)
2014 return false;
2015
2016 if (AS == AMDGPUAS::GLOBAL_ADDRESS)
2017 return isLegalGlobalAddressingMode(AM);
2018
2019 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
2023 // If the offset isn't a multiple of 4, it probably isn't going to be
2024 // correctly aligned.
2025 // FIXME: Can we get the real alignment here?
2026 if (AM.BaseOffs % 4 != 0)
2027 return isLegalMUBUFAddressingMode(AM);
2028
2029 if (!Subtarget->hasScalarSubwordLoads()) {
2030 // There are no SMRD extloads, so if we have to do a small type access we
2031 // will use a MUBUF load.
2032 // FIXME?: We also need to do this if unaligned, but we don't know the
2033 // alignment here.
2034 if (Ty->isSized() && DL.getTypeStoreSize(Ty) < 4)
2035 return isLegalGlobalAddressingMode(AM);
2036 }
2037
2038 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS) {
2039 // SMRD instructions have an 8-bit, dword offset on SI.
2040 if (!isUInt<8>(AM.BaseOffs / 4))
2041 return false;
2042 } else if (Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS) {
2043 // On CI+, this can also be a 32-bit literal constant offset. If it fits
2044 // in 8-bits, it can use a smaller encoding.
2045 if (!isUInt<32>(AM.BaseOffs / 4))
2046 return false;
2047 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX9) {
2048 // On VI, these use the SMEM format and the offset is 20-bit in bytes.
2049 if (!isUInt<20>(AM.BaseOffs))
2050 return false;
2051 } else if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX12) {
2052 // On GFX9 the offset is signed 21-bit in bytes (but must not be negative
2053 // for S_BUFFER_* instructions).
2054 if (!isInt<21>(AM.BaseOffs))
2055 return false;
2056 } else {
2057 // On GFX12, all offsets are signed 24-bit in bytes.
2058 if (!isInt<24>(AM.BaseOffs))
2059 return false;
2060 }
2061
2062 if ((AS == AMDGPUAS::CONSTANT_ADDRESS ||
2064 AM.BaseOffs < 0) {
2065 // Scalar (non-buffer) loads can only use a negative offset if
2066 // soffset+offset is non-negative. Since the compiler can only prove that
2067 // in a few special cases, it is safer to claim that negative offsets are
2068 // not supported.
2069 return false;
2070 }
2071
2072 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2073 return true;
2074
2075 if (AM.Scale == 1 && AM.HasBaseReg)
2076 return true;
2077
2078 return false;
2079 }
2080
2081 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
2082 return Subtarget->hasFlatScratchEnabled()
2084 : isLegalMUBUFAddressingMode(AM);
2085
2086 if (AS == AMDGPUAS::LOCAL_ADDRESS ||
2087 (AS == AMDGPUAS::REGION_ADDRESS && Subtarget->hasGDS())) {
2088 // Basic, single offset DS instructions allow a 16-bit unsigned immediate
2089 // field.
2090 // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
2091 // an 8-bit dword offset but we don't know the alignment here.
2092 if (!isUInt<16>(AM.BaseOffs))
2093 return false;
2094
2095 if (AM.Scale == 0) // r + i or just i, depending on HasBaseReg.
2096 return true;
2097
2098 if (AM.Scale == 1 && AM.HasBaseReg)
2099 return true;
2100
2101 return false;
2102 }
2103
2105 // For an unknown address space, this usually means that this is for some
2106 // reason being used for pure arithmetic, and not based on some addressing
2107 // computation. We don't have instructions that compute pointers with any
2108 // addressing modes, so treat them as having no offset like flat
2109 // instructions.
2111 }
2112
2113 // Assume a user alias of global for unknown address spaces.
2114 return isLegalGlobalAddressingMode(AM);
2115}
2116
2118 const MachineFunction &MF) const {
2120 return (MemVT.getSizeInBits() <= 4 * 32);
2121 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
2122 unsigned MaxPrivateBits = 8 * getSubtarget()->getMaxPrivateElementSize();
2123 return (MemVT.getSizeInBits() <= MaxPrivateBits);
2124 }
2126 return (MemVT.getSizeInBits() <= 2 * 32);
2127 return true;
2128}
2129
2131 unsigned Size, unsigned AddrSpace, Align Alignment,
2132 MachineMemOperand::Flags Flags, unsigned *IsFast) const {
2133 if (IsFast)
2134 *IsFast = 0;
2135
2136 if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
2137 AddrSpace == AMDGPUAS::REGION_ADDRESS) {
2138 // Check if alignment requirements for ds_read/write instructions are
2139 // disabled.
2140 if (!Subtarget->hasUnalignedDSAccessEnabled() && Alignment < Align(4))
2141 return false;
2142
2143 Align RequiredAlignment(
2144 PowerOf2Ceil(divideCeil(Size, 8))); // Natural alignment.
2145 if (Subtarget->hasLDSMisalignedBugInWGPMode() && Size > 32 &&
2146 Alignment < RequiredAlignment)
2147 return false;
2148
2149 // Either, the alignment requirements are "enabled", or there is an
2150 // unaligned LDS access related hardware bug though alignment requirements
2151 // are "disabled". In either case, we need to check for proper alignment
2152 // requirements.
2153 //
2154 switch (Size) {
2155 case 64:
2156 // SI has a hardware bug in the LDS / GDS bounds checking: if the base
2157 // address is negative, then the instruction is incorrectly treated as
2158 // out-of-bounds even if base + offsets is in bounds. Split vectorized
2159 // loads here to avoid emitting ds_read2_b32. We may re-combine the
2160 // load later in the SILoadStoreOptimizer.
2161 if (!Subtarget->hasUsableDSOffset() && Alignment < Align(8))
2162 return false;
2163
2164 // 8 byte accessing via ds_read/write_b64 require 8-byte alignment, but we
2165 // can do a 4 byte aligned, 8 byte access in a single operation using
2166 // ds_read2/write2_b32 with adjacent offsets.
2167 RequiredAlignment = Align(4);
2168
2169 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2170 // We will either select ds_read_b64/ds_write_b64 or ds_read2_b32/
2171 // ds_write2_b32 depending on the alignment. In either case with either
2172 // alignment there is no faster way of doing this.
2173
2174 // The numbers returned here and below are not additive, it is a 'speed
2175 // rank'. They are just meant to be compared to decide if a certain way
2176 // of lowering an operation is faster than another. For that purpose
2177 // naturally aligned operation gets it bitsize to indicate that "it
2178 // operates with a speed comparable to N-bit wide load". With the full
2179 // alignment ds128 is slower than ds96 for example. If underaligned it
2180 // is comparable to a speed of a single dword access, which would then
2181 // mean 32 < 128 and it is faster to issue a wide load regardless.
2182 // 1 is simply "slow, don't do it". I.e. comparing an aligned load to a
2183 // wider load which will not be aligned anymore the latter is slower.
2184 if (IsFast)
2185 *IsFast = (Alignment >= RequiredAlignment) ? 64
2186 : (Alignment < Align(4)) ? 32
2187 : 1;
2188 return true;
2189 }
2190
2191 break;
2192 case 96:
2193 if (!Subtarget->hasDS96AndDS128())
2194 return false;
2195
2196 // 12 byte accessing via ds_read/write_b96 require 16-byte alignment on
2197 // gfx8 and older.
2198
2199 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2200 // Naturally aligned access is fastest. However, also report it is Fast
2201 // if memory is aligned less than DWORD. A narrow load or store will be
2202 // be equally slow as a single ds_read_b96/ds_write_b96, but there will
2203 // be more of them, so overall we will pay less penalty issuing a single
2204 // instruction.
2205
2206 // See comment on the values above.
2207 if (IsFast)
2208 *IsFast = (Alignment >= RequiredAlignment) ? 96
2209 : (Alignment < Align(4)) ? 32
2210 : 1;
2211 return true;
2212 }
2213
2214 break;
2215 case 128:
2216 if (!Subtarget->hasDS96AndDS128() || !Subtarget->useDS128())
2217 return false;
2218
2219 // 16 byte accessing via ds_read/write_b128 require 16-byte alignment on
2220 // gfx8 and older, but we can do a 8 byte aligned, 16 byte access in a
2221 // single operation using ds_read2/write2_b64.
2222 RequiredAlignment = Align(8);
2223
2224 if (Subtarget->hasUnalignedDSAccessEnabled()) {
2225 // Naturally aligned access is fastest. However, also report it is Fast
2226 // if memory is aligned less than DWORD. A narrow load or store will be
2227 // be equally slow as a single ds_read_b128/ds_write_b128, but there
2228 // will be more of them, so overall we will pay less penalty issuing a
2229 // single instruction.
2230
2231 // See comment on the values above.
2232 if (IsFast)
2233 *IsFast = (Alignment >= RequiredAlignment) ? 128
2234 : (Alignment < Align(4)) ? 32
2235 : 1;
2236 return true;
2237 }
2238
2239 break;
2240 default:
2241 if (Size > 32)
2242 return false;
2243
2244 break;
2245 }
2246
2247 // See comment on the values above.
2248 // Note that we have a single-dword or sub-dword here, so if underaligned
2249 // it is a slowest possible access, hence returned value is 0.
2250 if (IsFast)
2251 *IsFast = (Alignment >= RequiredAlignment) ? Size : 0;
2252
2253 return Alignment >= RequiredAlignment ||
2254 Subtarget->hasUnalignedDSAccessEnabled();
2255 }
2256
2257 // FIXME: We have to be conservative here and assume that flat operations
2258 // will access scratch. If we had access to the IR function, then we
2259 // could determine if any private memory was used in the function.
2260 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
2261 AddrSpace == AMDGPUAS::FLAT_ADDRESS) {
2262 bool AlignedBy4 = Alignment >= Align(4);
2263 if (Subtarget->hasUnalignedScratchAccessEnabled()) {
2264 if (IsFast)
2265 *IsFast = AlignedBy4 ? Size : 1;
2266 return true;
2267 }
2268
2269 if (IsFast)
2270 *IsFast = AlignedBy4;
2271
2272 return AlignedBy4;
2273 }
2274
2275 // So long as they are correct, wide global memory operations perform better
2276 // than multiple smaller memory ops -- even when misaligned
2277 if (AMDGPU::isExtendedGlobalAddrSpace(AddrSpace)) {
2278 if (IsFast)
2279 *IsFast = Size;
2280
2281 return Alignment >= Align(4) ||
2282 Subtarget->hasUnalignedBufferAccessEnabled();
2283 }
2284
2285 // Ensure robust out-of-bounds guarantees for buffer accesses are met when the
2286 // "amdgpu.buffer.oob.mode" module flag has not enabled relaxed untyped-buffer
2287 // OOB semantics. Normally hardware will ensure proper
2288 // out-of-bounds behavior, but in the edge case where an access starts
2289 // out-of-bounds and then enters in-bounds, the entire access would be treated
2290 // as out-of-bounds. Prevent misaligned memory accesses by requiring the
2291 // natural alignment of buffer accesses.
2292 if (AddrSpace == AMDGPUAS::BUFFER_FAT_POINTER ||
2293 AddrSpace == AMDGPUAS::BUFFER_RESOURCE ||
2294 AddrSpace == AMDGPUAS::BUFFER_STRIDED_POINTER) {
2295 if (!Subtarget->hasRelaxedBufferOOBMode() &&
2296 Alignment < Align(PowerOf2Ceil(divideCeil(Size, 8))))
2297 return false;
2298 }
2299
2300 // Smaller than dword value must be aligned.
2301 if (Size < 32)
2302 return false;
2303
2304 // 8.1.6 - For Dword or larger reads or writes, the two LSBs of the
2305 // byte-address are ignored, thus forcing Dword alignment.
2306 // This applies to private, global, and constant memory.
2307 if (IsFast)
2308 *IsFast = 1;
2309
2310 return Size >= 32 && Alignment >= Align(4);
2311}
2312
2314 EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
2315 unsigned *IsFast) const {
2317 Alignment, Flags, IsFast);
2318}
2319
2321 LLVMContext &Context, const MemOp &Op,
2322 const AttributeList &FuncAttributes) const {
2323 // FIXME: Should account for address space here.
2324
2325 // The default fallback uses the private pointer size as a guess for a type to
2326 // use. Make sure we switch these to 64-bit accesses.
2327
2328 if (Op.size() >= 16 &&
2329 Op.isDstAligned(Align(4))) // XXX: Should only do for global
2330 return MVT::v4i32;
2331
2332 if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
2333 return MVT::v2i32;
2334
2335 // Use the default.
2336 return MVT::Other;
2337}
2338
2340 const MemSDNode *MemNode = cast<MemSDNode>(N);
2341 return MemNode->getMemOperand()->getFlags() & MONoClobber;
2342}
2343
2348
2350 unsigned DestAS) const {
2351 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
2352 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
2353 Subtarget->hasGloballyAddressableScratch()) {
2354 // Flat -> private requires subtracting src_flat_scratch_base_lo.
2355 return false;
2356 }
2357
2358 // Flat -> private/local is a simple truncate.
2359 // Flat -> global is no-op
2360 return true;
2361 }
2362
2363 const GCNTargetMachine &TM =
2364 static_cast<const GCNTargetMachine &>(getTargetMachine());
2365 return TM.isNoopAddrSpaceCast(SrcAS, DestAS);
2366}
2367
2375
2377 Type *Ty) const {
2378 // FIXME: Could be smarter if called for vector constants.
2379 return true;
2380}
2381
2383 unsigned Index) const {
2385 return false;
2386
2387 // TODO: Add more cases that are cheap.
2388 return Index == 0;
2389}
2390
2391bool SITargetLowering::isExtractVecEltCheap(EVT VT, unsigned Index) const {
2392 // TODO: This should be more aggressive, particular for 16-bit element
2393 // vectors. However there are some mixed improvements and regressions.
2394 EVT EltTy = VT.getVectorElementType();
2395 unsigned MinAlign = Subtarget->useRealTrue16Insts() ? 16 : 32;
2396 return EltTy.getSizeInBits() % MinAlign == 0;
2397}
2398
2400 if (Subtarget->has16BitInsts() && VT == MVT::i16) {
2401 switch (Op) {
2402 case ISD::LOAD:
2403 case ISD::STORE:
2404 return true;
2405 default:
2406 return false;
2407 }
2408 }
2409
2410 // SimplifySetCC uses this function to determine whether or not it should
2411 // create setcc with i1 operands. We don't have instructions for i1 setcc.
2412 if (VT == MVT::i1 && Op == ISD::SETCC)
2413 return false;
2414
2416}
2417
2420 // This isn't really a constant pool but close enough.
2423 return PtrInfo;
2424}
2425
2426SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
2427 const SDLoc &SL,
2428 SDValue Chain,
2429 uint64_t Offset) const {
2430 const DataLayout &DL = DAG.getDataLayout();
2434
2435 auto [InputPtrReg, RC, ArgTy] =
2436 Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
2437
2438 // We may not have the kernarg segment argument if we have no kernel
2439 // arguments.
2440 if (!InputPtrReg)
2441 return DAG.getConstant(Offset, SL, PtrVT);
2442
2444 SDValue BasePtr = DAG.getCopyFromReg(
2445 Chain, SL, MRI.getLiveInVirtReg(InputPtrReg->getRegister()), PtrVT);
2446
2447 return DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Offset));
2448}
2449
2450SDValue SITargetLowering::getImplicitArgPtr(SelectionDAG &DAG,
2451 const SDLoc &SL) const {
2454 return lowerKernArgParameterPtr(DAG, SL, DAG.getEntryNode(), Offset);
2455}
2456
2457SDValue SITargetLowering::getLDSKernelId(SelectionDAG &DAG,
2458 const SDLoc &SL) const {
2459
2461 std::optional<uint32_t> KnownSize =
2463 if (KnownSize.has_value())
2464 return DAG.getConstant(*KnownSize, SL, MVT::i32);
2465 return SDValue();
2466}
2467
2468SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
2469 const SDLoc &SL, SDValue Val,
2470 bool Signed,
2471 const ISD::InputArg *Arg) const {
2472 // First, if it is a widened vector, narrow it.
2473 if (VT.isVector() &&
2475 EVT NarrowedVT =
2478 Val = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, NarrowedVT, Val,
2479 DAG.getConstant(0, SL, MVT::i32));
2480 }
2481
2482 // Then convert the vector elements or scalar value.
2483 if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) && VT.bitsLT(MemVT)) {
2484 unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
2485 Val = DAG.getNode(Opc, SL, MemVT, Val, DAG.getValueType(VT));
2486 }
2487
2488 if (MemVT.isFloatingPoint()) {
2489 if (VT.isFloatingPoint()) {
2490 Val = getFPExtOrFPRound(DAG, Val, SL, VT);
2491 } else {
2492 assert(!MemVT.isVector());
2493 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
2494 SDValue Cast = DAG.getBitcast(IntVT, Val);
2495 Val = DAG.getAnyExtOrTrunc(Cast, SL, VT);
2496 }
2497 } else if (Signed)
2498 Val = DAG.getSExtOrTrunc(Val, SL, VT);
2499 else
2500 Val = DAG.getZExtOrTrunc(Val, SL, VT);
2501
2502 return Val;
2503}
2504
2505SDValue SITargetLowering::lowerKernargMemParameter(
2506 SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
2507 uint64_t Offset, Align Alignment, bool Signed,
2508 const ISD::InputArg *Arg) const {
2509
2510 MachinePointerInfo PtrInfo =
2512
2513 // Try to avoid using an extload by loading earlier than the argument address,
2514 // and extracting the relevant bits. The load should hopefully be merged with
2515 // the previous argument.
2516 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
2517 // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
2518 int64_t AlignDownOffset = alignDown(Offset, 4);
2519 int64_t OffsetDiff = Offset - AlignDownOffset;
2520
2521 EVT IntVT = MemVT.changeTypeToInteger();
2522
2523 // TODO: If we passed in the base kernel offset we could have a better
2524 // alignment than 4, but we don't really need it.
2525 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, AlignDownOffset);
2526 SDValue Load = DAG.getLoad(MVT::i32, SL, Chain, Ptr,
2527 PtrInfo.getWithOffset(AlignDownOffset), Align(4),
2530
2531 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, SL, MVT::i32);
2532 SDValue Extract = DAG.getNode(ISD::SRL, SL, MVT::i32, Load, ShiftAmt);
2533
2534 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, SL, IntVT, Extract);
2535 ArgVal = DAG.getNode(ISD::BITCAST, SL, MemVT, ArgVal);
2536 ArgVal = convertArgType(DAG, VT, MemVT, SL, ArgVal, Signed, Arg);
2537
2538 return DAG.getMergeValues({ArgVal, Load.getValue(1)}, SL);
2539 }
2540
2541 SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
2542 SDValue Load = DAG.getLoad(
2543 MemVT, SL, Chain, Ptr, PtrInfo.getWithOffset(Offset), Alignment,
2545
2546 SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
2547 return DAG.getMergeValues({Val, Load.getValue(1)}, SL);
2548}
2549
2550/// Coerce an argument which was passed in a different ABI type to the original
2551/// expected value type.
2552SDValue SITargetLowering::convertABITypeToValueType(SelectionDAG &DAG,
2553 SDValue Val,
2554 CCValAssign &VA,
2555 const SDLoc &SL) const {
2556 EVT ValVT = VA.getValVT();
2557
2558 // If this is an 8 or 16-bit value, it is really passed promoted
2559 // to 32 bits. Insert an assert[sz]ext to capture this, then
2560 // truncate to the right size.
2561 switch (VA.getLocInfo()) {
2562 case CCValAssign::Full:
2563 return Val;
2564 case CCValAssign::BCvt:
2565 return DAG.getNode(ISD::BITCAST, SL, ValVT, Val);
2566 case CCValAssign::SExt:
2567 Val = DAG.getNode(ISD::AssertSext, SL, VA.getLocVT(), Val,
2568 DAG.getValueType(ValVT));
2569 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2570 case CCValAssign::ZExt:
2571 Val = DAG.getNode(ISD::AssertZext, SL, VA.getLocVT(), Val,
2572 DAG.getValueType(ValVT));
2573 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2574 case CCValAssign::AExt:
2575 return DAG.getNode(ISD::TRUNCATE, SL, ValVT, Val);
2576 default:
2577 llvm_unreachable("Unknown loc info!");
2578 }
2579}
2580
2581SDValue SITargetLowering::lowerStackParameter(SelectionDAG &DAG,
2582 CCValAssign &VA, const SDLoc &SL,
2583 SDValue Chain,
2584 const ISD::InputArg &Arg) const {
2585 MachineFunction &MF = DAG.getMachineFunction();
2586 MachineFrameInfo &MFI = MF.getFrameInfo();
2587
2588 if (Arg.Flags.isByVal()) {
2589 unsigned Size = Arg.Flags.getByValSize();
2590 int FrameIdx = MFI.CreateFixedObject(Size, VA.getLocMemOffset(), false);
2591 return DAG.getFrameIndex(FrameIdx, MVT::i32);
2592 }
2593
2594 unsigned ArgOffset = VA.getLocMemOffset();
2595 unsigned ArgSize = VA.getValVT().getStoreSize();
2596
2597 int FI = MFI.CreateFixedObject(ArgSize, ArgOffset, true);
2598
2599 // Create load nodes to retrieve arguments from the stack.
2600 SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
2601
2602 // For NON_EXTLOAD, generic code in getLoad assert(ValVT == MemVT)
2604 MVT MemVT = VA.getValVT();
2605
2606 switch (VA.getLocInfo()) {
2607 default:
2608 break;
2609 case CCValAssign::BCvt:
2610 MemVT = VA.getLocVT();
2611 break;
2612 case CCValAssign::SExt:
2613 ExtType = ISD::SEXTLOAD;
2614 break;
2615 case CCValAssign::ZExt:
2616 ExtType = ISD::ZEXTLOAD;
2617 break;
2618 case CCValAssign::AExt:
2619 ExtType = ISD::EXTLOAD;
2620 break;
2621 }
2622
2623 SDValue ArgValue = DAG.getExtLoad(
2624 ExtType, SL, VA.getLocVT(), Chain, FIN,
2626
2627 SDValue ConvertedVal = convertABITypeToValueType(DAG, ArgValue, VA, SL);
2628 if (ConvertedVal == ArgValue)
2629 return ConvertedVal;
2630
2631 return DAG.getMergeValues({ConvertedVal, ArgValue.getValue(1)}, SL);
2632}
2633
2634SDValue SITargetLowering::lowerWorkGroupId(
2635 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2638 AMDGPUFunctionArgInfo::PreloadedValue ClusterWorkGroupIdPV) const {
2639 if (!Subtarget->hasClusters())
2640 return getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2641
2642 // Clusters are supported. Return the global position in the grid. If clusters
2643 // are enabled, WorkGroupIdPV returns the cluster ID not the workgroup ID.
2644
2645 // WorkGroupIdXYZ = ClusterId == 0 ?
2646 // ClusterIdXYZ :
2647 // ClusterIdXYZ * (ClusterMaxIdXYZ + 1) + ClusterWorkGroupIdXYZ
2648 SDValue ClusterIdXYZ = getPreloadedValue(DAG, MFI, VT, WorkGroupIdPV);
2649 SDLoc SL(ClusterIdXYZ);
2650 SDValue ClusterMaxIdXYZ = getPreloadedValue(DAG, MFI, VT, ClusterMaxIdPV);
2651 SDValue One = DAG.getConstant(1, SL, VT);
2652 SDValue ClusterSizeXYZ = DAG.getNode(ISD::ADD, SL, VT, ClusterMaxIdXYZ, One);
2653 SDValue ClusterWorkGroupIdXYZ =
2654 getPreloadedValue(DAG, MFI, VT, ClusterWorkGroupIdPV);
2655 SDValue GlobalIdXYZ =
2656 DAG.getNode(ISD::ADD, SL, VT, ClusterWorkGroupIdXYZ,
2657 DAG.getNode(ISD::MUL, SL, VT, ClusterIdXYZ, ClusterSizeXYZ));
2658
2659 switch (MFI.getClusterDims().getKind()) {
2662 return GlobalIdXYZ;
2664 return ClusterIdXYZ;
2666 using namespace AMDGPU::Hwreg;
2667 SDValue ClusterIdField =
2668 DAG.getTargetConstant(HwregEncoding::encode(ID_IB_STS2, 6, 4), SL, VT);
2669 SDNode *GetReg =
2670 DAG.getMachineNode(AMDGPU::S_GETREG_B32_const, SL, VT, ClusterIdField);
2671 SDValue ClusterId(GetReg, 0);
2672 SDValue Zero = DAG.getConstant(0, SL, VT);
2673 return DAG.getNode(ISD::SELECT_CC, SL, VT, ClusterId, Zero, ClusterIdXYZ,
2674 GlobalIdXYZ, DAG.getCondCode(ISD::SETEQ));
2675 }
2676 }
2677
2678 llvm_unreachable("nothing should reach here");
2679}
2680
2681SDValue SITargetLowering::getPreloadedValue(
2682 SelectionDAG &DAG, const SIMachineFunctionInfo &MFI, EVT VT,
2684 const ArgDescriptor *Reg = nullptr;
2685 const TargetRegisterClass *RC;
2686 LLT Ty;
2687
2689 const ArgDescriptor WorkGroupIDX =
2690 ArgDescriptor::createRegister(AMDGPU::TTMP9);
2691 // If GridZ is not programmed in an entry function then the hardware will set
2692 // it to all zeros, so there is no need to mask the GridY value in the low
2693 // order bits.
2694 const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
2695 AMDGPU::TTMP7,
2696 AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
2697 const ArgDescriptor WorkGroupIDZ =
2698 ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
2699 const ArgDescriptor ClusterWorkGroupIDX =
2700 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000000Fu);
2701 const ArgDescriptor ClusterWorkGroupIDY =
2702 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000000F0u);
2703 const ArgDescriptor ClusterWorkGroupIDZ =
2704 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00000F00u);
2705 const ArgDescriptor ClusterWorkGroupMaxIDX =
2706 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0000F000u);
2707 const ArgDescriptor ClusterWorkGroupMaxIDY =
2708 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x000F0000u);
2709 const ArgDescriptor ClusterWorkGroupMaxIDZ =
2710 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x00F00000u);
2711 const ArgDescriptor ClusterWorkGroupMaxFlatID =
2712 ArgDescriptor::createRegister(AMDGPU::TTMP6, 0x0F000000u);
2713
2714 auto LoadConstant = [&](unsigned N) {
2715 return DAG.getConstant(N, SDLoc(), VT);
2716 };
2717
2718 if (Subtarget->hasArchitectedSGPRs() &&
2720 AMDGPU::ClusterDimsAttr ClusterDims = MFI.getClusterDims();
2721 bool HasFixedDims = ClusterDims.isFixedDims();
2722
2723 switch (PVID) {
2725 Reg = &WorkGroupIDX;
2726 RC = &AMDGPU::SReg_32RegClass;
2727 Ty = LLT::scalar(32);
2728 break;
2730 Reg = &WorkGroupIDY;
2731 RC = &AMDGPU::SReg_32RegClass;
2732 Ty = LLT::scalar(32);
2733 break;
2735 Reg = &WorkGroupIDZ;
2736 RC = &AMDGPU::SReg_32RegClass;
2737 Ty = LLT::scalar(32);
2738 break;
2740 if (HasFixedDims && ClusterDims.getDims()[0] == 1)
2741 return LoadConstant(0);
2742 Reg = &ClusterWorkGroupIDX;
2743 RC = &AMDGPU::SReg_32RegClass;
2744 Ty = LLT::scalar(32);
2745 break;
2747 if (HasFixedDims && ClusterDims.getDims()[1] == 1)
2748 return LoadConstant(0);
2749 Reg = &ClusterWorkGroupIDY;
2750 RC = &AMDGPU::SReg_32RegClass;
2751 Ty = LLT::scalar(32);
2752 break;
2754 if (HasFixedDims && ClusterDims.getDims()[2] == 1)
2755 return LoadConstant(0);
2756 Reg = &ClusterWorkGroupIDZ;
2757 RC = &AMDGPU::SReg_32RegClass;
2758 Ty = LLT::scalar(32);
2759 break;
2761 if (HasFixedDims)
2762 return LoadConstant(ClusterDims.getDims()[0] - 1);
2763 Reg = &ClusterWorkGroupMaxIDX;
2764 RC = &AMDGPU::SReg_32RegClass;
2765 Ty = LLT::scalar(32);
2766 break;
2768 if (HasFixedDims)
2769 return LoadConstant(ClusterDims.getDims()[1] - 1);
2770 Reg = &ClusterWorkGroupMaxIDY;
2771 RC = &AMDGPU::SReg_32RegClass;
2772 Ty = LLT::scalar(32);
2773 break;
2775 if (HasFixedDims)
2776 return LoadConstant(ClusterDims.getDims()[2] - 1);
2777 Reg = &ClusterWorkGroupMaxIDZ;
2778 RC = &AMDGPU::SReg_32RegClass;
2779 Ty = LLT::scalar(32);
2780 break;
2782 Reg = &ClusterWorkGroupMaxFlatID;
2783 RC = &AMDGPU::SReg_32RegClass;
2784 Ty = LLT::scalar(32);
2785 break;
2786 default:
2787 break;
2788 }
2789 }
2790
2791 if (!Reg)
2792 std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
2793 if (!Reg) {
2795 // It's possible for a kernarg intrinsic call to appear in a kernel with
2796 // no allocated segment, in which case we do not add the user sgpr
2797 // argument, so just return null.
2798 return DAG.getConstant(0, SDLoc(), VT);
2799 }
2800
2801 // It's undefined behavior if a function marked with the amdgpu-no-*
2802 // attributes uses the corresponding intrinsic.
2803 return DAG.getPOISON(VT);
2804 }
2805
2806 return loadInputValue(DAG, RC, VT, SDLoc(DAG.getEntryNode()), *Reg);
2807}
2808
2810 CallingConv::ID CallConv,
2811 ArrayRef<ISD::InputArg> Ins, BitVector &Skipped,
2812 FunctionType *FType,
2813 SIMachineFunctionInfo *Info) {
2814 for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
2815 const ISD::InputArg *Arg = &Ins[I];
2816
2817 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2818 "vector type argument should have been split");
2819
2820 // First check if it's a PS input addr.
2821 if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() &&
2822 PSInputNum <= 15) {
2823 bool SkipArg = !Arg->Used && !Info->isPSInputAllocated(PSInputNum);
2824
2825 // Inconveniently only the first part of the split is marked as isSplit,
2826 // so skip to the end. We only want to increment PSInputNum once for the
2827 // entire split argument.
2828 if (Arg->Flags.isSplit()) {
2829 while (!Arg->Flags.isSplitEnd()) {
2830 assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) &&
2831 "unexpected vector split in ps argument type");
2832 if (!SkipArg)
2833 Splits.push_back(*Arg);
2834 Arg = &Ins[++I];
2835 }
2836 }
2837
2838 if (SkipArg) {
2839 // We can safely skip PS inputs.
2840 Skipped.set(Arg->getOrigArgIndex());
2841 ++PSInputNum;
2842 continue;
2843 }
2844
2845 Info->markPSInputAllocated(PSInputNum);
2846 if (Arg->Used)
2847 Info->markPSInputEnabled(PSInputNum);
2848
2849 ++PSInputNum;
2850 }
2851
2852 Splits.push_back(*Arg);
2853 }
2854}
2855
2856// Allocate special inputs passed in VGPRs.
2858 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2859 SIMachineFunctionInfo &Info) const {
2860 const LLT S32 = LLT::scalar(32);
2861 MachineRegisterInfo &MRI = MF.getRegInfo();
2862
2863 if (Info.hasWorkItemIDX()) {
2864 Register Reg = AMDGPU::VGPR0;
2865 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2866
2867 CCInfo.AllocateReg(Reg);
2868 unsigned Mask =
2869 (Subtarget->hasPackedTID() && Info.hasWorkItemIDY()) ? 0x3ff : ~0u;
2870 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
2871 }
2872
2873 if (Info.hasWorkItemIDY()) {
2874 assert(Info.hasWorkItemIDX());
2875 if (Subtarget->hasPackedTID()) {
2876 Info.setWorkItemIDY(
2877 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 10));
2878 } else {
2879 unsigned Reg = AMDGPU::VGPR1;
2880 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2881
2882 CCInfo.AllocateReg(Reg);
2883 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg));
2884 }
2885 }
2886
2887 if (Info.hasWorkItemIDZ()) {
2888 assert(Info.hasWorkItemIDX() && Info.hasWorkItemIDY());
2889 if (Subtarget->hasPackedTID()) {
2890 Info.setWorkItemIDZ(
2891 ArgDescriptor::createRegister(AMDGPU::VGPR0, 0x3ff << 20));
2892 } else {
2893 unsigned Reg = AMDGPU::VGPR2;
2894 MRI.setType(MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass), S32);
2895
2896 CCInfo.AllocateReg(Reg);
2897 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg));
2898 }
2899 }
2900}
2901
2902// Try to allocate a VGPR at the end of the argument list, or if no argument
2903// VGPRs are left allocating a stack slot.
2904// If \p Mask is is given it indicates bitfield position in the register.
2905// If \p Arg is given use it with new ]p Mask instead of allocating new.
2906static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
2907 ArgDescriptor Arg = ArgDescriptor()) {
2908 if (Arg.isSet())
2909 return ArgDescriptor::createArg(Arg, Mask);
2910
2911 ArrayRef<MCPhysReg> ArgVGPRs = ArrayRef(AMDGPU::VGPR_32RegClass.begin(), 32);
2912 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
2913 if (RegIdx == ArgVGPRs.size()) {
2914 // Spill to stack required.
2915 int64_t Offset = CCInfo.AllocateStack(4, Align(4));
2916
2917 return ArgDescriptor::createStack(Offset, Mask);
2918 }
2919
2920 unsigned Reg = ArgVGPRs[RegIdx];
2921 Reg = CCInfo.AllocateReg(Reg);
2922 assert(Reg != AMDGPU::NoRegister);
2923
2924 MachineFunction &MF = CCInfo.getMachineFunction();
2925 Register LiveInVReg = MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
2926 MF.getRegInfo().setType(LiveInVReg, LLT::scalar(32));
2927 return ArgDescriptor::createRegister(Reg, Mask);
2928}
2929
2931 const TargetRegisterClass *RC,
2932 unsigned NumArgRegs) {
2933 ArrayRef<MCPhysReg> ArgSGPRs = ArrayRef(RC->begin(), 32);
2934 unsigned RegIdx = CCInfo.getFirstUnallocated(ArgSGPRs);
2935 if (RegIdx == ArgSGPRs.size())
2936 report_fatal_error("ran out of SGPRs for arguments");
2937
2938 unsigned Reg = ArgSGPRs[RegIdx];
2939 Reg = CCInfo.AllocateReg(Reg);
2940 assert(Reg != AMDGPU::NoRegister);
2941
2942 MachineFunction &MF = CCInfo.getMachineFunction();
2943 MF.addLiveIn(Reg, RC);
2945}
2946
2947// If this has a fixed position, we still should allocate the register in the
2948// CCInfo state. Technically we could get away with this for values passed
2949// outside of the normal argument range.
2951 const TargetRegisterClass *RC,
2952 MCRegister Reg) {
2953 Reg = CCInfo.AllocateReg(Reg);
2954 assert(Reg != AMDGPU::NoRegister);
2955 MachineFunction &MF = CCInfo.getMachineFunction();
2956 MF.addLiveIn(Reg, RC);
2957}
2958
2959static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg) {
2960 if (Arg) {
2961 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_32RegClass,
2962 Arg.getRegister());
2963 } else
2964 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_32RegClass, 32);
2965}
2966
2967static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg) {
2968 if (Arg) {
2969 allocateFixedSGPRInputImpl(CCInfo, &AMDGPU::SGPR_64RegClass,
2970 Arg.getRegister());
2971 } else
2972 Arg = allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
2973}
2974
2975/// Allocate implicit function VGPR arguments at the end of allocated user
2976/// arguments.
2978 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
2979 SIMachineFunctionInfo &Info) const {
2980 const unsigned Mask = 0x3ff;
2981 ArgDescriptor Arg;
2982
2983 if (Info.hasWorkItemIDX()) {
2984 Arg = allocateVGPR32Input(CCInfo, Mask);
2985 Info.setWorkItemIDX(Arg);
2986 }
2987
2988 if (Info.hasWorkItemIDY()) {
2989 Arg = allocateVGPR32Input(CCInfo, Mask << 10, Arg);
2990 Info.setWorkItemIDY(Arg);
2991 }
2992
2993 if (Info.hasWorkItemIDZ())
2994 Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
2995}
2996
2997/// Allocate implicit function VGPR arguments in fixed registers.
2999 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
3000 SIMachineFunctionInfo &Info) const {
3001 Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
3002 if (!Reg)
3003 report_fatal_error("failed to allocate VGPR for implicit arguments");
3004
3005 const unsigned Mask = 0x3ff;
3006 Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
3007 Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
3008 Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
3009}
3010
3012 CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI,
3013 SIMachineFunctionInfo &Info) const {
3014 auto &ArgInfo = Info.getArgInfo();
3015 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
3016
3017 // TODO: Unify handling with private memory pointers.
3018 if (UserSGPRInfo.hasDispatchPtr())
3019 allocateSGPR64Input(CCInfo, ArgInfo.DispatchPtr);
3020
3021 if (UserSGPRInfo.hasQueuePtr())
3022 allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
3023
3024 // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
3025 // constant offset from the kernarg segment.
3026 if (Info.hasImplicitArgPtr())
3027 allocateSGPR64Input(CCInfo, ArgInfo.ImplicitArgPtr);
3028
3029 if (UserSGPRInfo.hasDispatchID())
3030 allocateSGPR64Input(CCInfo, ArgInfo.DispatchID);
3031
3032 // flat_scratch_init is not applicable for non-kernel functions.
3033
3034 if (Info.hasWorkGroupIDX())
3035 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDX);
3036
3037 if (Info.hasWorkGroupIDY())
3038 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDY);
3039
3040 if (Info.hasWorkGroupIDZ())
3041 allocateSGPR32Input(CCInfo, ArgInfo.WorkGroupIDZ);
3042
3043 if (Info.hasLDSKernelId())
3044 allocateSGPR32Input(CCInfo, ArgInfo.LDSKernelId);
3045}
3046
3047// Allocate special inputs passed in user SGPRs.
3049 MachineFunction &MF,
3050 const SIRegisterInfo &TRI,
3051 SIMachineFunctionInfo &Info) const {
3052 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info.getUserSGPRInfo();
3053 if (UserSGPRInfo.hasImplicitBufferPtr()) {
3054 Register ImplicitBufferPtrReg = Info.addImplicitBufferPtr(TRI);
3055 MF.addLiveIn(ImplicitBufferPtrReg, &AMDGPU::SGPR_64RegClass);
3056 CCInfo.AllocateReg(ImplicitBufferPtrReg);
3057 }
3058
3059 // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
3060 if (UserSGPRInfo.hasPrivateSegmentBuffer()) {
3061 Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
3062 MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
3063 CCInfo.AllocateReg(PrivateSegmentBufferReg);
3064 }
3065
3066 if (UserSGPRInfo.hasDispatchPtr()) {
3067 Register DispatchPtrReg = Info.addDispatchPtr(TRI);
3068 MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
3069 CCInfo.AllocateReg(DispatchPtrReg);
3070 }
3071
3072 if (UserSGPRInfo.hasQueuePtr()) {
3073 Register QueuePtrReg = Info.addQueuePtr(TRI);
3074 MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
3075 CCInfo.AllocateReg(QueuePtrReg);
3076 }
3077
3078 if (UserSGPRInfo.hasKernargSegmentPtr()) {
3079 MachineRegisterInfo &MRI = MF.getRegInfo();
3080 Register InputPtrReg = Info.addKernargSegmentPtr(TRI);
3081 CCInfo.AllocateReg(InputPtrReg);
3082
3083 Register VReg = MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
3085 }
3086
3087 if (UserSGPRInfo.hasDispatchID()) {
3088 Register DispatchIDReg = Info.addDispatchID(TRI);
3089 MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
3090 CCInfo.AllocateReg(DispatchIDReg);
3091 }
3092
3093 if (UserSGPRInfo.hasFlatScratchInit() && !getSubtarget()->isAmdPalOS()) {
3094 Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
3095 MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
3096 CCInfo.AllocateReg(FlatScratchInitReg);
3097 }
3098
3099 if (UserSGPRInfo.hasPrivateSegmentSize()) {
3100 Register PrivateSegmentSizeReg = Info.addPrivateSegmentSize(TRI);
3101 MF.addLiveIn(PrivateSegmentSizeReg, &AMDGPU::SGPR_32RegClass);
3102 CCInfo.AllocateReg(PrivateSegmentSizeReg);
3103 }
3104
3105 // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
3106 // these from the dispatch pointer.
3107}
3108
3109// Allocate pre-loaded kernel arguemtns. Arguments to be preloading must be
3110// sequential starting from the first argument.
3112 CCState &CCInfo, SmallVectorImpl<CCValAssign> &ArgLocs,
3114 const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
3115 Function &F = MF.getFunction();
3116 unsigned LastExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
3117 GCNUserSGPRUsageInfo &SGPRInfo = Info.getUserSGPRInfo();
3118 bool InPreloadSequence = true;
3119 unsigned InIdx = 0;
3120 bool AlignedForImplictArgs = false;
3121 unsigned ImplicitArgOffset = 0;
3122 for (auto &Arg : F.args()) {
3123 if (!InPreloadSequence || !Arg.hasInRegAttr())
3124 break;
3125
3126 unsigned ArgIdx = Arg.getArgNo();
3127 // Don't preload non-original args or parts not in the current preload
3128 // sequence.
3129 if (InIdx < Ins.size() &&
3130 (!Ins[InIdx].isOrigArg() || Ins[InIdx].getOrigArgIndex() != ArgIdx))
3131 break;
3132
3133 for (; InIdx < Ins.size() && Ins[InIdx].isOrigArg() &&
3134 Ins[InIdx].getOrigArgIndex() == ArgIdx;
3135 InIdx++) {
3136 assert(ArgLocs[ArgIdx].isMemLoc());
3137 auto &ArgLoc = ArgLocs[InIdx];
3138 const Align KernelArgBaseAlign = Align(16);
3139 unsigned ArgOffset = ArgLoc.getLocMemOffset();
3140 Align Alignment = commonAlignment(KernelArgBaseAlign, ArgOffset);
3141 unsigned NumAllocSGPRs =
3142 alignTo(ArgLoc.getLocVT().getFixedSizeInBits(), 32) / 32;
3143
3144 // Fix alignment for hidden arguments.
3145 if (Arg.hasAttribute("amdgpu-hidden-argument")) {
3146 if (!AlignedForImplictArgs) {
3147 ImplicitArgOffset =
3148 alignTo(LastExplicitArgOffset,
3149 Subtarget->getAlignmentForImplicitArgPtr()) -
3150 LastExplicitArgOffset;
3151 AlignedForImplictArgs = true;
3152 }
3153 ArgOffset += ImplicitArgOffset;
3154 }
3155
3156 // Arg is preloaded into the previous SGPR.
3157 if (ArgLoc.getLocVT().getStoreSize() < 4 && Alignment < 4) {
3158 assert(InIdx >= 1 && "No previous SGPR");
3159 Info.getArgInfo().PreloadKernArgs[InIdx].Regs.push_back(
3160 Info.getArgInfo().PreloadKernArgs[InIdx - 1].Regs[0]);
3161 continue;
3162 }
3163
3164 unsigned Padding = ArgOffset - LastExplicitArgOffset;
3165 unsigned PaddingSGPRs = alignTo(Padding, 4) / 4;
3166 // Check for free user SGPRs for preloading.
3167 if (PaddingSGPRs + NumAllocSGPRs > SGPRInfo.getNumFreeUserSGPRs()) {
3168 InPreloadSequence = false;
3169 break;
3170 }
3171
3172 // Preload this argument.
3173 const TargetRegisterClass *RC =
3174 TRI.getSGPRClassForBitWidth(NumAllocSGPRs * 32);
3175 SmallVectorImpl<MCRegister> *PreloadRegs =
3176 Info.addPreloadedKernArg(TRI, RC, NumAllocSGPRs, InIdx, PaddingSGPRs);
3177
3178 if (PreloadRegs->size() > 1)
3179 RC = &AMDGPU::SGPR_32RegClass;
3180 for (auto &Reg : *PreloadRegs) {
3181 assert(Reg);
3182 MF.addLiveIn(Reg, RC);
3183 CCInfo.AllocateReg(Reg);
3184 }
3185
3186 LastExplicitArgOffset = NumAllocSGPRs * 4 + ArgOffset;
3187 }
3188 }
3189}
3190
3192 const SIRegisterInfo &TRI,
3193 SIMachineFunctionInfo &Info) const {
3194 // Always allocate this last since it is a synthetic preload.
3195 if (Info.hasLDSKernelId()) {
3196 Register Reg = Info.addLDSKernelId();
3197 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3198 CCInfo.AllocateReg(Reg);
3199 }
3200}
3201
3202// Allocate special input registers that are initialized per-wave.
3205 CallingConv::ID CallConv,
3206 bool IsShader) const {
3207 bool HasArchitectedSGPRs = Subtarget->hasArchitectedSGPRs();
3208 if (Subtarget->hasUserSGPRInit16BugInWave32() && !IsShader) {
3209 // Note: user SGPRs are handled by the front-end for graphics shaders
3210 // Pad up the used user SGPRs with dead inputs.
3211
3212 // TODO: NumRequiredSystemSGPRs computation should be adjusted appropriately
3213 // before enabling architected SGPRs for workgroup IDs.
3214 assert(!HasArchitectedSGPRs && "Unhandled feature for the subtarget");
3215
3216 unsigned CurrentUserSGPRs = Info.getNumUserSGPRs();
3217 // Note we do not count the PrivateSegmentWaveByteOffset. We do not want to
3218 // rely on it to reach 16 since if we end up having no stack usage, it will
3219 // not really be added.
3220 unsigned NumRequiredSystemSGPRs =
3221 Info.hasWorkGroupIDX() + Info.hasWorkGroupIDY() +
3222 Info.hasWorkGroupIDZ() + Info.hasWorkGroupInfo();
3223 for (unsigned i = NumRequiredSystemSGPRs + CurrentUserSGPRs; i < 16; ++i) {
3224 Register Reg = Info.addReservedUserSGPR();
3225 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3226 CCInfo.AllocateReg(Reg);
3227 }
3228 }
3229
3230 if (!HasArchitectedSGPRs) {
3231 if (Info.hasWorkGroupIDX()) {
3232 Register Reg = Info.addWorkGroupIDX();
3233 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3234 CCInfo.AllocateReg(Reg);
3235 }
3236
3237 if (Info.hasWorkGroupIDY()) {
3238 Register Reg = Info.addWorkGroupIDY();
3239 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3240 CCInfo.AllocateReg(Reg);
3241 }
3242
3243 if (Info.hasWorkGroupIDZ()) {
3244 Register Reg = Info.addWorkGroupIDZ();
3245 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3246 CCInfo.AllocateReg(Reg);
3247 }
3248 }
3249
3250 if (Info.hasWorkGroupInfo()) {
3251 Register Reg = Info.addWorkGroupInfo();
3252 MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
3253 CCInfo.AllocateReg(Reg);
3254 }
3255
3256 if (Info.hasPrivateSegmentWaveByteOffset()) {
3257 // Scratch wave offset passed in system SGPR.
3258 unsigned PrivateSegmentWaveByteOffsetReg;
3259
3260 if (IsShader) {
3261 PrivateSegmentWaveByteOffsetReg =
3262 Info.getPrivateSegmentWaveByteOffsetSystemSGPR();
3263
3264 // This is true if the scratch wave byte offset doesn't have a fixed
3265 // location.
3266 if (PrivateSegmentWaveByteOffsetReg == AMDGPU::NoRegister) {
3267 PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
3268 Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
3269 }
3270 } else
3271 PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
3272
3273 MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
3274 CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
3275 }
3276
3277 assert(!Subtarget->hasUserSGPRInit16BugInWave32() || IsShader ||
3278 Info.getNumPreloadedSGPRs() >= 16);
3279}
3280
3282 MachineFunction &MF,
3283 const SIRegisterInfo &TRI,
3284 SIMachineFunctionInfo &Info) {
3285 // Now that we've figured out where the scratch register inputs are, see if
3286 // should reserve the arguments and use them directly.
3287 MachineFrameInfo &MFI = MF.getFrameInfo();
3288 bool HasStackObjects = MFI.hasStackObjects();
3289 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
3290
3291 // Record that we know we have non-spill stack objects so we don't need to
3292 // check all stack objects later.
3293 if (HasStackObjects)
3294 Info.setHasNonSpillStackObjects(true);
3295
3296 // Everything live out of a block is spilled with fast regalloc, so it's
3297 // almost certain that spilling will be required.
3299 HasStackObjects = true;
3300
3301 // For now assume stack access is needed in any callee functions, so we need
3302 // the scratch registers to pass in.
3303 bool RequiresStackAccess = HasStackObjects || MFI.hasCalls();
3304
3305 if (!ST.hasFlatScratchEnabled()) {
3306 if (RequiresStackAccess && ST.isAmdHsaOrMesa(MF.getFunction())) {
3307 // If we have stack objects, we unquestionably need the private buffer
3308 // resource. For the Code Object V2 ABI, this will be the first 4 user
3309 // SGPR inputs. We can reserve those and use them directly.
3310
3311 Register PrivateSegmentBufferReg =
3313 Info.setScratchRSrcReg(PrivateSegmentBufferReg);
3314 } else {
3315 unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
3316 // We tentatively reserve the last registers (skipping the last registers
3317 // which may contain VCC, FLAT_SCR, and XNACK). After register allocation,
3318 // we'll replace these with the ones immediately after those which were
3319 // really allocated. In the prologue copies will be inserted from the
3320 // argument to these reserved registers.
3321
3322 // Without HSA, relocations are used for the scratch pointer and the
3323 // buffer resource setup is always inserted in the prologue. Scratch wave
3324 // offset is still in an input SGPR.
3325 Info.setScratchRSrcReg(ReservedBufferReg);
3326 }
3327 }
3328
3329 MachineRegisterInfo &MRI = MF.getRegInfo();
3330
3331 // For entry functions we have to set up the stack pointer if we use it,
3332 // whereas non-entry functions get this "for free". This means there is no
3333 // intrinsic advantage to using S32 over S34 in cases where we do not have
3334 // calls but do need a frame pointer (i.e. if we are requested to have one
3335 // because frame pointer elimination is disabled). To keep things simple we
3336 // only ever use S32 as the call ABI stack pointer, and so using it does not
3337 // imply we need a separate frame pointer.
3338 //
3339 // Try to use s32 as the SP, but move it if it would interfere with input
3340 // arguments. This won't work with calls though.
3341 //
3342 // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
3343 // registers.
3344 if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
3345 Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
3346 } else {
3348
3349 if (MFI.hasCalls())
3350 report_fatal_error("call in graphics shader with too many input SGPRs");
3351
3352 for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
3353 if (!MRI.isLiveIn(Reg)) {
3354 Info.setStackPtrOffsetReg(Reg);
3355 break;
3356 }
3357 }
3358
3359 if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
3360 report_fatal_error("failed to find register for SP");
3361 }
3362
3363 // hasFP should be accurate for entry functions even before the frame is
3364 // finalized, because it does not rely on the known stack size, only
3365 // properties like whether variable sized objects are present.
3366 if (ST.getFrameLowering()->hasFP(MF)) {
3367 Info.setFrameOffsetReg(AMDGPU::SGPR33);
3368 }
3369}
3370
3373 return !Info->isEntryFunction();
3374}
3375
3377
3379 MachineBasicBlock *Entry,
3380 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
3382
3383 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
3384 if (!IStart)
3385 return;
3386
3387 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
3388 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
3389 MachineBasicBlock::iterator MBBI = Entry->begin();
3390 for (const MCPhysReg *I = IStart; *I; ++I) {
3391 const TargetRegisterClass *RC = nullptr;
3392 if (AMDGPU::SReg_64RegClass.contains(*I))
3393 RC = &AMDGPU::SGPR_64RegClass;
3394 else if (AMDGPU::SReg_32RegClass.contains(*I))
3395 RC = &AMDGPU::SGPR_32RegClass;
3396 else
3397 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3398
3399 Register NewVR = MRI->createVirtualRegister(RC);
3400 // Create copy from CSR to a virtual register.
3401 Entry->addLiveIn(*I);
3402 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
3403 .addReg(*I);
3404
3405 // Insert the copy-back instructions right before the terminator.
3406 for (auto *Exit : Exits)
3407 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
3408 TII->get(TargetOpcode::COPY), *I)
3409 .addReg(NewVR);
3410 }
3411}
3412
3414 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
3415 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3416 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
3418
3420 const Function &Fn = MF.getFunction();
3423 bool IsError = false;
3424
3425 if (Subtarget->isAmdHsaOS() && AMDGPU::isGraphics(CallConv)) {
3427 Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc()));
3428 IsError = true;
3429 }
3430
3433 BitVector Skipped(Ins.size());
3434 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
3435 *DAG.getContext());
3436
3437 bool IsGraphics = AMDGPU::isGraphics(CallConv);
3438 bool IsKernel = AMDGPU::isKernel(CallConv);
3439 bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
3440
3441 if (IsGraphics) {
3442 const GCNUserSGPRUsageInfo &UserSGPRInfo = Info->getUserSGPRInfo();
3443 assert(!UserSGPRInfo.hasDispatchPtr() &&
3444 !UserSGPRInfo.hasKernargSegmentPtr() && !Info->hasWorkGroupInfo() &&
3445 !Info->hasLDSKernelId() && !Info->hasWorkItemIDX() &&
3446 !Info->hasWorkItemIDY() && !Info->hasWorkItemIDZ());
3447 (void)UserSGPRInfo;
3448 if (!Subtarget->hasFlatScratchEnabled())
3449 assert(!UserSGPRInfo.hasFlatScratchInit());
3450 if ((CallConv != CallingConv::AMDGPU_CS &&
3451 CallConv != CallingConv::AMDGPU_Gfx &&
3452 CallConv != CallingConv::AMDGPU_Gfx_WholeWave) ||
3453 !Subtarget->hasArchitectedSGPRs())
3454 assert(!Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
3455 !Info->hasWorkGroupIDZ());
3456 }
3457
3458 bool IsWholeWaveFunc = Info->isWholeWaveFunction();
3459
3460 if (CallConv == CallingConv::AMDGPU_PS) {
3461 processPSInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
3462
3463 // At least one interpolation mode must be enabled or else the GPU will
3464 // hang.
3465 //
3466 // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
3467 // set PSInputAddr, the user wants to enable some bits after the compilation
3468 // based on run-time states. Since we can't know what the final PSInputEna
3469 // will look like, so we shouldn't do anything here and the user should take
3470 // responsibility for the correct programming.
3471 //
3472 // Otherwise, the following restrictions apply:
3473 // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
3474 // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
3475 // enabled too.
3476 if ((Info->getPSInputAddr() & 0x7F) == 0 ||
3477 ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11))) {
3478 CCInfo.AllocateReg(AMDGPU::VGPR0);
3479 CCInfo.AllocateReg(AMDGPU::VGPR1);
3480 Info->markPSInputAllocated(0);
3481 Info->markPSInputEnabled(0);
3482 }
3483 if (Subtarget->isAmdPalOS()) {
3484 // For isAmdPalOS, the user does not enable some bits after compilation
3485 // based on run-time states; the register values being generated here are
3486 // the final ones set in hardware. Therefore we need to apply the
3487 // workaround to PSInputAddr and PSInputEnable together. (The case where
3488 // a bit is set in PSInputAddr but not PSInputEnable is where the
3489 // frontend set up an input arg for a particular interpolation mode, but
3490 // nothing uses that input arg. Really we should have an earlier pass
3491 // that removes such an arg.)
3492 unsigned PsInputBits = Info->getPSInputAddr() & Info->getPSInputEnable();
3493 if ((PsInputBits & 0x7F) == 0 ||
3494 ((PsInputBits & 0xF) == 0 && (PsInputBits >> 11 & 1)))
3495 Info->markPSInputEnabled(llvm::countr_zero(Info->getPSInputAddr()));
3496 }
3497 } else if (IsKernel) {
3498 assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
3499 } else {
3500 Splits.append(IsWholeWaveFunc ? std::next(Ins.begin()) : Ins.begin(),
3501 Ins.end());
3502 }
3503
3504 if (IsKernel)
3505 analyzeFormalArgumentsCompute(CCInfo, Ins);
3506
3507 if (IsEntryFunc) {
3508 allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
3509 allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
3510 if (IsKernel && Subtarget->hasKernargPreload())
3511 allocatePreloadKernArgSGPRs(CCInfo, ArgLocs, Ins, MF, *TRI, *Info);
3512
3513 allocateLDSKernelId(CCInfo, MF, *TRI, *Info);
3514 } else if (!IsGraphics) {
3515 // For the fixed ABI, pass workitem IDs in the last argument register.
3516 allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
3517
3518 // FIXME: Sink this into allocateSpecialInputSGPRs
3519 if (!Subtarget->hasFlatScratchEnabled())
3520 CCInfo.AllocateReg(Info->getScratchRSrcReg());
3521
3522 allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
3523 }
3524
3525 if (!IsKernel) {
3526 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
3527 CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
3528
3529 // This assumes the registers are allocated by CCInfo in ascending order
3530 // with no gaps.
3531 Info->setNumWaveDispatchSGPRs(
3532 CCInfo.getFirstUnallocated(AMDGPU::SGPR_32RegClass.getRegisters()));
3533 Info->setNumWaveDispatchVGPRs(
3534 CCInfo.getFirstUnallocated(AMDGPU::VGPR_32RegClass.getRegisters()));
3535 } else if (Info->getNumKernargPreloadedSGPRs()) {
3536 Info->setNumWaveDispatchSGPRs(Info->getNumUserSGPRs());
3537 }
3538
3540
3541 if (IsWholeWaveFunc) {
3542 SDValue Setup = DAG.getNode(AMDGPUISD::WHOLE_WAVE_SETUP, DL,
3543 {MVT::i1, MVT::Other}, Chain);
3544 InVals.push_back(Setup.getValue(0));
3545 Chains.push_back(Setup.getValue(1));
3546 }
3547
3548 // FIXME: This is the minimum kernel argument alignment. We should improve
3549 // this to the maximum alignment of the arguments.
3550 //
3551 // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
3552 // kern arg offset.
3553 const Align KernelArgBaseAlign = Align(16);
3554
3555 for (unsigned i = IsWholeWaveFunc ? 1 : 0, e = Ins.size(), ArgIdx = 0; i != e;
3556 ++i) {
3557 const ISD::InputArg &Arg = Ins[i];
3558 if ((Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) || IsError) {
3559 InVals.push_back(DAG.getPOISON(Arg.VT));
3560 continue;
3561 }
3562
3563 CCValAssign &VA = ArgLocs[ArgIdx++];
3564 MVT VT = VA.getLocVT();
3565
3566 if (IsEntryFunc && VA.isMemLoc()) {
3567 VT = Ins[i].VT;
3568 EVT MemVT = VA.getLocVT();
3569
3570 const uint64_t Offset = VA.getLocMemOffset();
3571 Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
3572
3573 if (Arg.Flags.isByRef()) {
3574 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, Chain, Offset);
3575
3576 const GCNTargetMachine &TM =
3577 static_cast<const GCNTargetMachine &>(getTargetMachine());
3578 if (!TM.isNoopAddrSpaceCast(AMDGPUAS::CONSTANT_ADDRESS,
3579 Arg.Flags.getPointerAddrSpace())) {
3582 }
3583
3584 InVals.push_back(Ptr);
3585 continue;
3586 }
3587
3588 SDValue NewArg;
3589 if (Arg.isOrigArg() && Info->getArgInfo().PreloadKernArgs.count(i)) {
3590 if (MemVT.getStoreSize() < 4 && Alignment < 4) {
3591 // In this case the argument is packed into the previous preload SGPR.
3592 int64_t AlignDownOffset = alignDown(Offset, 4);
3593 int64_t OffsetDiff = Offset - AlignDownOffset;
3594 EVT IntVT = MemVT.changeTypeToInteger();
3595
3596 const SIMachineFunctionInfo *Info =
3599 Register Reg =
3600 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs[0];
3601
3602 assert(Reg);
3603 Register VReg = MRI.getLiveInVirtReg(Reg);
3604 SDValue Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3605
3606 SDValue ShiftAmt = DAG.getConstant(OffsetDiff * 8, DL, MVT::i32);
3607 SDValue Extract = DAG.getNode(ISD::SRL, DL, MVT::i32, Copy, ShiftAmt);
3608
3609 SDValue ArgVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Extract);
3610 ArgVal = DAG.getNode(ISD::BITCAST, DL, MemVT, ArgVal);
3611 NewArg = convertArgType(DAG, VT, MemVT, DL, ArgVal,
3612 Ins[i].Flags.isSExt(), &Ins[i]);
3613
3614 NewArg = DAG.getMergeValues({NewArg, Copy.getValue(1)}, DL);
3615 } else {
3616 const SIMachineFunctionInfo *Info =
3619 const SmallVectorImpl<MCRegister> &PreloadRegs =
3620 Info->getArgInfo().PreloadKernArgs.find(i)->getSecond().Regs;
3621
3622 SDValue Copy;
3623 if (PreloadRegs.size() == 1) {
3624 Register VReg = MRI.getLiveInVirtReg(PreloadRegs[0]);
3625 const TargetRegisterClass *RC = MRI.getRegClass(VReg);
3626 NewArg = DAG.getCopyFromReg(
3627 Chain, DL, VReg,
3629 TRI->getRegSizeInBits(*RC)));
3630
3631 } else {
3632 // If the kernarg alignment does not match the alignment of the SGPR
3633 // tuple RC that can accommodate this argument, it will be built up
3634 // via copies from from the individual SGPRs that the argument was
3635 // preloaded to.
3637 for (auto Reg : PreloadRegs) {
3638 Register VReg = MRI.getLiveInVirtReg(Reg);
3639 Copy = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i32);
3640 Elts.push_back(Copy);
3641 }
3642 NewArg =
3643 DAG.getBuildVector(EVT::getVectorVT(*DAG.getContext(), MVT::i32,
3644 PreloadRegs.size()),
3645 DL, Elts);
3646 }
3647
3648 // If the argument was preloaded to multiple consecutive 32-bit
3649 // registers because of misalignment between addressable SGPR tuples
3650 // and the argument size, we can still assume that because of kernarg
3651 // segment alignment restrictions that NewArg's size is the same as
3652 // MemVT and just do a bitcast. If MemVT is less than 32-bits we add a
3653 // truncate since we cannot preload to less than a single SGPR and the
3654 // MemVT may be smaller.
3655 EVT MemVTInt =
3657 if (MemVT.bitsLT(NewArg.getSimpleValueType()))
3658 NewArg = DAG.getNode(ISD::TRUNCATE, DL, MemVTInt, NewArg);
3659
3660 NewArg = DAG.getBitcast(MemVT, NewArg);
3661 NewArg = convertArgType(DAG, VT, MemVT, DL, NewArg,
3662 Ins[i].Flags.isSExt(), &Ins[i]);
3663 NewArg = DAG.getMergeValues({NewArg, Chain}, DL);
3664 }
3665 } else {
3666 // Hidden arguments that are in the kernel signature must be preloaded
3667 // to user SGPRs. Print a diagnostic error if a hidden argument is in
3668 // the argument list and is not preloaded.
3669 if (Arg.isOrigArg()) {
3670 Argument *OrigArg = Fn.getArg(Arg.getOrigArgIndex());
3671 if (OrigArg->hasAttribute("amdgpu-hidden-argument")) {
3673 *OrigArg->getParent(),
3674 "hidden argument in kernel signature was not preloaded",
3675 DL.getDebugLoc()));
3676 }
3677 }
3678
3679 NewArg =
3680 lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset,
3681 Alignment, Ins[i].Flags.isSExt(), &Ins[i]);
3682 }
3683 Chains.push_back(NewArg.getValue(1));
3684
3685 auto *ParamTy =
3686 dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
3687 if (Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS &&
3688 ParamTy &&
3689 (ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
3690 ParamTy->getAddressSpace() == AMDGPUAS::REGION_ADDRESS)) {
3691 // On SI local pointers are just offsets into LDS, so they are always
3692 // less than 16-bits. On CI and newer they could potentially be
3693 // real pointers, so we can't guarantee their size.
3694 NewArg = DAG.getNode(ISD::AssertZext, DL, NewArg.getValueType(), NewArg,
3695 DAG.getValueType(MVT::i16));
3696 }
3697
3698 InVals.push_back(NewArg);
3699 continue;
3700 }
3701 if (!IsEntryFunc && VA.isMemLoc()) {
3702 SDValue Val = lowerStackParameter(DAG, VA, DL, Chain, Arg);
3703 InVals.push_back(Val);
3704 if (!Arg.Flags.isByVal())
3705 Chains.push_back(Val.getValue(1));
3706 continue;
3707 }
3708
3709 assert(VA.isRegLoc() && "Parameter must be in a register!");
3710
3711 Register Reg = VA.getLocReg();
3712 const TargetRegisterClass *RC = nullptr;
3713 if (AMDGPU::VGPR_32RegClass.contains(Reg))
3714 RC = &AMDGPU::VGPR_32RegClass;
3715 else if (AMDGPU::SGPR_32RegClass.contains(Reg))
3716 RC = &AMDGPU::SGPR_32RegClass;
3717 else
3718 llvm_unreachable("Unexpected register class in LowerFormalArguments!");
3719
3720 Reg = MF.addLiveIn(Reg, RC);
3721 SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
3722 if (Arg.Flags.isInReg() && RC == &AMDGPU::VGPR_32RegClass) {
3723 // FIXME: Need to forward the chains created by `CopyFromReg`s, make sure
3724 // they will read physical regs before any side effect instructions.
3725 SDValue ReadFirstLane =
3726 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3728 ReadFirstLane, Val);
3729 }
3730
3731 if (Arg.Flags.isSRet()) {
3732 // The return object should be reasonably addressable.
3733
3734 // FIXME: This helps when the return is a real sret. If it is a
3735 // automatically inserted sret (i.e. CanLowerReturn returns false), an
3736 // extra copy is inserted in SelectionDAGBuilder which obscures this.
3737 unsigned NumBits =
3739 Val = DAG.getNode(
3740 ISD::AssertZext, DL, VT, Val,
3741 DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), NumBits)));
3742 }
3743
3744 Val = convertABITypeToValueType(DAG, Val, VA, DL);
3745 InVals.push_back(Val);
3746 }
3747
3748 // Start adding system SGPRs.
3749 if (IsEntryFunc)
3750 allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsGraphics);
3751
3752 unsigned StackArgSize = CCInfo.getStackSize();
3753 Info->setBytesInStackArgArea(StackArgSize);
3754
3755 return Chains.empty() ? Chain
3756 : DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
3757}
3758
3759// TODO: If return values can't fit in registers, we should return as many as
3760// possible in registers before passing on stack.
3762 CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
3763 const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context,
3764 const Type *RetTy) const {
3765 // Replacing returns with sret/stack usage doesn't make sense for shaders.
3766 // FIXME: Also sort of a workaround for custom vector splitting in LowerReturn
3767 // for shaders. Vector types should be explicitly handled by CC.
3768 if (AMDGPU::isEntryFunctionCC(CallConv))
3769 return true;
3770
3772 CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
3773 if (!CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, IsVarArg)))
3774 return false;
3775
3776 // We must use the stack if return would require unavailable registers.
3777 unsigned MaxNumVGPRs = Subtarget->getMaxNumVGPRs(MF);
3778 unsigned TotalNumVGPRs = Subtarget->getAddressableNumArchVGPRs();
3779 for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i)
3780 if (CCInfo.isAllocated(AMDGPU::VGPR_32RegClass.getRegister(i)))
3781 return false;
3782
3783 return true;
3784}
3785
3786SDValue
3788 bool isVarArg,
3790 const SmallVectorImpl<SDValue> &OutVals,
3791 const SDLoc &DL, SelectionDAG &DAG) const {
3795
3796 if (AMDGPU::isKernel(CallConv)) {
3797 return AMDGPUTargetLowering::LowerReturn(Chain, CallConv, isVarArg, Outs,
3798 OutVals, DL, DAG);
3799 }
3800
3801 bool IsShader = AMDGPU::isShader(CallConv);
3802
3803 Info->setIfReturnsVoid(Outs.empty());
3804 bool IsWaveEnd = Info->returnsVoid() && IsShader;
3805
3806 // CCValAssign - represent the assignment of the return value to a location.
3808
3809 // CCState - Info about the registers and stack slots.
3810 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
3811 *DAG.getContext());
3812
3813 // Analyze outgoing return values.
3814 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
3815
3816 SDValue Glue;
3818 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
3819
3820 SDValue ReadFirstLane =
3821 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
3822 // Copy the result values into the output registers.
3823 for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E;
3824 ++I, ++RealRVLocIdx) {
3825 CCValAssign &VA = RVLocs[I];
3826 assert(VA.isRegLoc() && "Can only return in registers!");
3827 // TODO: Partially return in registers if return values don't fit.
3828 SDValue Arg = OutVals[RealRVLocIdx];
3829
3830 // Copied from other backends.
3831 switch (VA.getLocInfo()) {
3832 case CCValAssign::Full:
3833 break;
3834 case CCValAssign::BCvt:
3835 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
3836 break;
3837 case CCValAssign::SExt:
3838 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
3839 break;
3840 case CCValAssign::ZExt:
3841 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
3842 break;
3843 case CCValAssign::AExt:
3844 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
3845 break;
3846 default:
3847 llvm_unreachable("Unknown loc info!");
3848 }
3849 if (TRI->isSGPRPhysReg(VA.getLocReg()))
3851 ReadFirstLane, Arg);
3852 Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), Arg, Glue);
3853 Glue = Chain.getValue(1);
3854 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3855 }
3856
3857 // FIXME: Does sret work properly?
3858 if (!Info->isEntryFunction()) {
3859 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3860 const MCPhysReg *I =
3861 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3862 if (I) {
3863 for (; *I; ++I) {
3864 if (AMDGPU::SReg_64RegClass.contains(*I))
3865 RetOps.push_back(DAG.getRegister(*I, MVT::i64));
3866 else if (AMDGPU::SReg_32RegClass.contains(*I))
3867 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3868 else
3869 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3870 }
3871 }
3872 }
3873
3874 // Update chain and glue.
3875 RetOps[0] = Chain;
3876 if (Glue.getNode())
3877 RetOps.push_back(Glue);
3878
3879 unsigned Opc = AMDGPUISD::ENDPGM;
3880 if (!IsWaveEnd)
3881 Opc = Info->isWholeWaveFunction() ? AMDGPUISD::WHOLE_WAVE_RETURN
3882 : IsShader ? AMDGPUISD::RETURN_TO_EPILOG
3883 : AMDGPUISD::RET_GLUE;
3884 return DAG.getNode(Opc, DL, MVT::Other, RetOps);
3885}
3886
3888 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool IsVarArg,
3889 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
3890 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool IsThisReturn,
3891 SDValue ThisVal) const {
3892 CCAssignFn *RetCC = CCAssignFnForReturn(CallConv, IsVarArg);
3893
3894 // Assign locations to each value returned by this call.
3896 CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
3897 *DAG.getContext());
3898 CCInfo.AnalyzeCallResult(Ins, RetCC);
3899
3900 // Copy all of the result registers out of their specified physreg.
3901 for (CCValAssign VA : RVLocs) {
3902 SDValue Val;
3903
3904 if (VA.isRegLoc()) {
3905 Val =
3906 DAG.getCopyFromReg(Chain, DL, VA.getLocReg(), VA.getLocVT(), InGlue);
3907 Chain = Val.getValue(1);
3908 InGlue = Val.getValue(2);
3909 } else if (VA.isMemLoc()) {
3910 report_fatal_error("TODO: return values in memory");
3911 } else
3912 llvm_unreachable("unknown argument location type");
3913
3914 switch (VA.getLocInfo()) {
3915 case CCValAssign::Full:
3916 break;
3917 case CCValAssign::BCvt:
3918 Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
3919 break;
3920 case CCValAssign::ZExt:
3921 Val = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Val,
3922 DAG.getValueType(VA.getValVT()));
3923 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3924 break;
3925 case CCValAssign::SExt:
3926 Val = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Val,
3927 DAG.getValueType(VA.getValVT()));
3928 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3929 break;
3930 case CCValAssign::AExt:
3931 Val = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Val);
3932 break;
3933 default:
3934 llvm_unreachable("Unknown loc info!");
3935 }
3936
3937 InVals.push_back(Val);
3938 }
3939
3940 return Chain;
3941}
3942
3943// Add code to pass special inputs required depending on used features separate
3944// from the explicit user arguments present in the IR.
3946 CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info,
3947 SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass,
3948 SmallVectorImpl<SDValue> &MemOpChains, SDValue Chain) const {
3949 // If we don't have a call site, this was a call inserted by
3950 // legalization. These can never use special inputs.
3951 if (!CLI.CB)
3952 return;
3953
3954 SelectionDAG &DAG = CLI.DAG;
3955 const SDLoc &DL = CLI.DL;
3956 const Function &F = DAG.getMachineFunction().getFunction();
3957
3958 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
3959 const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
3960
3961 const AMDGPUFunctionArgInfo &CalleeArgInfo =
3963
3964 // TODO: Unify with private memory register handling. This is complicated by
3965 // the fact that at least in kernels, the input argument is not necessarily
3966 // in the same location as the input.
3967 // clang-format off
3968 static constexpr std::pair<AMDGPUFunctionArgInfo::PreloadedValue,
3969 std::array<StringLiteral, 2>> ImplicitAttrs[] = {
3970 {AMDGPUFunctionArgInfo::DISPATCH_PTR, {"amdgpu-no-dispatch-ptr", ""}},
3971 {AMDGPUFunctionArgInfo::QUEUE_PTR, {"amdgpu-no-queue-ptr", ""}},
3972 {AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR, {"amdgpu-no-implicitarg-ptr", ""}},
3973 {AMDGPUFunctionArgInfo::DISPATCH_ID, {"amdgpu-no-dispatch-id", ""}},
3974 {AMDGPUFunctionArgInfo::WORKGROUP_ID_X, {"amdgpu-no-workgroup-id-x", "amdgpu-no-cluster-id-x"}},
3975 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Y, {"amdgpu-no-workgroup-id-y", "amdgpu-no-cluster-id-y"}},
3976 {AMDGPUFunctionArgInfo::WORKGROUP_ID_Z, {"amdgpu-no-workgroup-id-z", "amdgpu-no-cluster-id-z"}},
3977 {AMDGPUFunctionArgInfo::LDS_KERNEL_ID, {"amdgpu-no-lds-kernel-id", ""}},
3978 };
3979 // clang-format on
3980
3981 for (auto [InputID, Attrs] : ImplicitAttrs) {
3982 // If the callee does not use the attribute value, skip copying the value.
3983 if (all_of(Attrs, [&](StringRef Attr) {
3984 return Attr.empty() || CLI.CB->hasFnAttr(Attr);
3985 }))
3986 continue;
3987
3988 const auto [OutgoingArg, ArgRC, ArgTy] =
3989 CalleeArgInfo.getPreloadedValue(InputID);
3990 if (!OutgoingArg)
3991 continue;
3992
3993 const auto [IncomingArg, IncomingArgRC, Ty] =
3994 CallerArgInfo.getPreloadedValue(InputID);
3995 assert(IncomingArgRC == ArgRC);
3996
3997 // All special arguments are ints for now.
3998 EVT ArgVT = TRI->getSpillSize(*ArgRC) == 8 ? MVT::i64 : MVT::i32;
3999 SDValue InputReg;
4000
4001 if (IncomingArg) {
4002 InputReg = loadInputValue(DAG, ArgRC, ArgVT, DL, *IncomingArg);
4003 } else if (InputID == AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR) {
4004 // The implicit arg ptr is special because it doesn't have a corresponding
4005 // input for kernels, and is computed from the kernarg segment pointer.
4006 InputReg = getImplicitArgPtr(DAG, DL);
4007 } else if (InputID == AMDGPUFunctionArgInfo::LDS_KERNEL_ID) {
4008 std::optional<uint32_t> Id =
4010 if (Id.has_value()) {
4011 InputReg = DAG.getConstant(*Id, DL, ArgVT);
4012 } else {
4013 InputReg = DAG.getPOISON(ArgVT);
4014 }
4015 } else {
4016 // We may have proven the input wasn't needed, although the ABI is
4017 // requiring it. We just need to allocate the register appropriately.
4018 InputReg = DAG.getPOISON(ArgVT);
4019 }
4020
4021 if (OutgoingArg->isRegister()) {
4022 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4023 if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
4024 report_fatal_error("failed to allocate implicit input argument");
4025 } else {
4026 unsigned SpecialArgOffset =
4027 CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
4028 SDValue ArgStore =
4029 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
4030 MemOpChains.push_back(ArgStore);
4031 }
4032 }
4033
4034 // Pack workitem IDs into a single register or pass it as is if already
4035 // packed.
4036
4037 auto [OutgoingArg, ArgRC, Ty] =
4039 if (!OutgoingArg)
4040 std::tie(OutgoingArg, ArgRC, Ty) =
4042 if (!OutgoingArg)
4043 std::tie(OutgoingArg, ArgRC, Ty) =
4045 if (!OutgoingArg)
4046 return;
4047
4048 const ArgDescriptor *IncomingArgX = std::get<0>(
4050 const ArgDescriptor *IncomingArgY = std::get<0>(
4052 const ArgDescriptor *IncomingArgZ = std::get<0>(
4054
4055 SDValue InputReg;
4056 SDLoc SL;
4057
4058 const bool NeedWorkItemIDX = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-x");
4059 const bool NeedWorkItemIDY = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-y");
4060 const bool NeedWorkItemIDZ = !CLI.CB->hasFnAttr("amdgpu-no-workitem-id-z");
4061
4062 // If incoming ids are not packed we need to pack them.
4063 if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX &&
4064 NeedWorkItemIDX) {
4065 if (Subtarget->getMaxWorkitemID(F, 0) != 0) {
4066 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
4067 } else {
4068 InputReg = DAG.getConstant(0, DL, MVT::i32);
4069 }
4070 }
4071
4072 if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY &&
4073 NeedWorkItemIDY && Subtarget->getMaxWorkitemID(F, 1) != 0) {
4074 SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
4075 Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
4076 DAG.getShiftAmountConstant(10, MVT::i32, SL));
4077 InputReg = InputReg.getNode()
4078 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y)
4079 : Y;
4080 }
4081
4082 if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ &&
4083 NeedWorkItemIDZ && Subtarget->getMaxWorkitemID(F, 2) != 0) {
4084 SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
4085 Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
4086 DAG.getShiftAmountConstant(20, MVT::i32, SL));
4087 InputReg = InputReg.getNode()
4088 ? DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Z)
4089 : Z;
4090 }
4091
4092 if (!InputReg && (NeedWorkItemIDX || NeedWorkItemIDY || NeedWorkItemIDZ)) {
4093 if (!IncomingArgX && !IncomingArgY && !IncomingArgZ) {
4094 // We're in a situation where the outgoing function requires the workitem
4095 // ID, but the calling function does not have it (e.g a graphics function
4096 // calling a C calling convention function). This is illegal, but we need
4097 // to produce something.
4098 InputReg = DAG.getPOISON(MVT::i32);
4099 } else {
4100 // Workitem ids are already packed, any of present incoming arguments
4101 // will carry all required fields.
4102 ArgDescriptor IncomingArg =
4103 ArgDescriptor::createArg(IncomingArgX ? *IncomingArgX
4104 : IncomingArgY ? *IncomingArgY
4105 : *IncomingArgZ,
4106 ~0u);
4107 InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, IncomingArg);
4108 }
4109 }
4110
4111 if (OutgoingArg->isRegister()) {
4112 if (InputReg)
4113 RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
4114
4115 CCInfo.AllocateReg(OutgoingArg->getRegister());
4116 } else {
4117 unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
4118 if (InputReg) {
4119 SDValue ArgStore =
4120 storeStackInputValue(DAG, DL, Chain, InputReg, SpecialArgOffset);
4121 MemOpChains.push_back(ArgStore);
4122 }
4123 }
4124}
4125
4127 SDValue Callee, CallingConv::ID CalleeCC, bool IsVarArg,
4129 const SmallVectorImpl<SDValue> &OutVals,
4130 const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
4131 if (AMDGPU::isChainCC(CalleeCC))
4132 return true;
4133
4134 if (!AMDGPU::mayTailCallThisCC(CalleeCC))
4135 return false;
4136
4137 // For a divergent call target, we need to do a waterfall loop over the
4138 // possible callees which precludes us from using a simple jump.
4139 if (Callee->isDivergent())
4140 return false;
4141
4143 const Function &CallerF = MF.getFunction();
4144 CallingConv::ID CallerCC = CallerF.getCallingConv();
4146 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
4147
4148 // Kernels aren't callable, and don't have a live in return address so it
4149 // doesn't make sense to do a tail call with entry functions.
4150 if (!CallerPreserved)
4151 return false;
4152
4153 bool CCMatch = CallerCC == CalleeCC;
4154
4156 if (AMDGPU::canGuaranteeTCO(CalleeCC) && CCMatch)
4157 return true;
4158 return false;
4159 }
4160
4161 // TODO: Can we handle var args?
4162 if (IsVarArg)
4163 return false;
4164
4165 for (const Argument &Arg : CallerF.args()) {
4166 if (Arg.hasByValAttr())
4167 return false;
4168 }
4169
4170 LLVMContext &Ctx = *DAG.getContext();
4171
4172 // Check that the call results are passed in the same way.
4173 if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, Ctx, Ins,
4174 CCAssignFnForCall(CalleeCC, IsVarArg),
4175 CCAssignFnForCall(CallerCC, IsVarArg)))
4176 return false;
4177
4178 // The callee has to preserve all registers the caller needs to preserve.
4179 if (!CCMatch) {
4180 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
4181 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
4182 return false;
4183 }
4184
4185 // Nothing more to check if the callee is taking no arguments.
4186 if (Outs.empty())
4187 return true;
4188
4190 CCState CCInfo(CalleeCC, IsVarArg, MF, ArgLocs, Ctx);
4191
4192 // FIXME: We are not allocating special input registers, so we will be
4193 // deciding based on incorrect register assignments.
4194 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, IsVarArg));
4195
4196 const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
4197 // If the stack arguments for this call do not fit into our own save area then
4198 // the call cannot be made tail.
4199 // TODO: Is this really necessary?
4200 if (CCInfo.getStackSize() > FuncInfo->getBytesInStackArgArea())
4201 return false;
4202
4203 for (const auto &[CCVA, ArgVal] : zip_equal(ArgLocs, OutVals)) {
4204 // FIXME: What about inreg arguments that end up passed in memory?
4205 if (!CCVA.isRegLoc())
4206 continue;
4207
4208 // If we are passing an argument in an SGPR, and the value is divergent,
4209 // this call requires a waterfall loop.
4210 if (ArgVal->isDivergent() && TRI->isSGPRPhysReg(CCVA.getLocReg())) {
4211 LLVM_DEBUG(
4212 dbgs() << "Cannot tail call due to divergent outgoing argument in "
4213 << printReg(CCVA.getLocReg(), TRI) << '\n');
4214 return false;
4215 }
4216 }
4217
4218 const MachineRegisterInfo &MRI = MF.getRegInfo();
4219 return parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals);
4220}
4221
4223 if (!CI->isTailCall())
4224 return false;
4225
4226 const Function *ParentFn = CI->getFunction();
4228 return false;
4229 return true;
4230}
4231
4232namespace {
4233// Chain calls have special arguments that we need to handle. These are
4234// tagging along at the end of the arguments list(s), after the SGPR and VGPR
4235// arguments (index 0 and 1 respectively).
4236enum ChainCallArgIdx {
4237 Exec = 2,
4238 Flags,
4239 NumVGPRs,
4240 FallbackExec,
4241 FallbackCallee
4242};
4243} // anonymous namespace
4244
4245// The wave scratch offset register is used as the global base pointer.
4247 SmallVectorImpl<SDValue> &InVals) const {
4248 CallingConv::ID CallConv = CLI.CallConv;
4249 bool IsChainCallConv = AMDGPU::isChainCC(CallConv);
4250
4251 SelectionDAG &DAG = CLI.DAG;
4252
4253 const SDLoc &DL = CLI.DL;
4254 SDValue Chain = CLI.Chain;
4255 SDValue Callee = CLI.Callee;
4256
4257 llvm::SmallVector<SDValue, 6> ChainCallSpecialArgs;
4258 bool UsesDynamicVGPRs = false;
4259 if (IsChainCallConv) {
4260 // The last arguments should be the value that we need to put in EXEC,
4261 // followed by the flags and any other arguments with special meanings.
4262 // Pop them out of CLI.Outs and CLI.OutVals before we do any processing so
4263 // we don't treat them like the "real" arguments.
4264 auto RequestedExecIt =
4265 llvm::find_if(CLI.Outs, [](const ISD::OutputArg &Arg) {
4266 return Arg.OrigArgIndex == 2;
4267 });
4268 assert(RequestedExecIt != CLI.Outs.end() && "No node for EXEC");
4269
4270 size_t SpecialArgsBeginIdx = RequestedExecIt - CLI.Outs.begin();
4271 CLI.OutVals.erase(CLI.OutVals.begin() + SpecialArgsBeginIdx,
4272 CLI.OutVals.end());
4273 CLI.Outs.erase(RequestedExecIt, CLI.Outs.end());
4274
4275 assert(CLI.Outs.back().OrigArgIndex < 2 &&
4276 "Haven't popped all the special args");
4277
4278 TargetLowering::ArgListEntry RequestedExecArg =
4279 CLI.Args[ChainCallArgIdx::Exec];
4280 if (!RequestedExecArg.Ty->isIntegerTy(Subtarget->getWavefrontSize()))
4281 return lowerUnhandledCall(CLI, InVals, "Invalid value for EXEC");
4282
4283 // Convert constants into TargetConstants, so they become immediate operands
4284 // instead of being selected into S_MOV.
4285 auto PushNodeOrTargetConstant = [&](TargetLowering::ArgListEntry Arg) {
4286 if (const auto *ArgNode = dyn_cast<ConstantSDNode>(Arg.Node)) {
4287 ChainCallSpecialArgs.push_back(DAG.getTargetConstant(
4288 ArgNode->getAPIntValue(), DL, ArgNode->getValueType(0)));
4289 } else
4290 ChainCallSpecialArgs.push_back(Arg.Node);
4291 };
4292
4293 PushNodeOrTargetConstant(RequestedExecArg);
4294
4295 // Process any other special arguments depending on the value of the flags.
4296 TargetLowering::ArgListEntry Flags = CLI.Args[ChainCallArgIdx::Flags];
4297
4298 const APInt &FlagsValue = cast<ConstantSDNode>(Flags.Node)->getAPIntValue();
4299 if (FlagsValue.isZero()) {
4300 if (CLI.Args.size() > ChainCallArgIdx::Flags + 1)
4301 return lowerUnhandledCall(CLI, InVals,
4302 "no additional args allowed if flags == 0");
4303 } else if (FlagsValue.isOneBitSet(0)) {
4304 if (CLI.Args.size() != ChainCallArgIdx::FallbackCallee + 1) {
4305 return lowerUnhandledCall(CLI, InVals, "expected 3 additional args");
4306 }
4307
4308 if (!Subtarget->isWave32()) {
4309 return lowerUnhandledCall(
4310 CLI, InVals, "dynamic VGPR mode is only supported for wave32");
4311 }
4312
4313 UsesDynamicVGPRs = true;
4314 std::for_each(CLI.Args.begin() + ChainCallArgIdx::NumVGPRs,
4315 CLI.Args.end(), PushNodeOrTargetConstant);
4316 }
4317 }
4318
4320 SmallVector<SDValue, 32> &OutVals = CLI.OutVals;
4322 bool &IsTailCall = CLI.IsTailCall;
4323 bool IsVarArg = CLI.IsVarArg;
4324 bool IsSibCall = false;
4326
4327 if (Callee.isUndef() || isNullConstant(Callee)) {
4328 if (!CLI.IsTailCall) {
4329 for (ISD::InputArg &Arg : CLI.Ins)
4330 InVals.push_back(DAG.getPOISON(Arg.VT));
4331 }
4332
4333 return Chain;
4334 }
4335
4336 if (IsVarArg) {
4337 return lowerUnhandledCall(CLI, InVals,
4338 "unsupported call to variadic function ");
4339 }
4340
4341 if (!CLI.CB)
4342 return lowerUnhandledCall(CLI, InVals, "unsupported libcall legalization");
4343
4344 if (IsTailCall && MF.getTarget().Options.GuaranteedTailCallOpt) {
4345 return lowerUnhandledCall(CLI, InVals,
4346 "unsupported required tail call to function ");
4347 }
4348
4349 if (IsTailCall) {
4350 IsTailCall = isEligibleForTailCallOptimization(Callee, CallConv, IsVarArg,
4351 Outs, OutVals, Ins, DAG);
4352 if (!IsTailCall &&
4353 ((CLI.CB && CLI.CB->isMustTailCall()) || IsChainCallConv)) {
4354 report_fatal_error("failed to perform tail call elimination on a call "
4355 "site marked musttail or on llvm.amdgcn.cs.chain");
4356 }
4357
4358 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4359
4360 // A sibling call is one where we're under the usual C ABI and not planning
4361 // to change that but can still do a tail call:
4362 if (!TailCallOpt && IsTailCall)
4363 IsSibCall = true;
4364
4365 if (IsTailCall)
4366 ++NumTailCalls;
4367 }
4368
4371 SmallVector<SDValue, 8> MemOpChains;
4372
4373 // Analyze operands of the call, assigning locations to each operand.
4375 CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
4376 CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
4377
4378 if (CallConv != CallingConv::AMDGPU_Gfx && !AMDGPU::isChainCC(CallConv) &&
4380 // With a fixed ABI, allocate fixed registers before user arguments.
4381 passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
4382 }
4383
4384 // Mark the scratch resource descriptor as allocated so the CC analysis
4385 // does not assign user arguments to these registers, matching the callee.
4386 if (!Subtarget->hasFlatScratchEnabled())
4387 CCInfo.AllocateReg(Info->getScratchRSrcReg());
4388
4389 CCInfo.AnalyzeCallOperands(Outs, AssignFn);
4390
4391 // Get a count of how many bytes are to be pushed on the stack.
4392 unsigned NumBytes = CCInfo.getStackSize();
4393
4394 if (IsSibCall) {
4395 // Since we're not changing the ABI to make this a tail call, the memory
4396 // operands are already available in the caller's incoming argument space.
4397 NumBytes = 0;
4398 }
4399
4400 // FPDiff is the byte offset of the call's argument area from the callee's.
4401 // Stores to callee stack arguments will be placed in FixedStackSlots offset
4402 // by this amount for a tail call. In a sibling call it must be 0 because the
4403 // caller will deallocate the entire stack and the callee still expects its
4404 // arguments to begin at SP+0. Completely unused for non-tail calls.
4405 int32_t FPDiff = 0;
4406 MachineFrameInfo &MFI = MF.getFrameInfo();
4407 auto *TRI = Subtarget->getRegisterInfo();
4408
4409 // Adjust the stack pointer for the new arguments...
4410 // These operations are automatically eliminated by the prolog/epilog pass
4411 if (!IsSibCall)
4412 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
4413
4414 if (!IsSibCall || IsChainCallConv) {
4415 if (!Subtarget->hasFlatScratchEnabled()) {
4416 SmallVector<SDValue, 4> CopyFromChains;
4417
4418 // In the HSA case, this should be an identity copy.
4419 SDValue ScratchRSrcReg =
4420 DAG.getCopyFromReg(Chain, DL, Info->getScratchRSrcReg(), MVT::v4i32);
4421 RegsToPass.emplace_back(IsChainCallConv
4422 ? AMDGPU::SGPR48_SGPR49_SGPR50_SGPR51
4423 : AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3,
4424 ScratchRSrcReg);
4425 CopyFromChains.push_back(ScratchRSrcReg.getValue(1));
4426 Chain = DAG.getTokenFactor(DL, CopyFromChains);
4427 }
4428 }
4429
4430 const unsigned NumSpecialInputs = RegsToPass.size();
4431
4432 MVT PtrVT = MVT::i32;
4433
4434 // Walk the register/memloc assignments, inserting copies/loads.
4435 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4436 CCValAssign &VA = ArgLocs[i];
4437 SDValue Arg = OutVals[i];
4438
4439 // Promote the value if needed.
4440 switch (VA.getLocInfo()) {
4441 case CCValAssign::Full:
4442 break;
4443 case CCValAssign::BCvt:
4444 Arg = DAG.getNode(ISD::BITCAST, DL, VA.getLocVT(), Arg);
4445 break;
4446 case CCValAssign::ZExt:
4447 Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
4448 break;
4449 case CCValAssign::SExt:
4450 Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
4451 break;
4452 case CCValAssign::AExt:
4453 Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
4454 break;
4455 case CCValAssign::FPExt:
4456 Arg = DAG.getNode(ISD::FP_EXTEND, DL, VA.getLocVT(), Arg);
4457 break;
4458 default:
4459 llvm_unreachable("Unknown loc info!");
4460 }
4461
4462 if (VA.isRegLoc()) {
4463 RegsToPass.push_back(std::pair(VA.getLocReg(), Arg));
4464 } else {
4465 assert(VA.isMemLoc());
4466
4467 SDValue DstAddr;
4468 MachinePointerInfo DstInfo;
4469
4470 unsigned LocMemOffset = VA.getLocMemOffset();
4471 int32_t Offset = LocMemOffset;
4472
4473 SDValue PtrOff = DAG.getConstant(Offset, DL, PtrVT);
4474 MaybeAlign Alignment;
4475
4476 if (IsTailCall) {
4477 ISD::ArgFlagsTy Flags = Outs[i].Flags;
4478 unsigned OpSize = Flags.isByVal() ? Flags.getByValSize()
4479 : VA.getValVT().getStoreSize();
4480
4481 // FIXME: We can have better than the minimum byval required alignment.
4482 Alignment =
4483 Flags.isByVal()
4484 ? Flags.getNonZeroByValAlign()
4485 : commonAlignment(Subtarget->getStackAlignment(), Offset);
4486
4487 Offset = Offset + FPDiff;
4488 int FI = MFI.CreateFixedObject(OpSize, Offset, true);
4489
4490 DstAddr = DAG.getFrameIndex(FI, PtrVT);
4491 DstInfo = MachinePointerInfo::getFixedStack(MF, FI);
4492
4493 // Make sure any stack arguments overlapping with where we're storing
4494 // are loaded before this eventual operation. Otherwise they'll be
4495 // clobbered.
4496
4497 // FIXME: Why is this really necessary? This seems to just result in a
4498 // lot of code to copy the stack and write them back to the same
4499 // locations, which are supposed to be immutable?
4500 Chain = addTokenForArgument(Chain, DAG, MFI, FI);
4501 } else {
4502 // Stores to the argument stack area are relative to the stack pointer.
4503 SDValue SP = DAG.getCopyFromReg(Chain, DL, Info->getStackPtrOffsetReg(),
4504 MVT::i32);
4505 DstAddr = DAG.getNode(ISD::ADD, DL, MVT::i32, SP, PtrOff);
4506 DstInfo = MachinePointerInfo::getStack(MF, LocMemOffset);
4507 Alignment =
4508 commonAlignment(Subtarget->getStackAlignment(), LocMemOffset);
4509 }
4510
4511 if (Outs[i].Flags.isByVal()) {
4512 SDValue SizeNode =
4513 DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
4514 SDValue Cpy =
4515 DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
4516 Outs[i].Flags.getNonZeroByValAlign(),
4517 Outs[i].Flags.getNonZeroByValAlign(),
4518 /*isVol = */ false, /*AlwaysInline = */ true,
4519 /*CI=*/nullptr, std::nullopt, DstInfo,
4521
4522 MemOpChains.push_back(Cpy);
4523 } else {
4524 SDValue Store =
4525 DAG.getStore(Chain, DL, Arg, DstAddr, DstInfo, Alignment);
4526 MemOpChains.push_back(Store);
4527 }
4528 }
4529 }
4530
4531 if (!MemOpChains.empty())
4532 Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
4533
4534 SDValue ReadFirstLaneID =
4535 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4536
4537 SDValue TokenGlue;
4538 if (CLI.ConvergenceControlToken) {
4539 TokenGlue = DAG.getNode(ISD::CONVERGENCECTRL_GLUE, DL, MVT::Glue,
4541 }
4542
4543 // Build a sequence of copy-to-reg nodes chained together with token chain
4544 // and flag operands which copy the outgoing args into the appropriate regs.
4545 SDValue InGlue;
4546
4547 unsigned ArgIdx = 0;
4548 for (auto [Reg, Val] : RegsToPass) {
4549 if (ArgIdx++ >= NumSpecialInputs &&
4550 (IsChainCallConv || !Val->isDivergent()) && TRI->isSGPRPhysReg(Reg)) {
4551 // For chain calls, the inreg arguments are required to be
4552 // uniform. Speculatively Insert a readfirstlane in case we cannot prove
4553 // they are uniform.
4554 //
4555 // For other calls, if an inreg arguments is known to be uniform,
4556 // speculatively insert a readfirstlane in case it is in a VGPR.
4557 //
4558 // FIXME: We need to execute this in a waterfall loop if it is a divergent
4559 // value, so let that continue to produce invalid code.
4560
4561 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Val});
4562 if (TokenGlue)
4563 ReadfirstlaneArgs.push_back(TokenGlue);
4565 ReadfirstlaneArgs);
4566 }
4567
4568 Chain = DAG.getCopyToReg(Chain, DL, Reg, Val, InGlue);
4569 InGlue = Chain.getValue(1);
4570 }
4571
4572 // We don't usually want to end the call-sequence here because we would tidy
4573 // the frame up *after* the call, however in the ABI-changing tail-call case
4574 // we've carefully laid out the parameters so that when sp is reset they'll be
4575 // in the correct location.
4576 if (IsTailCall && !IsSibCall) {
4577 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, 0, InGlue, DL);
4578 InGlue = Chain.getValue(1);
4579 }
4580
4581 std::vector<SDValue> Ops({Chain});
4582
4583 // Add a redundant copy of the callee global which will not be legalized, as
4584 // we need direct access to the callee later.
4586 const GlobalValue *GV = GSD->getGlobal();
4587 Ops.push_back(Callee);
4588 Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
4589 } else {
4590 if (IsTailCall) {
4591 // isEligibleForTailCallOptimization considered whether the call target is
4592 // divergent, but we may still end up with a uniform value in a VGPR.
4593 // Insert a readfirstlane just in case.
4594 SDValue ReadFirstLaneID =
4595 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
4596
4597 SmallVector<SDValue, 3> ReadfirstlaneArgs({ReadFirstLaneID, Callee});
4598 if (TokenGlue)
4599 ReadfirstlaneArgs.push_back(TokenGlue); // Wire up convergence token.
4600 Callee = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Callee.getValueType(),
4601 ReadfirstlaneArgs);
4602 }
4603
4604 Ops.push_back(Callee);
4605 Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
4606 }
4607
4608 if (IsTailCall) {
4609 // Each tail call may have to adjust the stack by a different amount, so
4610 // this information must travel along with the operation for eventual
4611 // consumption by emitEpilogue.
4612 Ops.push_back(DAG.getTargetConstant(FPDiff, DL, MVT::i32));
4613 }
4614
4615 if (IsChainCallConv)
4616 llvm::append_range(Ops, ChainCallSpecialArgs);
4617
4618 // Add argument registers to the end of the list so that they are known live
4619 // into the call.
4620 for (auto &[Reg, Val] : RegsToPass)
4621 Ops.push_back(DAG.getRegister(Reg, Val.getValueType()));
4622
4623 // Add a register mask operand representing the call-preserved registers.
4624 const uint32_t *Mask = TRI->getCallPreservedMask(MF, CallConv);
4625 assert(Mask && "Missing call preserved mask for calling convention");
4626 Ops.push_back(DAG.getRegisterMask(Mask));
4627
4628 if (SDValue Token = CLI.ConvergenceControlToken) {
4630 GlueOps.push_back(Token);
4631 if (InGlue)
4632 GlueOps.push_back(InGlue);
4633
4634 InGlue = SDValue(DAG.getMachineNode(TargetOpcode::CONVERGENCECTRL_GLUE, DL,
4635 MVT::Glue, GlueOps),
4636 0);
4637 }
4638
4639 if (InGlue)
4640 Ops.push_back(InGlue);
4641
4642 // If we're doing a tall call, use a TC_RETURN here rather than an
4643 // actual call instruction.
4644 if (IsTailCall) {
4645 MFI.setHasTailCall();
4646 unsigned OPC = AMDGPUISD::TC_RETURN;
4647 switch (CallConv) {
4649 OPC = AMDGPUISD::TC_RETURN_GFX;
4650 break;
4653 OPC = UsesDynamicVGPRs ? AMDGPUISD::TC_RETURN_CHAIN_DVGPR
4654 : AMDGPUISD::TC_RETURN_CHAIN;
4655 break;
4656 }
4657
4658 // If the caller is a whole wave function, we need to use a special opcode
4659 // so we can patch up EXEC.
4660 if (Info->isWholeWaveFunction())
4661 OPC = AMDGPUISD::TC_RETURN_GFX_WholeWave;
4662
4663 return DAG.getNode(OPC, DL, MVT::Other, Ops);
4664 }
4665
4666 // Returns a chain and a flag for retval copy to use.
4667 SDValue Call = DAG.getNode(AMDGPUISD::CALL, DL, {MVT::Other, MVT::Glue}, Ops);
4668 Chain = Call.getValue(0);
4669 InGlue = Call.getValue(1);
4670
4671 uint64_t CalleePopBytes = NumBytes;
4672 Chain = DAG.getCALLSEQ_END(Chain, 0, CalleePopBytes, InGlue, DL);
4673 if (!Ins.empty())
4674 InGlue = Chain.getValue(1);
4675
4676 // Handle result values, copying them out of physregs into vregs that we
4677 // return.
4678 return LowerCallResult(Chain, InGlue, CallConv, IsVarArg, Ins, DL, DAG,
4679 InVals, /*IsThisReturn=*/false, SDValue());
4680}
4681
4682// This is similar to the default implementation in ExpandDYNAMIC_STACKALLOC,
4683// except for:
4684// 1. Stack growth direction(default: downwards, AMDGPU: upwards), and
4685// 2. Scale size where, scale = wave-reduction(alloca-size) * wave-size
4687 SelectionDAG &DAG) const {
4688 const MachineFunction &MF = DAG.getMachineFunction();
4690
4691 SDLoc dl(Op);
4692 EVT VT = Op.getValueType();
4693 SDValue Chain = Op.getOperand(0);
4694 Register SPReg = Info->getStackPtrOffsetReg();
4695
4696 // Chain the dynamic stack allocation so that it doesn't modify the stack
4697 // pointer when other instructions are using the stack.
4698 Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
4699
4700 SDValue Size = Op.getOperand(1);
4701 SDValue BaseAddr = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
4702 Align Alignment = cast<ConstantSDNode>(Op.getOperand(2))->getAlignValue();
4703
4704 const TargetFrameLowering *TFL = Subtarget->getFrameLowering();
4706 "Stack grows upwards for AMDGPU");
4707
4708 Chain = BaseAddr.getValue(1);
4709 // When using flat-scratch, the stack offset is unscaled.
4710 const bool HasFlatScratch = Subtarget->hasFlatScratchEnabled();
4711 const unsigned WavefrontSizeLog2 = Subtarget->getWavefrontSizeLog2();
4712
4713 Align StackAlign = TFL->getStackAlign();
4714 if (Alignment > StackAlign) {
4715 uint64_t ScaledAlignment = Alignment.value()
4716 << (HasFlatScratch ? 0 : WavefrontSizeLog2);
4717 uint64_t StackAlignMask = ScaledAlignment - 1;
4718 SDValue TmpAddr = DAG.getNode(ISD::ADD, dl, VT, BaseAddr,
4719 DAG.getConstant(StackAlignMask, dl, VT));
4720 BaseAddr = DAG.getNode(ISD::AND, dl, VT, TmpAddr,
4721 DAG.getSignedConstant(-ScaledAlignment, dl, VT));
4722 }
4723
4724 assert(Size.getValueType() == MVT::i32 && "Size must be 32-bit");
4725 SDValue NewSP;
4727 // Increase the stack pointer by the size of the alloca.
4728 // If not using flat-scratch, we have to scale the size by the wave-size.
4729 SDValue ScaledSize =
4730 HasFlatScratch
4731 ? Size
4732 : DAG.getNode(ISD::SHL, dl, VT, Size,
4733 DAG.getConstant(WavefrontSizeLog2, dl, MVT::i32));
4734 NewSP = DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value
4735 } else {
4736 // For dynamic sized alloca, perform wave-wide reduction to get max of
4737 // alloca size(divergent), and then scale it (when not using flat-scratch)
4738 // by wave-size.
4739 SDValue WaveReduction =
4740 DAG.getTargetConstant(Intrinsic::amdgcn_wave_reduce_umax, dl, MVT::i32);
4741 Size = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, WaveReduction,
4742 Size, DAG.getTargetConstant(0, dl, MVT::i32));
4743 SDValue ScaledSize = Size;
4744 if (!HasFlatScratch) {
4745 ScaledSize =
4746 DAG.getNode(ISD::SHL, dl, VT, Size,
4747 DAG.getConstant(WavefrontSizeLog2, dl, MVT::i32));
4748 }
4749 NewSP =
4750 DAG.getNode(ISD::ADD, dl, VT, BaseAddr, ScaledSize); // Value in vgpr.
4751 SDValue ReadFirstLaneID =
4752 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, dl, MVT::i32);
4753 NewSP = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32, ReadFirstLaneID,
4754 NewSP);
4755 }
4756
4757 Chain = DAG.getCopyToReg(Chain, dl, SPReg, NewSP); // Output chain
4758 SDValue CallSeqEnd = DAG.getCALLSEQ_END(Chain, 0, 0, SDValue(), dl);
4759
4760 return DAG.getMergeValues({BaseAddr, CallSeqEnd}, dl);
4761}
4762
4764 if (Op.getValueType() != MVT::i32)
4765 return Op; // Defer to cannot select error.
4766
4768 SDLoc SL(Op);
4769
4770 SDValue CopyFromSP = DAG.getCopyFromReg(Op->getOperand(0), SL, SP, MVT::i32);
4771
4772 // Convert from wave uniform to swizzled vector address. This should protect
4773 // from any edge cases where the stacksave result isn't directly used with
4774 // stackrestore.
4775 SDValue VectorAddress =
4776 DAG.getNode(AMDGPUISD::WAVE_ADDRESS, SL, MVT::i32, CopyFromSP);
4777 return DAG.getMergeValues({VectorAddress, CopyFromSP.getValue(1)}, SL);
4778}
4779
4781 SelectionDAG &DAG) const {
4782 SDLoc SL(Op);
4783 assert(Op.getValueType() == MVT::i32);
4784
4785 uint32_t BothRoundHwReg =
4787 SDValue GetRoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4788
4789 SDValue IntrinID =
4790 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4791 SDValue GetReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, Op->getVTList(),
4792 Op.getOperand(0), IntrinID, GetRoundBothImm);
4793
4794 // There are two rounding modes, one for f32 and one for f64/f16. We only
4795 // report in the standard value range if both are the same.
4796 //
4797 // The raw values also differ from the expected FLT_ROUNDS values. Nearest
4798 // ties away from zero is not supported, and the other values are rotated by
4799 // 1.
4800 //
4801 // If the two rounding modes are not the same, report a target defined value.
4802
4803 // Mode register rounding mode fields:
4804 //
4805 // [1:0] Single-precision round mode.
4806 // [3:2] Double/Half-precision round mode.
4807 //
4808 // 0=nearest even; 1= +infinity; 2= -infinity, 3= toward zero.
4809 //
4810 // Hardware Spec
4811 // Toward-0 3 0
4812 // Nearest Even 0 1
4813 // +Inf 1 2
4814 // -Inf 2 3
4815 // NearestAway0 N/A 4
4816 //
4817 // We have to handle 16 permutations of a 4-bit value, so we create a 64-bit
4818 // table we can index by the raw hardware mode.
4819 //
4820 // (trunc (FltRoundConversionTable >> MODE.fp_round)) & 0xf
4821
4822 SDValue BitTable =
4824
4825 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4826 SDValue RoundModeTimesNumBits =
4827 DAG.getNode(ISD::SHL, SL, MVT::i32, GetReg, Two);
4828
4829 // TODO: We could possibly avoid a 64-bit shift and use a simpler table if we
4830 // knew only one mode was demanded.
4831 SDValue TableValue =
4832 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4833 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4834
4835 SDValue EntryMask = DAG.getConstant(0xf, SL, MVT::i32);
4836 SDValue TableEntry =
4837 DAG.getNode(ISD::AND, SL, MVT::i32, TruncTable, EntryMask);
4838
4839 // There's a gap in the 4-bit encoded table and actual enum values, so offset
4840 // if it's an extended value.
4841 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4842 SDValue IsStandardValue =
4843 DAG.getSetCC(SL, MVT::i1, TableEntry, Four, ISD::SETULT);
4844 SDValue EnumOffset = DAG.getNode(ISD::ADD, SL, MVT::i32, TableEntry, Four);
4845 SDValue Result = DAG.getNode(ISD::SELECT, SL, MVT::i32, IsStandardValue,
4846 TableEntry, EnumOffset);
4847
4848 return DAG.getMergeValues({Result, GetReg.getValue(1)}, SL);
4849}
4850
4852 SelectionDAG &DAG) const {
4853 SDLoc SL(Op);
4854
4855 SDValue NewMode = Op.getOperand(1);
4856 assert(NewMode.getValueType() == MVT::i32);
4857
4858 // Index a table of 4-bit entries mapping from the C FLT_ROUNDS values to the
4859 // hardware MODE.fp_round values.
4860 if (auto *ConstMode = dyn_cast<ConstantSDNode>(NewMode)) {
4861 uint32_t ClampedVal = std::min(
4862 static_cast<uint32_t>(ConstMode->getZExtValue()),
4864 NewMode = DAG.getConstant(
4865 AMDGPU::decodeFltRoundToHWConversionTable(ClampedVal), SL, MVT::i32);
4866 } else {
4867 // If we know the input can only be one of the supported standard modes in
4868 // the range 0-3, we can use a simplified mapping to hardware values.
4869 KnownBits KB = DAG.computeKnownBits(NewMode);
4870 const bool UseReducedTable = KB.countMinLeadingZeros() >= 30;
4871 // The supported standard values are 0-3. The extended values start at 8. We
4872 // need to offset by 4 if the value is in the extended range.
4873
4874 if (UseReducedTable) {
4875 // Truncate to the low 32-bits.
4876 SDValue BitTable = DAG.getConstant(
4877 AMDGPU::FltRoundToHWConversionTable & 0xffff, SL, MVT::i32);
4878
4879 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4880 SDValue RoundModeTimesNumBits =
4881 DAG.getNode(ISD::SHL, SL, MVT::i32, NewMode, Two);
4882
4883 NewMode =
4884 DAG.getNode(ISD::SRL, SL, MVT::i32, BitTable, RoundModeTimesNumBits);
4885
4886 // TODO: SimplifyDemandedBits on the setreg source here can likely reduce
4887 // the table extracted bits into inline immediates.
4888 } else {
4889 // table_index = umin(value, value - 4)
4890 // MODE.fp_round = (bit_table >> (table_index << 2)) & 0xf
4891 SDValue BitTable =
4893
4894 SDValue Four = DAG.getConstant(4, SL, MVT::i32);
4895 SDValue OffsetEnum = DAG.getNode(ISD::SUB, SL, MVT::i32, NewMode, Four);
4896 SDValue IndexVal =
4897 DAG.getNode(ISD::UMIN, SL, MVT::i32, NewMode, OffsetEnum);
4898
4899 SDValue Two = DAG.getConstant(2, SL, MVT::i32);
4900 SDValue RoundModeTimesNumBits =
4901 DAG.getNode(ISD::SHL, SL, MVT::i32, IndexVal, Two);
4902
4903 SDValue TableValue =
4904 DAG.getNode(ISD::SRL, SL, MVT::i64, BitTable, RoundModeTimesNumBits);
4905 SDValue TruncTable = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, TableValue);
4906
4907 // No need to mask out the high bits since the setreg will ignore them
4908 // anyway.
4909 NewMode = TruncTable;
4910 }
4911
4912 // Insert a readfirstlane in case the value is a VGPR. We could do this
4913 // earlier and keep more operations scalar, but that interferes with
4914 // combining the source.
4915 SDValue ReadFirstLaneID =
4916 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
4917 NewMode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
4918 ReadFirstLaneID, NewMode);
4919 }
4920
4921 // N.B. The setreg will be later folded into s_round_mode on supported
4922 // targets.
4923 SDValue IntrinID =
4924 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
4925 uint32_t BothRoundHwReg =
4927 SDValue RoundBothImm = DAG.getTargetConstant(BothRoundHwReg, SL, MVT::i32);
4928
4929 SDValue SetReg =
4930 DAG.getNode(ISD::INTRINSIC_VOID, SL, Op->getVTList(), Op.getOperand(0),
4931 IntrinID, RoundBothImm, NewMode);
4932
4933 return SetReg;
4934}
4935
4937 if (Op->isDivergent() &&
4938 (!Subtarget->hasVmemPrefInsts() || !Op.getConstantOperandVal(4)))
4939 // Cannot do I$ prefetch with divergent pointer.
4940 return SDValue();
4941
4942 switch (cast<MemSDNode>(Op)->getAddressSpace()) {
4946 break;
4948 if (Subtarget->hasSafeSmemPrefetch())
4949 break;
4950 [[fallthrough]];
4951 default:
4952 return SDValue();
4953 }
4954
4955 // I$ prefetch
4956 if (!Subtarget->hasSafeSmemPrefetch() && !Op.getConstantOperandVal(4))
4957 return SDValue();
4958
4959 return Op;
4960}
4961
4962// Work around DAG legality rules only based on the result type.
4964 bool IsStrict = Op.getOpcode() == ISD::STRICT_FP_EXTEND;
4965 SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
4966 EVT SrcVT = Src.getValueType();
4967
4968 if (SrcVT.getScalarType() != MVT::bf16)
4969 return Op;
4970
4971 SDLoc SL(Op);
4972 SDValue BitCast =
4973 DAG.getNode(ISD::BITCAST, SL, SrcVT.changeTypeToInteger(), Src);
4974
4975 EVT DstVT = Op.getValueType();
4976 if (IsStrict)
4977 llvm_unreachable("Need STRICT_BF16_TO_FP");
4978
4979 return DAG.getNode(ISD::BF16_TO_FP, SL, DstVT, BitCast);
4980}
4981
4983 SDLoc SL(Op);
4984 if (Op.getValueType() != MVT::i64)
4985 return Op;
4986
4987 uint32_t ModeHwReg =
4989 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
4990 uint32_t TrapHwReg =
4992 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
4993
4994 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::Other);
4995 SDValue IntrinID =
4996 DAG.getTargetConstant(Intrinsic::amdgcn_s_getreg, SL, MVT::i32);
4997 SDValue GetModeReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
4998 Op.getOperand(0), IntrinID, ModeHwRegImm);
4999 SDValue GetTrapReg = DAG.getNode(ISD::INTRINSIC_W_CHAIN, SL, VTList,
5000 Op.getOperand(0), IntrinID, TrapHwRegImm);
5001 SDValue TokenReg =
5002 DAG.getNode(ISD::TokenFactor, SL, MVT::Other, GetModeReg.getValue(1),
5003 GetTrapReg.getValue(1));
5004
5005 SDValue CvtPtr =
5006 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, GetModeReg, GetTrapReg);
5007 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
5008
5009 return DAG.getMergeValues({Result, TokenReg}, SL);
5010}
5011
5013 SDLoc SL(Op);
5014 if (Op.getOperand(1).getValueType() != MVT::i64)
5015 return Op;
5016
5017 SDValue Input = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op.getOperand(1));
5018 SDValue NewModeReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
5019 DAG.getConstant(0, SL, MVT::i32));
5020 SDValue NewTrapReg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Input,
5021 DAG.getConstant(1, SL, MVT::i32));
5022
5023 SDValue ReadFirstLaneID =
5024 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, SL, MVT::i32);
5025 NewModeReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
5026 ReadFirstLaneID, NewModeReg);
5027 NewTrapReg = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
5028 ReadFirstLaneID, NewTrapReg);
5029
5030 unsigned ModeHwReg =
5032 SDValue ModeHwRegImm = DAG.getTargetConstant(ModeHwReg, SL, MVT::i32);
5033 unsigned TrapHwReg =
5035 SDValue TrapHwRegImm = DAG.getTargetConstant(TrapHwReg, SL, MVT::i32);
5036
5037 SDValue IntrinID =
5038 DAG.getTargetConstant(Intrinsic::amdgcn_s_setreg, SL, MVT::i32);
5039 SDValue SetModeReg =
5040 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
5041 IntrinID, ModeHwRegImm, NewModeReg);
5042 SDValue SetTrapReg =
5043 DAG.getNode(ISD::INTRINSIC_VOID, SL, MVT::Other, Op.getOperand(0),
5044 IntrinID, TrapHwRegImm, NewTrapReg);
5045 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, SetTrapReg, SetModeReg);
5046}
5047
5049 const MachineFunction &MF) const {
5050 const Function &Fn = MF.getFunction();
5051
5053 .Case("m0", AMDGPU::M0)
5054 .Case("exec", AMDGPU::EXEC)
5055 .Case("exec_lo", AMDGPU::EXEC_LO)
5056 .Case("exec_hi", AMDGPU::EXEC_HI)
5057 .Case("flat_scratch", AMDGPU::FLAT_SCR)
5058 .Case("flat_scratch_lo", AMDGPU::FLAT_SCR_LO)
5059 .Case("flat_scratch_hi", AMDGPU::FLAT_SCR_HI)
5060 .Default(Register());
5061 if (!Reg)
5062 return Reg;
5063
5064 if (!Subtarget->hasFlatScrRegister() &&
5065 Subtarget->getRegisterInfo()->regsOverlap(Reg, AMDGPU::FLAT_SCR)) {
5066 Fn.getContext().emitError(Twine("invalid register \"" + StringRef(RegName) +
5067 "\" for subtarget."));
5068 }
5069
5070 switch (Reg) {
5071 case AMDGPU::M0:
5072 case AMDGPU::EXEC_LO:
5073 case AMDGPU::EXEC_HI:
5074 case AMDGPU::FLAT_SCR_LO:
5075 case AMDGPU::FLAT_SCR_HI:
5076 if (VT.getSizeInBits() == 32)
5077 return Reg;
5078 break;
5079 case AMDGPU::EXEC:
5080 case AMDGPU::FLAT_SCR:
5081 if (VT.getSizeInBits() == 64)
5082 return Reg;
5083 break;
5084 default:
5085 llvm_unreachable("missing register type checking");
5086 }
5087
5089 Twine("invalid type for register \"" + StringRef(RegName) + "\"."));
5090}
5091
5092// If kill is not the last instruction, split the block so kill is always a
5093// proper terminator.
5096 MachineBasicBlock *BB) const {
5097 MachineBasicBlock *SplitBB = BB->splitAt(MI, /*UpdateLiveIns=*/true);
5099 MI.setDesc(TII->getKillTerminatorFromPseudo(MI.getOpcode()));
5100 return SplitBB;
5101}
5102
5103// Split block \p MBB at \p MI, as to insert a loop. If \p InstInLoop is true,
5104// \p MI will be the only instruction in the loop body block. Otherwise, it will
5105// be the first instruction in the remainder block.
5106//
5107/// \returns { LoopBody, Remainder }
5108static std::pair<MachineBasicBlock *, MachineBasicBlock *>
5110 MachineFunction *MF = MBB.getParent();
5112
5113 // To insert the loop we need to split the block. Move everything after this
5114 // point to a new block, and insert a new empty block between the two.
5116 MachineBasicBlock *RemainderBB = MF->CreateMachineBasicBlock();
5118 ++MBBI;
5119
5120 MF->insert(MBBI, LoopBB);
5121 MF->insert(MBBI, RemainderBB);
5122
5123 LoopBB->addSuccessor(LoopBB);
5124 LoopBB->addSuccessor(RemainderBB);
5125
5126 // Move the rest of the block into a new block.
5127 RemainderBB->transferSuccessorsAndUpdatePHIs(&MBB);
5128
5129 if (InstInLoop) {
5130 auto Next = std::next(I);
5131
5132 // Move instruction to loop body.
5133 LoopBB->splice(LoopBB->begin(), &MBB, I, Next);
5134
5135 // Move the rest of the block.
5136 RemainderBB->splice(RemainderBB->begin(), &MBB, Next, MBB.end());
5137 } else {
5138 RemainderBB->splice(RemainderBB->begin(), &MBB, I, MBB.end());
5139 }
5140
5141 MBB.addSuccessor(LoopBB);
5142
5143 return std::pair(LoopBB, RemainderBB);
5144}
5145
5146/// Insert \p MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
5148 MachineBasicBlock *MBB = MI.getParent();
5150 auto I = MI.getIterator();
5151 auto E = std::next(I);
5152
5153 // clang-format off
5154 BuildMI(*MBB, E, MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT))
5155 .addImm(0);
5156 // clang-format on
5157
5158 MIBundleBuilder Bundler(*MBB, I, E);
5159 finalizeBundle(*MBB, Bundler.begin());
5160}
5161
5164 MachineBasicBlock *BB) const {
5165 const DebugLoc &DL = MI.getDebugLoc();
5166
5168
5170
5171 // Apparently kill flags are only valid if the def is in the same block?
5172 if (MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::data0))
5173 Src->setIsKill(false);
5174
5175 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, *BB, true);
5176
5177 MachineBasicBlock::iterator I = LoopBB->end();
5178
5179 const unsigned EncodedReg = AMDGPU::Hwreg::HwregEncoding::encode(
5181
5182 // Clear TRAP_STS.MEM_VIOL
5183 BuildMI(*LoopBB, LoopBB->begin(), DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
5184 .addImm(0)
5185 .addImm(EncodedReg);
5186
5188
5189 Register Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5190
5191 // Load and check TRAP_STS.MEM_VIOL
5192 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_GETREG_B32), Reg)
5193 .addImm(EncodedReg);
5194
5195 // FIXME: Do we need to use an isel pseudo that may clobber scc?
5196 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CMP_LG_U32))
5197 .addReg(Reg, RegState::Kill)
5198 .addImm(0);
5199 // clang-format off
5200 BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
5201 .addMBB(LoopBB);
5202 // clang-format on
5203
5204 return RemainderBB;
5205}
5206
5207// Do a v_movrels_b32 or v_movreld_b32 for each unique value of \p IdxReg in the
5208// wavefront. If the value is uniform and just happens to be in a VGPR, this
5209// will only do one iteration. In the worst case, this will loop 64 times.
5210//
5211// TODO: Just use v_readlane_b32 if we know the VGPR has a uniform value.
5214 MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB,
5215 const DebugLoc &DL, const MachineOperand &Idx,
5216 unsigned InitReg, unsigned ResultReg, unsigned PhiReg,
5217 unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode,
5218 Register &SGPRIdxReg) {
5219
5220 MachineFunction *MF = OrigBB.getParent();
5221 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5222 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5225
5226 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5227 Register PhiExec = MRI.createVirtualRegister(BoolRC);
5228 Register NewExec = MRI.createVirtualRegister(BoolRC);
5229 Register CurrentIdxReg =
5230 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5231 Register CondReg = MRI.createVirtualRegister(BoolRC);
5232
5233 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiReg)
5234 .addReg(InitReg)
5235 .addMBB(&OrigBB)
5236 .addReg(ResultReg)
5237 .addMBB(&LoopBB);
5238
5239 BuildMI(LoopBB, I, DL, TII->get(TargetOpcode::PHI), PhiExec)
5240 .addReg(InitSaveExecReg)
5241 .addMBB(&OrigBB)
5242 .addReg(NewExec)
5243 .addMBB(&LoopBB);
5244
5245 // Read the next variant <- also loop target.
5246 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), CurrentIdxReg)
5247 .addReg(Idx.getReg(), getUndefRegState(Idx.isUndef()));
5248
5249 // Compare the just read M0 value to all possible Idx values.
5250 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::V_CMP_EQ_U32_e64), CondReg)
5251 .addReg(CurrentIdxReg)
5252 .addReg(Idx.getReg(), {}, Idx.getSubReg());
5253
5254 // Update EXEC, save the original EXEC value to VCC.
5255 BuildMI(LoopBB, I, DL, TII->get(LMC.AndSaveExecOpc), NewExec)
5256 .addReg(CondReg, RegState::Kill);
5257
5258 MRI.setSimpleHint(NewExec, CondReg);
5259
5260 if (UseGPRIdxMode) {
5261 if (Offset == 0) {
5262 SGPRIdxReg = CurrentIdxReg;
5263 } else {
5264 SGPRIdxReg = MRI.createVirtualRegister(&AMDGPU::SGPR_32RegClass);
5265 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), SGPRIdxReg)
5266 .addReg(CurrentIdxReg, RegState::Kill)
5267 .addImm(Offset);
5268 }
5269 } else {
5270 // Move index from VCC into M0
5271 if (Offset == 0) {
5272 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5273 .addReg(CurrentIdxReg, RegState::Kill);
5274 } else {
5275 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5276 .addReg(CurrentIdxReg, RegState::Kill)
5277 .addImm(Offset);
5278 }
5279 }
5280
5281 // Update EXEC, switch all done bits to 0 and all todo bits to 1.
5282 MachineInstr *InsertPt =
5283 BuildMI(LoopBB, I, DL, TII->get(LMC.XorTermOpc), LMC.ExecReg)
5284 .addReg(LMC.ExecReg)
5285 .addReg(NewExec);
5286
5287 // XXX - s_xor_b64 sets scc to 1 if the result is nonzero, so can we use
5288 // s_cbranch_scc0?
5289
5290 // Loop back to V_READFIRSTLANE_B32 if there are still variants to cover.
5291 // clang-format off
5292 BuildMI(LoopBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
5293 .addMBB(&LoopBB);
5294 // clang-format on
5295
5296 return InsertPt->getIterator();
5297}
5298
5299// This has slightly sub-optimal regalloc when the source vector is killed by
5300// the read. The register allocator does not understand that the kill is
5301// per-workitem, so is kept alive for the whole loop so we end up not re-using a
5302// subregister from it, using 1 more VGPR than necessary. This was saved when
5303// this was expanded after register allocation.
5306 unsigned InitResultReg, unsigned PhiReg, int Offset,
5307 bool UseGPRIdxMode, Register &SGPRIdxReg) {
5308 MachineFunction *MF = MBB.getParent();
5309 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5310 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5311 MachineRegisterInfo &MRI = MF->getRegInfo();
5312 const DebugLoc &DL = MI.getDebugLoc();
5314
5315 const auto *BoolXExecRC = TRI->getWaveMaskRegClass();
5316 Register DstReg = MI.getOperand(0).getReg();
5317 Register SaveExec = MRI.createVirtualRegister(BoolXExecRC);
5318 Register TmpExec = MRI.createVirtualRegister(BoolXExecRC);
5320
5321 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), TmpExec);
5322
5323 // Save the EXEC mask
5324 // clang-format off
5325 BuildMI(MBB, I, DL, TII->get(LMC.MovOpc), SaveExec)
5326 .addReg(LMC.ExecReg);
5327 // clang-format on
5328
5329 auto [LoopBB, RemainderBB] = splitBlockForLoop(MI, MBB, false);
5330
5331 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5332
5333 auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
5334 InitResultReg, DstReg, PhiReg, TmpExec,
5335 Offset, UseGPRIdxMode, SGPRIdxReg);
5336
5337 MachineBasicBlock *LandingPad = MF->CreateMachineBasicBlock();
5339 ++MBBI;
5340 MF->insert(MBBI, LandingPad);
5341 LoopBB->removeSuccessor(RemainderBB);
5342 LandingPad->addSuccessor(RemainderBB);
5343 LoopBB->addSuccessor(LandingPad);
5344 MachineBasicBlock::iterator First = LandingPad->begin();
5345 // clang-format off
5346 BuildMI(*LandingPad, First, DL, TII->get(LMC.MovOpc), LMC.ExecReg)
5347 .addReg(SaveExec);
5348 // clang-format on
5349
5350 return InsPt;
5351}
5352
5353// Returns subreg index, offset
5354static std::pair<unsigned, int>
5356 const TargetRegisterClass *SuperRC, unsigned VecReg,
5357 int Offset) {
5358 int NumElts = TRI.getRegSizeInBits(*SuperRC) / 32;
5359
5360 // Skip out of bounds offsets, or else we would end up using an undefined
5361 // register.
5362 if (Offset >= NumElts || Offset < 0)
5363 return std::pair(AMDGPU::sub0, Offset);
5364
5365 return std::pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
5366}
5367
5370 int Offset) {
5371 MachineBasicBlock *MBB = MI.getParent();
5372 const DebugLoc &DL = MI.getDebugLoc();
5374
5375 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5376
5377 assert(Idx->getReg() != AMDGPU::NoRegister);
5378
5379 if (Offset == 0) {
5380 // clang-format off
5381 BuildMI(*MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::M0)
5382 .add(*Idx);
5383 // clang-format on
5384 } else {
5385 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
5386 .add(*Idx)
5387 .addImm(Offset);
5388 }
5389}
5390
5393 int Offset) {
5394 MachineBasicBlock *MBB = MI.getParent();
5395 const DebugLoc &DL = MI.getDebugLoc();
5397
5398 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5399
5400 if (Offset == 0)
5401 return Idx->getReg();
5402
5403 Register Tmp = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
5404 BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), Tmp)
5405 .add(*Idx)
5406 .addImm(Offset);
5407 return Tmp;
5408}
5409
5412 const GCNSubtarget &ST) {
5413 const SIInstrInfo *TII = ST.getInstrInfo();
5414 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5415 MachineFunction *MF = MBB.getParent();
5416 MachineRegisterInfo &MRI = MF->getRegInfo();
5417
5418 Register Dst = MI.getOperand(0).getReg();
5419 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5420 Register SrcReg = TII->getNamedOperand(MI, AMDGPU::OpName::src)->getReg();
5421 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5422
5423 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcReg);
5424 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5425
5426 unsigned SubReg;
5427 std::tie(SubReg, Offset) =
5428 computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
5429
5430 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5431
5432 // Check for a SGPR index.
5433 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5435 const DebugLoc &DL = MI.getDebugLoc();
5436
5437 if (UseGPRIdxMode) {
5438 // TODO: Look at the uses to avoid the copy. This may require rescheduling
5439 // to avoid interfering with other uses, so probably requires a new
5440 // optimization pass.
5441 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5442
5443 const MCInstrDesc &GPRIDXDesc =
5444 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5445 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5446 .addReg(SrcReg)
5447 .addReg(Idx)
5448 .addImm(SubReg);
5449 } else {
5451
5452 BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5453 .addReg(SrcReg, {}, SubReg)
5454 .addReg(SrcReg, RegState::Implicit);
5455 }
5456
5457 MI.eraseFromParent();
5458
5459 return &MBB;
5460 }
5461
5462 // Control flow needs to be inserted if indexing with a VGPR.
5463 const DebugLoc &DL = MI.getDebugLoc();
5465
5466 Register PhiReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5467 Register InitReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5468
5469 BuildMI(MBB, I, DL, TII->get(TargetOpcode::IMPLICIT_DEF), InitReg);
5470
5471 Register SGPRIdxReg;
5472 auto InsPt = loadM0FromVGPR(TII, MBB, MI, InitReg, PhiReg, Offset,
5473 UseGPRIdxMode, SGPRIdxReg);
5474
5475 MachineBasicBlock *LoopBB = InsPt->getParent();
5476
5477 if (UseGPRIdxMode) {
5478 const MCInstrDesc &GPRIDXDesc =
5479 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), true);
5480
5481 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5482 .addReg(SrcReg)
5483 .addReg(SGPRIdxReg)
5484 .addImm(SubReg);
5485 } else {
5486 BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOVRELS_B32_e32), Dst)
5487 .addReg(SrcReg, {}, SubReg)
5488 .addReg(SrcReg, RegState::Implicit);
5489 }
5490
5491 MI.eraseFromParent();
5492
5493 return LoopBB;
5494}
5495
5498 const GCNSubtarget &ST) {
5499 const SIInstrInfo *TII = ST.getInstrInfo();
5500 const SIRegisterInfo &TRI = TII->getRegisterInfo();
5501 MachineFunction *MF = MBB.getParent();
5502 MachineRegisterInfo &MRI = MF->getRegInfo();
5503
5504 Register Dst = MI.getOperand(0).getReg();
5505 const MachineOperand *SrcVec = TII->getNamedOperand(MI, AMDGPU::OpName::src);
5506 const MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::idx);
5507 const MachineOperand *Val = TII->getNamedOperand(MI, AMDGPU::OpName::val);
5508 int Offset = TII->getNamedOperand(MI, AMDGPU::OpName::offset)->getImm();
5509 const TargetRegisterClass *VecRC = MRI.getRegClass(SrcVec->getReg());
5510 const TargetRegisterClass *IdxRC = MRI.getRegClass(Idx->getReg());
5511
5512 // This can be an immediate, but will be folded later.
5513 assert(Val->getReg());
5514
5515 unsigned SubReg;
5516 std::tie(SubReg, Offset) =
5517 computeIndirectRegAndOffset(TRI, VecRC, SrcVec->getReg(), Offset);
5518 const bool UseGPRIdxMode = ST.useVGPRIndexMode();
5519
5520 if (Idx->getReg() == AMDGPU::NoRegister) {
5522 const DebugLoc &DL = MI.getDebugLoc();
5523
5524 assert(Offset == 0);
5525
5526 BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dst)
5527 .add(*SrcVec)
5528 .add(*Val)
5529 .addImm(SubReg);
5530
5531 MI.eraseFromParent();
5532 return &MBB;
5533 }
5534
5535 // Check for a SGPR index.
5536 if (TII->getRegisterInfo().isSGPRClass(IdxRC)) {
5538 const DebugLoc &DL = MI.getDebugLoc();
5539
5540 if (UseGPRIdxMode) {
5541 Register Idx = getIndirectSGPRIdx(TII, MRI, MI, Offset);
5542
5543 const MCInstrDesc &GPRIDXDesc =
5544 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5545 BuildMI(MBB, I, DL, GPRIDXDesc, Dst)
5546 .addReg(SrcVec->getReg())
5547 .add(*Val)
5548 .addReg(Idx)
5549 .addImm(SubReg);
5550 } else {
5552
5553 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5554 TRI.getRegSizeInBits(*VecRC), 32, false);
5555 BuildMI(MBB, I, DL, MovRelDesc, Dst)
5556 .addReg(SrcVec->getReg())
5557 .add(*Val)
5558 .addImm(SubReg);
5559 }
5560 MI.eraseFromParent();
5561 return &MBB;
5562 }
5563
5564 // Control flow needs to be inserted if indexing with a VGPR.
5565 if (Val->isReg())
5566 MRI.clearKillFlags(Val->getReg());
5567
5568 const DebugLoc &DL = MI.getDebugLoc();
5569
5570 Register PhiReg = MRI.createVirtualRegister(VecRC);
5571
5572 Register SGPRIdxReg;
5573 auto InsPt = loadM0FromVGPR(TII, MBB, MI, SrcVec->getReg(), PhiReg, Offset,
5574 UseGPRIdxMode, SGPRIdxReg);
5575 MachineBasicBlock *LoopBB = InsPt->getParent();
5576
5577 if (UseGPRIdxMode) {
5578 const MCInstrDesc &GPRIDXDesc =
5579 TII->getIndirectGPRIDXPseudo(TRI.getRegSizeInBits(*VecRC), false);
5580
5581 BuildMI(*LoopBB, InsPt, DL, GPRIDXDesc, Dst)
5582 .addReg(PhiReg)
5583 .add(*Val)
5584 .addReg(SGPRIdxReg)
5585 .addImm(SubReg);
5586 } else {
5587 const MCInstrDesc &MovRelDesc = TII->getIndirectRegWriteMovRelPseudo(
5588 TRI.getRegSizeInBits(*VecRC), 32, false);
5589 BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
5590 .addReg(PhiReg)
5591 .add(*Val)
5592 .addImm(SubReg);
5593 }
5594
5595 MI.eraseFromParent();
5596 return LoopBB;
5597}
5598
5600 MachineBasicBlock *BB) {
5601 // For targets older than GFX12, we emit a sequence of 32-bit operations.
5602 // For GFX12, we emit s_add_u64 and s_sub_u64.
5603 MachineFunction *MF = BB->getParent();
5604 const SIInstrInfo *TII = MF->getSubtarget<GCNSubtarget>().getInstrInfo();
5605 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5607 const DebugLoc &DL = MI.getDebugLoc();
5608 MachineOperand &Dest = MI.getOperand(0);
5609 MachineOperand &Src0 = MI.getOperand(1);
5610 MachineOperand &Src1 = MI.getOperand(2);
5611 bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
5612 if (ST.hasScalarAddSub64()) {
5613 unsigned Opc = IsAdd ? AMDGPU::S_ADD_U64 : AMDGPU::S_SUB_U64;
5614 // clang-format off
5615 BuildMI(*BB, MI, DL, TII->get(Opc), Dest.getReg())
5616 .add(Src0)
5617 .add(Src1);
5618 // clang-format on
5619 } else {
5620 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5621 const TargetRegisterClass *BoolRC = TRI->getBoolRC();
5622
5623 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5624 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5625
5626 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5627 MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5628 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5629 MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5630
5631 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5632 MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
5633 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5634 MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
5635
5636 unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
5637 unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
5638 BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
5639 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
5640 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
5641 .addReg(DestSub0)
5642 .addImm(AMDGPU::sub0)
5643 .addReg(DestSub1)
5644 .addImm(AMDGPU::sub1);
5645 }
5646 MI.eraseFromParent();
5647 return BB;
5648}
5649
5651 MachineFunction *MF = BB->getParent();
5652 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
5653 const SIInstrInfo *TII = ST.getInstrInfo();
5654 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5655 MachineRegisterInfo &MRI = MF->getRegInfo();
5656 const DebugLoc &DL = MI.getDebugLoc();
5657 Register Dst = MI.getOperand(0).getReg();
5658 const MachineOperand &Src0 = MI.getOperand(1);
5659 const MachineOperand &Src1 = MI.getOperand(2);
5660 Register SrcCond = MI.getOperand(3).getReg();
5661
5662 Register DstLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5663 Register DstHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
5664 const TargetRegisterClass *CondRC = TRI->getWaveMaskRegClass();
5665 Register SrcCondCopy = MRI.createVirtualRegister(CondRC);
5666
5667 int Src0Idx =
5668 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
5669 int Src1Idx =
5670 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
5671 const TargetRegisterClass *Src0RC =
5672 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src0Idx));
5673 const TargetRegisterClass *Src1RC =
5674 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), Src1Idx));
5675
5676 const TargetRegisterClass *Src0SubRC =
5677 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
5678 const TargetRegisterClass *Src1SubRC =
5679 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
5680
5681 MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
5682 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
5683 MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
5684 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
5685
5686 MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
5687 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
5688 MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
5689 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
5690
5691 BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), SrcCondCopy).addReg(SrcCond);
5692 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstLo)
5693 .addImm(0)
5694 .add(Src0Sub0)
5695 .addImm(0)
5696 .add(Src1Sub0)
5697 .addReg(SrcCondCopy);
5698
5699 BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstHi)
5700 .addImm(0)
5701 .add(Src0Sub1)
5702 .addImm(0)
5703 .add(Src1Sub1)
5704 .addReg(SrcCondCopy);
5705
5706 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE), Dst)
5707 .addReg(DstLo)
5708 .addImm(AMDGPU::sub0)
5709 .addReg(DstHi)
5710 .addImm(AMDGPU::sub1);
5711 MI.eraseFromParent();
5712}
5713
5715 switch (Opc) {
5716 case AMDGPU::S_MIN_U32:
5717 return std::numeric_limits<uint32_t>::max();
5718 case AMDGPU::S_MIN_I32:
5719 return std::numeric_limits<int32_t>::max();
5720 case AMDGPU::S_MAX_U32:
5721 return std::numeric_limits<uint32_t>::min();
5722 case AMDGPU::S_MAX_I32:
5723 return std::numeric_limits<int32_t>::min();
5724 case AMDGPU::V_ADD_F32_e64: // -0.0
5725 return 0x80000000;
5726 case AMDGPU::V_SUB_F32_e64: // +0.0
5727 return 0x0;
5728 case AMDGPU::S_ADD_I32:
5729 case AMDGPU::S_SUB_I32:
5730 case AMDGPU::S_OR_B32:
5731 case AMDGPU::S_XOR_B32:
5732 return std::numeric_limits<uint32_t>::min();
5733 case AMDGPU::S_AND_B32:
5734 return std::numeric_limits<uint32_t>::max();
5735 case AMDGPU::V_MIN_F32_e64:
5736 case AMDGPU::V_MAX_F32_e64:
5737 return 0x7fc00000; // qNAN
5738 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5739 return std::numeric_limits<uint64_t>::max();
5740 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5741 return std::numeric_limits<int64_t>::max();
5742 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5743 return std::numeric_limits<uint64_t>::min();
5744 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5745 return std::numeric_limits<int64_t>::min();
5746 case AMDGPU::V_MIN_F64_e64:
5747 case AMDGPU::V_MAX_F64_e64:
5748 case AMDGPU::V_MIN_NUM_F64_e64:
5749 case AMDGPU::V_MAX_NUM_F64_e64:
5750 return 0x7FF8000000000000; // qNAN
5751 case AMDGPU::S_ADD_U64_PSEUDO:
5752 case AMDGPU::S_SUB_U64_PSEUDO:
5753 case AMDGPU::S_OR_B64:
5754 case AMDGPU::S_XOR_B64:
5755 return std::numeric_limits<uint64_t>::min();
5756 case AMDGPU::S_AND_B64:
5757 return std::numeric_limits<uint64_t>::max();
5758 case AMDGPU::V_ADD_F64_e64:
5759 case AMDGPU::V_ADD_F64_pseudo_e64:
5760 return 0x8000000000000000; // -0.0
5761 default:
5762 llvm_unreachable("Unexpected opcode in getIdentityValueForWaveReduction");
5763 }
5764}
5765
5766static bool is32bitWaveReduceOperation(unsigned Opc) {
5767 return Opc == AMDGPU::S_MIN_U32 || Opc == AMDGPU::S_MIN_I32 ||
5768 Opc == AMDGPU::S_MAX_U32 || Opc == AMDGPU::S_MAX_I32 ||
5769 Opc == AMDGPU::S_ADD_I32 || Opc == AMDGPU::S_SUB_I32 ||
5770 Opc == AMDGPU::S_AND_B32 || Opc == AMDGPU::S_OR_B32 ||
5771 Opc == AMDGPU::S_XOR_B32 || Opc == AMDGPU::V_MIN_F32_e64 ||
5772 Opc == AMDGPU::V_MAX_F32_e64 || Opc == AMDGPU::V_ADD_F32_e64 ||
5773 Opc == AMDGPU::V_SUB_F32_e64;
5774}
5775
5777 return Opc == AMDGPU::V_MIN_F32_e64 || Opc == AMDGPU::V_MAX_F32_e64 ||
5778 Opc == AMDGPU::V_ADD_F32_e64 || Opc == AMDGPU::V_SUB_F32_e64 ||
5779 Opc == AMDGPU::V_MIN_F64_e64 || Opc == AMDGPU::V_MAX_F64_e64 ||
5780 Opc == AMDGPU::V_MIN_NUM_F64_e64 || Opc == AMDGPU::V_MAX_NUM_F64_e64 ||
5781 Opc == AMDGPU::V_ADD_F64_e64 || Opc == AMDGPU::V_ADD_F64_pseudo_e64;
5782}
5783
5784static std::tuple<unsigned, unsigned>
5786 unsigned DPPOpc;
5787 switch (Opc) {
5788 case AMDGPU::S_MIN_U32:
5789 DPPOpc = AMDGPU::V_MIN_U32_dpp;
5790 break;
5791 case AMDGPU::S_MIN_I32:
5792 DPPOpc = AMDGPU::V_MIN_I32_dpp;
5793 break;
5794 case AMDGPU::S_MAX_U32:
5795 DPPOpc = AMDGPU::V_MAX_U32_dpp;
5796 break;
5797 case AMDGPU::S_MAX_I32:
5798 DPPOpc = AMDGPU::V_MAX_I32_dpp;
5799 break;
5800 case AMDGPU::S_ADD_I32:
5801 case AMDGPU::S_SUB_I32:
5802 DPPOpc = ST.hasAddNoCarryInsts() ? AMDGPU::V_ADD_U32_dpp
5803 : AMDGPU::V_ADD_CO_U32_dpp;
5804 break;
5805 case AMDGPU::S_AND_B32:
5806 DPPOpc = AMDGPU::V_AND_B32_dpp;
5807 break;
5808 case AMDGPU::S_OR_B32:
5809 DPPOpc = AMDGPU::V_OR_B32_dpp;
5810 break;
5811 case AMDGPU::S_XOR_B32:
5812 DPPOpc = AMDGPU::V_XOR_B32_dpp;
5813 break;
5814 case AMDGPU::V_ADD_F32_e64:
5815 case AMDGPU::V_SUB_F32_e64:
5816 DPPOpc = AMDGPU::V_ADD_F32_dpp;
5817 break;
5818 case AMDGPU::V_MIN_F32_e64:
5819 DPPOpc = AMDGPU::V_MIN_F32_dpp;
5820 break;
5821 case AMDGPU::V_MAX_F32_e64:
5822 DPPOpc = AMDGPU::V_MAX_F32_dpp;
5823 break;
5824 case AMDGPU::V_CMP_LT_U64_e64: // umin.u64
5825 case AMDGPU::V_CMP_LT_I64_e64: // min.i64
5826 case AMDGPU::V_CMP_GT_U64_e64: // umax.u64
5827 case AMDGPU::V_CMP_GT_I64_e64: // max.i64
5828 case AMDGPU::S_ADD_U64_PSEUDO:
5829 case AMDGPU::S_SUB_U64_PSEUDO:
5830 case AMDGPU::S_AND_B64:
5831 case AMDGPU::S_OR_B64:
5832 case AMDGPU::S_XOR_B64:
5833 case AMDGPU::V_MIN_NUM_F64_e64:
5834 case AMDGPU::V_MIN_F64_e64:
5835 case AMDGPU::V_MAX_NUM_F64_e64:
5836 case AMDGPU::V_MAX_F64_e64:
5837 case AMDGPU::V_ADD_F64_pseudo_e64:
5838 case AMDGPU::V_ADD_F64_e64:
5839 DPPOpc = AMDGPU::V_MOV_B64_DPP_PSEUDO;
5840 break;
5841 default:
5842 llvm_unreachable("unhandled lane op");
5843 }
5844 unsigned ClampOpc = Opc;
5845 if (!ST.getInstrInfo()->isVALU(Opc, /*AllowLDSDMA=*/true)) {
5846 if (Opc == AMDGPU::S_SUB_I32)
5847 ClampOpc = AMDGPU::S_ADD_I32;
5848 if (Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO)
5849 ClampOpc = AMDGPU::V_ADD_CO_U32_e64;
5850 else if (Opc == AMDGPU::S_AND_B64)
5851 ClampOpc = AMDGPU::V_AND_B32_e64;
5852 else if (Opc == AMDGPU::S_OR_B64)
5853 ClampOpc = AMDGPU::V_OR_B32_e64;
5854 else if (Opc == AMDGPU::S_XOR_B64)
5855 ClampOpc = AMDGPU::V_XOR_B32_e64;
5856 else
5857 ClampOpc = ST.getInstrInfo()->getVALUOp(ClampOpc);
5858 }
5859 return {DPPOpc, ClampOpc};
5860}
5861
5862static std::pair<Register, Register>
5864 const TargetRegisterClass *SrcRC, const GCNSubtarget &ST,
5865 MachineRegisterInfo &MRI) {
5866 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5867 const SIInstrInfo *TII = ST.getInstrInfo();
5868 const TargetRegisterClass *SrcSubRC =
5869 TRI->getSubRegisterClass(SrcRC, AMDGPU::sub0);
5870 Register Op1L =
5871 TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub0, SrcSubRC);
5872 Register Op1H =
5873 TII->buildExtractSubReg(MI, MRI, Op, SrcRC, AMDGPU::sub1, SrcSubRC);
5874 return {Op1L, Op1H};
5875}
5876
5879 const GCNSubtarget &ST,
5880 unsigned Opc) {
5882 const SIRegisterInfo *TRI = ST.getRegisterInfo();
5883 const DebugLoc &DL = MI.getDebugLoc();
5884 const SIInstrInfo *TII = ST.getInstrInfo();
5885
5886 // Reduction operations depend on whether the input operand is SGPR or VGPR.
5887 Register SrcReg = MI.getOperand(1).getReg();
5888 bool isSGPR = TRI->isSGPRClass(MRI.getRegClass(SrcReg));
5889 Register DstReg = MI.getOperand(0).getReg();
5890 unsigned Stratergy = static_cast<unsigned>(MI.getOperand(2).getImm());
5891 enum WAVE_REDUCE_STRATEGY : unsigned { DEFAULT = 0, ITERATIVE = 1, DPP = 2 };
5892 MachineBasicBlock *RetBB = nullptr;
5893 unsigned MIOpc = MI.getOpcode();
5894 auto BuildRegSequence = [&](MachineBasicBlock &BB,
5896 Register Src0, Register Src1) {
5897 auto RegSequence =
5898 BuildMI(BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dst)
5899 .addReg(Src0)
5900 .addImm(AMDGPU::sub0)
5901 .addReg(Src1)
5902 .addImm(AMDGPU::sub1);
5903 return RegSequence;
5904 };
5905 if (isSGPR) {
5906 switch (Opc) {
5907 case AMDGPU::S_MIN_U32:
5908 case AMDGPU::S_MIN_I32:
5909 case AMDGPU::V_MIN_F32_e64:
5910 case AMDGPU::S_MAX_U32:
5911 case AMDGPU::S_MAX_I32:
5912 case AMDGPU::V_MAX_F32_e64:
5913 case AMDGPU::S_AND_B32:
5914 case AMDGPU::S_OR_B32: {
5915 // Idempotent operations.
5916 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B32), DstReg).addReg(SrcReg);
5917 RetBB = &BB;
5918 break;
5919 }
5920 case AMDGPU::V_CMP_LT_U64_e64: // umin
5921 case AMDGPU::V_CMP_LT_I64_e64: // min
5922 case AMDGPU::V_CMP_GT_U64_e64: // umax
5923 case AMDGPU::V_CMP_GT_I64_e64: // max
5924 case AMDGPU::V_MIN_F64_e64:
5925 case AMDGPU::V_MIN_NUM_F64_e64:
5926 case AMDGPU::V_MAX_F64_e64:
5927 case AMDGPU::V_MAX_NUM_F64_e64:
5928 case AMDGPU::S_AND_B64:
5929 case AMDGPU::S_OR_B64: {
5930 // Idempotent operations.
5931 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MOV_B64), DstReg).addReg(SrcReg);
5932 RetBB = &BB;
5933 break;
5934 }
5935 case AMDGPU::S_XOR_B32:
5936 case AMDGPU::S_XOR_B64:
5937 case AMDGPU::S_ADD_I32:
5938 case AMDGPU::S_ADD_U64_PSEUDO:
5939 case AMDGPU::V_ADD_F32_e64:
5940 case AMDGPU::V_ADD_F64_e64:
5941 case AMDGPU::V_ADD_F64_pseudo_e64:
5942 case AMDGPU::S_SUB_I32:
5943 case AMDGPU::S_SUB_U64_PSEUDO:
5944 case AMDGPU::V_SUB_F32_e64: {
5945 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
5946 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
5947 Register ExecMask = MRI.createVirtualRegister(WaveMaskRegClass);
5948 Register NumActiveLanes =
5949 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5950
5951 bool IsWave32 = ST.isWave32();
5952 unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
5953 MCRegister ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
5954 unsigned BitCountOpc =
5955 IsWave32 ? AMDGPU::S_BCNT1_I32_B32 : AMDGPU::S_BCNT1_I32_B64;
5956
5957 BuildMI(BB, MI, DL, TII->get(MovOpc), ExecMask).addReg(ExecReg);
5958
5959 auto NewAccumulator =
5960 BuildMI(BB, MI, DL, TII->get(BitCountOpc), NumActiveLanes)
5961 .addReg(ExecMask);
5962
5963 switch (Opc) {
5964 case AMDGPU::S_XOR_B32:
5965 case AMDGPU::S_XOR_B64: {
5966 // Performing an XOR operation on a uniform value
5967 // depends on the parity of the number of active lanes.
5968 // For even parity, the result will be 0, for odd
5969 // parity the result will be the same as the input value.
5970 Register ParityRegister =
5971 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5972
5973 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_AND_B32), ParityRegister)
5974 .addReg(NewAccumulator->getOperand(0).getReg())
5975 .addImm(1)
5976 .setOperandDead(3); // Dead scc
5977 if (Opc == AMDGPU::S_XOR_B32) {
5978 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
5979 .addReg(SrcReg)
5980 .addReg(ParityRegister);
5981 } else {
5982 Register DestSub0 =
5983 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5984 Register DestSub1 =
5985 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
5986 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
5987 MRI.getRegClass(SrcReg), ST, MRI);
5988 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
5989 .addReg(Op1L)
5990 .addReg(ParityRegister);
5991 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub1)
5992 .addReg(Op1H)
5993 .addReg(ParityRegister);
5994 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
5995 }
5996 break;
5997 }
5998 case AMDGPU::S_SUB_I32: {
5999 Register NegatedVal = MRI.createVirtualRegister(DstRegClass);
6000
6001 // Take the negation of the source operand.
6002 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedVal)
6003 .addImm(0)
6004 .addReg(SrcReg);
6005 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
6006 .addReg(NegatedVal)
6007 .addReg(NewAccumulator->getOperand(0).getReg());
6008 break;
6009 }
6010 case AMDGPU::S_ADD_I32: {
6011 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DstReg)
6012 .addReg(SrcReg)
6013 .addReg(NewAccumulator->getOperand(0).getReg());
6014 break;
6015 }
6016 case AMDGPU::S_ADD_U64_PSEUDO:
6017 case AMDGPU::S_SUB_U64_PSEUDO: {
6018 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6019 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6020 Register Op1H_Op0L_Reg =
6021 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6022 Register Op1L_Op0H_Reg =
6023 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6024 Register CarryReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6025 Register AddReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6026 Register NegatedValLo =
6027 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6028 Register NegatedValHi =
6029 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6030 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
6031 MRI.getRegClass(SrcReg), ST, MRI);
6032 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6033 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedValLo)
6034 .addImm(0)
6035 .addReg(NewAccumulator->getOperand(0).getReg())
6036 .setOperandDead(3); // Dead scc
6037 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ASHR_I32), NegatedValHi)
6038 .addReg(NegatedValLo)
6039 .addImm(31)
6040 .setOperandDead(3); // Dead scc
6041 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1L_Op0H_Reg)
6042 .addReg(Op1L)
6043 .addReg(NegatedValHi);
6044 }
6045 Register LowOpcode = Opc == AMDGPU::S_SUB_U64_PSEUDO
6046 ? NegatedValLo
6047 : NewAccumulator->getOperand(0).getReg();
6048 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), DestSub0)
6049 .addReg(Op1L)
6050 .addReg(LowOpcode);
6051 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_HI_U32), CarryReg)
6052 .addReg(Op1L)
6053 .addReg(LowOpcode);
6054 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_MUL_I32), Op1H_Op0L_Reg)
6055 .addReg(Op1H)
6056 .addReg(LowOpcode);
6057
6058 Register HiVal = Opc == AMDGPU::S_SUB_U64_PSEUDO ? AddReg : DestSub1;
6059 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), HiVal)
6060 .addReg(CarryReg)
6061 .addReg(Op1H_Op0L_Reg)
6062 .setOperandDead(3); // Dead scc
6063
6064 if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6065 BuildMI(BB, MI, DL, TII->get(AMDGPU::S_ADD_U32), DestSub1)
6066 .addReg(HiVal)
6067 .addReg(Op1L_Op0H_Reg)
6068 .setOperandDead(3); // Dead scc
6069 }
6070 BuildRegSequence(BB, MI, DstReg, DestSub0, DestSub1);
6071 break;
6072 }
6073 case AMDGPU::V_ADD_F32_e64:
6074 case AMDGPU::V_ADD_F64_e64:
6075 case AMDGPU::V_ADD_F64_pseudo_e64:
6076 case AMDGPU::V_SUB_F32_e64: {
6077 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
6078 const TargetRegisterClass *VregRC = TII->getRegClass(TII->get(Opc), 0);
6079 Register ActiveLanesVreg = MRI.createVirtualRegister(VregRC);
6080 Register DstVreg = MRI.createVirtualRegister(VregRC);
6081 // Get number of active lanes as a float val.
6082 BuildMI(BB, MI, DL,
6083 TII->get(is32BitOpc ? AMDGPU::V_CVT_F32_I32_e64
6084 : AMDGPU::V_CVT_F64_I32_e64),
6085 ActiveLanesVreg)
6086 .addReg(NewAccumulator->getOperand(0).getReg())
6087 .addImm(0) // clamp
6088 .addImm(0); // output-modifier
6089
6090 // Take negation of input for SUB reduction
6091 unsigned srcMod = (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6092 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64)
6095 unsigned MulOpc = is32BitOpc ? AMDGPU::V_MUL_F32_e64
6096 : ST.getGeneration() >= AMDGPUSubtarget::GFX12
6097 ? AMDGPU::V_MUL_F64_pseudo_e64
6098 : AMDGPU::V_MUL_F64_e64;
6099 auto DestVregInst = BuildMI(BB, MI, DL, TII->get(MulOpc),
6100 DstVreg)
6101 .addImm(srcMod) // src0 modifier
6102 .addReg(SrcReg)
6103 .addImm(SISrcMods::NONE) // src1 modifier
6104 .addReg(ActiveLanesVreg)
6105 .addImm(SISrcMods::NONE) // clamp
6106 .addImm(SISrcMods::NONE); // output-mod
6107 if (is32BitOpc) {
6108 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), DstReg)
6109 .addReg(DstVreg);
6110 } else {
6111 Register LaneValueLoReg =
6112 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6113 Register LaneValueHiReg =
6114 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6115 auto [Op1L, Op1H] =
6116 ExtractSubRegs(MI, DestVregInst->getOperand(0), VregRC, ST, MRI);
6117 // lane value input should be in an sgpr
6118 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6119 LaneValueLoReg)
6120 .addReg(Op1L);
6121 BuildMI(BB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6122 LaneValueHiReg)
6123 .addReg(Op1H);
6124 NewAccumulator =
6125 BuildRegSequence(BB, MI, DstReg, LaneValueLoReg, LaneValueHiReg);
6126 }
6127 }
6128 }
6129 RetBB = &BB;
6130 }
6131 }
6132 } else {
6134 Register SrcReg = MI.getOperand(1).getReg();
6135 bool is32BitOpc = is32bitWaveReduceOperation(Opc);
6137 bool NeedsMovDPP = !is32BitOpc;
6138 // Create virtual registers required for lowering.
6139 const TargetRegisterClass *WaveMaskRegClass = TRI->getWaveMaskRegClass();
6140 const TargetRegisterClass *DstRegClass = MRI.getRegClass(DstReg);
6141 const TargetRegisterClass *SrcRegClass = MRI.getRegClass(SrcReg);
6142 bool IsWave32 = ST.isWave32();
6143 unsigned MovOpcForExec = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
6144 unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
6145 if (Stratergy == WAVE_REDUCE_STRATEGY::ITERATIVE ||
6146 !ST.hasDPP()) { // If target doesn't support DPP operations, default to
6147 // iterative stratergy
6148
6149 // To reduce the VGPR using iterative approach, we need to iterate
6150 // over all the active lanes. Lowering consists of ComputeLoop,
6151 // which iterate over only active lanes. We use copy of EXEC register
6152 // as induction variable and every active lane modifies it using bitset0
6153 // so that we will get the next active lane for next iteration.
6154
6155 // Create Control flow for loop
6156 // Split MI's Machine Basic block into For loop
6157 auto [ComputeLoop, ComputeEnd] = splitBlockForLoop(MI, BB, true);
6158
6159 Register LoopIterator = MRI.createVirtualRegister(WaveMaskRegClass);
6160 Register IdentityValReg = MRI.createVirtualRegister(DstRegClass);
6161 Register AccumulatorReg = MRI.createVirtualRegister(DstRegClass);
6162 Register ActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
6163 Register NewActiveBitsReg = MRI.createVirtualRegister(WaveMaskRegClass);
6164 Register FF1Reg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6165 Register LaneValueReg = MRI.createVirtualRegister(DstRegClass);
6166
6167 // Create initial values of induction variable from Exec, Accumulator and
6168 // insert branch instr to newly created ComputeBlock
6169 BuildMI(BB, I, DL, TII->get(MovOpcForExec), LoopIterator).addReg(ExecReg);
6170 uint64_t IdentityValue =
6171 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6172 ? 0x0 // +0.0 for double sub reduction
6174 BuildMI(BB, I, DL,
6175 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6176 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6177 IdentityValReg)
6178 .addImm(IdentityValue);
6179 // clang-format off
6180 BuildMI(BB, I, DL, TII->get(AMDGPU::S_BRANCH))
6181 .addMBB(ComputeLoop);
6182 // clang-format on
6183
6184 // Start constructing ComputeLoop
6185 I = ComputeLoop->begin();
6186 auto Accumulator =
6187 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), AccumulatorReg)
6188 .addReg(IdentityValReg)
6189 .addMBB(&BB);
6190 auto ActiveBits =
6191 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::PHI), ActiveBitsReg)
6192 .addReg(LoopIterator)
6193 .addMBB(&BB);
6194
6195 I = ComputeLoop->end();
6196 MachineInstr *NewAccumulator;
6197 // Perform the computations
6198 unsigned SFFOpc =
6199 IsWave32 ? AMDGPU::S_FF1_I32_B32 : AMDGPU::S_FF1_I32_B64;
6200 BuildMI(*ComputeLoop, I, DL, TII->get(SFFOpc), FF1Reg)
6201 .addReg(ActiveBitsReg);
6202 if (is32BitOpc) {
6203 Register OpDstReg = DstReg;
6204 bool hasSrc0Modifier = AMDGPU::getNamedOperandIdx(
6205 Opc, AMDGPU::OpName::src0_modifiers) != -1;
6206 bool hasSrc1Modifier = AMDGPU::getNamedOperandIdx(
6207 Opc, AMDGPU::OpName::src1_modifiers) != -1;
6208 bool hasClamp =
6209 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1;
6210 bool hasOpSel =
6211 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel) != -1;
6212 bool hasOMod =
6213 AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1;
6214 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6215 LaneValueReg)
6216 .addReg(SrcReg)
6217 .addReg(FF1Reg);
6218 if (ST.getInstrInfo()->isVALU(Opc, /*AllowLDSDMA=*/true)) {
6219 // Get the Lane Value in VGPR to avoid the Constant Bus Restriction
6220 Register LaneValVgpr = MRI.createVirtualRegister(SrcRegClass);
6221 Register VgprResultReg = MRI.createVirtualRegister(SrcRegClass);
6222 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), LaneValVgpr)
6223 .addReg(LaneValueReg);
6224 OpDstReg = VgprResultReg;
6225 LaneValueReg = LaneValVgpr;
6226 }
6227 auto OpInstr = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), OpDstReg);
6228 if (hasSrc0Modifier)
6229 OpInstr.addImm(SISrcMods::NONE); // src0 modifier
6230 OpInstr.addReg(AccumulatorReg); // src0
6231 if (hasSrc1Modifier)
6232 OpInstr.addImm(SISrcMods::NONE); // src1 modifier
6233 OpInstr.addReg(LaneValueReg); // src1
6234 if (hasClamp)
6235 OpInstr.addImm(0); // clamp
6236 if (hasOpSel)
6237 OpInstr.addImm(0); // opsel
6238 if (hasOMod)
6239 OpInstr.addImm(0); // omod
6240 if (ST.getInstrInfo()->isVALU(Opc, /*AllowLDSDMA=*/true)) {
6241 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
6242 DstReg)
6243 .addReg(OpDstReg);
6244 }
6245 } else {
6246 Register LaneValueLoReg =
6247 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6248 Register LaneValueHiReg =
6249 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6250 Register LaneValReg =
6251 MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
6252 auto [Op1L, Op1H] = ExtractSubRegs(MI, MI.getOperand(1),
6253 MRI.getRegClass(SrcReg), ST, MRI);
6254 // lane value input should be in an sgpr
6255 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6256 LaneValueLoReg)
6257 .addReg(Op1L)
6258 .addReg(FF1Reg);
6259 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::V_READLANE_B32),
6260 LaneValueHiReg)
6261 .addReg(Op1H)
6262 .addReg(FF1Reg);
6263 auto LaneValue = BuildRegSequence(*ComputeLoop, I, LaneValReg,
6264 LaneValueLoReg, LaneValueHiReg);
6265 switch (Opc) {
6266 case AMDGPU::S_OR_B64:
6267 case AMDGPU::S_AND_B64:
6268 case AMDGPU::S_XOR_B64: {
6269 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6270 .addReg(Accumulator->getOperand(0).getReg())
6271 .addReg(LaneValue->getOperand(0).getReg())
6272 .setOperandDead(3); // Dead scc
6273 break;
6274 }
6275 case AMDGPU::V_CMP_GT_I64_e64:
6276 case AMDGPU::V_CMP_GT_U64_e64:
6277 case AMDGPU::V_CMP_LT_I64_e64:
6278 case AMDGPU::V_CMP_LT_U64_e64: {
6279 Register LaneMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6280 Register ComparisonResultReg =
6281 MRI.createVirtualRegister(WaveMaskRegClass);
6282 int SrcIdx =
6283 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6284 const TargetRegisterClass *VregClass =
6285 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6286 Register AccumulatorVReg = MRI.createVirtualRegister(VregClass);
6287 auto [SrcReg0Sub0, SrcReg0Sub1] = ExtractSubRegs(
6288 MI, Accumulator->getOperand(0), VregClass, ST, MRI);
6289 BuildRegSequence(*ComputeLoop, I, AccumulatorVReg, SrcReg0Sub0,
6290 SrcReg0Sub1);
6291 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), LaneMaskReg)
6292 .addReg(LaneValue->getOperand(0).getReg())
6293 .addReg(AccumulatorVReg);
6294
6295 unsigned AndOpc = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
6296 BuildMI(*ComputeLoop, I, DL, TII->get(AndOpc), ComparisonResultReg)
6297 .addReg(LaneMaskReg)
6298 .addReg(ActiveBitsReg);
6299
6300 NewAccumulator = BuildMI(*ComputeLoop, I, DL,
6301 TII->get(AMDGPU::S_CSELECT_B64), DstReg)
6302 .addReg(LaneValue->getOperand(0).getReg())
6303 .addReg(Accumulator->getOperand(0).getReg());
6304 break;
6305 }
6306 case AMDGPU::V_MIN_F64_e64:
6307 case AMDGPU::V_MIN_NUM_F64_e64:
6308 case AMDGPU::V_MAX_F64_e64:
6309 case AMDGPU::V_MAX_NUM_F64_e64:
6310 case AMDGPU::V_ADD_F64_e64:
6311 case AMDGPU::V_ADD_F64_pseudo_e64: {
6312 int SrcIdx =
6313 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src);
6314 const TargetRegisterClass *VregRC =
6315 TRI->getAllocatableClass(TII->getRegClass(MI.getDesc(), SrcIdx));
6316 Register AccumulatorVReg = MRI.createVirtualRegister(VregRC);
6317 Register DstVreg = MRI.createVirtualRegister(VregRC);
6318 Register LaneValLo =
6319 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6320 Register LaneValHi =
6321 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6322 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::COPY), AccumulatorVReg)
6323 .addReg(Accumulator->getOperand(0).getReg());
6324 unsigned Modifier =
6325 MI.getOpcode() == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64
6328 auto DstVregInst =
6329 BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstVreg)
6330 .addImm(Modifier) // src0 modifiers
6331 .addReg(LaneValue->getOperand(0).getReg())
6332 .addImm(SISrcMods::NONE) // src1 modifiers
6333 .addReg(AccumulatorVReg)
6334 .addImm(SISrcMods::NONE) // clamp
6335 .addImm(SISrcMods::NONE); // omod
6336 auto ReadLaneLo =
6337 BuildMI(*ComputeLoop, I, DL,
6338 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValLo);
6339 auto ReadLaneHi =
6340 BuildMI(*ComputeLoop, I, DL,
6341 TII->get(AMDGPU::V_READFIRSTLANE_B32), LaneValHi);
6342 MachineBasicBlock::iterator Iters = *ReadLaneLo;
6343 auto [Op1L, Op1H] = ExtractSubRegs(*Iters, DstVregInst->getOperand(0),
6344 VregRC, ST, MRI);
6345 ReadLaneLo.addReg(Op1L);
6346 ReadLaneHi.addReg(Op1H);
6347 NewAccumulator =
6348 BuildRegSequence(*ComputeLoop, I, DstReg, LaneValLo, LaneValHi);
6349 break;
6350 }
6351 case AMDGPU::S_ADD_U64_PSEUDO:
6352 case AMDGPU::S_SUB_U64_PSEUDO: {
6353 NewAccumulator = BuildMI(*ComputeLoop, I, DL, TII->get(Opc), DstReg)
6354 .addReg(Accumulator->getOperand(0).getReg())
6355 .addReg(LaneValue->getOperand(0).getReg());
6356 ComputeLoop =
6357 expand64BitScalarArithmetic(*NewAccumulator, ComputeLoop);
6358 break;
6359 }
6360 }
6361 }
6362 // Manipulate the iterator to get the next active lane
6363 unsigned BITSETOpc =
6364 IsWave32 ? AMDGPU::S_BITSET0_B32 : AMDGPU::S_BITSET0_B64;
6365 BuildMI(*ComputeLoop, I, DL, TII->get(BITSETOpc), NewActiveBitsReg)
6366 .addReg(FF1Reg)
6367 .addReg(ActiveBitsReg);
6368
6369 // Add phi nodes
6370 Accumulator.addReg(DstReg).addMBB(ComputeLoop);
6371 ActiveBits.addReg(NewActiveBitsReg).addMBB(ComputeLoop);
6372
6373 // Creating branching
6374 unsigned CMPOpc = IsWave32 ? AMDGPU::S_CMP_LG_U32 : AMDGPU::S_CMP_LG_U64;
6375 BuildMI(*ComputeLoop, I, DL, TII->get(CMPOpc))
6376 .addReg(NewActiveBitsReg)
6377 .addImm(0);
6378 BuildMI(*ComputeLoop, I, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
6379 .addMBB(ComputeLoop);
6380
6381 RetBB = ComputeEnd;
6382 } else {
6383 assert(ST.hasDPP() && "Sub Target does not support DPP Operations");
6384 MachineBasicBlock *CurrBB = &BB;
6385 Register SrcWithIdentity = MRI.createVirtualRegister(SrcRegClass);
6386 Register IdentityVGPR = MRI.createVirtualRegister(SrcRegClass);
6387 Register IdentitySGPR = MRI.createVirtualRegister(DstRegClass);
6388 Register DPPRowShr1 = MRI.createVirtualRegister(SrcRegClass);
6389 Register DPPRowShr2 = MRI.createVirtualRegister(SrcRegClass);
6390 Register DPPRowShr4 = MRI.createVirtualRegister(SrcRegClass);
6391 Register DPPRowShr8 = MRI.createVirtualRegister(SrcRegClass);
6392 Register RowBcast15 = MRI.createVirtualRegister(SrcRegClass);
6393 Register ReducedValSGPR = MRI.createVirtualRegister(DstRegClass);
6394 Register NegatedReducedVal = MRI.createVirtualRegister(DstRegClass);
6395 Register RowBcast31 = MRI.createVirtualRegister(SrcRegClass);
6396 Register UndefExec = MRI.createVirtualRegister(WaveMaskRegClass);
6397 Register FinalDPPResult;
6398 MachineInstr *SrcWithIdentityInstr;
6399 MachineInstr *LastBcastInstr;
6400 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), UndefExec);
6401
6403 BuildMI(*CurrBB, MI, DL,
6404 TII->get(is32BitOpc ? AMDGPU::S_MOV_B32
6405 : AMDGPU::S_MOV_B64_IMM_PSEUDO),
6406 IdentitySGPR)
6407 .addImm(IdentityValue);
6408 auto IdentityCopyInstr =
6409 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::COPY), IdentityVGPR)
6410 .addReg(IdentitySGPR);
6411 auto DPPClampOpcPair = getDPPOpcForWaveReduction(Opc, ST);
6412 unsigned DPPOpc = std::get<0>(DPPClampOpcPair);
6413 unsigned ClampOpc = std::get<1>(DPPClampOpcPair);
6414 auto BuildSetInactiveInstr = [&](Register Dst, Register Src0,
6415 Register Src1) {
6416 return BuildMI(BB, MI, DL, TII->get(AMDGPU::V_SET_INACTIVE_B32),
6417 Dst)
6418 .addImm(0) // src0 modifiers
6419 .addReg(Src0) // src0
6420 .addImm(0) // src1 modifiers
6421 .addReg(Src1) // identity value for inactive lanes
6422 .addReg(UndefExec); // bool i1
6423 };
6424 auto BuildDPPMachineInstr = [&](Register Dst, Register Src,
6425 unsigned DPPCtrl) {
6426 auto DPPInstr =
6427 BuildMI(*CurrBB, MI, DL, TII->get(DPPOpc), Dst).addReg(Src); // old
6428 if (isFPOp && !NeedsMovDPP)
6429 DPPInstr.addImm(SISrcMods::NONE); // src0 modifier
6430 DPPInstr.addReg(Src); // src0
6431 if (isFPOp && !NeedsMovDPP)
6432 DPPInstr.addImm(SISrcMods::NONE); // src1 modifier
6433 if (!NeedsMovDPP)
6434 DPPInstr.addReg(Src); // src1
6435 if (AMDGPU::getNamedOperandIdx(DPPOpc, AMDGPU::OpName::clamp) >= 0)
6436 DPPInstr.addImm(0); // clamp
6437 DPPInstr
6438 .addImm(DPPCtrl) // dpp-ctrl
6439 .addImm(0xf) // row-mask
6440 .addImm(0xf) // bank-mask
6441 .addImm(0); // bound-control
6442 };
6443 auto BuildClampInstr = [&](Register Dst, Register Src0, Register Src1,
6444 bool isAddSub = false,
6445 bool needsCarryIn = false,
6446 Register CarryIn = Register()) {
6447 unsigned InstrOpc = ClampOpc;
6448 Register CarryOutReg = MRI.createVirtualRegister(WaveMaskRegClass);
6449 if (needsCarryIn)
6450 InstrOpc = AMDGPU::V_ADDC_U32_e64;
6451 auto ClampInstr = BuildMI(*CurrBB, MI, DL, TII->get(InstrOpc), Dst);
6452 if (isFPOp)
6453 ClampInstr.addImm(SISrcMods::NONE); // src0 mod
6454 if (isAddSub) {
6455 if (needsCarryIn)
6456 ClampInstr.addReg(CarryOutReg,
6458 RegState::Dead); // killed carry-out reg
6459 else
6460 ClampInstr.addReg(CarryOutReg, RegState::Define); // carry-out reg
6461 }
6462 ClampInstr.addReg(Src0); // src0
6463 if (isFPOp)
6464 ClampInstr.addImm(SISrcMods::NONE); // src1 mod
6465 ClampInstr.addReg(Src1); // src1
6466 if (needsCarryIn)
6467 ClampInstr.addReg(CarryIn, RegState::Kill); // carry-in reg
6468 if (AMDGPU::getNamedOperandIdx(InstrOpc, AMDGPU::OpName::clamp) >= 0)
6469 ClampInstr.addImm(0); // clamp
6470 if (isFPOp)
6471 ClampInstr.addImm(0); // omod
6472 LastBcastInstr = ClampInstr;
6473 return CarryOutReg;
6474 };
6475 auto BuildPostDPPInstr = [&](Register Src0, Register Src1) {
6476 bool isAddSubOpc =
6477 Opc == AMDGPU::S_ADD_U64_PSEUDO || Opc == AMDGPU::S_SUB_U64_PSEUDO;
6478 bool isBitWiseOpc = Opc == AMDGPU::S_AND_B64 ||
6479 Opc == AMDGPU::S_OR_B64 || Opc == AMDGPU::S_XOR_B64;
6480 Register ReturnReg = MRI.createVirtualRegister(SrcRegClass);
6481 if (isAddSubOpc || isBitWiseOpc) {
6482 Register ResLo = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6483 Register ResHi = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6484 MachineOperand Src0Operand =
6485 MachineOperand::CreateReg(Src0, /*isDef=*/false);
6486 MachineOperand Src1Operand =
6487 MachineOperand::CreateReg(Src1, /*isDef=*/false);
6488 auto [Src0Lo, Src0Hi] =
6489 ExtractSubRegs(MI, Src0Operand, SrcRegClass, ST, MRI);
6490 auto [Src1Lo, Src1Hi] =
6491 ExtractSubRegs(MI, Src1Operand, SrcRegClass, ST, MRI);
6492 Register CarryReg = BuildClampInstr(
6493 ResLo, Src0Lo, Src1Lo, isAddSubOpc, /*needsCarryIn*/ false);
6494 BuildClampInstr(ResHi, Src0Hi, Src1Hi, isAddSubOpc,
6495 /*needsCarryIn*/ isAddSubOpc, CarryReg);
6496 BuildRegSequence(*CurrBB, MI, ReturnReg, ResLo, ResHi);
6497 } else {
6498 if (isFPOp) {
6499 BuildMI(*CurrBB, MI, DL, TII->get(Opc), ReturnReg)
6500 .addImm(SISrcMods::NONE) // src0 modifiers
6501 .addReg(Src0)
6502 .addImm(SISrcMods::NONE) // src1 modifiers
6503 .addReg(Src1)
6504 .addImm(SISrcMods::NONE) // clamp
6505 .addImm(SISrcMods::NONE); // omod
6506 } else {
6507 Register CmpMaskReg = MRI.createVirtualRegister(WaveMaskRegClass);
6508 BuildMI(*CurrBB, MI, DL, TII->get(Opc), CmpMaskReg)
6509 .addReg(Src0) // src0
6510 .addReg(Src1); // src1
6511 LastBcastInstr =
6512 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B64_PSEUDO),
6513 ReturnReg)
6514 .addReg(Src1) // src0
6515 .addReg(Src0) // src1
6516 .addReg(CmpMaskReg); // src2
6517 expand64BitV_CNDMASK(*LastBcastInstr, CurrBB);
6518 }
6519 }
6520 return ReturnReg;
6521 };
6522
6523 // Set inactive lanes to the identity value.
6524 if (is32BitOpc) {
6525 SrcWithIdentityInstr =
6526 BuildSetInactiveInstr(SrcWithIdentity, SrcReg, IdentityVGPR);
6527 } else {
6528 Register SrcWithIdentitylo =
6529 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6530 Register SrcWithIdentityhi =
6531 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6532 auto [Reg0Sub0, Reg0Sub1] = ExtractSubRegs(
6533 MI, IdentityCopyInstr->getOperand(0), SrcRegClass, ST, MRI);
6534 auto [SrcReg0Sub0, SrcReg0Sub1] =
6535 ExtractSubRegs(MI, MI.getOperand(1), SrcRegClass, ST, MRI);
6536 MachineInstr *SetInactiveLoInstr =
6537 BuildSetInactiveInstr(SrcWithIdentitylo, SrcReg0Sub0, Reg0Sub0);
6538 MachineInstr *SetInactiveHiInstr =
6539 BuildSetInactiveInstr(SrcWithIdentityhi, SrcReg0Sub1, Reg0Sub1);
6540 SrcWithIdentityInstr =
6541 BuildRegSequence(*CurrBB, MI, SrcWithIdentity,
6542 SetInactiveLoInstr->getOperand(0).getReg(),
6543 SetInactiveHiInstr->getOperand(0).getReg());
6544 }
6545 // DPP reduction
6546 Register SrcWithIdentityReg =
6547 SrcWithIdentityInstr->getOperand(0).getReg();
6548 BuildDPPMachineInstr(DPPRowShr1, SrcWithIdentityReg,
6550 if (NeedsMovDPP)
6551 DPPRowShr1 = BuildPostDPPInstr(SrcWithIdentityReg, DPPRowShr1);
6552
6553 BuildDPPMachineInstr(DPPRowShr2, DPPRowShr1,
6555 if (NeedsMovDPP)
6556 DPPRowShr2 = BuildPostDPPInstr(DPPRowShr1, DPPRowShr2);
6557
6558 BuildDPPMachineInstr(DPPRowShr4, DPPRowShr2,
6560 if (NeedsMovDPP)
6561 DPPRowShr4 = BuildPostDPPInstr(DPPRowShr2, DPPRowShr4);
6562
6563 BuildDPPMachineInstr(DPPRowShr8, DPPRowShr4,
6565 if (NeedsMovDPP)
6566 DPPRowShr8 = BuildPostDPPInstr(DPPRowShr4, DPPRowShr8);
6567
6568 if (ST.hasDPPBroadcasts()) {
6569 BuildDPPMachineInstr(RowBcast15, DPPRowShr8, AMDGPU::DPP::BCAST15);
6570 if (NeedsMovDPP)
6571 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, RowBcast15);
6572 } else {
6573 // magic constant: 0x1E0
6574 // To Set BIT_MODE : bit 15 = 0
6575 // XOR mask : bit [14:10] = 0
6576 // OR mask : bit [9:5] = 15
6577 // AND mask : bit [4:0] = 0
6578 if (is32BitOpc) {
6579 Register SwizzledValue =
6580 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6581 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6582 SwizzledValue)
6583 .addReg(DPPRowShr8) // addr
6584 .addImm(0x1E0) // swizzle offset (i16)
6585 .addImm(0x0); // gds (i1)
6586 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue);
6587 } else {
6588 Register SwizzledValuelo =
6589 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6590 Register SwizzledValuehi =
6591 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6592 Register SwizzledValue64 = MRI.createVirtualRegister(SrcRegClass);
6593 MachineOperand DPPRowShr8Op =
6594 MachineOperand::CreateReg(DPPRowShr8, /*isDef=*/false);
6595 auto [Op1L, Op1H] =
6596 ExtractSubRegs(MI, DPPRowShr8Op, SrcRegClass, ST, MRI);
6597 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6598 SwizzledValuelo)
6599 .addReg(Op1L) // addr
6600 .addImm(0x1E0) // swizzle offset (i16)
6601 .addImm(0x0); // gds (i1)
6602 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_SWIZZLE_B32),
6603 SwizzledValuehi)
6604 .addReg(Op1H) // addr
6605 .addImm(0x1E0) // swizzle offset (i16)
6606 .addImm(0x0); // gds (i1)
6607 BuildRegSequence(*CurrBB, MI, SwizzledValue64, SwizzledValuelo,
6608 SwizzledValuehi);
6609 if (NeedsMovDPP)
6610 RowBcast15 = BuildPostDPPInstr(DPPRowShr8, SwizzledValue64);
6611 else
6612 BuildClampInstr(RowBcast15, DPPRowShr8, SwizzledValue64);
6613 }
6614 }
6615 FinalDPPResult = RowBcast15;
6616 if (!IsWave32) {
6617 if (ST.hasDPPBroadcasts()) {
6618 BuildDPPMachineInstr(RowBcast31, RowBcast15, AMDGPU::DPP::BCAST31);
6619 if (NeedsMovDPP)
6620 RowBcast31 = BuildPostDPPInstr(RowBcast15, RowBcast31);
6621 } else {
6622 Register ShiftedThreadID =
6623 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6624 Register PermuteByteOffset =
6625 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6626 Register PermutedValue = MRI.createVirtualRegister(SrcRegClass);
6627 Register Lane32Offset =
6628 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6629 Register WordSizeConst =
6630 MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
6631 Register ThreadIDRegLo =
6632 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6633 Register ThreadIDReg =
6634 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6635 // Get the thread ID.
6636 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_LO_U32_B32_e64),
6637 ThreadIDRegLo)
6638 .addImm(-1)
6639 .addImm(0);
6640 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MBCNT_HI_U32_B32_e64),
6641 ThreadIDReg)
6642 .addImm(-1)
6643 .addReg(ThreadIDRegLo);
6644 // shift each lane over by 32 positions, so value in 31st lane is
6645 // present in 63rd lane.
6646 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), Lane32Offset)
6647 .addImm(0x20);
6648 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_ADD_U32_e64),
6649 ShiftedThreadID)
6650 .addReg(ThreadIDReg)
6651 .addReg(Lane32Offset)
6652 .addImm(0); // clamp
6653 // multiply by reg size.
6654 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), WordSizeConst)
6655 .addImm(0x4);
6656 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_MUL_LO_U32_e64),
6657 PermuteByteOffset)
6658 .addReg(WordSizeConst)
6659 .addReg(ShiftedThreadID);
6660 // Permute the lanes
6661 if (is32BitOpc) {
6662 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6663 PermutedValue)
6664 .addReg(PermuteByteOffset) // addr
6665 .addReg(RowBcast15) // data
6666 .addImm(0); // offset
6667 } else {
6668 Register PermutedValuelo =
6669 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6670 Register PermutedValuehi =
6671 MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6672 MachineOperand RowBcast15Op =
6673 MachineOperand::CreateReg(RowBcast15, /*isDef=*/false);
6674 auto [RowBcast15Lo, RowBcast15Hi] =
6675 ExtractSubRegs(MI, RowBcast15Op, SrcRegClass, ST, MRI);
6676 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6677 PermutedValuelo)
6678 .addReg(PermuteByteOffset) // addr
6679 .addReg(RowBcast15Lo) // swizzle offset (i16)
6680 .addImm(0x0); // gds (i1)
6681 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::DS_PERMUTE_B32),
6682 PermutedValuehi)
6683 .addReg(PermuteByteOffset) // addr
6684 .addReg(RowBcast15Hi) // swizzle offset (i16)
6685 .addImm(0x0); // gds (i1)
6686 BuildRegSequence(*CurrBB, MI, PermutedValue, PermutedValuelo,
6687 PermutedValuehi);
6688 }
6689 if (NeedsMovDPP)
6690 RowBcast31 = BuildPostDPPInstr(RowBcast15, PermutedValue);
6691 else
6692 BuildClampInstr(RowBcast31, RowBcast15, PermutedValue);
6693 }
6694 FinalDPPResult = RowBcast31;
6695 }
6696 if (MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32 ||
6697 MIOpc == AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64) {
6698 Register NegatedValVGPR = MRI.createVirtualRegister(SrcRegClass);
6699 // Opc for f32 reduction is V_SUB_F32.
6700 // For f64, there is no equivalent V_SUB_F64 opcode, so use
6701 // V_ADD_F64/V_ADD_F64_pseudo, and negate the second operand.
6702 BuildMI(*CurrBB, MI, DL, TII->get(Opc),
6703 NegatedValVGPR)
6704 .addImm(SISrcMods::NONE) // src0 mods
6705 .addReg(IdentityVGPR) // src0
6706 .addImm(is32BitOpc ? SISrcMods::NONE : SISrcMods::NEG) // src1 mods
6707 .addReg(IsWave32 ? RowBcast15 : RowBcast31) // src1
6708 .addImm(SISrcMods::NONE) // clamp
6709 .addImm(SISrcMods::NONE); // omod
6710 FinalDPPResult = NegatedValVGPR;
6711 }
6712 // The final reduced value is in the last lane.
6713 if (is32BitOpc) {
6714 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6715 ReducedValSGPR)
6716 .addReg(FinalDPPResult)
6717 .addImm(ST.getWavefrontSize() - 1);
6718 } else {
6719 Register LaneValueLoReg =
6720 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6721 Register LaneValueHiReg =
6722 MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6723 const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
6724 MachineOperand FinalDPPResultOperand =
6725 MachineOperand::CreateReg(FinalDPPResult, /*isDef=*/false);
6726 auto [Op1L, Op1H] =
6727 ExtractSubRegs(MI, FinalDPPResultOperand, SrcRC, ST, MRI);
6728 // lane value input should be in an sgpr
6729 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6730 LaneValueLoReg)
6731 .addReg(Op1L)
6732 .addImm(ST.getWavefrontSize() - 1);
6733 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::V_READLANE_B32),
6734 LaneValueHiReg)
6735 .addReg(Op1H)
6736 .addImm(ST.getWavefrontSize() - 1);
6737 BuildRegSequence(*CurrBB, MI, ReducedValSGPR, LaneValueLoReg,
6738 LaneValueHiReg);
6739 }
6740 if (Opc == AMDGPU::S_SUB_I32) {
6741 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::S_SUB_I32), NegatedReducedVal)
6742 .addImm(0)
6743 .addReg(ReducedValSGPR);
6744 } else if (Opc == AMDGPU::S_SUB_U64_PSEUDO) {
6745 auto NegatedValInstr =
6746 BuildMI(*CurrBB, MI, DL, TII->get(Opc), NegatedReducedVal)
6747 .addImm(0)
6748 .addReg(ReducedValSGPR);
6749 CurrBB = expand64BitScalarArithmetic(*NegatedValInstr, CurrBB);
6750 }
6751 // Mark the final result as a whole-wave-mode calculation.
6752 BuildMI(*CurrBB, MI, DL, TII->get(AMDGPU::STRICT_WWM), DstReg)
6753 .addReg(Opc == AMDGPU::S_SUB_I32 || Opc == AMDGPU::S_SUB_U64_PSEUDO
6754 ? NegatedReducedVal
6755 : ReducedValSGPR);
6756 RetBB = CurrBB;
6757 }
6758 }
6759 MI.eraseFromParent();
6760 return RetBB;
6761}
6762
6765 MachineBasicBlock *BB) const {
6766 MachineFunction *MF = BB->getParent();
6768 const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
6770 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
6771 MachineRegisterInfo &MRI = MF->getRegInfo();
6772 const DebugLoc &DL = MI.getDebugLoc();
6773
6774 switch (MI.getOpcode()) {
6775 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U32:
6776 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_U32);
6777 case AMDGPU::WAVE_REDUCE_UMIN_PSEUDO_U64:
6778 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_U64_e64);
6779 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I32:
6780 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MIN_I32);
6781 case AMDGPU::WAVE_REDUCE_MIN_PSEUDO_I64:
6782 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_LT_I64_e64);
6783 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F32:
6784 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MIN_F32_e64);
6785 case AMDGPU::WAVE_REDUCE_FMIN_PSEUDO_F64:
6786 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6787 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6788 ? AMDGPU::V_MIN_NUM_F64_e64
6789 : AMDGPU::V_MIN_F64_e64);
6790 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U32:
6791 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_U32);
6792 case AMDGPU::WAVE_REDUCE_UMAX_PSEUDO_U64:
6793 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_U64_e64);
6794 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I32:
6795 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_MAX_I32);
6796 case AMDGPU::WAVE_REDUCE_MAX_PSEUDO_I64:
6797 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_CMP_GT_I64_e64);
6798 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F32:
6799 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_MAX_F32_e64);
6800 case AMDGPU::WAVE_REDUCE_FMAX_PSEUDO_F64:
6801 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6802 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6803 ? AMDGPU::V_MAX_NUM_F64_e64
6804 : AMDGPU::V_MAX_F64_e64);
6805 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_I32:
6806 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_I32);
6807 case AMDGPU::WAVE_REDUCE_ADD_PSEUDO_U64:
6808 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_ADD_U64_PSEUDO);
6809 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F32:
6810 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_ADD_F32_e64);
6811 case AMDGPU::WAVE_REDUCE_FADD_PSEUDO_F64:
6812 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6813 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6814 ? AMDGPU::V_ADD_F64_pseudo_e64
6815 : AMDGPU::V_ADD_F64_e64);
6816 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_I32:
6817 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_I32);
6818 case AMDGPU::WAVE_REDUCE_SUB_PSEUDO_U64:
6819 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_SUB_U64_PSEUDO);
6820 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F32:
6821 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::V_SUB_F32_e64);
6822 case AMDGPU::WAVE_REDUCE_FSUB_PSEUDO_F64:
6823 // There is no S/V_SUB_F64 opcode. Double type subtraction is expanded as
6824 // fadd + neg, by setting the NEG bit in the instruction.
6825 return lowerWaveReduce(MI, *BB, *getSubtarget(),
6826 ST.getGeneration() >= AMDGPUSubtarget::GFX12
6827 ? AMDGPU::V_ADD_F64_pseudo_e64
6828 : AMDGPU::V_ADD_F64_e64);
6829 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B32:
6830 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B32);
6831 case AMDGPU::WAVE_REDUCE_AND_PSEUDO_B64:
6832 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_AND_B64);
6833 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B32:
6834 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B32);
6835 case AMDGPU::WAVE_REDUCE_OR_PSEUDO_B64:
6836 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_OR_B64);
6837 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B32:
6838 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B32);
6839 case AMDGPU::WAVE_REDUCE_XOR_PSEUDO_B64:
6840 return lowerWaveReduce(MI, *BB, *getSubtarget(), AMDGPU::S_XOR_B64);
6841 case AMDGPU::S_UADDO_PSEUDO:
6842 case AMDGPU::S_USUBO_PSEUDO: {
6843 MachineOperand &Dest0 = MI.getOperand(0);
6844 MachineOperand &Dest1 = MI.getOperand(1);
6845 MachineOperand &Src0 = MI.getOperand(2);
6846 MachineOperand &Src1 = MI.getOperand(3);
6847
6848 unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
6849 ? AMDGPU::S_ADD_U32
6850 : AMDGPU::S_SUB_U32;
6851 // clang-format off
6852 BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg())
6853 .add(Src0)
6854 .add(Src1);
6855 // clang-format on
6856
6857 unsigned SelOpc =
6858 Subtarget->isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
6859 BuildMI(*BB, MI, DL, TII->get(SelOpc), Dest1.getReg()).addImm(-1).addImm(0);
6860
6861 MI.eraseFromParent();
6862 return BB;
6863 }
6864 case AMDGPU::S_ADD_U64_PSEUDO:
6865 case AMDGPU::S_SUB_U64_PSEUDO: {
6866 return expand64BitScalarArithmetic(MI, BB);
6867 }
6868 case AMDGPU::V_ADD_U64_PSEUDO:
6869 case AMDGPU::V_SUB_U64_PSEUDO: {
6870 bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
6871
6872 MachineOperand &Dest = MI.getOperand(0);
6873 MachineOperand &Src0 = MI.getOperand(1);
6874 MachineOperand &Src1 = MI.getOperand(2);
6875
6876 if (ST.hasAddSubU64Insts()) {
6877 auto I = BuildMI(*BB, MI, DL,
6878 TII->get(IsAdd ? AMDGPU::V_ADD_U64_e64
6879 : AMDGPU::V_SUB_U64_e64),
6880 Dest.getReg())
6881 .add(Src0)
6882 .add(Src1)
6883 .addImm(0); // clamp
6884 TII->legalizeOperands(*I);
6885 MI.eraseFromParent();
6886 return BB;
6887 }
6888
6889 if (IsAdd && ST.hasLshlAddU64Inst()) {
6890 auto Add = BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_LSHL_ADD_U64_e64),
6891 Dest.getReg())
6892 .add(Src0)
6893 .addImm(0)
6894 .add(Src1);
6895 TII->legalizeOperands(*Add);
6896 MI.eraseFromParent();
6897 return BB;
6898 }
6899
6900 const auto *CarryRC = TRI->getWaveMaskRegClass();
6901
6902 Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6903 Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
6904
6905 Register CarryReg = MRI.createVirtualRegister(CarryRC);
6906 Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
6907
6908 const TargetRegisterClass *Src0RC = Src0.isReg()
6909 ? MRI.getRegClass(Src0.getReg())
6910 : &AMDGPU::VReg_64RegClass;
6911 const TargetRegisterClass *Src1RC = Src1.isReg()
6912 ? MRI.getRegClass(Src1.getReg())
6913 : &AMDGPU::VReg_64RegClass;
6914
6915 const TargetRegisterClass *Src0SubRC =
6916 TRI->getSubRegisterClass(Src0RC, AMDGPU::sub0);
6917 const TargetRegisterClass *Src1SubRC =
6918 TRI->getSubRegisterClass(Src1RC, AMDGPU::sub1);
6919
6920 MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
6921 MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
6922 MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
6923 MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
6924
6925 MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
6926 MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
6927 MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
6928 MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
6929
6930 unsigned LoOpc =
6931 IsAdd ? AMDGPU::V_ADD_CO_U32_e64 : AMDGPU::V_SUB_CO_U32_e64;
6932 MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
6933 .addReg(CarryReg, RegState::Define)
6934 .add(SrcReg0Sub0)
6935 .add(SrcReg1Sub0)
6936 .addImm(0); // clamp bit
6937
6938 unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
6939 MachineInstr *HiHalf =
6940 BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
6941 .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
6942 .add(SrcReg0Sub1)
6943 .add(SrcReg1Sub1)
6944 .addReg(CarryReg, RegState::Kill)
6945 .addImm(0); // clamp bit
6946
6947 BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
6948 .addReg(DestSub0)
6949 .addImm(AMDGPU::sub0)
6950 .addReg(DestSub1)
6951 .addImm(AMDGPU::sub1);
6952 TII->legalizeOperands(*LoHalf);
6953 TII->legalizeOperands(*HiHalf);
6954 MI.eraseFromParent();
6955 return BB;
6956 }
6957 case AMDGPU::S_ADD_CO_PSEUDO:
6958 case AMDGPU::S_SUB_CO_PSEUDO: {
6959 // This pseudo has a chance to be selected
6960 // only from uniform add/subcarry node. All the VGPR operands
6961 // therefore assumed to be splat vectors.
6963 MachineOperand &Dest = MI.getOperand(0);
6964 MachineOperand &CarryDest = MI.getOperand(1);
6965 MachineOperand &Src0 = MI.getOperand(2);
6966 MachineOperand &Src1 = MI.getOperand(3);
6967 MachineOperand &Src2 = MI.getOperand(4);
6968 if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
6969 Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6970 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
6971 .addReg(Src0.getReg());
6972 Src0.setReg(RegOp0);
6973 }
6974 if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
6975 Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6976 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
6977 .addReg(Src1.getReg());
6978 Src1.setReg(RegOp1);
6979 }
6980 Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
6981 if (TRI->isVectorRegister(MRI, Src2.getReg())) {
6982 BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
6983 .addReg(Src2.getReg());
6984 Src2.setReg(RegOp2);
6985 }
6986
6987 if (ST.isWave64()) {
6988 if (ST.hasScalarCompareEq64()) {
6989 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
6990 .addReg(Src2.getReg())
6991 .addImm(0);
6992 } else {
6993 const TargetRegisterClass *Src2RC = MRI.getRegClass(Src2.getReg());
6994 const TargetRegisterClass *SubRC =
6995 TRI->getSubRegisterClass(Src2RC, AMDGPU::sub0);
6996 MachineOperand Src2Sub0 = TII->buildExtractSubRegOrImm(
6997 MII, MRI, Src2, Src2RC, AMDGPU::sub0, SubRC);
6998 MachineOperand Src2Sub1 = TII->buildExtractSubRegOrImm(
6999 MII, MRI, Src2, Src2RC, AMDGPU::sub1, SubRC);
7000 Register Src2_32 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7001
7002 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_OR_B32), Src2_32)
7003 .add(Src2Sub0)
7004 .add(Src2Sub1);
7005
7006 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
7007 .addReg(Src2_32, RegState::Kill)
7008 .addImm(0);
7009 }
7010 } else {
7011 BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U32))
7012 .addReg(Src2.getReg())
7013 .addImm(0);
7014 }
7015
7016 unsigned Opc = MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO
7017 ? AMDGPU::S_ADDC_U32
7018 : AMDGPU::S_SUBB_U32;
7019
7020 BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
7021
7022 unsigned SelOpc =
7023 ST.isWave64() ? AMDGPU::S_CSELECT_B64 : AMDGPU::S_CSELECT_B32;
7024
7025 BuildMI(*BB, MII, DL, TII->get(SelOpc), CarryDest.getReg())
7026 .addImm(-1)
7027 .addImm(0);
7028
7029 MI.eraseFromParent();
7030 return BB;
7031 }
7032 case AMDGPU::SI_INIT_M0: {
7033 MachineOperand &M0Init = MI.getOperand(0);
7034 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
7035 TII->get(M0Init.isReg() ? AMDGPU::COPY : AMDGPU::S_MOV_B32),
7036 AMDGPU::M0)
7037 .add(M0Init);
7038 MI.eraseFromParent();
7039 return BB;
7040 }
7041 case AMDGPU::S_BARRIER_SIGNAL_ISFIRST_IMM: {
7042 // Set SCC to true, in case the barrier instruction gets converted to a NOP.
7043 BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
7044 TII->get(AMDGPU::S_CMP_EQ_U32))
7045 .addImm(0)
7046 .addImm(0);
7047 return BB;
7048 }
7049 case AMDGPU::GET_GROUPSTATICSIZE: {
7050 assert(getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
7051 getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL);
7052 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_MOV_B32))
7053 .add(MI.getOperand(0))
7054 .addImm(MFI->getLDSSize());
7055 MI.eraseFromParent();
7056 return BB;
7057 }
7058 case AMDGPU::GET_SHADERCYCLESHILO: {
7059 assert(MF->getSubtarget<GCNSubtarget>().hasShaderCyclesHiLoRegisters());
7060 // The algorithm is:
7061 //
7062 // hi1 = getreg(SHADER_CYCLES_HI)
7063 // lo1 = getreg(SHADER_CYCLES_LO)
7064 // hi2 = getreg(SHADER_CYCLES_HI)
7065 //
7066 // If hi1 == hi2 then there was no overflow and the result is hi2:lo1.
7067 // Otherwise there was overflow and the result is hi2:0. In both cases the
7068 // result should represent the actual time at some point during the sequence
7069 // of three getregs.
7070 using namespace AMDGPU::Hwreg;
7071 Register RegHi1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7072 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi1)
7073 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
7074 Register RegLo1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7075 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegLo1)
7076 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES, 0, 32));
7077 Register RegHi2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7078 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_GETREG_B32), RegHi2)
7079 .addImm(HwregEncoding::encode(ID_SHADER_CYCLES_HI, 0, 32));
7080 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CMP_EQ_U32))
7081 .addReg(RegHi1)
7082 .addReg(RegHi2);
7083 Register RegLo = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
7084 BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B32), RegLo)
7085 .addReg(RegLo1)
7086 .addImm(0);
7087 BuildMI(*BB, MI, DL, TII->get(AMDGPU::REG_SEQUENCE))
7088 .add(MI.getOperand(0))
7089 .addReg(RegLo)
7090 .addImm(AMDGPU::sub0)
7091 .addReg(RegHi2)
7092 .addImm(AMDGPU::sub1);
7093 MI.eraseFromParent();
7094 return BB;
7095 }
7096 case AMDGPU::SI_INDIRECT_SRC_V1:
7097 case AMDGPU::SI_INDIRECT_SRC_V2:
7098 case AMDGPU::SI_INDIRECT_SRC_V3:
7099 case AMDGPU::SI_INDIRECT_SRC_V4:
7100 case AMDGPU::SI_INDIRECT_SRC_V5:
7101 case AMDGPU::SI_INDIRECT_SRC_V6:
7102 case AMDGPU::SI_INDIRECT_SRC_V7:
7103 case AMDGPU::SI_INDIRECT_SRC_V8:
7104 case AMDGPU::SI_INDIRECT_SRC_V9:
7105 case AMDGPU::SI_INDIRECT_SRC_V10:
7106 case AMDGPU::SI_INDIRECT_SRC_V11:
7107 case AMDGPU::SI_INDIRECT_SRC_V12:
7108 case AMDGPU::SI_INDIRECT_SRC_V16:
7109 case AMDGPU::SI_INDIRECT_SRC_V32:
7110 return emitIndirectSrc(MI, *BB, *getSubtarget());
7111 case AMDGPU::SI_INDIRECT_DST_V1:
7112 case AMDGPU::SI_INDIRECT_DST_V2:
7113 case AMDGPU::SI_INDIRECT_DST_V3:
7114 case AMDGPU::SI_INDIRECT_DST_V4:
7115 case AMDGPU::SI_INDIRECT_DST_V5:
7116 case AMDGPU::SI_INDIRECT_DST_V6:
7117 case AMDGPU::SI_INDIRECT_DST_V7:
7118 case AMDGPU::SI_INDIRECT_DST_V8:
7119 case AMDGPU::SI_INDIRECT_DST_V9:
7120 case AMDGPU::SI_INDIRECT_DST_V10:
7121 case AMDGPU::SI_INDIRECT_DST_V11:
7122 case AMDGPU::SI_INDIRECT_DST_V12:
7123 case AMDGPU::SI_INDIRECT_DST_V16:
7124 case AMDGPU::SI_INDIRECT_DST_V32:
7125 return emitIndirectDst(MI, *BB, *getSubtarget());
7126 case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
7127 case AMDGPU::SI_KILL_I1_PSEUDO:
7128 return splitKillBlock(MI, BB);
7129 case AMDGPU::V_CNDMASK_B64_PSEUDO: {
7131 return BB;
7132 }
7133 case AMDGPU::SI_BR_UNDEF: {
7134 MachineInstr *Br = BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CBRANCH_SCC1))
7135 .add(MI.getOperand(0));
7136 Br->getOperand(1).setIsUndef(); // read undef SCC
7137 MI.eraseFromParent();
7138 return BB;
7139 }
7140 case AMDGPU::ADJCALLSTACKUP:
7141 case AMDGPU::ADJCALLSTACKDOWN: {
7143 MachineInstrBuilder MIB(*MF, &MI);
7144 MIB.addReg(Info->getStackPtrOffsetReg(), RegState::ImplicitDefine)
7145 .addReg(Info->getStackPtrOffsetReg(), RegState::Implicit);
7146 return BB;
7147 }
7148 case AMDGPU::SI_CALL_ISEL: {
7149 unsigned ReturnAddrReg = TII->getRegisterInfo().getReturnAddressReg(*MF);
7150
7152 MIB = BuildMI(*BB, MI, DL, TII->get(AMDGPU::SI_CALL), ReturnAddrReg);
7153
7154 for (const MachineOperand &MO : MI.operands())
7155 MIB.add(MO);
7156
7157 MIB.cloneMemRefs(MI);
7158 MI.eraseFromParent();
7159 return BB;
7160 }
7161 case AMDGPU::V_ADD_CO_U32_e32:
7162 case AMDGPU::V_SUB_CO_U32_e32:
7163 case AMDGPU::V_SUBREV_CO_U32_e32: {
7164 // TODO: Define distinct V_*_I32_Pseudo instructions instead.
7165 unsigned Opc = MI.getOpcode();
7166
7167 bool NeedClampOperand = false;
7168 if (TII->pseudoToMCOpcode(Opc) == -1) {
7170 NeedClampOperand = true;
7171 }
7172
7173 auto I = BuildMI(*BB, MI, DL, TII->get(Opc), MI.getOperand(0).getReg());
7174 if (TII->isVOP3(*I)) {
7175 I.addReg(TRI->getVCC(), RegState::Define);
7176 }
7177 I.add(MI.getOperand(1)).add(MI.getOperand(2));
7178 if (NeedClampOperand)
7179 I.addImm(0); // clamp bit for e64 encoding
7180
7181 TII->legalizeOperands(*I);
7182
7183 MI.eraseFromParent();
7184 return BB;
7185 }
7186 case AMDGPU::V_ADDC_U32_e32:
7187 case AMDGPU::V_SUBB_U32_e32:
7188 case AMDGPU::V_SUBBREV_U32_e32:
7189 // These instructions have an implicit use of vcc which counts towards the
7190 // constant bus limit.
7191 TII->legalizeOperands(MI);
7192 return BB;
7193 case AMDGPU::DS_GWS_INIT:
7194 case AMDGPU::DS_GWS_SEMA_BR:
7195 case AMDGPU::DS_GWS_BARRIER:
7196 case AMDGPU::DS_GWS_SEMA_V:
7197 case AMDGPU::DS_GWS_SEMA_P:
7198 case AMDGPU::DS_GWS_SEMA_RELEASE_ALL:
7199 // A s_waitcnt 0 is required to be the instruction immediately following.
7200 if (getSubtarget()->hasGWSAutoReplay()) {
7202 return BB;
7203 }
7204
7205 return emitGWSMemViolTestLoop(MI, BB);
7206 case AMDGPU::S_SETREG_B32: {
7207 // Try to optimize cases that only set the denormal mode or rounding mode.
7208 //
7209 // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
7210 // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
7211 // instead.
7212 //
7213 // FIXME: This could be predicates on the immediate, but tablegen doesn't
7214 // allow you to have a no side effect instruction in the output of a
7215 // sideeffecting pattern.
7216 auto [ID, Offset, Width] =
7217 AMDGPU::Hwreg::HwregEncoding::decode(MI.getOperand(1).getImm());
7219 return BB;
7220
7221 const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
7222 const unsigned SetMask = WidthMask << Offset;
7223
7224 if (getSubtarget()->hasDenormModeInst()) {
7225 unsigned SetDenormOp = 0;
7226 unsigned SetRoundOp = 0;
7227
7228 // The dedicated instructions can only set the whole denorm or round mode
7229 // at once, not a subset of bits in either.
7230 if (SetMask ==
7232 // If this fully sets both the round and denorm mode, emit the two
7233 // dedicated instructions for these.
7234 SetRoundOp = AMDGPU::S_ROUND_MODE;
7235 SetDenormOp = AMDGPU::S_DENORM_MODE;
7236 } else if (SetMask == AMDGPU::Hwreg::FP_ROUND_MASK) {
7237 SetRoundOp = AMDGPU::S_ROUND_MODE;
7238 } else if (SetMask == AMDGPU::Hwreg::FP_DENORM_MASK) {
7239 SetDenormOp = AMDGPU::S_DENORM_MODE;
7240 }
7241
7242 if (SetRoundOp || SetDenormOp) {
7243 MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
7244 if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
7245 unsigned ImmVal = Def->getOperand(1).getImm();
7246 if (SetRoundOp) {
7247 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
7248 .addImm(ImmVal & 0xf);
7249
7250 // If we also have the denorm mode, get just the denorm mode bits.
7251 ImmVal >>= 4;
7252 }
7253
7254 if (SetDenormOp) {
7255 BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
7256 .addImm(ImmVal & 0xf);
7257 }
7258
7259 MI.eraseFromParent();
7260 return BB;
7261 }
7262 }
7263 }
7264
7265 // If only FP bits are touched, used the no side effects pseudo.
7266 if ((SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
7267 AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask)
7268 MI.setDesc(TII->get(AMDGPU::S_SETREG_B32_mode));
7269
7270 return BB;
7271 }
7272 case AMDGPU::S_INVERSE_BALLOT_U32:
7273 case AMDGPU::S_INVERSE_BALLOT_U64:
7274 // These opcodes only exist to let SIFixSGPRCopies insert a readfirstlane if
7275 // necessary. After that they are equivalent to a COPY.
7276 MI.setDesc(TII->get(AMDGPU::COPY));
7277 return BB;
7278 case AMDGPU::ENDPGM_TRAP: {
7279 if (BB->succ_empty() && std::next(MI.getIterator()) == BB->end()) {
7280 MI.setDesc(TII->get(AMDGPU::S_ENDPGM));
7281 MI.addOperand(MachineOperand::CreateImm(0));
7282 return BB;
7283 }
7284
7285 // We need a block split to make the real endpgm a terminator. We also don't
7286 // want to break phis in successor blocks, so we can't just delete to the
7287 // end of the block.
7288
7289 MachineBasicBlock *SplitBB = BB->splitAt(MI, false /*UpdateLiveIns*/);
7291 MF->push_back(TrapBB);
7292 // clang-format off
7293 BuildMI(*TrapBB, TrapBB->end(), DL, TII->get(AMDGPU::S_ENDPGM))
7294 .addImm(0);
7295 BuildMI(*BB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
7296 .addMBB(TrapBB);
7297 // clang-format on
7298
7299 BB->addSuccessor(TrapBB);
7300 MI.eraseFromParent();
7301 return SplitBB;
7302 }
7303 case AMDGPU::SIMULATED_TRAP: {
7304 assert(Subtarget->hasPrivEnabledTrap2NopBug());
7305 MachineBasicBlock *SplitBB =
7306 TII->insertSimulatedTrap(MRI, *BB, MI, MI.getDebugLoc());
7307 MI.eraseFromParent();
7308 return SplitBB;
7309 }
7310 case AMDGPU::SI_TCRETURN_GFX_WholeWave:
7311 case AMDGPU::SI_WHOLE_WAVE_FUNC_RETURN: {
7313
7314 // During ISel, it's difficult to propagate the original EXEC mask to use as
7315 // an input to SI_WHOLE_WAVE_FUNC_RETURN. Set it up here instead.
7316 MachineInstr *Setup = TII->getWholeWaveFunctionSetup(*BB->getParent());
7317 assert(Setup && "Couldn't find SI_SETUP_WHOLE_WAVE_FUNC");
7318 Register OriginalExec = Setup->getOperand(0).getReg();
7319 MF->getRegInfo().clearKillFlags(OriginalExec);
7320 MI.getOperand(0).setReg(OriginalExec);
7321 return BB;
7322 }
7323 default:
7324 if (TII->isImage(MI) || TII->isMUBUF(MI)) {
7325 if (!MI.mayStore())
7327 return BB;
7328 }
7330 }
7331}
7332
7334 // This currently forces unfolding various combinations of fsub into fma with
7335 // free fneg'd operands. As long as we have fast FMA (controlled by
7336 // isFMAFasterThanFMulAndFAdd), we should perform these.
7337
7338 // When fma is quarter rate, for f64 where add / sub are at best half rate,
7339 // most of these combines appear to be cycle neutral but save on instruction
7340 // count / code size.
7341 return true;
7342}
7343
7345
7347 EVT VT) const {
7348 if (!VT.isVector()) {
7349 return MVT::i1;
7350 }
7351 return EVT::getVectorVT(Ctx, MVT::i1, VT.getVectorNumElements());
7352}
7353
7355 // TODO: Should i16 be used always if legal? For now it would force VALU
7356 // shifts.
7357 return (VT == MVT::i16) ? MVT::i16 : MVT::i32;
7358}
7359
7361 return (Ty.getScalarSizeInBits() <= 16 && Subtarget->has16BitInsts())
7362 ? Ty.changeElementSize(16)
7363 : Ty.changeElementSize(32);
7364}
7365
7366// Answering this is somewhat tricky and depends on the specific device which
7367// have different rates for fma or all f64 operations.
7368//
7369// v_fma_f64 and v_mul_f64 always take the same number of cycles as each other
7370// regardless of which device (although the number of cycles differs between
7371// devices), so it is always profitable for f64.
7372//
7373// v_fma_f32 takes 4 or 16 cycles depending on the device, so it is profitable
7374// only on full rate devices. Normally, we should prefer selecting v_mad_f32
7375// which we can always do even without fused FP ops since it returns the same
7376// result as the separate operations and since it is always full
7377// rate. Therefore, we lie and report that it is not faster for f32. v_mad_f32
7378// however does not support denormals, so we do report fma as faster if we have
7379// a fast fma device and require denormals.
7380//
7382 EVT VT) const {
7383 VT = VT.getScalarType();
7384
7385 switch (VT.getSimpleVT().SimpleTy) {
7386 case MVT::f32: {
7387 // If mad is not available this depends only on if f32 fma is full rate.
7388 if (!Subtarget->hasMadMacF32Insts())
7389 return Subtarget->hasFastFMAF32();
7390
7391 // Otherwise f32 mad is always full rate and returns the same result as
7392 // the separate operations so should be preferred over fma.
7393 // However does not support denormals.
7395 return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
7396
7397 // If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
7398 return Subtarget->hasFastFMAF32() && Subtarget->hasDLInsts();
7399 }
7400 case MVT::f64:
7401 return true;
7402 case MVT::f16:
7403 case MVT::bf16:
7404 return Subtarget->has16BitInsts() && !denormalModeIsFlushAllF64F16(MF);
7405 default:
7406 break;
7407 }
7408
7409 return false;
7410}
7411
7413 LLT Ty) const {
7414 switch (Ty.getScalarSizeInBits()) {
7415 case 16:
7416 return isFMAFasterThanFMulAndFAdd(MF, MVT::f16);
7417 case 32:
7418 return isFMAFasterThanFMulAndFAdd(MF, MVT::f32);
7419 case 64:
7420 return isFMAFasterThanFMulAndFAdd(MF, MVT::f64);
7421 default:
7422 break;
7423 }
7424
7425 return false;
7426}
7427
7429 if (!Ty.isScalar())
7430 return false;
7431
7432 if (Ty.getScalarSizeInBits() == 16)
7433 return Subtarget->hasMadF16() && denormalModeIsFlushAllF64F16(*MI.getMF());
7434 if (Ty.getScalarSizeInBits() == 32)
7435 return Subtarget->hasMadMacF32Insts() &&
7436 denormalModeIsFlushAllF32(*MI.getMF());
7437
7438 return false;
7439}
7440
7442 const SDNode *N) const {
7443 // TODO: Check future ftz flag
7444 // v_mad_f32/v_mac_f32 do not support denormals.
7445 EVT VT = N->getValueType(0);
7446 if (VT == MVT::f32)
7447 return Subtarget->hasMadMacF32Insts() &&
7449 if (VT == MVT::f16) {
7450 return Subtarget->hasMadF16() &&
7452 }
7453
7454 return false;
7455}
7456
7457//===----------------------------------------------------------------------===//
7458// Custom DAG Lowering Operations
7459//===----------------------------------------------------------------------===//
7460
7461// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7462// wider vector type is legal.
7464 SelectionDAG &DAG) const {
7465 unsigned Opc = Op.getOpcode();
7466 EVT VT = Op.getValueType();
7468
7469 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
7470 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VT);
7471
7472 SDLoc SL(Op);
7473
7474 // Forward any trailing scalar operands unchanged to both halves.
7475 SmallVector<SDValue, 2> LoOps = {Lo};
7476 SmallVector<SDValue, 2> HiOps = {Hi};
7477 auto TrailingOps = drop_begin(Op->ops());
7478 LoOps.append(TrailingOps.begin(), TrailingOps.end());
7479 HiOps.append(TrailingOps.begin(), TrailingOps.end());
7480
7481 SDValue OpLo = DAG.getNode(Opc, SL, LoVT, LoOps, Op->getFlags());
7482 SDValue OpHi = DAG.getNode(Opc, SL, HiVT, HiOps, Op->getFlags());
7483
7484 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7485}
7486
7487// Enable lowering of ROTR for vxi32 types. This is a workaround for a
7488// regression whereby extra unnecessary instructions were added to codegen
7489// for rotr operations, casued by legalising v2i32 or. This resulted in extra
7490// instructions to extract the result from the vector.
7492 [[maybe_unused]] EVT VT = Op.getValueType();
7493
7494 assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 ||
7495 VT == MVT::v16i32) &&
7496 "Unexpected ValueType.");
7497
7498 return DAG.UnrollVectorOp(Op.getNode());
7499}
7500
7501// Work around LegalizeDAG doing the wrong thing and fully scalarizing if the
7502// wider vector type is legal.
7504 SelectionDAG &DAG) const {
7505 unsigned Opc = Op.getOpcode();
7506 EVT VT = Op.getValueType();
7508
7509 auto [Lo0, Hi0] = DAG.SplitVectorOperand(Op.getNode(), 0);
7510 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7511
7512 SDLoc SL(Op);
7513
7514 SDValue OpLo =
7515 DAG.getNode(Opc, SL, Lo0.getValueType(), Lo0, Lo1, Op->getFlags());
7516 SDValue OpHi =
7517 DAG.getNode(Opc, SL, Hi0.getValueType(), Hi0, Hi1, Op->getFlags());
7518
7519 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7520}
7521
7523 SelectionDAG &DAG) const {
7524 unsigned Opc = Op.getOpcode();
7525 EVT VT = Op.getValueType();
7527
7528 SDValue Op0 = Op.getOperand(0);
7529 auto [Lo0, Hi0] = Op0.getValueType().isVector()
7530 ? DAG.SplitVectorOperand(Op.getNode(), 0)
7531 : std::pair(Op0, Op0);
7532
7533 auto [Lo1, Hi1] = DAG.SplitVectorOperand(Op.getNode(), 1);
7534 auto [Lo2, Hi2] = DAG.SplitVectorOperand(Op.getNode(), 2);
7535
7536 SDLoc SL(Op);
7537 auto ResVT = DAG.GetSplitDestVTs(VT);
7538
7539 SDValue OpLo =
7540 DAG.getNode(Opc, SL, ResVT.first, Lo0, Lo1, Lo2, Op->getFlags());
7541 SDValue OpHi =
7542 DAG.getNode(Opc, SL, ResVT.second, Hi0, Hi1, Hi2, Op->getFlags());
7543
7544 return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi);
7545}
7546
7548 switch (Op.getOpcode()) {
7549 default:
7551 case ISD::BRCOND:
7552 return LowerBRCOND(Op, DAG);
7553 case ISD::RETURNADDR:
7554 return LowerRETURNADDR(Op, DAG);
7555 case ISD::SPONENTRY:
7556 return LowerSPONENTRY(Op, DAG);
7557 case ISD::LOAD: {
7558 SDValue Result = LowerLOAD(Op, DAG);
7559 assert((!Result.getNode() || Result.getNode()->getNumValues() == 2) &&
7560 "Load should return a value and a chain");
7561 return Result;
7562 }
7563 case ISD::FSQRT: {
7564 EVT VT = Op.getValueType();
7565 if (VT == MVT::f32)
7566 return lowerFSQRTF32(Op, DAG);
7567 if (VT == MVT::f64)
7568 return lowerFSQRTF64(Op, DAG);
7569 return SDValue();
7570 }
7571 case ISD::FSIN:
7572 case ISD::FCOS:
7573 return LowerTrig(Op, DAG);
7574 case ISD::SELECT:
7575 return LowerSELECT(Op, DAG);
7576 case ISD::FDIV:
7577 return LowerFDIV(Op, DAG);
7578 case ISD::FFREXP:
7579 return LowerFFREXP(Op, DAG);
7581 return LowerATOMIC_CMP_SWAP(Op, DAG);
7582 case ISD::STORE:
7583 return LowerSTORE(Op, DAG);
7584 case ISD::GlobalAddress: {
7587 return LowerGlobalAddress(MFI, Op, DAG);
7588 }
7589 case ISD::BlockAddress:
7590 return LowerBlockAddress(Op, DAG);
7592 return LowerExternalSymbol(Op, DAG);
7594 return LowerINTRINSIC_WO_CHAIN(Op, DAG);
7596 return LowerINTRINSIC_W_CHAIN(Op, DAG);
7598 return LowerINTRINSIC_VOID(Op, DAG);
7599 case ISD::ADDRSPACECAST:
7600 return lowerADDRSPACECAST(Op, DAG);
7602 return lowerINSERT_SUBVECTOR(Op, DAG);
7604 return lowerINSERT_VECTOR_ELT(Op, DAG);
7606 return lowerEXTRACT_VECTOR_ELT(Op, DAG);
7608 return lowerVECTOR_SHUFFLE(Op, DAG);
7610 return lowerSCALAR_TO_VECTOR(Op, DAG);
7611 case ISD::BUILD_VECTOR:
7612 return lowerBUILD_VECTOR(Op, DAG);
7613 case ISD::FP_ROUND:
7615 return lowerFP_ROUND(Op, DAG);
7616 case ISD::TRAP:
7617 return lowerTRAP(Op, DAG);
7618 case ISD::DEBUGTRAP:
7619 return lowerDEBUGTRAP(Op, DAG);
7620 case ISD::ABS:
7621 case ISD::FABS:
7622 case ISD::FNEG:
7623 case ISD::FCANONICALIZE:
7624 case ISD::BSWAP:
7625 return splitUnaryVectorOp(Op, DAG);
7628 if (Op.getValueType().isVector() && Op.getValueType() != MVT::v2i16 &&
7629 Op.getOperand(0).getValueType().getScalarType() == MVT::f32)
7630 return splitUnaryVectorOp(Op, DAG);
7631 return LowerFP_TO_INT_SAT(Op, DAG);
7632 case ISD::FMINNUM:
7633 case ISD::FMAXNUM:
7634 return lowerFMINNUM_FMAXNUM(Op, DAG);
7635 case ISD::FMINIMUMNUM:
7636 case ISD::FMAXIMUMNUM:
7637 return lowerFMINIMUMNUM_FMAXIMUMNUM(Op, DAG);
7638 case ISD::FMINIMUM:
7639 case ISD::FMAXIMUM:
7640 return lowerFMINIMUM_FMAXIMUM(Op, DAG);
7641 case ISD::FLDEXP:
7642 case ISD::STRICT_FLDEXP:
7643 return lowerFLDEXP(Op, DAG);
7644 case ISD::FMA:
7645 return splitTernaryVectorOp(Op, DAG);
7646 case ISD::FP_TO_SINT:
7647 case ISD::FP_TO_UINT:
7648 if (Subtarget->hasVCvtPkIU16F32() && Op.getValueType() == MVT::i16 &&
7649 Op.getOperand(0).getValueType() == MVT::f32) {
7650 // Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
7651 return Op;
7652 }
7653 return LowerFP_TO_INT(Op, DAG);
7654 case ISD::SHL:
7655 case ISD::SRA:
7656 case ISD::SRL:
7657 case ISD::ADD:
7658 case ISD::SUB:
7659 case ISD::SMIN:
7660 case ISD::SMAX:
7661 case ISD::UMIN:
7662 case ISD::UMAX:
7663 case ISD::FADD:
7664 case ISD::FMUL:
7665 case ISD::FMINNUM_IEEE:
7666 case ISD::FMAXNUM_IEEE:
7667 case ISD::UADDSAT:
7668 case ISD::USUBSAT:
7669 case ISD::SADDSAT:
7670 case ISD::SSUBSAT:
7671 return splitBinaryVectorOp(Op, DAG);
7672 case ISD::FCOPYSIGN:
7673 return lowerFCOPYSIGN(Op, DAG);
7674 case ISD::MUL:
7675 return lowerMUL(Op, DAG);
7676 case ISD::SMULO:
7677 case ISD::UMULO:
7678 return lowerXMULO(Op, DAG);
7679 case ISD::SMUL_LOHI:
7680 case ISD::UMUL_LOHI:
7681 return lowerXMUL_LOHI(Op, DAG);
7683 return LowerDYNAMIC_STACKALLOC(Op, DAG);
7684 case ISD::STACKSAVE:
7685 return LowerSTACKSAVE(Op, DAG);
7686 case ISD::GET_ROUNDING:
7687 return lowerGET_ROUNDING(Op, DAG);
7688 case ISD::SET_ROUNDING:
7689 return lowerSET_ROUNDING(Op, DAG);
7690 case ISD::PREFETCH:
7691 return lowerPREFETCH(Op, DAG);
7692 case ISD::FP_EXTEND:
7694 return lowerFP_EXTEND(Op, DAG);
7695 case ISD::GET_FPENV:
7696 return lowerGET_FPENV(Op, DAG);
7697 case ISD::SET_FPENV:
7698 return lowerSET_FPENV(Op, DAG);
7699 case ISD::ROTR:
7700 return lowerROTR(Op, DAG);
7701 case ISD::INLINEASM:
7702 return LowerINLINEASM(Op, DAG);
7703 }
7704 return SDValue();
7705}
7706
7707// Used for D16: Casts the result of an instruction into the right vector,
7708// packs values if loads return unpacked values.
7710 const SDLoc &DL, SelectionDAG &DAG,
7711 bool Unpacked) {
7712 if (!LoadVT.isVector())
7713 return Result;
7714
7715 // Cast back to the original packed type or to a larger type that is a
7716 // multiple of 32 bit for D16. Widening the return type is a required for
7717 // legalization.
7718 EVT FittingLoadVT = LoadVT;
7719 if ((LoadVT.getVectorNumElements() % 2) == 1) {
7720 FittingLoadVT =
7722 LoadVT.getVectorNumElements() + 1);
7723 }
7724
7725 if (Unpacked) { // From v2i32/v4i32 back to v2f16/v4f16.
7726 // Truncate to v2i16/v4i16.
7727 EVT IntLoadVT = FittingLoadVT.changeTypeToInteger();
7728
7729 // Workaround legalizer not scalarizing truncate after vector op
7730 // legalization but not creating intermediate vector trunc.
7732 DAG.ExtractVectorElements(Result, Elts);
7733 for (SDValue &Elt : Elts)
7734 Elt = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Elt);
7735
7736 // Pad illegal v1i16/v3fi6 to v4i16
7737 if ((LoadVT.getVectorNumElements() % 2) == 1)
7738 Elts.push_back(DAG.getPOISON(MVT::i16));
7739
7740 Result = DAG.getBuildVector(IntLoadVT, DL, Elts);
7741
7742 // Bitcast to original type (v2f16/v4f16).
7743 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7744 }
7745
7746 // Cast back to the original packed type.
7747 return DAG.getNode(ISD::BITCAST, DL, FittingLoadVT, Result);
7748}
7749
7750SDValue SITargetLowering::adjustLoadValueType(unsigned Opcode, MemSDNode *M,
7751 SelectionDAG &DAG,
7753 bool IsIntrinsic) const {
7754 SDLoc DL(M);
7755
7756 bool Unpacked = Subtarget->hasUnpackedD16VMem();
7757 EVT LoadVT = M->getValueType(0);
7758
7759 EVT EquivLoadVT = LoadVT;
7760 if (LoadVT.isVector()) {
7761 if (Unpacked) {
7762 EquivLoadVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
7763 LoadVT.getVectorNumElements());
7764 } else if ((LoadVT.getVectorNumElements() % 2) == 1) {
7765 // Widen v3f16 to legal type
7766 EquivLoadVT =
7768 LoadVT.getVectorNumElements() + 1);
7769 }
7770 }
7771
7772 // Change from v4f16/v2f16 to EquivLoadVT.
7773 SDVTList VTList = DAG.getVTList(EquivLoadVT, MVT::Other);
7774
7776 IsIntrinsic ? (unsigned)ISD::INTRINSIC_W_CHAIN : Opcode, DL, VTList, Ops,
7777 M->getMemoryVT(), M->getMemOperand());
7778
7779 SDValue Adjusted = adjustLoadValueTypeImpl(Load, LoadVT, DL, DAG, Unpacked);
7780
7781 return DAG.getMergeValues({Adjusted, Load.getValue(1)}, DL);
7782}
7783
7784SDValue SITargetLowering::lowerIntrinsicLoad(MemSDNode *M, bool IsFormat,
7785 SelectionDAG &DAG,
7786 ArrayRef<SDValue> Ops) const {
7787 SDLoc DL(M);
7788 EVT LoadVT = M->getValueType(0);
7789 EVT EltType = LoadVT.getScalarType();
7790 EVT IntVT = LoadVT.changeTypeToInteger();
7791
7792 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
7793
7794 assert(M->getNumValues() == 2 || M->getNumValues() == 3);
7795 bool IsTFE = M->getNumValues() == 3;
7796
7797 unsigned Opc = IsFormat ? (IsTFE ? AMDGPUISD::BUFFER_LOAD_FORMAT_TFE
7798 : AMDGPUISD::BUFFER_LOAD_FORMAT)
7799 : IsTFE ? AMDGPUISD::BUFFER_LOAD_TFE
7800 : AMDGPUISD::BUFFER_LOAD;
7801
7802 if (IsD16) {
7803 return adjustLoadValueType(AMDGPUISD::BUFFER_LOAD_FORMAT_D16, M, DAG, Ops);
7804 }
7805
7806 // Handle BUFFER_LOAD_BYTE/UBYTE/SHORT/USHORT overloaded intrinsics
7807 if (!IsD16 && !LoadVT.isVector() && EltType.getSizeInBits() < 32)
7808 return handleByteShortBufferLoads(DAG, LoadVT, DL, Ops, M->getMemOperand(),
7809 IsTFE);
7810
7811 if (isTypeLegal(LoadVT)) {
7812 return getMemIntrinsicNode(Opc, DL, M->getVTList(), Ops, IntVT,
7813 M->getMemOperand(), DAG);
7814 }
7815
7816 EVT CastVT = getEquivalentMemType(*DAG.getContext(), LoadVT);
7817 SDVTList VTList = DAG.getVTList(CastVT, MVT::Other);
7818 SDValue MemNode = getMemIntrinsicNode(Opc, DL, VTList, Ops, CastVT,
7819 M->getMemOperand(), DAG);
7820 return DAG.getMergeValues(
7821 {DAG.getNode(ISD::BITCAST, DL, LoadVT, MemNode), MemNode.getValue(1)},
7822 DL);
7823}
7824
7826 SelectionDAG &DAG) {
7827 EVT VT = N->getValueType(0);
7828 unsigned CondCode = N->getConstantOperandVal(3);
7829 if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
7830 return DAG.getPOISON(VT);
7831
7832 ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
7833
7834 SDValue LHS = N->getOperand(1);
7835 SDValue RHS = N->getOperand(2);
7836
7837 SDLoc DL(N);
7838
7839 EVT CmpVT = LHS.getValueType();
7840 if (CmpVT == MVT::i16 && !TLI.isTypeLegal(MVT::i16)) {
7841 unsigned PromoteOp =
7843 LHS = DAG.getNode(PromoteOp, DL, MVT::i32, LHS);
7844 RHS = DAG.getNode(PromoteOp, DL, MVT::i32, RHS);
7845 }
7846
7847 ISD::CondCode CCOpcode = getICmpCondCode(IcInput);
7848
7849 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7850 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7851
7852 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, DL, CCVT, LHS, RHS,
7853 DAG.getCondCode(CCOpcode));
7854 if (VT.bitsEq(CCVT))
7855 return SetCC;
7856 return DAG.getZExtOrTrunc(SetCC, DL, VT);
7857}
7858
7860 SelectionDAG &DAG) {
7861 EVT VT = N->getValueType(0);
7862
7863 unsigned CondCode = N->getConstantOperandVal(3);
7864 if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
7865 return DAG.getPOISON(VT);
7866
7867 SDValue Src0 = N->getOperand(1);
7868 SDValue Src1 = N->getOperand(2);
7869 EVT CmpVT = Src0.getValueType();
7870 SDLoc SL(N);
7871
7872 if (CmpVT == MVT::f16 && !TLI.isTypeLegal(CmpVT)) {
7873 Src0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src0);
7874 Src1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src1);
7875 }
7876
7877 FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
7878 ISD::CondCode CCOpcode = getFCmpCondCode(IcInput);
7879 unsigned WavefrontSize = TLI.getSubtarget()->getWavefrontSize();
7880 EVT CCVT = EVT::getIntegerVT(*DAG.getContext(), WavefrontSize);
7881 SDValue SetCC = DAG.getNode(AMDGPUISD::SETCC, SL, CCVT, Src0, Src1,
7882 DAG.getCondCode(CCOpcode));
7883 if (VT.bitsEq(CCVT))
7884 return SetCC;
7885 return DAG.getZExtOrTrunc(SetCC, SL, VT);
7886}
7887
7889 SelectionDAG &DAG) {
7890 EVT VT = N->getValueType(0);
7891 SDValue Src = N->getOperand(1);
7892 SDLoc SL(N);
7893
7894 if (Src.getOpcode() == ISD::SETCC) {
7895 SDValue Op0 = Src.getOperand(0);
7896 SDValue Op1 = Src.getOperand(1);
7897 // Need to expand bfloat to float for comparison (setcc).
7898 if (Op0.getValueType() == MVT::bf16) {
7899 Op0 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op0);
7900 Op1 = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op1);
7901 }
7902 // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
7903 return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Op0, Op1, Src.getOperand(2));
7904 }
7905 if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
7906 // (ballot 0) -> 0
7907 if (Arg->isZero())
7908 return DAG.getConstant(0, SL, VT);
7909
7910 // (ballot 1) -> EXEC/EXEC_LO
7911 if (Arg->isOne()) {
7912 Register Exec;
7913 if (VT.getScalarSizeInBits() == 32)
7914 Exec = AMDGPU::EXEC_LO;
7915 else if (VT.getScalarSizeInBits() == 64)
7916 Exec = AMDGPU::EXEC;
7917 else
7918 return SDValue();
7919
7920 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
7921 }
7922 }
7923
7924 // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
7925 // ISD::SETNE)
7926 return DAG.getNode(
7927 AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
7928 DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
7929}
7930
7932 EVT VT);
7933
7935 SelectionDAG &DAG) {
7936 EVT VT = N->getValueType(0);
7937 unsigned ValSize = VT.getSizeInBits();
7938 unsigned IID = N->getConstantOperandVal(0);
7939 bool IsPermLane16 = IID == Intrinsic::amdgcn_permlane16 ||
7940 IID == Intrinsic::amdgcn_permlanex16;
7941 bool IsSetInactive = IID == Intrinsic::amdgcn_set_inactive ||
7942 IID == Intrinsic::amdgcn_set_inactive_chain_arg;
7943 bool IsPermlaneShuffle = IID == Intrinsic::amdgcn_permlane_bcast ||
7944 IID == Intrinsic::amdgcn_permlane_up ||
7945 IID == Intrinsic::amdgcn_permlane_down ||
7946 IID == Intrinsic::amdgcn_permlane_xor;
7947 SDLoc SL(N);
7948 MVT IntVT = MVT::getIntegerVT(ValSize);
7949 const GCNSubtarget *ST = TLI.getSubtarget();
7950
7951 if ((IsPermLane16 && !ST->hasPermlane16Insts()) ||
7952 (IID == Intrinsic::amdgcn_mov_dpp8 && !ST->hasDPP8()))
7953 return emitRemovedIntrinsicError(DAG, SL, VT);
7954
7955 unsigned SplitSize = 32;
7956 if (IID == Intrinsic::amdgcn_update_dpp && (ValSize % 64 == 0) &&
7957 ST->hasDPALU_DPP() &&
7958 AMDGPU::isLegalDPALU_DPPControl(*ST, N->getConstantOperandVal(3)))
7959 SplitSize = 64;
7960
7961 auto createLaneOp = [&DAG, &SL, N, IID](SDValue Src0, SDValue Src1,
7962 SDValue Src2, MVT ValT) -> SDValue {
7963 SmallVector<SDValue, 8> Operands;
7964 switch (IID) {
7965 case Intrinsic::amdgcn_permlane16:
7966 case Intrinsic::amdgcn_permlanex16:
7967 case Intrinsic::amdgcn_update_dpp:
7968 Operands.push_back(N->getOperand(6));
7969 Operands.push_back(N->getOperand(5));
7970 Operands.push_back(N->getOperand(4));
7971 [[fallthrough]];
7972 case Intrinsic::amdgcn_writelane:
7973 case Intrinsic::amdgcn_permlane_bcast:
7974 case Intrinsic::amdgcn_permlane_up:
7975 case Intrinsic::amdgcn_permlane_down:
7976 case Intrinsic::amdgcn_permlane_xor:
7977 Operands.push_back(Src2);
7978 [[fallthrough]];
7979 case Intrinsic::amdgcn_readlane:
7980 case Intrinsic::amdgcn_set_inactive:
7981 case Intrinsic::amdgcn_set_inactive_chain_arg:
7982 case Intrinsic::amdgcn_mov_dpp8:
7983 Operands.push_back(Src1);
7984 [[fallthrough]];
7985 case Intrinsic::amdgcn_readfirstlane:
7986 case Intrinsic::amdgcn_permlane64:
7987 Operands.push_back(Src0);
7988 break;
7989 default:
7990 llvm_unreachable("unhandled lane op");
7991 }
7992
7993 Operands.push_back(DAG.getTargetConstant(IID, SL, MVT::i32));
7994 std::reverse(Operands.begin(), Operands.end());
7995
7996 if (SDNode *GL = N->getGluedNode()) {
7997 assert(GL->getOpcode() == ISD::CONVERGENCECTRL_GLUE);
7998 GL = GL->getOperand(0).getNode();
7999 Operands.push_back(DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
8000 SDValue(GL, 0)));
8001 }
8002
8003 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, ValT, Operands);
8004 };
8005
8006 SDValue Src0 = N->getOperand(1);
8007 SDValue Src1, Src2;
8008 if (IID == Intrinsic::amdgcn_readlane || IID == Intrinsic::amdgcn_writelane ||
8009 IID == Intrinsic::amdgcn_mov_dpp8 ||
8010 IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16 ||
8011 IsPermlaneShuffle) {
8012 Src1 = N->getOperand(2);
8013 if (IID == Intrinsic::amdgcn_writelane ||
8014 IID == Intrinsic::amdgcn_update_dpp || IsPermLane16 ||
8015 IsPermlaneShuffle)
8016 Src2 = N->getOperand(3);
8017 }
8018
8019 if (ValSize == SplitSize) {
8020 // Already legal
8021 return SDValue();
8022 }
8023
8024 if (ValSize < 32) {
8025 bool IsFloat = VT.isFloatingPoint();
8026 Src0 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src0) : Src0,
8027 SL, MVT::i32);
8028
8029 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16) {
8030 Src1 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src1) : Src1,
8031 SL, MVT::i32);
8032 }
8033
8034 if (IID == Intrinsic::amdgcn_writelane) {
8035 Src2 = DAG.getAnyExtOrTrunc(IsFloat ? DAG.getBitcast(IntVT, Src2) : Src2,
8036 SL, MVT::i32);
8037 }
8038
8039 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, MVT::i32);
8040 SDValue Trunc = DAG.getAnyExtOrTrunc(LaneOp, SL, IntVT);
8041 return IsFloat ? DAG.getBitcast(VT, Trunc) : Trunc;
8042 }
8043
8044 if (ValSize % SplitSize != 0)
8045 return SDValue();
8046
8047 auto unrollLaneOp = [&DAG, &SL](SDNode *N) -> SDValue {
8048 EVT VT = N->getValueType(0);
8049 unsigned NE = VT.getVectorNumElements();
8050 EVT EltVT = VT.getVectorElementType();
8052 unsigned NumOperands = N->getNumOperands();
8053 SmallVector<SDValue, 4> Operands(NumOperands);
8054 SDNode *GL = N->getGluedNode();
8055
8056 // only handle convergencectrl_glue
8058
8059 for (unsigned i = 0; i != NE; ++i) {
8060 for (unsigned j = 0, e = GL ? NumOperands - 1 : NumOperands; j != e;
8061 ++j) {
8062 SDValue Operand = N->getOperand(j);
8063 EVT OperandVT = Operand.getValueType();
8064 if (OperandVT.isVector()) {
8065 // A vector operand; extract a single element.
8066 EVT OperandEltVT = OperandVT.getVectorElementType();
8067 Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, OperandEltVT,
8068 Operand, DAG.getVectorIdxConstant(i, SL));
8069 } else {
8070 // A scalar operand; just use it as is.
8071 Operands[j] = Operand;
8072 }
8073 }
8074
8075 if (GL)
8076 Operands[NumOperands - 1] =
8077 DAG.getNode(ISD::CONVERGENCECTRL_GLUE, SL, MVT::Glue,
8078 SDValue(GL->getOperand(0).getNode(), 0));
8079
8080 Scalars.push_back(DAG.getNode(N->getOpcode(), SL, EltVT, Operands));
8081 }
8082
8083 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NE);
8084 return DAG.getBuildVector(VecVT, SL, Scalars);
8085 };
8086
8087 if (VT.isVector()) {
8088 switch (MVT::SimpleValueType EltTy =
8090 case MVT::i32:
8091 case MVT::f32:
8092 if (SplitSize == 32) {
8093 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VT.getSimpleVT());
8094 return unrollLaneOp(LaneOp.getNode());
8095 }
8096 [[fallthrough]];
8097 case MVT::i16:
8098 case MVT::f16:
8099 case MVT::bf16: {
8100 unsigned SubVecNumElt =
8101 SplitSize / VT.getVectorElementType().getSizeInBits();
8102 MVT SubVecVT = MVT::getVectorVT(EltTy, SubVecNumElt);
8104 SDValue Src0SubVec, Src1SubVec, Src2SubVec;
8105 for (unsigned i = 0, EltIdx = 0; i < ValSize / SplitSize; i++) {
8106 Src0SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src0,
8107 DAG.getConstant(EltIdx, SL, MVT::i32));
8108
8109 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive ||
8110 IsPermLane16) {
8111 Src1SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src1,
8112 DAG.getConstant(EltIdx, SL, MVT::i32));
8113
8114 Pieces.push_back(
8115 createLaneOp(Src0SubVec, Src1SubVec, Src2, SubVecVT));
8116 } else if (IID == Intrinsic::amdgcn_writelane) {
8117 Src2SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, SubVecVT, Src2,
8118 DAG.getConstant(EltIdx, SL, MVT::i32));
8119 Pieces.push_back(
8120 createLaneOp(Src0SubVec, Src1, Src2SubVec, SubVecVT));
8121 } else {
8122 Pieces.push_back(createLaneOp(Src0SubVec, Src1, Src2, SubVecVT));
8123 }
8124
8125 EltIdx += SubVecNumElt;
8126 }
8127 return DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, Pieces);
8128 }
8129 default:
8130 // Handle all other cases by bitcasting to i32 vectors
8131 break;
8132 }
8133 }
8134
8135 MVT VecVT =
8136 MVT::getVectorVT(MVT::getIntegerVT(SplitSize), ValSize / SplitSize);
8137 Src0 = DAG.getBitcast(VecVT, Src0);
8138
8139 if (IID == Intrinsic::amdgcn_update_dpp || IsSetInactive || IsPermLane16)
8140 Src1 = DAG.getBitcast(VecVT, Src1);
8141
8142 if (IID == Intrinsic::amdgcn_writelane)
8143 Src2 = DAG.getBitcast(VecVT, Src2);
8144
8145 SDValue LaneOp = createLaneOp(Src0, Src1, Src2, VecVT);
8146 SDValue UnrolledLaneOp = unrollLaneOp(LaneOp.getNode());
8147 return DAG.getBitcast(VT, UnrolledLaneOp);
8148}
8149
8151 SelectionDAG &DAG) {
8152 EVT VT = N->getValueType(0);
8153
8154 if (VT.getSizeInBits() != 32)
8155 return SDValue();
8156
8157 SDLoc SL(N);
8158
8159 SDValue Value = N->getOperand(1);
8160 SDValue Index = N->getOperand(2);
8161
8162 // ds_bpermute requires index to be multiplied by 4
8163 SDValue ShiftAmount = DAG.getShiftAmountConstant(2, MVT::i32, SL);
8164 SDValue ShiftedIndex =
8165 DAG.getNode(ISD::SHL, SL, Index.getValueType(), Index, ShiftAmount);
8166
8167 // Intrinsics will require i32 to operate on
8168 SDValue ValueI32 = DAG.getBitcast(MVT::i32, Value);
8169
8170 auto MakeIntrinsic = [&DAG, &SL](unsigned IID, MVT RetVT,
8171 SmallVector<SDValue> IntrinArgs) -> SDValue {
8172 SmallVector<SDValue> Operands(1);
8173 Operands[0] = DAG.getTargetConstant(IID, SL, MVT::i32);
8174 Operands.append(IntrinArgs);
8175 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, RetVT, Operands);
8176 };
8177
8178 // If we can bpermute across the whole wave, then just do that
8180 SDValue BPermute = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8181 {ShiftedIndex, ValueI32});
8182 return DAG.getBitcast(VT, BPermute);
8183 }
8184
8185 assert(TLI.getSubtarget()->isWave64());
8186
8187 // Otherwise, we need to make use of whole wave mode
8188 SDValue PoisonVal = DAG.getPOISON(ValueI32->getValueType(0));
8189
8190 // Set inactive lanes to poison
8191 SDValue WWMValue = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8192 {ValueI32, PoisonVal});
8193 SDValue WWMIndex = MakeIntrinsic(Intrinsic::amdgcn_set_inactive, MVT::i32,
8194 {ShiftedIndex, PoisonVal});
8195
8196 SDValue Swapped =
8197 MakeIntrinsic(Intrinsic::amdgcn_permlane64, MVT::i32, {WWMValue});
8198
8199 // Get permutation of each half, then we'll select which one to use
8200 SDValue BPermSameHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute, MVT::i32,
8201 {WWMIndex, WWMValue});
8202 SDValue BPermOtherHalf = MakeIntrinsic(Intrinsic::amdgcn_ds_bpermute,
8203 MVT::i32, {WWMIndex, Swapped});
8204 SDValue BPermOtherHalfWWM =
8205 MakeIntrinsic(Intrinsic::amdgcn_wwm, MVT::i32, {BPermOtherHalf});
8206
8207 // Select which side to take the permute from
8208 SDValue ThreadIDMask = DAG.getAllOnesConstant(SL, MVT::i32);
8209 // We can get away with only using mbcnt_lo here since we're only
8210 // trying to detect which side of 32 each lane is on, and mbcnt_lo
8211 // returns 32 for lanes 32-63.
8212 SDValue ThreadID =
8213 MakeIntrinsic(Intrinsic::amdgcn_mbcnt_lo, MVT::i32,
8214 {ThreadIDMask, DAG.getTargetConstant(0, SL, MVT::i32)});
8215
8216 SDValue SameOrOtherHalf =
8217 DAG.getNode(ISD::AND, SL, MVT::i32,
8218 DAG.getNode(ISD::XOR, SL, MVT::i32, ThreadID, Index),
8219 DAG.getTargetConstant(32, SL, MVT::i32));
8220 SDValue UseSameHalf =
8221 DAG.getSetCC(SL, MVT::i1, SameOrOtherHalf,
8222 DAG.getConstant(0, SL, MVT::i32), ISD::SETEQ);
8223 SDValue Result = DAG.getSelect(SL, MVT::i32, UseSameHalf, BPermSameHalf,
8224 BPermOtherHalfWWM);
8225 return DAG.getBitcast(VT, Result);
8226}
8227
8230 SelectionDAG &DAG) const {
8231 switch (N->getOpcode()) {
8233 if (SDValue Res = lowerINSERT_VECTOR_ELT(SDValue(N, 0), DAG))
8234 Results.push_back(Res);
8235 return;
8236 }
8238 if (SDValue Res = lowerEXTRACT_VECTOR_ELT(SDValue(N, 0), DAG))
8239 Results.push_back(Res);
8240 return;
8241 }
8243 unsigned IID = N->getConstantOperandVal(0);
8244 switch (IID) {
8245 case Intrinsic::amdgcn_make_buffer_rsrc:
8246 Results.push_back(lowerPointerAsRsrcIntrin(N, DAG));
8247 return;
8248 case Intrinsic::amdgcn_cvt_pkrtz: {
8249 SDValue Src0 = N->getOperand(1);
8250 SDValue Src1 = N->getOperand(2);
8251 SDLoc SL(N);
8252 SDValue Cvt =
8253 DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1);
8254 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
8255 return;
8256 }
8257 case Intrinsic::amdgcn_cvt_pknorm_i16:
8258 case Intrinsic::amdgcn_cvt_pknorm_u16:
8259 case Intrinsic::amdgcn_cvt_pk_i16:
8260 case Intrinsic::amdgcn_cvt_pk_u16: {
8261 SDValue Src0 = N->getOperand(1);
8262 SDValue Src1 = N->getOperand(2);
8263 SDLoc SL(N);
8264 unsigned Opcode;
8265
8266 if (IID == Intrinsic::amdgcn_cvt_pknorm_i16)
8267 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
8268 else if (IID == Intrinsic::amdgcn_cvt_pknorm_u16)
8269 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
8270 else if (IID == Intrinsic::amdgcn_cvt_pk_i16)
8271 Opcode = AMDGPUISD::CVT_PK_I16_I32;
8272 else
8273 Opcode = AMDGPUISD::CVT_PK_U16_U32;
8274
8275 EVT VT = N->getValueType(0);
8276 if (isTypeLegal(VT))
8277 Results.push_back(DAG.getNode(Opcode, SL, VT, Src0, Src1));
8278 else {
8279 SDValue Cvt = DAG.getNode(Opcode, SL, MVT::i32, Src0, Src1);
8280 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, Cvt));
8281 }
8282 return;
8283 }
8284 case Intrinsic::amdgcn_s_buffer_load: {
8285 // Lower llvm.amdgcn.s.buffer.load.(i8, u8) intrinsics. First, we generate
8286 // s_buffer_load_u8 for signed and unsigned load instructions. Next, DAG
8287 // combiner tries to merge the s_buffer_load_u8 with a sext instruction
8288 // (performSignExtendInRegCombine()) and it replaces s_buffer_load_u8 with
8289 // s_buffer_load_i8.
8290 if (!Subtarget->hasScalarSubwordLoads())
8291 return;
8292 SDValue Op = SDValue(N, 0);
8293 SDValue Rsrc = Op.getOperand(1);
8294 SDValue Offset = Op.getOperand(2);
8295 SDValue CachePolicy = Op.getOperand(3);
8296 EVT VT = Op.getValueType();
8297 assert(VT == MVT::i8 && "Expected 8-bit s_buffer_load intrinsics.\n");
8298 SDLoc DL(Op);
8300 const DataLayout &DataLayout = DAG.getDataLayout();
8301 Align Alignment =
8307 VT.getStoreSize(), Alignment);
8308 SDValue LoadVal;
8309 if (!Offset->isDivergent()) {
8310 SDValue Ops[] = {Rsrc, // source register
8311 Offset, CachePolicy};
8312 SDValue BufferLoad =
8313 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_UBYTE, DL,
8314 DAG.getVTList(MVT::i32), Ops, VT, MMO);
8315 LoadVal = DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
8316 } else {
8317 SDValue Ops[] = {
8318 DAG.getEntryNode(), // Chain
8319 Rsrc, // rsrc
8320 DAG.getConstant(0, DL, MVT::i32), // vindex
8321 {}, // voffset
8322 {}, // soffset
8323 {}, // offset
8324 CachePolicy, // cachepolicy
8325 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
8326 };
8327 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
8328 LoadVal = handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
8329 }
8330 Results.push_back(LoadVal);
8331 return;
8332 }
8333 case Intrinsic::amdgcn_dead: {
8334 for (unsigned I = 0, E = N->getNumValues(); I < E; ++I)
8335 Results.push_back(DAG.getPOISON(N->getValueType(I)));
8336 return;
8337 }
8338 }
8339 break;
8340 }
8342 if (SDValue Res = LowerINTRINSIC_W_CHAIN(SDValue(N, 0), DAG)) {
8343 if (Res.getOpcode() == ISD::MERGE_VALUES) {
8344 // FIXME: Hacky
8345 for (unsigned I = 0; I < Res.getNumOperands(); I++) {
8346 Results.push_back(Res.getOperand(I));
8347 }
8348 } else {
8349 Results.push_back(Res);
8350 Results.push_back(Res.getValue(1));
8351 }
8352 return;
8353 }
8354
8355 break;
8356 }
8357 case ISD::SELECT: {
8358 SDLoc SL(N);
8359 EVT VT = N->getValueType(0);
8360 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
8361 SDValue LHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(1));
8362 SDValue RHS = DAG.getNode(ISD::BITCAST, SL, NewVT, N->getOperand(2));
8363
8364 EVT SelectVT = NewVT;
8365 if (NewVT.bitsLT(MVT::i32)) {
8366 LHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, LHS);
8367 RHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, RHS);
8368 SelectVT = MVT::i32;
8369 }
8370
8371 SDValue NewSelect =
8372 DAG.getNode(ISD::SELECT, SL, SelectVT, N->getOperand(0), LHS, RHS);
8373
8374 if (NewVT != SelectVT)
8375 NewSelect = DAG.getNode(ISD::TRUNCATE, SL, NewVT, NewSelect);
8376 Results.push_back(DAG.getNode(ISD::BITCAST, SL, VT, NewSelect));
8377 return;
8378 }
8379 case ISD::FNEG: {
8380 if (N->getValueType(0) != MVT::v2f16)
8381 break;
8382
8383 SDLoc SL(N);
8384 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8385
8386 SDValue Op = DAG.getNode(ISD::XOR, SL, MVT::i32, BC,
8387 DAG.getConstant(0x80008000, SL, MVT::i32));
8388 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8389 return;
8390 }
8391 case ISD::FABS: {
8392 if (N->getValueType(0) != MVT::v2f16)
8393 break;
8394
8395 SDLoc SL(N);
8396 SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::i32, N->getOperand(0));
8397
8398 SDValue Op = DAG.getNode(ISD::AND, SL, MVT::i32, BC,
8399 DAG.getConstant(0x7fff7fff, SL, MVT::i32));
8400 Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Op));
8401 return;
8402 }
8403 case ISD::FSQRT: {
8404 if (N->getValueType(0) != MVT::f16)
8405 break;
8406 Results.push_back(lowerFSQRTF16(SDValue(N, 0), DAG));
8407 break;
8408 }
8409 default:
8411 break;
8412 }
8413}
8414
8415/// Helper function for LowerBRCOND
8416static SDNode *findUser(SDValue Value, unsigned Opcode) {
8417
8418 for (SDUse &U : Value->uses()) {
8419 if (U.get() != Value)
8420 continue;
8421
8422 if (U.getUser()->getOpcode() == Opcode)
8423 return U.getUser();
8424 }
8425 return nullptr;
8426}
8427
8428unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
8429 if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
8430 switch (Intr->getConstantOperandVal(1)) {
8431 case Intrinsic::amdgcn_if:
8432 return AMDGPUISD::IF;
8433 case Intrinsic::amdgcn_else:
8434 return AMDGPUISD::ELSE;
8435 case Intrinsic::amdgcn_loop:
8436 return AMDGPUISD::LOOP;
8437 case Intrinsic::amdgcn_end_cf:
8438 llvm_unreachable("should not occur");
8439 default:
8440 return 0;
8441 }
8442 }
8443
8444 // break, if_break, else_break are all only used as inputs to loop, not
8445 // directly as branch conditions.
8446 return 0;
8447}
8448
8455
8457 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS())
8458 return false;
8459
8460 // FIXME: Either avoid relying on address space here or change the default
8461 // address space for functions to avoid the explicit check.
8462 return (GV->getValueType()->isFunctionTy() ||
8465}
8466
8468 return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
8469}
8470
8472 if (!GV->hasExternalLinkage())
8473 return true;
8474
8475 // With object linking, external LDS declarations need relocations so the
8476 // linker can assign their offsets.
8478 if (const auto *GVar = dyn_cast<GlobalVariable>(GV)) {
8479 if (GVar->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
8480 assert(GVar->isDeclaration() && "AS3 GVs should be declaration here "
8481 "when object linking is enabled");
8482 return false;
8483 }
8484 }
8485 }
8486
8487 const auto OS = getTargetMachine().getTargetTriple().getOS();
8488 return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
8489}
8490
8491/// This transforms the control flow intrinsics to get the branch destination as
8492/// last parameter, also switches branch target with BR if the need arise
8493SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, SelectionDAG &DAG) const {
8494 SDLoc DL(BRCOND);
8495
8496 SDNode *Intr = BRCOND.getOperand(1).getNode();
8497 SDValue Target = BRCOND.getOperand(2);
8498 SDNode *BR = nullptr;
8499 SDNode *SetCC = nullptr;
8500
8501 switch (Intr->getOpcode()) {
8502 case ISD::SETCC: {
8503 // As long as we negate the condition everything is fine
8504 SetCC = Intr;
8505 Intr = SetCC->getOperand(0).getNode();
8506 break;
8507 }
8508 case ISD::XOR: {
8509 // Similar to SETCC, if we have (xor c, -1), we will be fine.
8510 SDValue LHS = Intr->getOperand(0);
8511 SDValue RHS = Intr->getOperand(1);
8512 if (auto *C = dyn_cast<ConstantSDNode>(RHS); C && C->getZExtValue()) {
8513 Intr = LHS.getNode();
8514 break;
8515 }
8516 [[fallthrough]];
8517 }
8518 default: {
8519 // Get the target from BR if we don't negate the condition
8520 BR = findUser(BRCOND, ISD::BR);
8521 assert(BR && "brcond missing unconditional branch user");
8522 Target = BR->getOperand(1);
8523 }
8524 }
8525
8526 unsigned CFNode = isCFIntrinsic(Intr);
8527 if (CFNode == 0) {
8528 // This is a uniform branch so we don't need to legalize.
8529 return BRCOND;
8530 }
8531
8532 bool HaveChain = Intr->getOpcode() == ISD::INTRINSIC_VOID ||
8534
8535 assert(!SetCC ||
8536 (SetCC->getConstantOperandVal(1) == 1 &&
8537 cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
8538 ISD::SETNE));
8539
8540 // operands of the new intrinsic call
8542 if (HaveChain)
8543 Ops.push_back(BRCOND.getOperand(0));
8544
8545 Ops.append(Intr->op_begin() + (HaveChain ? 2 : 1), Intr->op_end());
8546 Ops.push_back(Target);
8547
8548 ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
8549
8550 // build the new intrinsic call
8551 SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
8552
8553 if (!HaveChain) {
8554 SDValue Ops[] = {SDValue(Result, 0), BRCOND.getOperand(0)};
8555
8557 }
8558
8559 if (BR) {
8560 // Give the branch instruction our target
8561 SDValue Ops[] = {BR->getOperand(0), BRCOND.getOperand(2)};
8562 SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
8563 DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
8564 }
8565
8566 SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
8567
8568 // Copy the intrinsic results to registers
8569 for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) {
8570 SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg);
8571 if (!CopyToReg)
8572 continue;
8573
8574 Chain = DAG.getCopyToReg(Chain, DL, CopyToReg->getOperand(1),
8575 SDValue(Result, i - 1), SDValue());
8576
8577 DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0));
8578 }
8579
8580 // Remove the old intrinsic from the chain
8581 DAG.ReplaceAllUsesOfValueWith(SDValue(Intr, Intr->getNumValues() - 1),
8582 Intr->getOperand(0));
8583
8584 return Chain;
8585}
8586
8587SDValue SITargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const {
8588 MVT VT = Op.getSimpleValueType();
8589 SDLoc DL(Op);
8590 // Checking the depth
8591 if (Op.getConstantOperandVal(0) != 0)
8592 return DAG.getConstant(0, DL, VT);
8593
8594 MachineFunction &MF = DAG.getMachineFunction();
8595 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8596 // Check for kernel and shader functions
8597 if (Info->isEntryFunction())
8598 return DAG.getConstant(0, DL, VT);
8599
8600 MachineFrameInfo &MFI = MF.getFrameInfo();
8601 // There is a call to @llvm.returnaddress in this function
8602 MFI.setReturnAddressIsTaken(true);
8603
8604 const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
8605 // Get the return address reg and mark it as an implicit live-in
8606 Register Reg = MF.addLiveIn(TRI->getReturnAddressReg(MF),
8607 getRegClassFor(VT, Op.getNode()->isDivergent()));
8608
8609 return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
8610}
8611
8612SDValue SITargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
8613 MachineFunction &MF = DAG.getMachineFunction();
8614 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
8615
8616 // For functions that set up their own stack, select the GET_STACK_BASE
8617 // pseudo.
8618 if (MFI->isBottomOfStack())
8619 return Op;
8620
8621 // For everything else, create a dummy stack object.
8622 int FI = MF.getFrameInfo().CreateFixedObject(1, 0, /*IsImmutable=*/false);
8623 return DAG.getFrameIndex(FI, Op.getValueType());
8624}
8625
8626SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG, SDValue Op,
8627 const SDLoc &DL, EVT VT) const {
8628 return Op.getValueType().bitsLE(VT)
8629 ? DAG.getNode(ISD::FP_EXTEND, DL, VT, Op)
8630 : DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
8631 DAG.getTargetConstant(0, DL, MVT::i32));
8632}
8633
8634SDValue SITargetLowering::splitFP_ROUNDVectorOp(SDValue Op,
8635 SelectionDAG &DAG) const {
8636 EVT DstVT = Op.getValueType();
8637 unsigned NumElts = DstVT.getVectorNumElements();
8638 assert(NumElts > 2 && isPowerOf2_32(NumElts));
8639
8640 auto [Lo, Hi] = DAG.SplitVectorOperand(Op.getNode(), 0);
8641
8642 SDLoc DL(Op);
8643 unsigned Opc = Op.getOpcode();
8644 SDValue Flags = Op.getOperand(1);
8645 EVT HalfDstVT =
8646 EVT::getVectorVT(*DAG.getContext(), DstVT.getScalarType(), NumElts / 2);
8647 SDValue OpLo = DAG.getNode(Opc, DL, HalfDstVT, Lo, Flags);
8648 SDValue OpHi = DAG.getNode(Opc, DL, HalfDstVT, Hi, Flags);
8649
8650 return DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, OpLo, OpHi);
8651}
8652
8653SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
8654 SDValue Src = Op.getOperand(0);
8655 EVT SrcVT = Src.getValueType();
8656 EVT DstVT = Op.getValueType();
8657
8658 if (DstVT.isVectorOf(MVT::f16)) {
8659 assert(Subtarget->hasCvtPkF16F32Inst() && "support v_cvt_pk_f16_f32");
8660 if (SrcVT.getScalarType() != MVT::f32)
8661 return SDValue();
8662 return SrcVT == MVT::v2f32 ? Op : splitFP_ROUNDVectorOp(Op, DAG);
8663 }
8664
8665 if (SrcVT.getScalarType() != MVT::f64)
8666 return Op;
8667
8668 SDLoc DL(Op);
8669 if (DstVT == MVT::f16) {
8670 // TODO: Handle strictfp
8671 if (Op.getOpcode() != ISD::FP_ROUND)
8672 return Op;
8673
8674 if (!Subtarget->has16BitInsts()) {
8675 SDValue FpToFp16 = DAG.getNode(ISD::FP_TO_FP16, DL, MVT::i32, Src);
8676 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8677 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8678 }
8679 if (Op->getFlags().hasApproximateFuncs()) {
8680 SDValue Flags = Op.getOperand(1);
8681 SDValue Src32 = DAG.getNode(ISD::FP_ROUND, DL, MVT::f32, Src, Flags);
8682 return DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, Src32, Flags);
8683 }
8684 SDValue FpToFp16 = LowerF64ToF16Safe(Src, DL, DAG);
8685 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToFp16);
8686 return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);
8687 }
8688
8689 assert(DstVT.getScalarType() == MVT::bf16 &&
8690 "custom lower FP_ROUND for f16 or bf16");
8691 assert(Subtarget->hasBF16ConversionInsts() && "f32 -> bf16 is legal");
8692
8693 // Round-inexact-to-odd f64 to f32, then do the final rounding using the
8694 // hardware f32 -> bf16 instruction.
8695 EVT F32VT = SrcVT.changeElementType(*DAG.getContext(), MVT::f32);
8696 SDValue Rod = expandRoundInexactToOdd(F32VT, Src, DL, DAG);
8697 return DAG.getNode(ISD::FP_ROUND, DL, DstVT, Rod,
8698 DAG.getTargetConstant(0, DL, MVT::i32));
8699}
8700
8701SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
8702 SelectionDAG &DAG) const {
8703 EVT VT = Op.getValueType();
8704 const MachineFunction &MF = DAG.getMachineFunction();
8705 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8706 bool IsIEEEMode = Info->getMode().IEEE;
8707
8708 // FIXME: Assert during selection that this is only selected for
8709 // ieee_mode. Currently a combine can produce the ieee version for non-ieee
8710 // mode functions, but this happens to be OK since it's only done in cases
8711 // where there is known no sNaN.
8712 if (IsIEEEMode)
8713 return expandFMINNUM_FMAXNUM(Op.getNode(), DAG);
8714
8715 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8716 VT == MVT::v16bf16)
8717 return splitBinaryVectorOp(Op, DAG);
8718 return Op;
8719}
8720
8721SDValue
8722SITargetLowering::lowerFMINIMUMNUM_FMAXIMUMNUM(SDValue Op,
8723 SelectionDAG &DAG) const {
8724 EVT VT = Op.getValueType();
8725 const MachineFunction &MF = DAG.getMachineFunction();
8726 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
8727 bool IsIEEEMode = Info->getMode().IEEE;
8728
8729 if (IsIEEEMode)
8730 return expandFMINIMUMNUM_FMAXIMUMNUM(Op.getNode(), DAG);
8731
8732 if (VT == MVT::v4f16 || VT == MVT::v8f16 || VT == MVT::v16f16 ||
8733 VT == MVT::v16bf16)
8734 return splitBinaryVectorOp(Op, DAG);
8735 return Op;
8736}
8737
8738SDValue SITargetLowering::lowerFMINIMUM_FMAXIMUM(SDValue Op,
8739 SelectionDAG &DAG) const {
8740 EVT VT = Op.getValueType();
8741 if (VT.isVector())
8742 return splitBinaryVectorOp(Op, DAG);
8743
8744 assert(!Subtarget->hasIEEEMinimumMaximumInsts() &&
8745 !Subtarget->hasMinimum3Maximum3F16() &&
8746 Subtarget->hasMinimum3Maximum3PKF16() && VT == MVT::f16 &&
8747 "should not need to widen f16 minimum/maximum to v2f16");
8748
8749 // Widen f16 operation to v2f16
8750
8751 // fminimum f16:x, f16:y ->
8752 // extract_vector_elt (fminimum (v2f16 (scalar_to_vector x))
8753 // (v2f16 (scalar_to_vector y))), 0
8754 SDLoc SL(Op);
8755 SDValue WideSrc0 =
8756 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(0));
8757 SDValue WideSrc1 =
8758 DAG.getNode(ISD::SCALAR_TO_VECTOR, SL, MVT::v2f16, Op.getOperand(1));
8759
8760 SDValue Widened =
8761 DAG.getNode(Op.getOpcode(), SL, MVT::v2f16, WideSrc0, WideSrc1);
8762
8763 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::f16, Widened,
8764 DAG.getConstant(0, SL, MVT::i32));
8765}
8766
8767SDValue SITargetLowering::lowerFLDEXP(SDValue Op, SelectionDAG &DAG) const {
8768 bool IsStrict = Op.getOpcode() == ISD::STRICT_FLDEXP;
8769 EVT VT = Op.getValueType();
8770 assert(VT == MVT::f16);
8771
8772 SDValue Exp = Op.getOperand(IsStrict ? 2 : 1);
8773 EVT ExpVT = Exp.getValueType();
8774 if (ExpVT == MVT::i16)
8775 return Op;
8776
8777 SDLoc DL(Op);
8778
8779 // Correct the exponent type for f16 to i16.
8780 // Clamp the range of the exponent to the instruction's range.
8781
8782 // TODO: This should be a generic narrowing legalization, and can easily be
8783 // for GlobalISel.
8784
8785 SDValue MinExp = DAG.getSignedConstant(minIntN(16), DL, ExpVT);
8786 SDValue ClampMin = DAG.getNode(ISD::SMAX, DL, ExpVT, Exp, MinExp);
8787
8788 SDValue MaxExp = DAG.getSignedConstant(maxIntN(16), DL, ExpVT);
8789 SDValue Clamp = DAG.getNode(ISD::SMIN, DL, ExpVT, ClampMin, MaxExp);
8790
8791 SDValue TruncExp = DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Clamp);
8792
8793 if (IsStrict) {
8794 return DAG.getNode(ISD::STRICT_FLDEXP, DL, {VT, MVT::Other},
8795 {Op.getOperand(0), Op.getOperand(1), TruncExp});
8796 }
8797
8798 return DAG.getNode(ISD::FLDEXP, DL, VT, Op.getOperand(0), TruncExp);
8799}
8800
8802 switch (Op->getOpcode()) {
8803 case ISD::ABS:
8804 case ISD::SRA:
8805 case ISD::SMIN:
8806 case ISD::SMAX:
8807 return ISD::SIGN_EXTEND;
8808 case ISD::SRL:
8809 case ISD::UMIN:
8810 case ISD::UMAX:
8811 case ISD::USUBSAT:
8812 return ISD::ZERO_EXTEND;
8813 case ISD::ADD:
8814 case ISD::SUB:
8815 case ISD::AND:
8816 case ISD::OR:
8817 case ISD::XOR:
8818 case ISD::SHL:
8819 case ISD::SELECT:
8820 case ISD::MUL:
8821 // operation result won't be influenced by garbage high bits.
8822 // TODO: are all of those cases correct, and are there more?
8823 return ISD::ANY_EXTEND;
8824 case ISD::SETCC: {
8825 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8827 }
8828 default:
8829 llvm_unreachable("unexpected opcode!");
8830 }
8831}
8832
8833SDValue
8834SITargetLowering::promoteUniformUnaryOpToI32(SDValue Op,
8835 DAGCombinerInfo &DCI) const {
8836 EVT OpTy = Op.getValueType();
8837 SelectionDAG &DAG = DCI.DAG;
8838 EVT ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8839
8840 if (isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8841 return SDValue();
8842
8843 SDLoc DL(Op);
8844 SDValue Input = Op.getOperand(0);
8845 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8846 Input = DAG.getNode(ExtOp, DL, ExtTy, Input);
8847
8848 SDValue NewVal = DAG.getNode(Op.getOpcode(), DL, ExtTy, Input);
8849
8850 return DAG.getNode(ISD::TRUNCATE, DL, OpTy, NewVal);
8851}
8852
8853SDValue SITargetLowering::promoteUniformOpToI32(SDValue Op,
8854 DAGCombinerInfo &DCI) const {
8855 const unsigned Opc = Op.getOpcode();
8856 assert(Opc == ISD::ADD || Opc == ISD::SUB || Opc == ISD::SHL ||
8857 Opc == ISD::SRL || Opc == ISD::SRA || Opc == ISD::AND ||
8858 Opc == ISD::OR || Opc == ISD::XOR || Opc == ISD::MUL ||
8859 Opc == ISD::SETCC || Opc == ISD::SELECT || Opc == ISD::SMIN ||
8860 Opc == ISD::SMAX || Opc == ISD::UMIN || Opc == ISD::UMAX ||
8861 Opc == ISD::USUBSAT);
8862
8863 EVT OpTy = (Opc != ISD::SETCC) ? Op.getValueType()
8864 : Op->getOperand(0).getValueType();
8865 auto &DAG = DCI.DAG;
8866 auto ExtTy = OpTy.changeElementType(*DAG.getContext(), MVT::i32);
8867
8868 if (DCI.isBeforeLegalizeOps() ||
8869 isNarrowingProfitable(Op.getNode(), ExtTy, OpTy))
8870 return SDValue();
8871
8872 SDLoc DL(Op);
8873 SDValue LHS;
8874 SDValue RHS;
8875 if (Opc == ISD::SELECT) {
8876 LHS = Op->getOperand(1);
8877 RHS = Op->getOperand(2);
8878 } else {
8879 LHS = Op->getOperand(0);
8880 RHS = Op->getOperand(1);
8881 }
8882
8883 const unsigned ExtOp = getExtOpcodeForPromotedOp(Op);
8884 LHS = DAG.getNode(ExtOp, DL, ExtTy, {LHS});
8885
8886 // Special case: for shifts, the RHS always needs a zext.
8887 if (Opc == ISD::SHL || Opc == ISD::SRL || Opc == ISD::SRA)
8888 RHS = DAG.getNode(ISD::ZERO_EXTEND, DL, ExtTy, {RHS});
8889 else
8890 RHS = DAG.getNode(ExtOp, DL, ExtTy, {RHS});
8891
8892 // setcc always return i1/i1 vec so no need to truncate after.
8893 if (Opc == ISD::SETCC) {
8894 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
8895 return DAG.getSetCC(DL, Op.getValueType(), LHS, RHS, CC);
8896 }
8897
8898 // For other ops, we extend the operation's return type as well so we need to
8899 // truncate back to the original type.
8900 SDValue NewVal;
8901 if (Opc == ISD::SELECT)
8902 NewVal = DAG.getNode(ISD::SELECT, DL, ExtTy, {Op->getOperand(0), LHS, RHS});
8903 else
8904 NewVal = DAG.getNode(Opc, DL, ExtTy, {LHS, RHS});
8905
8906 return DAG.getZExtOrTrunc(NewVal, DL, OpTy);
8907}
8908
8909SDValue SITargetLowering::lowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
8910 SDValue Mag = Op.getOperand(0);
8911 EVT MagVT = Mag.getValueType();
8912
8913 if (MagVT.getVectorNumElements() > 2)
8914 return splitBinaryVectorOp(Op, DAG);
8915
8916 SDValue Sign = Op.getOperand(1);
8917 EVT SignVT = Sign.getValueType();
8918
8919 if (MagVT == SignVT)
8920 return Op;
8921
8922 // fcopysign v2f16:mag, v2f32:sign ->
8923 // fcopysign v2f16:mag,
8924 // bitcast (trunc (srl (bitcast sign to v2i32), 16) to v2i16)
8925
8926 SDLoc SL(Op);
8927 SDValue SignAsInt32 = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Sign);
8928 SDValue ShiftAmt = DAG.getShiftAmountConstant(16, MVT::v2i32, SL);
8929 SDValue SignShifted =
8930 DAG.getNode(ISD::SRL, SL, MVT::v2i32, SignAsInt32, ShiftAmt);
8931 SDValue SignAsInt16 = DAG.getNode(ISD::TRUNCATE, SL, MVT::v2i16, SignShifted);
8932
8933 SDValue SignAsHalf16 = DAG.getNode(ISD::BITCAST, SL, MagVT, SignAsInt16);
8934
8935 return DAG.getNode(ISD::FCOPYSIGN, SL, MagVT, Mag, SignAsHalf16);
8936}
8937
8938// Custom lowering for vector multiplications and s_mul_u64.
8939SDValue SITargetLowering::lowerMUL(SDValue Op, SelectionDAG &DAG) const {
8940 EVT VT = Op.getValueType();
8941
8942 // Split vector operands.
8943 if (VT.isVector())
8944 return splitBinaryVectorOp(Op, DAG);
8945
8946 assert(VT == MVT::i64 && "The following code is a special for s_mul_u64");
8947
8948 // There are four ways to lower s_mul_u64:
8949 //
8950 // 1. If all the operands are uniform, then we lower it as it is.
8951 //
8952 // 2. If the operands are divergent, then we have to split s_mul_u64 in 32-bit
8953 // multiplications because there is not a vector equivalent of s_mul_u64.
8954 //
8955 // 3. If the cost model decides that it is more efficient to use vector
8956 // registers, then we have to split s_mul_u64 in 32-bit multiplications.
8957 // This happens in splitScalarSMULU64() in SIInstrInfo.cpp .
8958 //
8959 // 4. If the cost model decides to use vector registers and both of the
8960 // operands are zero-extended/sign-extended from 32-bits, then we split the
8961 // s_mul_u64 in two 32-bit multiplications. The problem is that it is not
8962 // possible to check if the operands are zero-extended or sign-extended in
8963 // SIInstrInfo.cpp. For this reason, here, we replace s_mul_u64 with
8964 // s_mul_u64_u32_pseudo if both operands are zero-extended and we replace
8965 // s_mul_u64 with s_mul_i64_i32_pseudo if both operands are sign-extended.
8966 // If the cost model decides that we have to use vector registers, then
8967 // splitScalarSMulPseudo() (in SIInstrInfo.cpp) split s_mul_u64_u32/
8968 // s_mul_i64_i32_pseudo in two vector multiplications. If the cost model
8969 // decides that we should use scalar registers, then s_mul_u64_u32_pseudo/
8970 // s_mul_i64_i32_pseudo is lowered as s_mul_u64 in expandPostRAPseudo() in
8971 // SIInstrInfo.cpp .
8972
8973 if (Op->isDivergent())
8974 return SDValue();
8975
8976 SDValue Op0 = Op.getOperand(0);
8977 SDValue Op1 = Op.getOperand(1);
8978 // If all the operands are zero-enteted to 32-bits, then we replace s_mul_u64
8979 // with s_mul_u64_u32_pseudo. If all the operands are sign-extended to
8980 // 32-bits, then we replace s_mul_u64 with s_mul_i64_i32_pseudo.
8981 KnownBits Op0KnownBits = DAG.computeKnownBits(Op0);
8982 unsigned Op0LeadingZeros = Op0KnownBits.countMinLeadingZeros();
8983 KnownBits Op1KnownBits = DAG.computeKnownBits(Op1);
8984 unsigned Op1LeadingZeros = Op1KnownBits.countMinLeadingZeros();
8985 SDLoc SL(Op);
8986 if (Op0LeadingZeros >= 32 && Op1LeadingZeros >= 32)
8987 return SDValue(
8988 DAG.getMachineNode(AMDGPU::S_MUL_U64_U32_PSEUDO, SL, VT, Op0, Op1), 0);
8989 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op0);
8990 unsigned Op1SignBits = DAG.ComputeNumSignBits(Op1);
8991 if (Op0SignBits >= 33 && Op1SignBits >= 33)
8992 return SDValue(
8993 DAG.getMachineNode(AMDGPU::S_MUL_I64_I32_PSEUDO, SL, VT, Op0, Op1), 0);
8994 // If all the operands are uniform, then we lower s_mul_u64 as it is.
8995 return Op;
8996}
8997
8998SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
8999 EVT VT = Op.getValueType();
9000 SDLoc SL(Op);
9001 SDValue LHS = Op.getOperand(0);
9002 SDValue RHS = Op.getOperand(1);
9003 bool isSigned = Op.getOpcode() == ISD::SMULO;
9004
9005 if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
9006 const APInt &C = RHSC->getAPIntValue();
9007 // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
9008 if (C.isPowerOf2()) {
9009 // smulo(x, signed_min) is same as umulo(x, signed_min).
9010 bool UseArithShift = isSigned && !C.isMinSignedValue();
9011 SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
9012 SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
9013 SDValue Overflow =
9014 DAG.getSetCC(SL, MVT::i1,
9015 DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL, SL, VT,
9016 Result, ShiftAmt),
9017 LHS, ISD::SETNE);
9018 return DAG.getMergeValues({Result, Overflow}, SL);
9019 }
9020 }
9021
9022 SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
9023 SDValue Top =
9024 DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU, SL, VT, LHS, RHS);
9025
9026 SDValue Sign = isSigned
9027 ? DAG.getNode(ISD::SRA, SL, VT, Result,
9028 DAG.getConstant(VT.getScalarSizeInBits() - 1,
9029 SL, MVT::i32))
9030 : DAG.getConstant(0, SL, VT);
9031 SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
9032
9033 return DAG.getMergeValues({Result, Overflow}, SL);
9034}
9035
9036SDValue SITargetLowering::lowerXMUL_LOHI(SDValue Op, SelectionDAG &DAG) const {
9037 if (Op->isDivergent()) {
9038 // Select to V_MAD_[IU]64_[IU]32.
9039 return Op;
9040 }
9041 if (Subtarget->hasSMulHi()) {
9042 // Expand to S_MUL_I32 + S_MUL_HI_[IU]32.
9043 return SDValue();
9044 }
9045 // The multiply is uniform but we would have to use V_MUL_HI_[IU]32 to
9046 // calculate the high part, so we might as well do the whole thing with
9047 // V_MAD_[IU]64_[IU]32.
9048 return Op;
9049}
9050
9051SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
9052 if (!Subtarget->hasTrapHandler() ||
9053 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA)
9054 return lowerTrapEndpgm(Op, DAG);
9055
9056 return Subtarget->supportsGetDoorbellID() ? lowerTrapHsa(Op, DAG)
9057 : lowerTrapHsaQueuePtr(Op, DAG);
9058}
9059
9060SDValue SITargetLowering::lowerTrapEndpgm(SDValue Op, SelectionDAG &DAG) const {
9061 SDLoc SL(Op);
9062 SDValue Chain = Op.getOperand(0);
9063 return DAG.getNode(AMDGPUISD::ENDPGM_TRAP, SL, MVT::Other, Chain);
9064}
9065
9066SDValue
9067SITargetLowering::loadImplicitKernelArgument(SelectionDAG &DAG, MVT VT,
9068 const SDLoc &DL, Align Alignment,
9069 ImplicitParameter Param) const {
9070 MachineFunction &MF = DAG.getMachineFunction();
9071 uint64_t Offset = getImplicitParameterOffset(MF, Param);
9072 SDValue Ptr = lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), Offset);
9073 MachinePointerInfo PtrInfo =
9075 return DAG.getLoad(
9076 VT, DL, DAG.getEntryNode(), Ptr, PtrInfo.getWithOffset(Offset), Alignment,
9078}
9079
9080SDValue SITargetLowering::lowerTrapHsaQueuePtr(SDValue Op,
9081 SelectionDAG &DAG) const {
9082 SDLoc SL(Op);
9083 SDValue Chain = Op.getOperand(0);
9084
9085 SDValue QueuePtr;
9086 // For code object version 5, QueuePtr is passed through implicit kernarg.
9087 const Module *M = DAG.getMachineFunction().getFunction().getParent();
9089 QueuePtr =
9090 loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
9091 } else {
9092 MachineFunction &MF = DAG.getMachineFunction();
9093 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9094 Register UserSGPR = Info->getQueuePtrUserSGPR();
9095
9096 if (UserSGPR == AMDGPU::NoRegister) {
9097 // We probably are in a function incorrectly marked with
9098 // amdgpu-no-queue-ptr. This is undefined. We don't want to delete the
9099 // trap, so just use a null pointer.
9100 QueuePtr = DAG.getConstant(0, SL, MVT::i64);
9101 } else {
9102 QueuePtr = CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR,
9103 MVT::i64);
9104 }
9105 }
9106
9107 SDValue SGPR01 = DAG.getRegister(AMDGPU::SGPR0_SGPR1, MVT::i64);
9108 SDValue ToReg = DAG.getCopyToReg(Chain, SL, SGPR01, QueuePtr, SDValue());
9109
9110 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
9111 SDValue Ops[] = {ToReg, DAG.getTargetConstant(TrapID, SL, MVT::i16), SGPR01,
9112 ToReg.getValue(1)};
9113 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
9114}
9115
9116SDValue SITargetLowering::lowerTrapHsa(SDValue Op, SelectionDAG &DAG) const {
9117 SDLoc SL(Op);
9118 SDValue Chain = Op.getOperand(0);
9119
9120 // We need to simulate the 's_trap 2' instruction on targets that run in
9121 // PRIV=1 (where it is treated as a nop).
9122 if (Subtarget->hasPrivEnabledTrap2NopBug())
9123 return DAG.getNode(AMDGPUISD::SIMULATED_TRAP, SL, MVT::Other, Chain);
9124
9125 uint64_t TrapID = static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSATrap);
9126 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
9127 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
9128}
9129
9130SDValue SITargetLowering::lowerDEBUGTRAP(SDValue Op, SelectionDAG &DAG) const {
9131 SDLoc SL(Op);
9132 SDValue Chain = Op.getOperand(0);
9133 MachineFunction &MF = DAG.getMachineFunction();
9134
9135 if (!Subtarget->hasTrapHandler() ||
9136 Subtarget->getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbi::AMDHSA) {
9137 LLVMContext &Ctx = MF.getFunction().getContext();
9138 Ctx.diagnose(DiagnosticInfoUnsupported(MF.getFunction(),
9139 "debugtrap handler not supported",
9140 Op.getDebugLoc(), DS_Warning));
9141 return Chain;
9142 }
9143
9144 uint64_t TrapID =
9145 static_cast<uint64_t>(GCNSubtarget::TrapID::LLVMAMDHSADebugTrap);
9146 SDValue Ops[] = {Chain, DAG.getTargetConstant(TrapID, SL, MVT::i16)};
9147 return DAG.getNode(AMDGPUISD::TRAP, SL, MVT::Other, Ops);
9148}
9149
9150/// When a divergent value (in VGPR) is passed to an inline asm with an SGPR
9151/// constraint ('s'), we need to insert v_readfirstlane to move the value from
9152/// VGPR to SGPR. This is done by modifying the CopyToReg nodes in the glue
9153/// chain that feed into the INLINEASM node.
9154SDValue SITargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
9155 unsigned NumOps = Op.getNumOperands();
9156
9157 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
9158 SmallSet<Register, 8> SGPRInputRegs;
9159
9160 unsigned NumVals = 0;
9161 for (unsigned I = InlineAsm::Op_FirstOperand; I < NumOps - 1;
9162 I += 1 + NumVals) {
9163 const InlineAsm::Flag Flags(Op.getConstantOperandVal(I));
9164 NumVals = Flags.getNumOperandRegisters();
9165
9166 unsigned RCID;
9167 bool IsSGPRInput = Flags.getKind() == InlineAsm::Kind::RegUse &&
9168 NumVals > 0 && Flags.hasRegClassConstraint(RCID) &&
9169 TRI->isSGPRClass(TRI->getRegClass(RCID));
9170
9171 for (unsigned J = 0; J < NumVals; ++J) {
9172 SDValue Val = Op.getOperand(I + 1 + J);
9173 if (const RegisterSDNode *RegNode =
9175 Register Reg = RegNode->getReg();
9176 if (IsSGPRInput || (Reg.isPhysical() && TRI->isSGPRPhysReg(Reg)))
9177 SGPRInputRegs.insert(Reg);
9178 }
9179 }
9180 }
9181
9182 if (SGPRInputRegs.empty())
9183 return Op;
9184
9185 // Walk the glue chain and insert readfirstlane for divergent SGPR inputs.
9186 SDLoc DL(Op);
9187 SDNode *N = Op.getOperand(NumOps - 1).getNode();
9188
9189 while (N && N->getOpcode() == ISD::CopyToReg) {
9190 Register Reg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
9191 SDValue SrcVal = N->getOperand(2);
9192
9193 // Insert readfirstlane if copying a divergent value to an SGPR input.
9194 if (SrcVal->isDivergent() && SGPRInputRegs.count(Reg)) {
9195 SDValue ReadFirstLaneID =
9196 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
9197 SDValue ReadFirstLane =
9199 ReadFirstLaneID, SrcVal);
9200
9201 SmallVector<SDValue, 4> Ops = {N->getOperand(0), N->getOperand(1),
9202 ReadFirstLane};
9203 if (N->getNumOperands() > 3)
9204 Ops.push_back(N->getOperand(3)); // Glue input
9205
9206 DAG.UpdateNodeOperands(N, Ops);
9207 }
9208
9209 // Follow glue chain to next CopyToReg.
9210 SDNode *Next = nullptr;
9211 for (unsigned I = 0, E = N->getNumOperands(); I != E; ++I) {
9212 if (N->getOperand(I).getValueType() == MVT::Glue) {
9213 Next = N->getOperand(I).getNode();
9214 break;
9215 }
9216 }
9217 N = Next;
9218 }
9219
9220 return Op;
9221}
9222
9223SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
9224 SelectionDAG &DAG) const {
9225 if (Subtarget->hasApertureRegs()) {
9226 const unsigned ApertureRegNo = (AS == AMDGPUAS::LOCAL_ADDRESS)
9227 ? AMDGPU::SRC_SHARED_BASE
9228 : AMDGPU::SRC_PRIVATE_BASE;
9229 assert((ApertureRegNo != AMDGPU::SRC_PRIVATE_BASE ||
9230 !Subtarget->hasGloballyAddressableScratch()) &&
9231 "Cannot use src_private_base with globally addressable scratch!");
9232 // Note: this feature (register) is broken. When used as a 32-bit operand,
9233 // it returns a wrong value (all zeroes?). The real value is in the upper 32
9234 // bits.
9235 //
9236 // To work around the issue, emit a 64 bit copy from this register
9237 // then extract the high bits. Note that this shouldn't even result in a
9238 // shift being emitted and simply become a pair of registers (e.g.):
9239 // s_mov_b64 s[6:7], src_shared_base
9240 // v_mov_b32_e32 v1, s7
9241 SDValue Copy =
9242 DAG.getCopyFromReg(DAG.getEntryNode(), DL, ApertureRegNo, MVT::v2i32);
9243 return DAG.getExtractVectorElt(DL, MVT::i32, Copy, 1);
9244 }
9245
9246 // For code object version 5, private_base and shared_base are passed through
9247 // implicit kernargs.
9248 const Module *M = DAG.getMachineFunction().getFunction().getParent();
9252 return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
9253 }
9254
9255 MachineFunction &MF = DAG.getMachineFunction();
9256 SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
9257 Register UserSGPR = Info->getQueuePtrUserSGPR();
9258 if (UserSGPR == AMDGPU::NoRegister) {
9259 // We probably are in a function incorrectly marked with
9260 // amdgpu-no-queue-ptr. This is undefined.
9261 return DAG.getPOISON(MVT::i32);
9262 }
9263
9264 SDValue QueuePtr =
9265 CreateLiveInRegister(DAG, &AMDGPU::SReg_64RegClass, UserSGPR, MVT::i64);
9266
9267 // Offset into amd_queue_t for group_segment_aperture_base_hi /
9268 // private_segment_aperture_base_hi.
9269 uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
9270
9271 SDValue Ptr =
9272 DAG.getObjectPtrOffset(DL, QueuePtr, TypeSize::getFixed(StructOffset));
9273
9274 // TODO: Use custom target PseudoSourceValue.
9275 // TODO: We should use the value from the IR intrinsic call, but it might not
9276 // be available and how do we get it?
9277 MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
9278 return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
9279 commonAlignment(Align(64), StructOffset),
9282}
9283
9284/// Return true if the value is a known valid address, such that a null check is
9285/// not necessary.
9287 const AMDGPUTargetMachine &TM, unsigned AddrSpace) {
9289 return true;
9290
9291 if (auto *ConstVal = dyn_cast<ConstantSDNode>(Val))
9292 return ConstVal->getSExtValue() != AMDGPU::getNullPointerValue(AddrSpace);
9293
9294 // TODO: Search through arithmetic, handle arguments and loads
9295 // marked nonnull.
9296 return false;
9297}
9298
9299SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
9300 SelectionDAG &DAG) const {
9301 SDLoc SL(Op);
9302
9303 const AMDGPUTargetMachine &TM =
9304 static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
9305
9306 unsigned DestAS, SrcAS;
9307 SDValue Src;
9308 bool IsNonNull = false;
9309 if (const auto *ASC = dyn_cast<AddrSpaceCastSDNode>(Op)) {
9310 SrcAS = ASC->getSrcAddressSpace();
9311 Src = ASC->getOperand(0);
9312 DestAS = ASC->getDestAddressSpace();
9313 } else {
9314 assert(Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN &&
9315 Op.getConstantOperandVal(0) ==
9316 Intrinsic::amdgcn_addrspacecast_nonnull);
9317 Src = Op->getOperand(1);
9318 SrcAS = Op->getConstantOperandVal(2);
9319 DestAS = Op->getConstantOperandVal(3);
9320 IsNonNull = true;
9321 }
9322
9323 SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
9324
9325 // flat -> local/private
9326 if (SrcAS == AMDGPUAS::FLAT_ADDRESS) {
9327 if (DestAS == AMDGPUAS::LOCAL_ADDRESS ||
9328 DestAS == AMDGPUAS::PRIVATE_ADDRESS) {
9329 SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
9330
9331 if (DestAS == AMDGPUAS::PRIVATE_ADDRESS &&
9332 Subtarget->hasGloballyAddressableScratch()) {
9333 // flat -> private with globally addressable scratch: subtract
9334 // src_flat_scratch_base_lo.
9335 SDValue FlatScratchBaseLo(
9336 DAG.getMachineNode(
9337 AMDGPU::S_MOV_B32, SL, MVT::i32,
9338 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_LO, MVT::i32)),
9339 0);
9340 Ptr = DAG.getNode(ISD::SUB, SL, MVT::i32, Ptr, FlatScratchBaseLo);
9341 }
9342
9343 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
9344 return Ptr;
9345
9346 unsigned NullVal = AMDGPU::getNullPointerValue(DestAS);
9347 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
9348 SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
9349
9350 return DAG.getNode(ISD::SELECT, SL, MVT::i32, NonNull, Ptr,
9351 SegmentNullPtr);
9352 }
9353 }
9354
9355 // local/private -> flat
9356 if (DestAS == AMDGPUAS::FLAT_ADDRESS) {
9357 if (SrcAS == AMDGPUAS::LOCAL_ADDRESS ||
9358 SrcAS == AMDGPUAS::PRIVATE_ADDRESS) {
9359 SDValue CvtPtr;
9360 if (SrcAS == AMDGPUAS::PRIVATE_ADDRESS &&
9361 Subtarget->hasGloballyAddressableScratch()) {
9362 // For wave32: Addr = (TID[4:0] << 52) + FLAT_SCRATCH_BASE + privateAddr
9363 // For wave64: Addr = (TID[5:0] << 51) + FLAT_SCRATCH_BASE + privateAddr
9364 SDValue AllOnes = DAG.getSignedTargetConstant(-1, SL, MVT::i32);
9365 SDValue ThreadID = DAG.getConstant(0, SL, MVT::i32);
9366 ThreadID = DAG.getNode(
9367 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
9368 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_lo, SL, MVT::i32),
9369 AllOnes, ThreadID);
9370 if (Subtarget->isWave64())
9371 ThreadID = DAG.getNode(
9372 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
9373 DAG.getTargetConstant(Intrinsic::amdgcn_mbcnt_hi, SL, MVT::i32),
9374 AllOnes, ThreadID);
9375 SDValue ShAmt = DAG.getShiftAmountConstant(
9376 57 - 32 - Subtarget->getWavefrontSizeLog2(), MVT::i32, SL);
9377 SDValue SrcHi = DAG.getNode(ISD::SHL, SL, MVT::i32, ThreadID, ShAmt);
9378 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, SrcHi);
9379 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
9380 // Accessing src_flat_scratch_base_lo as a 64-bit operand gives the full
9381 // 64-bit hi:lo value.
9382 SDValue FlatScratchBase = {
9383 DAG.getMachineNode(
9384 AMDGPU::S_MOV_B64, SL, MVT::i64,
9385 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE, MVT::i64)),
9386 0};
9387 CvtPtr = DAG.getNode(ISD::ADD, SL, MVT::i64, CvtPtr, FlatScratchBase);
9388 } else {
9389 SDValue Aperture = getSegmentAperture(SrcAS, SL, DAG);
9390 CvtPtr = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
9391 CvtPtr = DAG.getNode(ISD::BITCAST, SL, MVT::i64, CvtPtr);
9392 }
9393
9394 if (IsNonNull || isKnownNonNull(Op, DAG, TM, SrcAS))
9395 return CvtPtr;
9396
9397 unsigned NullVal = AMDGPU::getNullPointerValue(SrcAS);
9398 SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
9399
9400 SDValue NonNull =
9401 DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
9402
9403 return DAG.getNode(ISD::SELECT, SL, MVT::i64, NonNull, CvtPtr,
9404 FlatNullPtr);
9405 }
9406 }
9407
9408 if (SrcAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9409 Op.getValueType() == MVT::i64) {
9410 const SIMachineFunctionInfo *Info =
9411 DAG.getMachineFunction().getInfo<SIMachineFunctionInfo>();
9412 if (Info->get32BitAddressHighBits() == 0)
9413 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, Src);
9414
9415 SDValue Hi = DAG.getConstant(Info->get32BitAddressHighBits(), SL, MVT::i32);
9416 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Hi);
9417 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
9418 }
9419
9420 if (DestAS == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
9421 Src.getValueType() == MVT::i64)
9422 return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
9423
9424 // global <-> flat are no-ops and never emitted.
9425
9426 // Invalid casts are poison.
9427 return DAG.getPOISON(Op->getValueType(0));
9428}
9429
9430// This lowers an INSERT_SUBVECTOR by extracting the individual elements from
9431// the small vector and inserting them into the big vector. That is better than
9432// the default expansion of doing it via a stack slot. Even though the use of
9433// the stack slot would be optimized away afterwards, the stack slot itself
9434// remains.
9435SDValue SITargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
9436 SelectionDAG &DAG) const {
9437 SDValue Vec = Op.getOperand(0);
9438 SDValue Ins = Op.getOperand(1);
9439 SDValue Idx = Op.getOperand(2);
9440 EVT VecVT = Vec.getValueType();
9441 EVT InsVT = Ins.getValueType();
9442 EVT EltVT = VecVT.getVectorElementType();
9443 unsigned InsNumElts = InsVT.getVectorNumElements();
9444 unsigned IdxVal = Idx->getAsZExtVal();
9445 SDLoc SL(Op);
9446
9447 if (EltVT.getScalarSizeInBits() == 16 && IdxVal % 2 == 0) {
9448 // Insert 32-bit registers at a time.
9449 assert(InsNumElts % 2 == 0 && "expect legal vector types");
9450
9451 unsigned VecNumElts = VecVT.getVectorNumElements();
9452 EVT NewVecVT =
9453 EVT::getVectorVT(*DAG.getContext(), MVT::i32, VecNumElts / 2);
9454 EVT NewInsVT = InsNumElts == 2 ? MVT::i32
9456 MVT::i32, InsNumElts / 2);
9457
9458 Vec = DAG.getNode(ISD::BITCAST, SL, NewVecVT, Vec);
9459 Ins = DAG.getNode(ISD::BITCAST, SL, NewInsVT, Ins);
9460
9461 for (unsigned I = 0; I != InsNumElts / 2; ++I) {
9462 SDValue Elt;
9463 if (InsNumElts == 2) {
9464 Elt = Ins;
9465 } else {
9466 Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Ins,
9467 DAG.getConstant(I, SL, MVT::i32));
9468 }
9469 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NewVecVT, Vec, Elt,
9470 DAG.getConstant(IdxVal / 2 + I, SL, MVT::i32));
9471 }
9472
9473 return DAG.getNode(ISD::BITCAST, SL, VecVT, Vec);
9474 }
9475
9476 for (unsigned I = 0; I != InsNumElts; ++I) {
9477 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Ins,
9478 DAG.getConstant(I, SL, MVT::i32));
9479 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, VecVT, Vec, Elt,
9480 DAG.getConstant(IdxVal + I, SL, MVT::i32));
9481 }
9482 return Vec;
9483}
9484
9485SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
9486 SelectionDAG &DAG) const {
9487 SDValue Vec = Op.getOperand(0);
9488 SDValue InsVal = Op.getOperand(1);
9489 SDValue Idx = Op.getOperand(2);
9490 EVT VecVT = Vec.getValueType();
9491 EVT EltVT = VecVT.getVectorElementType();
9492 unsigned VecSize = VecVT.getSizeInBits();
9493 unsigned EltSize = EltVT.getSizeInBits();
9494 SDLoc SL(Op);
9495
9496 // Specially handle the case of v4i16 with static indexing.
9497 unsigned NumElts = VecVT.getVectorNumElements();
9498 auto *KIdx = dyn_cast<ConstantSDNode>(Idx);
9499 if (NumElts == 4 && EltSize == 16 && KIdx) {
9500 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Vec);
9501
9502 SDValue LoHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9503 DAG.getConstant(0, SL, MVT::i32));
9504 SDValue HiHalf = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BCVec,
9505 DAG.getConstant(1, SL, MVT::i32));
9506
9507 SDValue LoVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, LoHalf);
9508 SDValue HiVec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i16, HiHalf);
9509
9510 unsigned Idx = KIdx->getZExtValue();
9511 bool InsertLo = Idx < 2;
9512 SDValue InsHalf = DAG.getNode(
9513 ISD::INSERT_VECTOR_ELT, SL, MVT::v2i16, InsertLo ? LoVec : HiVec,
9514 DAG.getNode(ISD::BITCAST, SL, MVT::i16, InsVal),
9515 DAG.getConstant(InsertLo ? Idx : (Idx - 2), SL, MVT::i32));
9516
9517 InsHalf = DAG.getNode(ISD::BITCAST, SL, MVT::i32, InsHalf);
9518
9519 SDValue Concat =
9520 InsertLo ? DAG.getBuildVector(MVT::v2i32, SL, {InsHalf, HiHalf})
9521 : DAG.getBuildVector(MVT::v2i32, SL, {LoHalf, InsHalf});
9522
9523 return DAG.getNode(ISD::BITCAST, SL, VecVT, Concat);
9524 }
9525
9526 // Static indexing does not lower to stack access, and hence there is no need
9527 // for special custom lowering to avoid stack access.
9528 if (isa<ConstantSDNode>(Idx))
9529 return SDValue();
9530
9531 // Avoid stack access for dynamic indexing by custom lowering to
9532 // v_bfi_b32 (v_bfm_b32 16, (shl idx, 16)), val, vec
9533
9534 assert(VecSize <= 64 && "Expected target vector size to be <= 64 bits");
9535
9536 MVT IntVT = MVT::getIntegerVT(VecSize);
9537
9538 // Convert vector index to bit-index and get the required bit mask.
9539 assert(isPowerOf2_32(EltSize));
9540 const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
9541 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9542 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9543 SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
9544 DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
9545
9546 // 1. Create a congruent vector with the target value in each element.
9547 SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,
9548 DAG.getSplatBuildVector(VecVT, SL, InsVal));
9549
9550 // 2. Mask off all other indices except the required index within (1).
9551 SDValue LHS = DAG.getNode(ISD::AND, SL, IntVT, BFM, ExtVal);
9552
9553 // 3. Mask off the required index within the target vector.
9554 SDValue BCVec = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9555 SDValue RHS =
9556 DAG.getNode(ISD::AND, SL, IntVT, DAG.getNOT(SL, BFM, IntVT), BCVec);
9557
9558 // 4. Get (2) and (3) ORed into the target vector.
9559 SDValue BFI =
9560 DAG.getNode(ISD::OR, SL, IntVT, LHS, RHS, SDNodeFlags::Disjoint);
9561
9562 return DAG.getNode(ISD::BITCAST, SL, VecVT, BFI);
9563}
9564
9565SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
9566 SelectionDAG &DAG) const {
9567 SDLoc SL(Op);
9568
9569 EVT ResultVT = Op.getValueType();
9570 SDValue Vec = Op.getOperand(0);
9571 SDValue Idx = Op.getOperand(1);
9572 EVT VecVT = Vec.getValueType();
9573 unsigned VecSize = VecVT.getSizeInBits();
9574 EVT EltVT = VecVT.getVectorElementType();
9575
9576 DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
9577
9578 // Make sure we do any optimizations that will make it easier to fold
9579 // source modifiers before obscuring it with bit operations.
9580
9581 // XXX - Why doesn't this get called when vector_shuffle is expanded?
9582 if (SDValue Combined = performExtractVectorEltCombine(Op.getNode(), DCI))
9583 return Combined;
9584
9585 if (VecSize == 128 || VecSize == 256 || VecSize == 512) {
9586 SDValue Lo, Hi;
9587 auto [LoVT, HiVT] = DAG.GetSplitDestVTs(VecVT);
9588
9589 if (VecSize == 128) {
9590 SDValue V2 = DAG.getBitcast(MVT::v2i64, Vec);
9591 Lo = DAG.getBitcast(LoVT,
9592 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9593 DAG.getConstant(0, SL, MVT::i32)));
9594 Hi = DAG.getBitcast(HiVT,
9595 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9596 DAG.getConstant(1, SL, MVT::i32)));
9597 } else if (VecSize == 256) {
9598 SDValue V2 = DAG.getBitcast(MVT::v4i64, Vec);
9599 SDValue Parts[4];
9600 for (unsigned P = 0; P < 4; ++P) {
9601 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9602 DAG.getConstant(P, SL, MVT::i32));
9603 }
9604
9605 Lo = DAG.getBitcast(LoVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9606 Parts[0], Parts[1]));
9607 Hi = DAG.getBitcast(HiVT, DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i64,
9608 Parts[2], Parts[3]));
9609 } else {
9610 assert(VecSize == 512);
9611
9612 SDValue V2 = DAG.getBitcast(MVT::v8i64, Vec);
9613 SDValue Parts[8];
9614 for (unsigned P = 0; P < 8; ++P) {
9615 Parts[P] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i64, V2,
9616 DAG.getConstant(P, SL, MVT::i32));
9617 }
9618
9619 Lo = DAG.getBitcast(LoVT,
9620 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9621 Parts[0], Parts[1], Parts[2], Parts[3]));
9622 Hi = DAG.getBitcast(HiVT,
9623 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v4i64,
9624 Parts[4], Parts[5], Parts[6], Parts[7]));
9625 }
9626
9627 EVT IdxVT = Idx.getValueType();
9628 unsigned NElem = VecVT.getVectorNumElements();
9629 assert(isPowerOf2_32(NElem));
9630 SDValue IdxMask = DAG.getConstant(NElem / 2 - 1, SL, IdxVT);
9631 SDValue NewIdx = DAG.getNode(ISD::AND, SL, IdxVT, Idx, IdxMask);
9632 SDValue Half = DAG.getSelectCC(SL, Idx, IdxMask, Hi, Lo, ISD::SETUGT);
9633 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Half, NewIdx);
9634 }
9635
9636 assert(VecSize <= 64);
9637
9638 MVT IntVT = MVT::getIntegerVT(VecSize);
9639
9640 // If Vec is just a SCALAR_TO_VECTOR, then use the scalar integer directly.
9641 SDValue VecBC = peekThroughBitcasts(Vec);
9642 if (VecBC.getOpcode() == ISD::SCALAR_TO_VECTOR) {
9643 SDValue Src = VecBC.getOperand(0);
9644 Src = DAG.getBitcast(Src.getValueType().changeTypeToInteger(), Src);
9645 Vec = DAG.getAnyExtOrTrunc(Src, SL, IntVT);
9646 }
9647
9648 unsigned EltSize = EltVT.getSizeInBits();
9649 assert(isPowerOf2_32(EltSize));
9650
9651 SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
9652
9653 // Convert vector index to bit-index (* EltSize)
9654 SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
9655
9656 SDValue BC = DAG.getNode(ISD::BITCAST, SL, IntVT, Vec);
9657 SDValue Elt = DAG.getNode(ISD::SRL, SL, IntVT, BC, ScaledIdx);
9658
9659 if (ResultVT == MVT::f16 || ResultVT == MVT::bf16) {
9660 SDValue Result = DAG.getNode(ISD::TRUNCATE, SL, MVT::i16, Elt);
9661 return DAG.getNode(ISD::BITCAST, SL, ResultVT, Result);
9662 }
9663
9664 return DAG.getAnyExtOrTrunc(Elt, SL, ResultVT);
9665}
9666
9667static bool elementPairIsContiguous(ArrayRef<int> Mask, int Elt) {
9668 assert(Elt % 2 == 0);
9669 return Mask[Elt + 1] == Mask[Elt] + 1 && (Mask[Elt] % 2 == 0);
9670}
9671
9672static bool elementPairIsOddToEven(ArrayRef<int> Mask, int Elt) {
9673 assert(Elt % 2 == 0);
9674 return Mask[Elt] >= 0 && Mask[Elt + 1] >= 0 && (Mask[Elt] & 1) &&
9675 !(Mask[Elt + 1] & 1);
9676}
9677
9678SDValue SITargetLowering::lowerVECTOR_SHUFFLE(SDValue Op,
9679 SelectionDAG &DAG) const {
9680 SDLoc SL(Op);
9681 EVT ResultVT = Op.getValueType();
9682 ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(Op);
9683 MVT EltVT = ResultVT.getVectorElementType().getSimpleVT();
9684 const int NewSrcNumElts = 2;
9685 MVT PackVT = MVT::getVectorVT(EltVT, NewSrcNumElts);
9686 int SrcNumElts = Op.getOperand(0).getValueType().getVectorNumElements();
9687
9688 // Break up the shuffle into registers sized pieces.
9689 //
9690 // We're trying to form sub-shuffles that the register allocation pipeline
9691 // won't be able to figure out, like how to use v_pk_mov_b32 to do a register
9692 // blend or 16-bit op_sel. It should be able to figure out how to reassemble a
9693 // pair of copies into a consecutive register copy, so use the ordinary
9694 // extract_vector_elt lowering unless we can use the shuffle.
9695 //
9696 // TODO: This is a bit of hack, and we should probably always use
9697 // extract_subvector for the largest possible subvector we can (or at least
9698 // use it for PackVT aligned pieces). However we have worse support for
9699 // combines on them don't directly treat extract_subvector / insert_subvector
9700 // as legal. The DAG scheduler also ends up doing a worse job with the
9701 // extract_subvectors.
9702 const bool ShouldUseConsecutiveExtract = EltVT.getSizeInBits() == 16;
9703
9704 // vector_shuffle <0,1,6,7> lhs, rhs
9705 // -> concat_vectors (extract_subvector lhs, 0), (extract_subvector rhs, 2)
9706 //
9707 // vector_shuffle <6,7,2,3> lhs, rhs
9708 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 2)
9709 //
9710 // vector_shuffle <6,7,0,1> lhs, rhs
9711 // -> concat_vectors (extract_subvector rhs, 2), (extract_subvector lhs, 0)
9712
9713 // Avoid scalarizing when both halves are reading from consecutive elements.
9714
9715 // If we're treating 2 element shuffles as legal, also create odd-to-even
9716 // shuffles of neighboring pairs.
9717 //
9718 // vector_shuffle <3,2,7,6> lhs, rhs
9719 // -> concat_vectors vector_shuffle <1, 0> (extract_subvector lhs, 0)
9720 // vector_shuffle <1, 0> (extract_subvector rhs, 2)
9721
9723 for (int I = 0, N = ResultVT.getVectorNumElements(); I != N; I += 2) {
9724 if (ShouldUseConsecutiveExtract &&
9726 const int Idx = SVN->getMaskElt(I);
9727 int VecIdx = Idx < SrcNumElts ? 0 : 1;
9728 int EltIdx = Idx < SrcNumElts ? Idx : Idx - SrcNumElts;
9729 SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT,
9730 SVN->getOperand(VecIdx),
9731 DAG.getConstant(EltIdx, SL, MVT::i32));
9732 Pieces.push_back(SubVec);
9733 } else if (elementPairIsOddToEven(SVN->getMask(), I) &&
9735 int Idx0 = SVN->getMaskElt(I);
9736 int Idx1 = SVN->getMaskElt(I + 1);
9737
9738 SDValue SrcOp0 = SVN->getOperand(0);
9739 SDValue SrcOp1 = SrcOp0;
9740 if (Idx0 >= SrcNumElts) {
9741 SrcOp0 = SVN->getOperand(1);
9742 Idx0 -= SrcNumElts;
9743 }
9744
9745 if (Idx1 >= SrcNumElts) {
9746 SrcOp1 = SVN->getOperand(1);
9747 Idx1 -= SrcNumElts;
9748 }
9749
9750 int AlignedIdx0 = Idx0 & ~(NewSrcNumElts - 1);
9751 int AlignedIdx1 = Idx1 & ~(NewSrcNumElts - 1);
9752
9753 // Extract nearest even aligned piece.
9754 SDValue SubVec0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp0,
9755 DAG.getConstant(AlignedIdx0, SL, MVT::i32));
9756 SDValue SubVec1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, PackVT, SrcOp1,
9757 DAG.getConstant(AlignedIdx1, SL, MVT::i32));
9758
9759 int NewMaskIdx0 = Idx0 - AlignedIdx0;
9760 int NewMaskIdx1 = Idx1 - AlignedIdx1;
9761
9762 SDValue Result0 = SubVec0;
9763 SDValue Result1 = SubVec0;
9764
9765 if (SubVec0 != SubVec1) {
9766 NewMaskIdx1 += NewSrcNumElts;
9767 Result1 = SubVec1;
9768 } else {
9769 Result1 = DAG.getPOISON(PackVT);
9770 }
9771
9772 SDValue Shuf = DAG.getVectorShuffle(PackVT, SL, Result0, Result1,
9773 {NewMaskIdx0, NewMaskIdx1});
9774 Pieces.push_back(Shuf);
9775 } else {
9776 const int Idx0 = SVN->getMaskElt(I);
9777 const int Idx1 = SVN->getMaskElt(I + 1);
9778 int VecIdx0 = Idx0 < SrcNumElts ? 0 : 1;
9779 int VecIdx1 = Idx1 < SrcNumElts ? 0 : 1;
9780 int EltIdx0 = Idx0 < SrcNumElts ? Idx0 : Idx0 - SrcNumElts;
9781 int EltIdx1 = Idx1 < SrcNumElts ? Idx1 : Idx1 - SrcNumElts;
9782
9783 SDValue Vec0 = SVN->getOperand(VecIdx0);
9784 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec0,
9785 DAG.getSignedConstant(EltIdx0, SL, MVT::i32));
9786
9787 SDValue Vec1 = SVN->getOperand(VecIdx1);
9788 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec1,
9789 DAG.getSignedConstant(EltIdx1, SL, MVT::i32));
9790 Pieces.push_back(DAG.getBuildVector(PackVT, SL, {Elt0, Elt1}));
9791 }
9792 }
9793
9794 return DAG.getNode(ISD::CONCAT_VECTORS, SL, ResultVT, Pieces);
9795}
9796
9797SDValue SITargetLowering::lowerSCALAR_TO_VECTOR(SDValue Op,
9798 SelectionDAG &DAG) const {
9799 SDValue SVal = Op.getOperand(0);
9800 EVT ResultVT = Op.getValueType();
9801 EVT SValVT = SVal.getValueType();
9802 SDValue UndefVal = DAG.getPOISON(SValVT);
9803 SDLoc SL(Op);
9804
9806 VElts.push_back(SVal);
9807 for (int I = 1, E = ResultVT.getVectorNumElements(); I < E; ++I)
9808 VElts.push_back(UndefVal);
9809
9810 return DAG.getBuildVector(ResultVT, SL, VElts);
9811}
9812
9813SDValue SITargetLowering::lowerBUILD_VECTOR(SDValue Op,
9814 SelectionDAG &DAG) const {
9815 SDLoc SL(Op);
9816 EVT VT = Op.getValueType();
9817
9818 if (VT == MVT::v2f16 || VT == MVT::v2i16 || VT == MVT::v2bf16) {
9819 assert(!Subtarget->hasVOP3PInsts() && "this should be legal");
9820
9821 SDValue Lo = Op.getOperand(0);
9822 SDValue Hi = Op.getOperand(1);
9823
9824 // Avoid adding defined bits with the zero_extend.
9825 if (Hi.isUndef()) {
9826 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9827 SDValue ExtLo = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Lo);
9828 return DAG.getNode(ISD::BITCAST, SL, VT, ExtLo);
9829 }
9830
9831 Hi = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Hi);
9832 Hi = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Hi);
9833
9834 SDValue ShlHi = DAG.getNode(ISD::SHL, SL, MVT::i32, Hi,
9835 DAG.getConstant(16, SL, MVT::i32));
9836 if (Lo.isUndef())
9837 return DAG.getNode(ISD::BITCAST, SL, VT, ShlHi);
9838
9839 Lo = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Lo);
9840 Lo = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Lo);
9841
9842 SDValue Or =
9843 DAG.getNode(ISD::OR, SL, MVT::i32, Lo, ShlHi, SDNodeFlags::Disjoint);
9844 return DAG.getNode(ISD::BITCAST, SL, VT, Or);
9845 }
9846
9847 // Split into 2-element chunks.
9848 const unsigned NumParts = VT.getVectorNumElements() / 2;
9849 EVT PartVT = MVT::getVectorVT(VT.getVectorElementType().getSimpleVT(), 2);
9850 MVT PartIntVT = MVT::getIntegerVT(PartVT.getSizeInBits());
9851
9853 for (unsigned P = 0; P < NumParts; ++P) {
9854 SDValue Vec = DAG.getBuildVector(
9855 PartVT, SL, {Op.getOperand(P * 2), Op.getOperand(P * 2 + 1)});
9856 Casts.push_back(DAG.getNode(ISD::BITCAST, SL, PartIntVT, Vec));
9857 }
9858
9859 SDValue Blend =
9860 DAG.getBuildVector(MVT::getVectorVT(PartIntVT, NumParts), SL, Casts);
9861 return DAG.getNode(ISD::BITCAST, SL, VT, Blend);
9862}
9863
9865 const GlobalAddressSDNode *GA) const {
9866 // Named barriers have fixed, non-relocated LDS addresses, so a constant
9867 // offset into an array of them can be folded into the address.
9869 const auto *GV = dyn_cast<GlobalVariable>(GA->getGlobal());
9870 return GV && AMDGPU::isNamedBarrier(*GV);
9871 }
9872
9873 // OSes that use ELF REL relocations (instead of RELA) can only store a
9874 // 32-bit addend in the instruction, so it is not safe to allow offset folding
9875 // which can create arbitrary 64-bit addends. (This is only a problem for
9876 // R_AMDGPU_*32_HI relocations since other relocation types are unaffected by
9877 // the high 32 bits of the addend.)
9878 //
9879 // This should be kept in sync with how HasRelocationAddend is initialized in
9880 // the constructor of ELFAMDGPUAsmBackend.
9881 if (!Subtarget->isAmdHsaOS())
9882 return false;
9883
9884 // We can fold offsets for anything that doesn't require a GOT relocation.
9885 return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
9889}
9890
9891static SDValue
9893 const SDLoc &DL, int64_t Offset, EVT PtrVT,
9894 unsigned GAFlags = SIInstrInfo::MO_NONE) {
9895 assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
9896 // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
9897 // lowered to the following code sequence:
9898 //
9899 // For constant address space:
9900 // s_getpc_b64 s[0:1]
9901 // s_add_u32 s0, s0, $symbol
9902 // s_addc_u32 s1, s1, 0
9903 //
9904 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9905 // a fixup or relocation is emitted to replace $symbol with a literal
9906 // constant, which is a pc-relative offset from the encoding of the $symbol
9907 // operand to the global variable.
9908 //
9909 // For global address space:
9910 // s_getpc_b64 s[0:1]
9911 // s_add_u32 s0, s0, $symbol@{gotpc}rel32@lo
9912 // s_addc_u32 s1, s1, $symbol@{gotpc}rel32@hi
9913 //
9914 // s_getpc_b64 returns the address of the s_add_u32 instruction and then
9915 // fixups or relocations are emitted to replace $symbol@*@lo and
9916 // $symbol@*@hi with lower 32 bits and higher 32 bits of a literal constant,
9917 // which is a 64-bit pc-relative offset from the encoding of the $symbol
9918 // operand to the global variable.
9919 if (((const GCNSubtarget &)DAG.getSubtarget()).has64BitLiterals()) {
9920 assert(GAFlags != SIInstrInfo::MO_NONE);
9921
9922 SDValue Ptr =
9923 DAG.getTargetGlobalAddress(GV, DL, MVT::i64, Offset, GAFlags + 2);
9924 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET64, DL, PtrVT, Ptr);
9925 }
9926
9927 SDValue PtrLo = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags);
9928 SDValue PtrHi;
9929 if (GAFlags == SIInstrInfo::MO_NONE)
9930 PtrHi = DAG.getTargetConstant(0, DL, MVT::i32);
9931 else
9932 PtrHi = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, Offset, GAFlags + 1);
9933 return DAG.getNode(AMDGPUISD::PC_ADD_REL_OFFSET, DL, PtrVT, PtrLo, PtrHi);
9934}
9935
9936SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI,
9937 SDValue Op,
9938 SelectionDAG &DAG) const {
9939 GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
9940 SDLoc DL(GSD);
9941 EVT PtrVT = Op.getValueType();
9942
9943 const GlobalValue *GV = GSD->getGlobal();
9949 GV->hasExternalLinkage()) {
9950 const GlobalVariable &GVar = *cast<GlobalVariable>(GV);
9951 // HIP uses an unsized array `extern __shared__ T s[]` or similar
9952 // zero-sized type in other languages to declare the dynamic shared
9953 // memory which size is not known at the compile time. They will be
9954 // allocated by the runtime and placed directly after the static
9955 // allocated ones. They all share the same offset.
9956 if (GVar.getGlobalSize(GVar.getDataLayout()) == 0) {
9957 assert(PtrVT == MVT::i32 && "32-bit pointer is expected.");
9958 // Adjust alignment for that dynamic shared memory array.
9960 MFI->setDynLDSAlign(F, GVar);
9961 MFI->setUsesDynamicLDS(true);
9962 return SDValue(
9963 DAG.getMachineNode(AMDGPU::GET_GROUPSTATICSIZE, DL, PtrVT), 0);
9964 }
9965 }
9967 }
9968
9970 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, GSD->getOffset(),
9972 return DAG.getNode(AMDGPUISD::LDS, DL, MVT::i32, GA);
9973 }
9974
9975 if (Subtarget->isAmdPalOS() || Subtarget->isMesa3DOS()) {
9976 if (Subtarget->has64BitLiterals()) {
9978 GV, DL, MVT::i64, GSD->getOffset(), SIInstrInfo::MO_ABS64);
9979 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B64, DL, MVT::i64, Addr),
9980 0);
9981 }
9982
9983 SDValue AddrLo = DAG.getTargetGlobalAddress(
9984 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_LO);
9985 AddrLo = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrLo), 0};
9986
9987 SDValue AddrHi = DAG.getTargetGlobalAddress(
9988 GV, DL, MVT::i32, GSD->getOffset(), SIInstrInfo::MO_ABS32_HI);
9989 AddrHi = {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, AddrHi), 0};
9990
9991 return DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, AddrLo, AddrHi);
9992 }
9993
9994 if (shouldEmitFixup(GV))
9995 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT);
9996
9997 if (shouldEmitPCReloc(GV))
9998 return buildPCRelGlobalAddress(DAG, GV, DL, GSD->getOffset(), PtrVT,
10000
10001 SDValue GOTAddr = buildPCRelGlobalAddress(DAG, GV, DL, 0, PtrVT,
10003 PointerType *PtrTy =
10005 const DataLayout &DataLayout = DAG.getDataLayout();
10006 Align Alignment = DataLayout.getABITypeAlign(PtrTy);
10007 MachinePointerInfo PtrInfo =
10009
10010 return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
10013}
10014
10015SDValue SITargetLowering::LowerExternalSymbol(SDValue Op,
10016 SelectionDAG &DAG) const {
10017 // TODO: Handle this. It should be mostly the same as LowerGlobalAddress.
10018 const Function &Fn = DAG.getMachineFunction().getFunction();
10019 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10020 Fn, "unsupported external symbol", Op.getDebugLoc()));
10021 return DAG.getPOISON(Op.getValueType());
10022}
10023
10025 const SDLoc &DL, SDValue V) const {
10026 // We can't use S_MOV_B32 directly, because there is no way to specify m0 as
10027 // the destination register.
10028 //
10029 // We can't use CopyToReg, because MachineCSE won't combine COPY instructions,
10030 // so we will end up with redundant moves to m0.
10031 //
10032 // We use a pseudo to ensure we emit s_mov_b32 with m0 as the direct result.
10033
10034 // A Null SDValue creates a glue result.
10035 SDNode *M0 = DAG.getMachineNode(AMDGPU::SI_INIT_M0, DL, MVT::Other, MVT::Glue,
10036 V, Chain);
10037 return SDValue(M0, 0);
10038}
10039
10040SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
10041 MVT VT,
10042 unsigned Offset) const {
10043 SDLoc SL(Op);
10044 SDValue Param = lowerKernargMemParameter(
10045 DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
10046 // The local size values will have the hi 16-bits as zero.
10047 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
10048 DAG.getValueType(VT));
10049}
10050
10052 EVT VT) {
10055 "non-hsa intrinsic with hsa target", DL.getDebugLoc()));
10056 return DAG.getPOISON(VT);
10057}
10058
10060 EVT VT) {
10063 "intrinsic not supported on subtarget", DL.getDebugLoc()));
10064 return DAG.getPOISON(VT);
10065}
10066
10068 ArrayRef<SDValue> Elts) {
10069 assert(!Elts.empty());
10070 MVT Type;
10071 unsigned NumElts = Elts.size();
10072
10073 if (NumElts <= 12) {
10074 Type = MVT::getVectorVT(MVT::f32, NumElts);
10075 } else {
10076 assert(Elts.size() <= 16);
10077 Type = MVT::v16f32;
10078 NumElts = 16;
10079 }
10080
10081 SmallVector<SDValue, 16> VecElts(NumElts);
10082 for (unsigned i = 0; i < Elts.size(); ++i) {
10083 SDValue Elt = Elts[i];
10084 if (Elt.getValueType() != MVT::f32)
10085 Elt = DAG.getBitcast(MVT::f32, Elt);
10086 VecElts[i] = Elt;
10087 }
10088 for (unsigned i = Elts.size(); i < NumElts; ++i)
10089 VecElts[i] = DAG.getPOISON(MVT::f32);
10090
10091 if (NumElts == 1)
10092 return VecElts[0];
10093 return DAG.getBuildVector(Type, DL, VecElts);
10094}
10095
10096static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
10097 SDValue Src, int ExtraElts) {
10098 EVT SrcVT = Src.getValueType();
10099
10101
10102 if (SrcVT.isVector())
10103 DAG.ExtractVectorElements(Src, Elts);
10104 else
10105 Elts.push_back(Src);
10106
10107 SDValue Undef = DAG.getPOISON(SrcVT.getScalarType());
10108 while (ExtraElts--)
10109 Elts.push_back(Undef);
10110
10111 return DAG.getBuildVector(CastVT, DL, Elts);
10112}
10113
10114// Re-construct the required return value for a image load intrinsic.
10115// This is more complicated due to the optional use TexFailCtrl which means the
10116// required return type is an aggregate
10118 ArrayRef<EVT> ResultTypes, bool IsTexFail,
10119 bool Unpacked, bool IsD16, int DMaskPop,
10120 int NumVDataDwords, bool IsAtomicPacked16Bit,
10121 const SDLoc &DL) {
10122 // Determine the required return type. This is the same regardless of
10123 // IsTexFail flag
10124 EVT ReqRetVT = ResultTypes[0];
10125 int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
10126 int NumDataDwords = ((IsD16 && !Unpacked) || IsAtomicPacked16Bit)
10127 ? (ReqRetNumElts + 1) / 2
10128 : ReqRetNumElts;
10129
10130 int MaskPopDwords = (!IsD16 || Unpacked) ? DMaskPop : (DMaskPop + 1) / 2;
10131
10132 MVT DataDwordVT =
10133 NumDataDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
10134
10135 MVT MaskPopVT =
10136 MaskPopDwords == 1 ? MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
10137
10138 SDValue Data(Result, 0);
10139 SDValue TexFail;
10140
10141 if (DMaskPop > 0 && Data.getValueType() != MaskPopVT) {
10142 SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
10143 if (MaskPopVT.isVector()) {
10144 Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
10145 SDValue(Result, 0), ZeroIdx);
10146 } else {
10147 Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
10148 SDValue(Result, 0), ZeroIdx);
10149 }
10150 }
10151
10152 if (DataDwordVT.isVector() && !IsAtomicPacked16Bit)
10153 Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
10154 NumDataDwords - MaskPopDwords);
10155
10156 if (IsD16)
10157 Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
10158
10159 EVT LegalReqRetVT = ReqRetVT;
10160 if (!ReqRetVT.isVector()) {
10161 if (!Data.getValueType().isInteger())
10162 Data = DAG.getNode(ISD::BITCAST, DL,
10163 Data.getValueType().changeTypeToInteger(), Data);
10164 Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
10165 } else {
10166 // We need to widen the return vector to a legal type
10167 if ((ReqRetVT.getVectorNumElements() % 2) == 1 &&
10168 ReqRetVT.getVectorElementType().getSizeInBits() == 16) {
10169 LegalReqRetVT =
10171 ReqRetVT.getVectorNumElements() + 1);
10172 }
10173 }
10174 Data = DAG.getNode(ISD::BITCAST, DL, LegalReqRetVT, Data);
10175
10176 if (IsTexFail) {
10177 TexFail =
10178 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SDValue(Result, 0),
10179 DAG.getConstant(MaskPopDwords, DL, MVT::i32));
10180
10181 return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
10182 }
10183
10184 if (Result->getNumValues() == 1)
10185 return Data;
10186
10187 return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
10188}
10189
10190static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
10191 SDValue *LWE, bool &IsTexFail) {
10192 auto *TexFailCtrlConst = cast<ConstantSDNode>(TexFailCtrl.getNode());
10193
10194 uint64_t Value = TexFailCtrlConst->getZExtValue();
10195 if (Value) {
10196 IsTexFail = true;
10197 }
10198
10199 SDLoc DL(TexFailCtrlConst);
10200 *TFE = DAG.getTargetConstant((Value & 0x1) ? 1 : 0, DL, MVT::i32);
10201 Value &= ~(uint64_t)0x1;
10202 *LWE = DAG.getTargetConstant((Value & 0x2) ? 1 : 0, DL, MVT::i32);
10203 Value &= ~(uint64_t)0x2;
10204
10205 return Value == 0;
10206}
10207
10209 MVT PackVectorVT,
10210 SmallVectorImpl<SDValue> &PackedAddrs,
10211 unsigned DimIdx, unsigned EndIdx,
10212 unsigned NumGradients) {
10213 SDLoc DL(Op);
10214 for (unsigned I = DimIdx; I < EndIdx; I++) {
10215 SDValue Addr = Op.getOperand(I);
10216
10217 // Gradients are packed with undef for each coordinate.
10218 // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
10219 // 1D: undef,dx/dh; undef,dx/dv
10220 // 2D: dy/dh,dx/dh; dy/dv,dx/dv
10221 // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
10222 if (((I + 1) >= EndIdx) ||
10223 ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
10224 I == DimIdx + NumGradients - 1))) {
10225 if (Addr.getValueType() != MVT::i16)
10226 Addr = DAG.getBitcast(MVT::i16, Addr);
10227 Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
10228 } else {
10229 Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
10230 I++;
10231 }
10232 Addr = DAG.getBitcast(MVT::f32, Addr);
10233 PackedAddrs.push_back(Addr);
10234 }
10235}
10236
10237SDValue SITargetLowering::lowerImage(SDValue Op,
10239 SelectionDAG &DAG, bool WithChain) const {
10240 SDLoc DL(Op);
10241 MachineFunction &MF = DAG.getMachineFunction();
10242 const GCNSubtarget *ST = &MF.getSubtarget<GCNSubtarget>();
10243 unsigned IntrOpcode = Intr->BaseOpcode;
10244 // For image atomic: use no-return opcode if result is unused.
10245 if (Intr->AtomicNoRetBaseOpcode != Intr->BaseOpcode &&
10246 !Op.getNode()->hasAnyUseOfValue(0))
10247 IntrOpcode = Intr->AtomicNoRetBaseOpcode;
10248 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
10250 const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
10251 bool IsGFX10Plus = AMDGPU::isGFX10Plus(*Subtarget);
10252 bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
10253 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
10254 bool IsGFX13 = AMDGPU::isGFX13(*Subtarget);
10255
10256 SmallVector<EVT, 3> ResultTypes(Op->values());
10257 SmallVector<EVT, 3> OrigResultTypes(Op->values());
10258 if (BaseOpcode->NoReturn && BaseOpcode->Atomic)
10259 ResultTypes.erase(&ResultTypes[0]);
10260
10261 bool IsD16 = false;
10262 bool IsG16 = false;
10263 bool IsA16 = false;
10264 SDValue VData;
10265 int NumVDataDwords = 0;
10266 bool AdjustRetType = false;
10267 bool IsAtomicPacked16Bit = false;
10268
10269 // Offset of intrinsic arguments
10270 const unsigned ArgOffset = WithChain ? 2 : 1;
10271
10272 unsigned DMask;
10273 unsigned DMaskLanes = 0;
10274
10275 if (BaseOpcode->Atomic) {
10276 VData = Op.getOperand(2);
10277
10278 IsAtomicPacked16Bit =
10279 (IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16 ||
10280 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_F16_NORTN ||
10281 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16 ||
10282 IntrOpcode == AMDGPU::IMAGE_ATOMIC_PK_ADD_BF16_NORTN);
10283
10284 bool Is64Bit = VData.getValueSizeInBits() == 64;
10285 if (BaseOpcode->AtomicX2) {
10286 SDValue VData2 = Op.getOperand(3);
10287 VData = DAG.getBuildVector(Is64Bit ? MVT::v2i64 : MVT::v2i32, DL,
10288 {VData, VData2});
10289 if (Is64Bit)
10290 VData = DAG.getBitcast(MVT::v4i32, VData);
10291
10292 if (!BaseOpcode->NoReturn)
10293 ResultTypes[0] = Is64Bit ? MVT::v2i64 : MVT::v2i32;
10294
10295 DMask = Is64Bit ? 0xf : 0x3;
10296 NumVDataDwords = Is64Bit ? 4 : 2;
10297 } else {
10298 DMask = Is64Bit ? 0x3 : 0x1;
10299 NumVDataDwords = Is64Bit ? 2 : 1;
10300 }
10301 } else {
10302 DMask = Op->getConstantOperandVal(ArgOffset + Intr->DMaskIndex);
10303 DMaskLanes = BaseOpcode->Gather4 ? 4 : llvm::popcount(DMask);
10304
10305 if (BaseOpcode->Store) {
10306 VData = Op.getOperand(2);
10307
10308 MVT StoreVT = VData.getSimpleValueType();
10309 if (StoreVT.getScalarType() == MVT::f16) {
10310 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10311 return Op; // D16 is unsupported for this instruction
10312
10313 IsD16 = true;
10314 VData = handleD16VData(VData, DAG, true);
10315 }
10316
10317 NumVDataDwords = (VData.getValueType().getSizeInBits() + 31) / 32;
10318 } else if (!BaseOpcode->NoReturn) {
10319 // Work out the num dwords based on the dmask popcount and underlying type
10320 // and whether packing is supported.
10321 MVT LoadVT = ResultTypes[0].getSimpleVT();
10322 if (LoadVT.getScalarType() == MVT::f16) {
10323 if (!Subtarget->hasD16Images() || !BaseOpcode->HasD16)
10324 return Op; // D16 is unsupported for this instruction
10325
10326 IsD16 = true;
10327 }
10328
10329 // Confirm that the return type is large enough for the dmask specified
10330 if ((LoadVT.isVector() && LoadVT.getVectorNumElements() < DMaskLanes) ||
10331 (!LoadVT.isVector() && DMaskLanes > 1))
10332 return Op;
10333
10334 // The sq block of gfx8 and gfx9 do not estimate register use correctly
10335 // for d16 image_gather4, image_gather4_l, and image_gather4_lz
10336 // instructions.
10337 if (IsD16 && !Subtarget->hasUnpackedD16VMem() &&
10338 !(BaseOpcode->Gather4 && Subtarget->hasImageGather4D16Bug()))
10339 NumVDataDwords = (DMaskLanes + 1) / 2;
10340 else
10341 NumVDataDwords = DMaskLanes;
10342
10343 AdjustRetType = true;
10344 }
10345 }
10346
10347 unsigned VAddrEnd = ArgOffset + Intr->VAddrEnd;
10349
10350 // Check for 16 bit addresses or derivatives and pack if true.
10351 MVT VAddrVT =
10352 Op.getOperand(ArgOffset + Intr->GradientStart).getSimpleValueType();
10353 MVT VAddrScalarVT = VAddrVT.getScalarType();
10354 MVT GradPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10355 IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10356
10357 VAddrVT = Op.getOperand(ArgOffset + Intr->CoordStart).getSimpleValueType();
10358 VAddrScalarVT = VAddrVT.getScalarType();
10359 MVT AddrPackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
10360 IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
10361
10362 // Push back extra arguments.
10363 for (unsigned I = Intr->VAddrStart; I < Intr->GradientStart; I++) {
10364 if (IsA16 && (Op.getOperand(ArgOffset + I).getValueType() == MVT::f16)) {
10365 assert(I == Intr->BiasIndex && "Got unexpected 16-bit extra argument");
10366 // Special handling of bias when A16 is on. Bias is of type half but
10367 // occupies full 32-bit.
10368 SDValue Bias = DAG.getBuildVector(
10369 MVT::v2f16, DL,
10370 {Op.getOperand(ArgOffset + I), DAG.getPOISON(MVT::f16)});
10371 VAddrs.push_back(Bias);
10372 } else {
10373 assert((!IsA16 || Intr->NumBiasArgs == 0 || I != Intr->BiasIndex) &&
10374 "Bias needs to be converted to 16 bit in A16 mode");
10375 VAddrs.push_back(Op.getOperand(ArgOffset + I));
10376 }
10377 }
10378
10379 if (BaseOpcode->Gradients && !ST->hasG16() && (IsA16 != IsG16)) {
10380 // 16 bit gradients are supported, but are tied to the A16 control
10381 // so both gradients and addresses must be 16 bit
10382 LLVM_DEBUG(
10383 dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
10384 "require 16 bit args for both gradients and addresses");
10385 return Op;
10386 }
10387
10388 if (IsA16) {
10389 if (!ST->hasA16()) {
10390 LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
10391 "support 16 bit addresses\n");
10392 return Op;
10393 }
10394 }
10395
10396 // We've dealt with incorrect input so we know that if IsA16, IsG16
10397 // are set then we have to compress/pack operands (either address,
10398 // gradient or both)
10399 // In the case where a16 and gradients are tied (no G16 support) then we
10400 // have already verified that both IsA16 and IsG16 are true
10401 if (BaseOpcode->Gradients && IsG16 && ST->hasG16()) {
10402 // Activate g16
10403 const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
10405 IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
10406 }
10407
10408 // Add gradients (packed or unpacked)
10409 if (IsG16) {
10410 // Pack the gradients
10411 // const int PackEndIdx = IsA16 ? VAddrEnd : (ArgOffset + Intr->CoordStart);
10412 packImage16bitOpsToDwords(DAG, Op, GradPackVectorVT, VAddrs,
10413 ArgOffset + Intr->GradientStart,
10414 ArgOffset + Intr->CoordStart, Intr->NumGradients);
10415 } else {
10416 for (unsigned I = ArgOffset + Intr->GradientStart;
10417 I < ArgOffset + Intr->CoordStart; I++)
10418 VAddrs.push_back(Op.getOperand(I));
10419 }
10420
10421 // Add addresses (packed or unpacked)
10422 if (IsA16) {
10423 packImage16bitOpsToDwords(DAG, Op, AddrPackVectorVT, VAddrs,
10424 ArgOffset + Intr->CoordStart, VAddrEnd,
10425 0 /* No gradients */);
10426 } else {
10427 // Add uncompressed address
10428 for (unsigned I = ArgOffset + Intr->CoordStart; I < VAddrEnd; I++)
10429 VAddrs.push_back(Op.getOperand(I));
10430 }
10431
10432 // If the register allocator cannot place the address registers contiguously
10433 // without introducing moves, then using the non-sequential address encoding
10434 // is always preferable, since it saves VALU instructions and is usually a
10435 // wash in terms of code size or even better.
10436 //
10437 // However, we currently have no way of hinting to the register allocator that
10438 // MIMG addresses should be placed contiguously when it is possible to do so,
10439 // so force non-NSA for the common 2-address case as a heuristic.
10440 //
10441 // SIShrinkInstructions will convert NSA encodings to non-NSA after register
10442 // allocation when possible.
10443 //
10444 // Partial NSA is allowed on GFX11+ where the final register is a contiguous
10445 // set of the remaining addresses.
10446 const unsigned NSAMaxSize = ST->getNSAMaxSize(BaseOpcode->Sampler);
10447 const bool HasPartialNSAEncoding = ST->hasPartialNSAEncoding();
10448 const bool UseNSA = ST->hasNSAEncoding() &&
10449 VAddrs.size() >= ST->getNSAThreshold(MF) &&
10450 (VAddrs.size() <= NSAMaxSize || HasPartialNSAEncoding);
10451 const bool UsePartialNSA =
10452 UseNSA && HasPartialNSAEncoding && VAddrs.size() > NSAMaxSize;
10453
10454 SDValue VAddr;
10455 if (UsePartialNSA) {
10456 VAddr = getBuildDwordsVector(DAG, DL,
10457 ArrayRef(VAddrs).drop_front(NSAMaxSize - 1));
10458 } else if (!UseNSA) {
10459 VAddr = getBuildDwordsVector(DAG, DL, VAddrs);
10460 }
10461
10462 SDValue True = DAG.getTargetConstant(1, DL, MVT::i1);
10463 SDValue False = DAG.getTargetConstant(0, DL, MVT::i1);
10464 SDValue Unorm;
10465 if (!BaseOpcode->Sampler) {
10466 Unorm = True;
10467 } else {
10468 uint64_t UnormConst =
10469 Op.getConstantOperandVal(ArgOffset + Intr->UnormIndex);
10470
10471 Unorm = UnormConst ? True : False;
10472 }
10473
10474 SDValue TFE;
10475 SDValue LWE;
10476 SDValue TexFail = Op.getOperand(ArgOffset + Intr->TexFailCtrlIndex);
10477 bool IsTexFail = false;
10478 if (!parseTexFail(TexFail, DAG, &TFE, &LWE, IsTexFail))
10479 return Op;
10480
10481 if (IsTexFail) {
10482 if (!DMaskLanes) {
10483 // Expecting to get an error flag since TFC is on - and dmask is 0
10484 // Force dmask to be at least 1 otherwise the instruction will fail
10485 DMask = 0x1;
10486 DMaskLanes = 1;
10487 NumVDataDwords = 1;
10488 }
10489 NumVDataDwords += 1;
10490 AdjustRetType = true;
10491 }
10492
10493 // Has something earlier tagged that the return type needs adjusting
10494 // This happens if the instruction is a load or has set TexFailCtrl flags
10495 if (AdjustRetType) {
10496 // NumVDataDwords reflects the true number of dwords required in the return
10497 // type
10498 if (DMaskLanes == 0 && !BaseOpcode->Store) {
10499 // This is a no-op load. This can be eliminated
10500 SDValue Undef = DAG.getPOISON(Op.getValueType());
10501 if (isa<MemSDNode>(Op))
10502 return DAG.getMergeValues({Undef, Op.getOperand(0)}, DL);
10503 return Undef;
10504 }
10505
10506 EVT NewVT = NumVDataDwords > 1 ? EVT::getVectorVT(*DAG.getContext(),
10507 MVT::i32, NumVDataDwords)
10508 : MVT::i32;
10509
10510 ResultTypes[0] = NewVT;
10511 if (ResultTypes.size() == 3) {
10512 // Original result was aggregate type used for TexFailCtrl results
10513 // The actual instruction returns as a vector type which has now been
10514 // created. Remove the aggregate result.
10515 ResultTypes.erase(&ResultTypes[1]);
10516 }
10517 }
10518
10519 unsigned CPol = Op.getConstantOperandVal(ArgOffset + Intr->CachePolicyIndex);
10520 // Keep GLC only when the atomic's result is actually used.
10521 if (BaseOpcode->Atomic && !BaseOpcode->NoReturn)
10523 if (CPol & ~((IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12) |
10525 return Op;
10526
10528 if (BaseOpcode->Store || BaseOpcode->Atomic)
10529 Ops.push_back(VData); // vdata
10530 if (UsePartialNSA) {
10531 append_range(Ops, ArrayRef(VAddrs).take_front(NSAMaxSize - 1));
10532 Ops.push_back(VAddr);
10533 } else if (UseNSA)
10534 append_range(Ops, VAddrs);
10535 else
10536 Ops.push_back(VAddr);
10537 SDValue Rsrc = Op.getOperand(ArgOffset + Intr->RsrcIndex);
10538 EVT RsrcVT = Rsrc.getValueType();
10539 if (RsrcVT != MVT::v4i32 && RsrcVT != MVT::v8i32)
10540 return Op;
10541 Ops.push_back(Rsrc);
10542 if (BaseOpcode->Sampler) {
10543 SDValue Samp = Op.getOperand(ArgOffset + Intr->SampIndex);
10544 if (Samp.getValueType() != MVT::v4i32)
10545 return Op;
10546 Ops.push_back(Samp);
10547 }
10548 Ops.push_back(DAG.getTargetConstant(DMask, DL, MVT::i32));
10549 if (IsGFX10Plus)
10550 Ops.push_back(DAG.getTargetConstant(DimInfo->Encoding, DL, MVT::i32));
10551 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10552 Ops.push_back(Unorm);
10553 Ops.push_back(DAG.getTargetConstant(CPol, DL, MVT::i32));
10554 Ops.push_back(IsA16 && // r128, a16 for gfx9
10555 ST->hasFeature(AMDGPU::FeatureR128A16)
10556 ? True
10557 : False);
10558 if (IsGFX10Plus)
10559 Ops.push_back(IsA16 ? True : False);
10560
10561 if (!Subtarget->hasGFX90AInsts())
10562 Ops.push_back(TFE); // tfe
10563 else if (TFE->getAsZExtVal()) {
10564 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10566 "TFE is not supported on this GPU", DL.getDebugLoc()));
10567 }
10568
10569 if (!IsGFX12Plus || BaseOpcode->Sampler || BaseOpcode->MSAA)
10570 Ops.push_back(LWE); // lwe
10571 if (!IsGFX10Plus)
10572 Ops.push_back(DimInfo->DA ? True : False);
10573 if (BaseOpcode->HasD16)
10574 Ops.push_back(IsD16 ? True : False);
10575 if (isa<MemSDNode>(Op))
10576 Ops.push_back(Op.getOperand(0)); // chain
10577
10578 int NumVAddrDwords =
10579 UseNSA ? VAddrs.size() : VAddr.getValueType().getSizeInBits() / 32;
10580 int Opcode = -1;
10581
10582 if (IsGFX13) {
10583 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx13,
10584 NumVDataDwords, NumVAddrDwords);
10585 } else if (IsGFX12Plus) {
10586 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx12,
10587 NumVDataDwords, NumVAddrDwords);
10588 } else if (IsGFX11Plus) {
10589 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10590 UseNSA ? AMDGPU::MIMGEncGfx11NSA
10591 : AMDGPU::MIMGEncGfx11Default,
10592 NumVDataDwords, NumVAddrDwords);
10593 } else if (IsGFX10Plus) {
10594 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
10595 UseNSA ? AMDGPU::MIMGEncGfx10NSA
10596 : AMDGPU::MIMGEncGfx10Default,
10597 NumVDataDwords, NumVAddrDwords);
10598 } else {
10599 if (Subtarget->hasGFX90AInsts()) {
10600 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx90a,
10601 NumVDataDwords, NumVAddrDwords);
10602 if (Opcode == -1) {
10603 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10605 "requested image instruction is not supported on this GPU",
10606 DL.getDebugLoc()));
10607
10608 unsigned Idx = 0;
10609 SmallVector<SDValue, 3> RetValues(OrigResultTypes.size());
10610 for (EVT VT : OrigResultTypes) {
10611 if (VT == MVT::Other)
10612 RetValues[Idx++] = Op.getOperand(0); // Chain
10613 else
10614 RetValues[Idx++] = DAG.getPOISON(VT);
10615 }
10616
10617 return DAG.getMergeValues(RetValues, DL);
10618 }
10619 }
10620 if (Opcode == -1 &&
10621 Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10622 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
10623 NumVDataDwords, NumVAddrDwords);
10624 if (Opcode == -1)
10625 Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
10626 NumVDataDwords, NumVAddrDwords);
10627 }
10628 if (Opcode == -1)
10629 return Op;
10630
10631 MachineSDNode *NewNode = DAG.getMachineNode(Opcode, DL, ResultTypes, Ops);
10632 if (auto *MemOp = dyn_cast<MemSDNode>(Op)) {
10633 MachineMemOperand *MemRef = MemOp->getMemOperand();
10634 DAG.setNodeMemRefs(NewNode, {MemRef});
10635 }
10636
10637 if (BaseOpcode->NoReturn) {
10638 if (BaseOpcode->Atomic)
10639 return DAG.getMergeValues(
10640 {DAG.getPOISON(OrigResultTypes[0]), SDValue(NewNode, 0)}, DL);
10641
10642 return SDValue(NewNode, 0);
10643 }
10644
10645 if (BaseOpcode->AtomicX2) {
10647 DAG.ExtractVectorElements(SDValue(NewNode, 0), Elt, 0, 1);
10648 return DAG.getMergeValues({Elt[0], SDValue(NewNode, 1)}, DL);
10649 }
10650
10651 return constructRetValue(DAG, NewNode, OrigResultTypes, IsTexFail,
10652 Subtarget->hasUnpackedD16VMem(), IsD16, DMaskLanes,
10653 NumVDataDwords, IsAtomicPacked16Bit, DL);
10654}
10655
10656SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
10657 SDValue Offset, SDValue CachePolicy,
10658 SelectionDAG &DAG) const {
10659 MachineFunction &MF = DAG.getMachineFunction();
10660
10661 const DataLayout &DataLayout = DAG.getDataLayout();
10662 Align Alignment =
10663 DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
10664
10665 MachineMemOperand *MMO = MF.getMachineMemOperand(
10666 MachinePointerInfo(),
10669 VT.getStoreSize(), Alignment);
10670
10671 if (!Offset->isDivergent()) {
10672 SDValue Ops[] = {Rsrc, Offset, CachePolicy};
10673
10674 // Lower llvm.amdgcn.s.buffer.load.{i16, u16} intrinsics. Initially, the
10675 // s_buffer_load_u16 instruction is emitted for both signed and unsigned
10676 // loads. Later, DAG combiner tries to combine s_buffer_load_u16 with sext
10677 // and generates s_buffer_load_i16 (performSignExtendInRegCombine).
10678 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10679 SDValue BufferLoad =
10680 DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD_USHORT, DL,
10681 DAG.getVTList(MVT::i32), Ops, VT, MMO);
10682 return DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
10683 }
10684
10685 // Widen vec3 load to vec4.
10686 if (VT.isVector() && VT.getVectorNumElements() == 3 &&
10687 !Subtarget->hasScalarDwordx3Loads()) {
10688 EVT WidenedVT =
10690 auto WidenedOp = DAG.getMemIntrinsicNode(
10691 AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
10692 MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
10693 auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
10694 DAG.getVectorIdxConstant(0, DL));
10695 return Subvector;
10696 }
10697
10698 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_LOAD, DL,
10699 DAG.getVTList(VT), Ops, VT, MMO);
10700 }
10701
10702 // We have a divergent offset. Emit a MUBUF buffer load instead. We can
10703 // assume that the buffer is unswizzled.
10704 SDValue Ops[] = {
10705 DAG.getEntryNode(), // Chain
10706 Rsrc, // rsrc
10707 DAG.getConstant(0, DL, MVT::i32), // vindex
10708 {}, // voffset
10709 {}, // soffset
10710 {}, // offset
10711 CachePolicy, // cachepolicy
10712 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
10713 };
10714 if (VT == MVT::i16 && Subtarget->hasScalarSubwordLoads()) {
10715 setBufferOffsets(Offset, DAG, &Ops[3], Align(4));
10716 return handleByteShortBufferLoads(DAG, VT, DL, Ops, MMO);
10717 }
10718
10720 unsigned NumLoads = 1;
10721 MVT LoadVT = VT.getSimpleVT();
10722 unsigned NumElts = LoadVT.isVector() ? LoadVT.getVectorNumElements() : 1;
10723 assert((LoadVT.getScalarType() == MVT::i32 ||
10724 LoadVT.getScalarType() == MVT::f32));
10725
10726 if (NumElts == 8 || NumElts == 16) {
10727 NumLoads = NumElts / 4;
10728 LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
10729 }
10730
10731 SDVTList VTList = DAG.getVTList({LoadVT, MVT::Other});
10732
10733 // Use the alignment to ensure that the required offsets will fit into the
10734 // immediate offsets.
10735 setBufferOffsets(Offset, DAG, &Ops[3],
10736 NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
10737
10738 uint64_t InstOffset = Ops[5]->getAsZExtVal();
10739 unsigned LoadSize = LoadVT.getStoreSize();
10740 for (unsigned i = 0; i < NumLoads; ++i) {
10741 Ops[5] = DAG.getTargetConstant(InstOffset + 16 * i, DL, MVT::i32);
10742 MachineMemOperand *LoadMMO = MF.getMachineMemOperand(MMO, 16 * i, LoadSize);
10743 Loads.push_back(getMemIntrinsicNode(AMDGPUISD::BUFFER_LOAD, DL, VTList, Ops,
10744 LoadVT, LoadMMO, DAG));
10745 }
10746
10747 if (NumElts == 8 || NumElts == 16)
10748 return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
10749
10750 return Loads[0];
10751}
10752
10753SDValue SITargetLowering::lowerWaveID(SelectionDAG &DAG, SDValue Op) const {
10754 // With architected SGPRs, waveIDinGroup is in TTMP8[29:25].
10755 if (!Subtarget->hasArchitectedSGPRs())
10756 return {};
10757 SDLoc SL(Op);
10758 MVT VT = MVT::i32;
10759 SDValue TTMP8 = DAG.getCopyFromReg(DAG.getEntryNode(), SL, AMDGPU::TTMP8, VT);
10760 return DAG.getNode(AMDGPUISD::BFE_U32, SL, VT, TTMP8,
10761 DAG.getConstant(25, SL, VT), DAG.getConstant(5, SL, VT));
10762}
10763
10764SDValue SITargetLowering::lowerConstHwRegRead(SelectionDAG &DAG, SDValue Op,
10765 AMDGPU::Hwreg::Id HwReg,
10766 unsigned LowBit,
10767 unsigned Width) const {
10768 SDLoc SL(Op);
10769 using namespace AMDGPU::Hwreg;
10770 return {DAG.getMachineNode(
10771 AMDGPU::S_GETREG_B32_const, SL, MVT::i32,
10772 DAG.getTargetConstant(HwregEncoding::encode(HwReg, LowBit, Width),
10773 SL, MVT::i32)),
10774 0};
10775}
10776
10777SDValue SITargetLowering::lowerWorkitemID(SelectionDAG &DAG, SDValue Op,
10778 unsigned Dim,
10779 const ArgDescriptor &Arg) const {
10780 SDLoc SL(Op);
10781 MachineFunction &MF = DAG.getMachineFunction();
10782 unsigned MaxID = Subtarget->getMaxWorkitemID(MF.getFunction(), Dim);
10783 if (MaxID == 0)
10784 return DAG.getConstant(0, SL, MVT::i32);
10785
10786 // It's undefined behavior if a function marked with the amdgpu-no-*
10787 // attributes uses the corresponding intrinsic.
10788 if (!Arg)
10789 return DAG.getPOISON(Op->getValueType(0));
10790
10791 SDValue Val = loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
10792 SDLoc(DAG.getEntryNode()), Arg);
10793
10794 // Don't bother inserting AssertZext for packed IDs since we're emitting the
10795 // masking operations anyway.
10796 //
10797 // TODO: We could assert the top bit is 0 for the source copy.
10798 if (Arg.isMasked())
10799 return Val;
10800
10801 // Preserve the known bits after expansion to a copy.
10802 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), llvm::bit_width(MaxID));
10803 return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Val,
10804 DAG.getValueType(SmallVT));
10805}
10806
10807SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
10808 SelectionDAG &DAG) const {
10809 MachineFunction &MF = DAG.getMachineFunction();
10810 auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
10811
10812 EVT VT = Op.getValueType();
10813 SDLoc DL(Op);
10814 unsigned IntrinsicID = Op.getConstantOperandVal(0);
10815
10816 // TODO: Should this propagate fast-math-flags?
10817
10818 switch (IntrinsicID) {
10819 case Intrinsic::amdgcn_wave_reduce_min:
10820 case Intrinsic::amdgcn_wave_reduce_umin:
10821 case Intrinsic::amdgcn_wave_reduce_fmin:
10822 case Intrinsic::amdgcn_wave_reduce_max:
10823 case Intrinsic::amdgcn_wave_reduce_umax:
10824 case Intrinsic::amdgcn_wave_reduce_fmax:
10825 case Intrinsic::amdgcn_wave_reduce_add:
10826 case Intrinsic::amdgcn_wave_reduce_fadd:
10827 case Intrinsic::amdgcn_wave_reduce_sub:
10828 case Intrinsic::amdgcn_wave_reduce_fsub:
10829 case Intrinsic::amdgcn_wave_reduce_and:
10830 case Intrinsic::amdgcn_wave_reduce_or:
10831 case Intrinsic::amdgcn_wave_reduce_xor: {
10832 EVT SrcVT = Op.getOperand(1).getValueType();
10833 if (SrcVT.getFixedSizeInBits() == 16) {
10834 bool IsFPOp = SrcVT.isFloatingPoint();
10835 bool NeedsSignExt = IntrinsicID == Intrinsic::amdgcn_wave_reduce_min ||
10836 IntrinsicID == Intrinsic::amdgcn_wave_reduce_max ||
10837 IntrinsicID == Intrinsic::amdgcn_wave_reduce_add ||
10838 IntrinsicID == Intrinsic::amdgcn_wave_reduce_sub;
10839 unsigned ExtOpc = IsFPOp ? ISD::FP_EXTEND
10840 : NeedsSignExt ? ISD::SIGN_EXTEND
10842 auto SrcType = IsFPOp ? MVT::f16 : MVT::i16;
10843 auto ExtType = IsFPOp ? MVT::f32 : MVT::i32;
10844 SDValue ExtendedSrc = DAG.getNode(ExtOpc, DL, ExtType, Op.getOperand(1));
10845 SDValue Strategy = Op.getOperand(2);
10847 Op.getOperand(0), ExtendedSrc, Strategy);
10848 if (IsFPOp)
10849 return DAG.getNode(ISD::FP_ROUND, DL, SrcType, Result,
10850 DAG.getTargetConstant(1, DL, MVT::i32));
10851 else
10852 return DAG.getNode(ISD::TRUNCATE, DL, SrcType, Result);
10853 }
10854 return SDValue();
10855 }
10856 case Intrinsic::amdgcn_implicit_buffer_ptr: {
10857 if (getSubtarget()->isAmdHsaOrMesa(MF.getFunction()))
10858 return emitNonHSAIntrinsicError(DAG, DL, VT);
10859 return getPreloadedValue(DAG, *MFI, VT,
10861 }
10862 case Intrinsic::amdgcn_dispatch_ptr:
10863 case Intrinsic::amdgcn_queue_ptr: {
10864 if (!Subtarget->isAmdHsaOrMesa(MF.getFunction())) {
10865 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
10866 MF.getFunction(), "unsupported hsa intrinsic without hsa target",
10867 DL.getDebugLoc()));
10868 return DAG.getPOISON(VT);
10869 }
10870
10871 auto RegID = IntrinsicID == Intrinsic::amdgcn_dispatch_ptr
10874 return getPreloadedValue(DAG, *MFI, VT, RegID);
10875 }
10876 case Intrinsic::amdgcn_implicitarg_ptr: {
10877 if (MFI->isEntryFunction())
10878 return getImplicitArgPtr(DAG, DL);
10879 return getPreloadedValue(DAG, *MFI, VT,
10881 }
10882 case Intrinsic::amdgcn_kernarg_segment_ptr: {
10883 if (!AMDGPU::isKernel(MF.getFunction())) {
10884 // This only makes sense to call in a kernel, so just lower to null.
10885 return DAG.getConstant(0, DL, VT);
10886 }
10887
10888 return getPreloadedValue(DAG, *MFI, VT,
10890 }
10891 case Intrinsic::amdgcn_dispatch_id: {
10892 return getPreloadedValue(DAG, *MFI, VT, AMDGPUFunctionArgInfo::DISPATCH_ID);
10893 }
10894 case Intrinsic::amdgcn_rcp:
10895 return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
10896 case Intrinsic::amdgcn_rsq:
10897 return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10898 case Intrinsic::amdgcn_rsq_legacy:
10899 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10900 return emitRemovedIntrinsicError(DAG, DL, VT);
10901 return SDValue();
10902 case Intrinsic::amdgcn_rcp_legacy:
10903 if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
10904 return emitRemovedIntrinsicError(DAG, DL, VT);
10905 return DAG.getNode(AMDGPUISD::RCP_LEGACY, DL, VT, Op.getOperand(1));
10906 case Intrinsic::amdgcn_fma_legacy:
10907 if (!Subtarget->hasFmaLegacy32Insts())
10908 return emitRemovedIntrinsicError(DAG, DL, VT);
10909 return SDValue();
10910 case Intrinsic::amdgcn_sudot4:
10911 case Intrinsic::amdgcn_sudot8:
10912 if (!Subtarget->hasDot8Insts())
10913 return emitRemovedIntrinsicError(DAG, DL, VT);
10914 return SDValue();
10915 case Intrinsic::amdgcn_tanh:
10916 if (!Subtarget->hasTanhInsts())
10917 return emitRemovedIntrinsicError(DAG, DL, VT);
10918 return SDValue();
10919 case Intrinsic::amdgcn_rsq_clamp: {
10920 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
10921 return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
10922
10923 Type *Type = VT.getTypeForEVT(*DAG.getContext());
10924 APFloat Max = APFloat::getLargest(Type->getFltSemantics());
10925 APFloat Min = APFloat::getLargest(Type->getFltSemantics(), true);
10926
10927 SDValue Rsq = DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
10928 SDValue Tmp =
10929 DAG.getNode(ISD::FMINNUM, DL, VT, Rsq, DAG.getConstantFP(Max, DL, VT));
10930 return DAG.getNode(ISD::FMAXNUM, DL, VT, Tmp,
10931 DAG.getConstantFP(Min, DL, VT));
10932 }
10933 case Intrinsic::r600_read_ngroups_x:
10934 if (Subtarget->isAmdHsaOS())
10935 return emitNonHSAIntrinsicError(DAG, DL, VT);
10936
10937 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10939 false);
10940 case Intrinsic::r600_read_ngroups_y:
10941 if (Subtarget->isAmdHsaOS())
10942 return emitNonHSAIntrinsicError(DAG, DL, VT);
10943
10944 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10946 false);
10947 case Intrinsic::r600_read_ngroups_z:
10948 if (Subtarget->isAmdHsaOS())
10949 return emitNonHSAIntrinsicError(DAG, DL, VT);
10950
10951 return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
10953 false);
10954 case Intrinsic::r600_read_local_size_x:
10955 if (Subtarget->isAmdHsaOS())
10956 return emitNonHSAIntrinsicError(DAG, DL, VT);
10957
10958 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10960 case Intrinsic::r600_read_local_size_y:
10961 if (Subtarget->isAmdHsaOS())
10962 return emitNonHSAIntrinsicError(DAG, DL, VT);
10963
10964 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10966 case Intrinsic::r600_read_local_size_z:
10967 if (Subtarget->isAmdHsaOS())
10968 return emitNonHSAIntrinsicError(DAG, DL, VT);
10969
10970 return lowerImplicitZextParam(DAG, Op, MVT::i16,
10972 case Intrinsic::amdgcn_workgroup_id_x:
10973 return lowerWorkGroupId(DAG, *MFI, VT,
10977 case Intrinsic::amdgcn_workgroup_id_y:
10978 return lowerWorkGroupId(DAG, *MFI, VT,
10982 case Intrinsic::amdgcn_workgroup_id_z:
10983 return lowerWorkGroupId(DAG, *MFI, VT,
10987 case Intrinsic::amdgcn_cluster_id_x:
10988 return Subtarget->hasClusters()
10989 ? getPreloadedValue(DAG, *MFI, VT,
10991 : DAG.getPOISON(VT);
10992 case Intrinsic::amdgcn_cluster_id_y:
10993 return Subtarget->hasClusters()
10994 ? getPreloadedValue(DAG, *MFI, VT,
10996 : DAG.getPOISON(VT);
10997 case Intrinsic::amdgcn_cluster_id_z:
10998 return Subtarget->hasClusters()
10999 ? getPreloadedValue(DAG, *MFI, VT,
11001 : DAG.getPOISON(VT);
11002 case Intrinsic::amdgcn_cluster_workgroup_id_x:
11003 return Subtarget->hasClusters()
11004 ? getPreloadedValue(
11005 DAG, *MFI, VT,
11007 : DAG.getPOISON(VT);
11008 case Intrinsic::amdgcn_cluster_workgroup_id_y:
11009 return Subtarget->hasClusters()
11010 ? getPreloadedValue(
11011 DAG, *MFI, VT,
11013 : DAG.getPOISON(VT);
11014 case Intrinsic::amdgcn_cluster_workgroup_id_z:
11015 return Subtarget->hasClusters()
11016 ? getPreloadedValue(
11017 DAG, *MFI, VT,
11019 : DAG.getPOISON(VT);
11020 case Intrinsic::amdgcn_cluster_workgroup_flat_id:
11021 return Subtarget->hasClusters()
11022 ? lowerConstHwRegRead(DAG, Op, AMDGPU::Hwreg::ID_IB_STS2, 21, 4)
11023 : SDValue();
11024 case Intrinsic::amdgcn_cluster_workgroup_max_id_x:
11025 return Subtarget->hasClusters()
11026 ? getPreloadedValue(
11027 DAG, *MFI, VT,
11029 : DAG.getPOISON(VT);
11030 case Intrinsic::amdgcn_cluster_workgroup_max_id_y:
11031 return Subtarget->hasClusters()
11032 ? getPreloadedValue(
11033 DAG, *MFI, VT,
11035 : DAG.getPOISON(VT);
11036 case Intrinsic::amdgcn_cluster_workgroup_max_id_z:
11037 return Subtarget->hasClusters()
11038 ? getPreloadedValue(
11039 DAG, *MFI, VT,
11041 : DAG.getPOISON(VT);
11042 case Intrinsic::amdgcn_cluster_workgroup_max_flat_id:
11043 return Subtarget->hasClusters()
11044 ? getPreloadedValue(
11045 DAG, *MFI, VT,
11047 : DAG.getPOISON(VT);
11048 case Intrinsic::amdgcn_wave_id:
11049 return lowerWaveID(DAG, Op);
11050 case Intrinsic::amdgcn_lds_kernel_id: {
11051 if (MFI->isEntryFunction())
11052 return getLDSKernelId(DAG, DL);
11053 return getPreloadedValue(DAG, *MFI, VT,
11055 }
11056 case Intrinsic::amdgcn_workitem_id_x:
11057 return lowerWorkitemID(DAG, Op, 0, MFI->getArgInfo().WorkItemIDX);
11058 case Intrinsic::amdgcn_workitem_id_y:
11059 return lowerWorkitemID(DAG, Op, 1, MFI->getArgInfo().WorkItemIDY);
11060 case Intrinsic::amdgcn_workitem_id_z:
11061 return lowerWorkitemID(DAG, Op, 2, MFI->getArgInfo().WorkItemIDZ);
11062 case Intrinsic::amdgcn_wavefrontsize:
11063 return DAG.getConstant(MF.getSubtarget<GCNSubtarget>().getWavefrontSize(),
11064 SDLoc(Op), MVT::i32);
11065 case Intrinsic::amdgcn_s_buffer_load: {
11066 unsigned CPol = Op.getConstantOperandVal(3);
11067 // s_buffer_load, because of how it's optimized, can't be volatile
11068 // so reject ones with the volatile bit set.
11069 if (CPol & ~((Subtarget->getGeneration() >= AMDGPUSubtarget::GFX12)
11072 return Op;
11073 return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2),
11074 Op.getOperand(3), DAG);
11075 }
11076 case Intrinsic::amdgcn_fdiv_fast:
11077 return lowerFDIV_FAST(Op, DAG);
11078 case Intrinsic::amdgcn_sin:
11079 return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
11080
11081 case Intrinsic::amdgcn_cos:
11082 return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));
11083
11084 case Intrinsic::amdgcn_mul_u24:
11085 return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1),
11086 Op.getOperand(2));
11087 case Intrinsic::amdgcn_mul_i24:
11088 return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1),
11089 Op.getOperand(2));
11090
11091 case Intrinsic::amdgcn_log_clamp: {
11092 if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
11093 return SDValue();
11094
11095 return emitRemovedIntrinsicError(DAG, DL, VT);
11096 }
11097 case Intrinsic::amdgcn_fract:
11098 return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
11099
11100 case Intrinsic::amdgcn_class:
11101 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, VT, Op.getOperand(1),
11102 Op.getOperand(2));
11103 case Intrinsic::amdgcn_div_fmas:
11104 return DAG.getNode(AMDGPUISD::DIV_FMAS, DL, VT, Op.getOperand(1),
11105 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
11106
11107 case Intrinsic::amdgcn_div_fixup:
11108 return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT, Op.getOperand(1),
11109 Op.getOperand(2), Op.getOperand(3));
11110
11111 case Intrinsic::amdgcn_div_scale: {
11112 const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
11113
11114 // Translate to the operands expected by the machine instruction. The
11115 // first parameter must be the same as the first instruction.
11116 SDValue Numerator = Op.getOperand(1);
11117 SDValue Denominator = Op.getOperand(2);
11118
11119 // Note this order is opposite of the machine instruction's operations,
11120 // which is s0.f = Quotient, s1.f = Denominator, s2.f = Numerator. The
11121 // intrinsic has the numerator as the first operand to match a normal
11122 // division operation.
11123
11124 SDValue Src0 = Param->isAllOnes() ? Numerator : Denominator;
11125
11126 return DAG.getNode(AMDGPUISD::DIV_SCALE, DL, Op->getVTList(), Src0,
11127 Denominator, Numerator);
11128 }
11129 case Intrinsic::amdgcn_icmp: {
11130 // There is a Pat that handles this variant, so return it as-is.
11131 if (Op.getOperand(1).getValueType() == MVT::i1 &&
11132 Op.getConstantOperandVal(2) == 0 &&
11133 Op.getConstantOperandVal(3) == ICmpInst::Predicate::ICMP_NE)
11134 return Op;
11135 return lowerICMPIntrinsic(*this, Op.getNode(), DAG);
11136 }
11137 case Intrinsic::amdgcn_fcmp: {
11138 return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
11139 }
11140 case Intrinsic::amdgcn_ballot:
11141 return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
11142 case Intrinsic::amdgcn_fmed3:
11143 return DAG.getNode(AMDGPUISD::FMED3, DL, VT, Op.getOperand(1),
11144 Op.getOperand(2), Op.getOperand(3), Op->getFlags());
11145 case Intrinsic::amdgcn_fdot2:
11146 return DAG.getNode(AMDGPUISD::FDOT2, DL, VT, Op.getOperand(1),
11147 Op.getOperand(2), Op.getOperand(3), Op.getOperand(4));
11148 case Intrinsic::amdgcn_fmul_legacy:
11149 return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT, Op.getOperand(1),
11150 Op.getOperand(2));
11151 case Intrinsic::amdgcn_sbfe:
11152 return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT, Op.getOperand(1),
11153 Op.getOperand(2), Op.getOperand(3));
11154 case Intrinsic::amdgcn_ubfe:
11155 return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT, Op.getOperand(1),
11156 Op.getOperand(2), Op.getOperand(3));
11157 case Intrinsic::amdgcn_cvt_pkrtz:
11158 case Intrinsic::amdgcn_cvt_pknorm_i16:
11159 case Intrinsic::amdgcn_cvt_pknorm_u16:
11160 case Intrinsic::amdgcn_cvt_pk_i16:
11161 case Intrinsic::amdgcn_cvt_pk_u16: {
11162 // FIXME: Stop adding cast if v2f16/v2i16 are legal.
11163 EVT VT = Op.getValueType();
11164 unsigned Opcode;
11165
11166 if (IntrinsicID == Intrinsic::amdgcn_cvt_pkrtz)
11167 Opcode = AMDGPUISD::CVT_PKRTZ_F16_F32;
11168 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_i16)
11169 Opcode = AMDGPUISD::CVT_PKNORM_I16_F32;
11170 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pknorm_u16)
11171 Opcode = AMDGPUISD::CVT_PKNORM_U16_F32;
11172 else if (IntrinsicID == Intrinsic::amdgcn_cvt_pk_i16)
11173 Opcode = AMDGPUISD::CVT_PK_I16_I32;
11174 else
11175 Opcode = AMDGPUISD::CVT_PK_U16_U32;
11176
11177 if (isTypeLegal(VT))
11178 return DAG.getNode(Opcode, DL, VT, Op.getOperand(1), Op.getOperand(2));
11179
11180 SDValue Node =
11181 DAG.getNode(Opcode, DL, MVT::i32, Op.getOperand(1), Op.getOperand(2));
11182 return DAG.getNode(ISD::BITCAST, DL, VT, Node);
11183 }
11184 case Intrinsic::amdgcn_fmad_ftz:
11185 return DAG.getNode(AMDGPUISD::FMAD_FTZ, DL, VT, Op.getOperand(1),
11186 Op.getOperand(2), Op.getOperand(3));
11187
11188 case Intrinsic::amdgcn_if_break:
11189 return SDValue(DAG.getMachineNode(AMDGPU::SI_IF_BREAK, DL, VT,
11190 Op->getOperand(1), Op->getOperand(2)),
11191 0);
11192
11193 case Intrinsic::amdgcn_groupstaticsize: {
11195 if (OS == Triple::AMDHSA || OS == Triple::AMDPAL)
11196 return Op;
11197
11198 const Module *M = MF.getFunction().getParent();
11199 const GlobalValue *GV =
11200 Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
11201 SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
11203 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
11204 }
11205 case Intrinsic::amdgcn_is_shared:
11206 case Intrinsic::amdgcn_is_private: {
11207 SDLoc SL(Op);
11208 SDValue SrcVec =
11209 DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
11210 SDValue SrcHi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, SrcVec,
11211 DAG.getConstant(1, SL, MVT::i32));
11212
11213 unsigned AS = (IntrinsicID == Intrinsic::amdgcn_is_shared)
11215 : AMDGPUAS::PRIVATE_ADDRESS;
11216 if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
11217 Subtarget->hasGloballyAddressableScratch()) {
11218 SDValue FlatScratchBaseHi(
11219 DAG.getMachineNode(
11220 AMDGPU::S_MOV_B32, DL, MVT::i32,
11221 DAG.getRegister(AMDGPU::SRC_FLAT_SCRATCH_BASE_HI, MVT::i32)),
11222 0);
11223 // Test bits 63..58 against the aperture address.
11224 return DAG.getSetCC(
11225 SL, MVT::i1,
11226 DAG.getNode(ISD::XOR, SL, MVT::i32, SrcHi, FlatScratchBaseHi),
11227 DAG.getConstant(1u << 26, SL, MVT::i32), ISD::SETULT);
11228 }
11229
11230 SDValue Aperture = getSegmentAperture(AS, SL, DAG);
11231 return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
11232 }
11233 case Intrinsic::amdgcn_perm:
11234 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op.getOperand(1),
11235 Op.getOperand(2), Op.getOperand(3));
11236 case Intrinsic::amdgcn_reloc_constant: {
11237 Module *M = MF.getFunction().getParent();
11238 const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
11239 auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
11240 auto *RelocSymbol = cast<GlobalVariable>(
11241 M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
11242 SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
11244 return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
11245 }
11246 case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
11247 case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
11248 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
11249 case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
11250 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
11251 case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
11252 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
11253 case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
11254 if (Op.getOperand(4).getValueType() == MVT::i32)
11255 return SDValue();
11256
11257 SDLoc SL(Op);
11258 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
11259 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
11260 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11261 Op.getOperand(3), IndexKeyi32);
11262 }
11263 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8:
11264 case Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8:
11265 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8:
11266 case Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8:
11267 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8:
11268 case Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8:
11269 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8:
11270 case Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8: {
11271 if (Op.getOperand(4).getValueType() == MVT::i64)
11272 return SDValue();
11273
11274 SDLoc SL(Op);
11275 auto IndexKeyi64 =
11276 Op.getOperand(4).getValueType() == MVT::v2i32
11277 ? DAG.getBitcast(MVT::i64, Op.getOperand(4))
11278 : DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i64);
11279 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
11280 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11281 Op.getOperand(3), IndexKeyi64, Op.getOperand(5),
11282 Op.getOperand(6)});
11283 }
11284 case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
11285 case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
11286 case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
11287 case Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16:
11288 case Intrinsic::amdgcn_swmmac_f32_16x16x64_f16:
11289 case Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8: {
11290 EVT IndexKeyTy = IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8
11291 ? MVT::i64
11292 : MVT::i32;
11293 if (Op.getOperand(6).getValueType() == IndexKeyTy)
11294 return SDValue();
11295
11296 SDLoc SL(Op);
11297 auto IndexKey =
11298 Op.getOperand(6).getValueType().isVector()
11299 ? DAG.getBitcast(IndexKeyTy, Op.getOperand(6))
11300 : DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, IndexKeyTy);
11302 Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11303 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11304 IndexKey, Op.getOperand(7), Op.getOperand(8)};
11305 if (IntrinsicID == Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8)
11306 Args.push_back(Op.getOperand(9));
11307 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(), Args);
11308 }
11309 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
11310 case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
11311 case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
11312 if (Op.getOperand(6).getValueType() == MVT::i32)
11313 return SDValue();
11314
11315 SDLoc SL(Op);
11316 auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
11317 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
11318 {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
11319 Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
11320 IndexKeyi32, Op.getOperand(7)});
11321 }
11322 case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
11323 case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4: {
11324 unsigned AFmt = (unsigned)Op.getConstantOperandVal(1);
11325 unsigned BFmt = (unsigned)Op.getConstantOperandVal(3);
11326 unsigned AScaleFmt = (unsigned)Op.getConstantOperandVal(8);
11327 unsigned BScaleFmt = (unsigned)Op.getConstantOperandVal(11);
11328 if (!AMDGPU::isValidWMMAScaleFmtCombination(AFmt, AScaleFmt, BFmt,
11329 BScaleFmt)) {
11331 "invalid matrix and scale format combination in wmma call");
11332 Op->print(errs());
11333 errs() << '\n';
11334 }
11335 return SDValue();
11336 }
11337 case Intrinsic::amdgcn_addrspacecast_nonnull:
11338 return lowerADDRSPACECAST(Op, DAG);
11339 case Intrinsic::amdgcn_readlane:
11340 case Intrinsic::amdgcn_readfirstlane:
11341 case Intrinsic::amdgcn_writelane:
11342 case Intrinsic::amdgcn_permlane16:
11343 case Intrinsic::amdgcn_permlanex16:
11344 case Intrinsic::amdgcn_permlane64:
11345 case Intrinsic::amdgcn_set_inactive:
11346 case Intrinsic::amdgcn_set_inactive_chain_arg:
11347 case Intrinsic::amdgcn_mov_dpp8:
11348 case Intrinsic::amdgcn_update_dpp:
11349 case Intrinsic::amdgcn_permlane_bcast:
11350 case Intrinsic::amdgcn_permlane_up:
11351 case Intrinsic::amdgcn_permlane_down:
11352 case Intrinsic::amdgcn_permlane_xor:
11353 return lowerLaneOp(*this, Op.getNode(), DAG);
11354 case Intrinsic::amdgcn_dead: {
11356 for (const EVT ValTy : Op.getNode()->values())
11357 Poisons.push_back(DAG.getPOISON(ValTy));
11358 return DAG.getMergeValues(Poisons, SDLoc(Op));
11359 }
11360 case Intrinsic::amdgcn_wave_shuffle:
11361 return lowerWaveShuffle(*this, Op.getNode(), DAG);
11362 default:
11363 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
11365 return lowerImage(Op, ImageDimIntr, DAG, false);
11366
11367 return Op;
11368 }
11369}
11370
11371// On targets not supporting constant in soffset field, turn zero to
11372// SGPR_NULL to avoid generating an extra s_mov with zero.
11374 const GCNSubtarget *Subtarget) {
11375 if (Subtarget->hasRestrictedSOffset() && isNullConstant(SOffset))
11376 return DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32);
11377 return SOffset;
11378}
11379
11380SDValue SITargetLowering::lowerRawBufferAtomicIntrin(SDValue Op,
11381 SelectionDAG &DAG,
11382 unsigned NewOpcode) const {
11383 SDLoc DL(Op);
11384
11385 SDValue VData = Op.getOperand(2);
11386 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11387 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11388 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11389 SDValue Ops[] = {
11390 Op.getOperand(0), // Chain
11391 VData, // vdata
11392 Rsrc, // rsrc
11393 DAG.getConstant(0, DL, MVT::i32), // vindex
11394 VOffset, // voffset
11395 SOffset, // soffset
11396 Offset, // offset
11397 Op.getOperand(6), // cachepolicy
11398 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11399 };
11400
11401 auto *M = cast<MemSDNode>(Op);
11402
11403 EVT MemVT = VData.getValueType();
11404 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
11405 M->getMemOperand());
11406}
11407
11408SDValue
11409SITargetLowering::lowerStructBufferAtomicIntrin(SDValue Op, SelectionDAG &DAG,
11410 unsigned NewOpcode) const {
11411 SDLoc DL(Op);
11412
11413 SDValue VData = Op.getOperand(2);
11414 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
11415 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11416 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11417 SDValue Ops[] = {
11418 Op.getOperand(0), // Chain
11419 VData, // vdata
11420 Rsrc, // rsrc
11421 Op.getOperand(4), // vindex
11422 VOffset, // voffset
11423 SOffset, // soffset
11424 Offset, // offset
11425 Op.getOperand(7), // cachepolicy
11426 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11427 };
11428
11429 auto *M = cast<MemSDNode>(Op);
11430
11431 EVT MemVT = VData.getValueType();
11432 return DAG.getMemIntrinsicNode(NewOpcode, DL, Op->getVTList(), Ops, MemVT,
11433 M->getMemOperand());
11434}
11435
11436SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
11437 SelectionDAG &DAG) const {
11438 unsigned IntrID = Op.getConstantOperandVal(1);
11439 SDLoc DL(Op);
11440
11441 switch (IntrID) {
11442 case Intrinsic::amdgcn_ds_ordered_add:
11443 case Intrinsic::amdgcn_ds_ordered_swap: {
11444 MemSDNode *M = cast<MemSDNode>(Op);
11445 SDValue Chain = M->getOperand(0);
11446 SDValue M0 = M->getOperand(2);
11447 SDValue Value = M->getOperand(3);
11448 unsigned IndexOperand = M->getConstantOperandVal(7);
11449 unsigned WaveRelease = M->getConstantOperandVal(8);
11450 unsigned WaveDone = M->getConstantOperandVal(9);
11451
11452 unsigned OrderedCountIndex = IndexOperand & 0x3f;
11453 IndexOperand &= ~0x3f;
11454 unsigned CountDw = 0;
11455
11456 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10) {
11457 CountDw = (IndexOperand >> 24) & 0xf;
11458 IndexOperand &= ~(0xf << 24);
11459
11460 if (CountDw < 1 || CountDw > 4) {
11461 const Function &Fn = DAG.getMachineFunction().getFunction();
11462 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11463 Fn, "ds_ordered_count: dword count must be between 1 and 4",
11464 DL.getDebugLoc()));
11465 CountDw = 1;
11466 }
11467 }
11468
11469 if (IndexOperand) {
11470 const Function &Fn = DAG.getMachineFunction().getFunction();
11471 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11472 Fn, "ds_ordered_count: bad index operand", DL.getDebugLoc()));
11473 }
11474
11475 if (WaveDone && !WaveRelease) {
11476 // TODO: Move this to IR verifier
11477 const Function &Fn = DAG.getMachineFunction().getFunction();
11478 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
11479 Fn, "ds_ordered_count: wave_done requires wave_release",
11480 DL.getDebugLoc()));
11481 }
11482
11483 unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
11484 unsigned ShaderType =
11486 unsigned Offset0 = OrderedCountIndex << 2;
11487 unsigned Offset1 = WaveRelease | (WaveDone << 1) | (Instruction << 4);
11488
11489 if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10)
11490 Offset1 |= (CountDw - 1) << 6;
11491
11492 if (Subtarget->getGeneration() < AMDGPUSubtarget::GFX11)
11493 Offset1 |= ShaderType << 2;
11494
11495 unsigned Offset = Offset0 | (Offset1 << 8);
11496
11497 SDValue Ops[] = {
11498 Chain, Value, DAG.getTargetConstant(Offset, DL, MVT::i16),
11499 copyToM0(DAG, Chain, DL, M0).getValue(1), // Glue
11500 };
11501 return DAG.getMemIntrinsicNode(AMDGPUISD::DS_ORDERED_COUNT, DL,
11502 M->getVTList(), Ops, M->getMemoryVT(),
11503 M->getMemOperand());
11504 }
11505 case Intrinsic::amdgcn_raw_buffer_load:
11506 case Intrinsic::amdgcn_raw_ptr_buffer_load:
11507 case Intrinsic::amdgcn_raw_atomic_buffer_load:
11508 case Intrinsic::amdgcn_raw_ptr_atomic_buffer_load:
11509 case Intrinsic::amdgcn_raw_buffer_load_format:
11510 case Intrinsic::amdgcn_raw_ptr_buffer_load_format: {
11511 const bool IsFormat =
11512 IntrID == Intrinsic::amdgcn_raw_buffer_load_format ||
11513 IntrID == Intrinsic::amdgcn_raw_ptr_buffer_load_format;
11514
11515 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11516 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
11517 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
11518 SDValue Ops[] = {
11519 Op.getOperand(0), // Chain
11520 Rsrc, // rsrc
11521 DAG.getConstant(0, DL, MVT::i32), // vindex
11522 VOffset, // voffset
11523 SOffset, // soffset
11524 Offset, // offset
11525 Op.getOperand(5), // cachepolicy, swizzled buffer
11526 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11527 };
11528
11529 auto *M = cast<MemSDNode>(Op);
11530 return lowerIntrinsicLoad(M, IsFormat, DAG, Ops);
11531 }
11532 case Intrinsic::amdgcn_struct_buffer_load:
11533 case Intrinsic::amdgcn_struct_ptr_buffer_load:
11534 case Intrinsic::amdgcn_struct_buffer_load_format:
11535 case Intrinsic::amdgcn_struct_ptr_buffer_load_format:
11536 case Intrinsic::amdgcn_struct_atomic_buffer_load:
11537 case Intrinsic::amdgcn_struct_ptr_atomic_buffer_load: {
11538 const bool IsFormat =
11539 IntrID == Intrinsic::amdgcn_struct_buffer_load_format ||
11540 IntrID == Intrinsic::amdgcn_struct_ptr_buffer_load_format;
11541
11542 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11543 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11544 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11545 SDValue Ops[] = {
11546 Op.getOperand(0), // Chain
11547 Rsrc, // rsrc
11548 Op.getOperand(3), // vindex
11549 VOffset, // voffset
11550 SOffset, // soffset
11551 Offset, // offset
11552 Op.getOperand(6), // cachepolicy, swizzled buffer
11553 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11554 };
11555
11556 return lowerIntrinsicLoad(cast<MemSDNode>(Op), IsFormat, DAG, Ops);
11557 }
11558 case Intrinsic::amdgcn_raw_tbuffer_load:
11559 case Intrinsic::amdgcn_raw_ptr_tbuffer_load: {
11560 MemSDNode *M = cast<MemSDNode>(Op);
11561 EVT LoadVT = Op.getValueType();
11562 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11563 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(3), DAG);
11564 auto SOffset = selectSOffset(Op.getOperand(4), DAG, Subtarget);
11565
11566 SDValue Ops[] = {
11567 Op.getOperand(0), // Chain
11568 Rsrc, // rsrc
11569 DAG.getConstant(0, DL, MVT::i32), // vindex
11570 VOffset, // voffset
11571 SOffset, // soffset
11572 Offset, // offset
11573 Op.getOperand(5), // format
11574 Op.getOperand(6), // cachepolicy, swizzled buffer
11575 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11576 };
11577
11578 if (LoadVT.getScalarType() == MVT::f16)
11579 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11580 Ops);
11581 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11582 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11583 DAG);
11584 }
11585 case Intrinsic::amdgcn_struct_tbuffer_load:
11586 case Intrinsic::amdgcn_struct_ptr_tbuffer_load: {
11587 MemSDNode *M = cast<MemSDNode>(Op);
11588 EVT LoadVT = Op.getValueType();
11589 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
11590 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
11591 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
11592
11593 SDValue Ops[] = {
11594 Op.getOperand(0), // Chain
11595 Rsrc, // rsrc
11596 Op.getOperand(3), // vindex
11597 VOffset, // voffset
11598 SOffset, // soffset
11599 Offset, // offset
11600 Op.getOperand(6), // format
11601 Op.getOperand(7), // cachepolicy, swizzled buffer
11602 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11603 };
11604
11605 if (LoadVT.getScalarType() == MVT::f16)
11606 return adjustLoadValueType(AMDGPUISD::TBUFFER_LOAD_FORMAT_D16, M, DAG,
11607 Ops);
11608 return getMemIntrinsicNode(AMDGPUISD::TBUFFER_LOAD_FORMAT, DL,
11609 Op->getVTList(), Ops, LoadVT, M->getMemOperand(),
11610 DAG);
11611 }
11612 case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
11613 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
11614 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
11615 case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
11616 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
11617 return lowerStructBufferAtomicIntrin(Op, DAG,
11618 AMDGPUISD::BUFFER_ATOMIC_FADD);
11619 case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
11620 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
11621 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
11622 case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
11623 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmin:
11624 return lowerStructBufferAtomicIntrin(Op, DAG,
11625 AMDGPUISD::BUFFER_ATOMIC_FMIN);
11626 case Intrinsic::amdgcn_raw_buffer_atomic_fmax:
11627 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
11628 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMAX);
11629 case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
11630 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
11631 return lowerStructBufferAtomicIntrin(Op, DAG,
11632 AMDGPUISD::BUFFER_ATOMIC_FMAX);
11633 case Intrinsic::amdgcn_raw_buffer_atomic_swap:
11634 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_swap:
11635 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SWAP);
11636 case Intrinsic::amdgcn_raw_buffer_atomic_add:
11637 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_add:
11638 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11639 case Intrinsic::amdgcn_raw_buffer_atomic_sub:
11640 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub:
11641 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11642 case Intrinsic::amdgcn_raw_buffer_atomic_smin:
11643 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smin:
11644 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMIN);
11645 case Intrinsic::amdgcn_raw_buffer_atomic_umin:
11646 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umin:
11647 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMIN);
11648 case Intrinsic::amdgcn_raw_buffer_atomic_smax:
11649 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_smax:
11650 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SMAX);
11651 case Intrinsic::amdgcn_raw_buffer_atomic_umax:
11652 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_umax:
11653 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_UMAX);
11654 case Intrinsic::amdgcn_raw_buffer_atomic_and:
11655 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_and:
11656 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11657 case Intrinsic::amdgcn_raw_buffer_atomic_or:
11658 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_or:
11659 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11660 case Intrinsic::amdgcn_raw_buffer_atomic_xor:
11661 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_xor:
11662 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11663 case Intrinsic::amdgcn_raw_buffer_atomic_inc:
11664 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_inc:
11665 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11666 case Intrinsic::amdgcn_raw_buffer_atomic_dec:
11667 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
11668 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11669 case Intrinsic::amdgcn_struct_buffer_atomic_swap:
11670 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
11671 return lowerStructBufferAtomicIntrin(Op, DAG,
11672 AMDGPUISD::BUFFER_ATOMIC_SWAP);
11673 case Intrinsic::amdgcn_struct_buffer_atomic_add:
11674 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_add:
11675 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_ADD);
11676 case Intrinsic::amdgcn_struct_buffer_atomic_sub:
11677 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub:
11678 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_SUB);
11679 case Intrinsic::amdgcn_struct_buffer_atomic_smin:
11680 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smin:
11681 return lowerStructBufferAtomicIntrin(Op, DAG,
11682 AMDGPUISD::BUFFER_ATOMIC_SMIN);
11683 case Intrinsic::amdgcn_struct_buffer_atomic_umin:
11684 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umin:
11685 return lowerStructBufferAtomicIntrin(Op, DAG,
11686 AMDGPUISD::BUFFER_ATOMIC_UMIN);
11687 case Intrinsic::amdgcn_struct_buffer_atomic_smax:
11688 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_smax:
11689 return lowerStructBufferAtomicIntrin(Op, DAG,
11690 AMDGPUISD::BUFFER_ATOMIC_SMAX);
11691 case Intrinsic::amdgcn_struct_buffer_atomic_umax:
11692 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_umax:
11693 return lowerStructBufferAtomicIntrin(Op, DAG,
11694 AMDGPUISD::BUFFER_ATOMIC_UMAX);
11695 case Intrinsic::amdgcn_struct_buffer_atomic_and:
11696 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_and:
11697 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_AND);
11698 case Intrinsic::amdgcn_struct_buffer_atomic_or:
11699 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_or:
11700 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_OR);
11701 case Intrinsic::amdgcn_struct_buffer_atomic_xor:
11702 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_xor:
11703 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_XOR);
11704 case Intrinsic::amdgcn_struct_buffer_atomic_inc:
11705 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_inc:
11706 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_INC);
11707 case Intrinsic::amdgcn_struct_buffer_atomic_dec:
11708 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
11709 return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
11710 case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
11711 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
11712 return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
11713 case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
11714 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
11715 return lowerStructBufferAtomicIntrin(Op, DAG,
11716 AMDGPUISD::BUFFER_ATOMIC_CSUB);
11717 case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
11718 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
11719 return lowerRawBufferAtomicIntrin(Op, DAG,
11720 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11721 case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
11722 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
11723 return lowerStructBufferAtomicIntrin(Op, DAG,
11724 AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
11725 case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
11726 case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
11727 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
11728 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
11729 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
11730 SDValue Ops[] = {
11731 Op.getOperand(0), // Chain
11732 Op.getOperand(2), // src
11733 Op.getOperand(3), // cmp
11734 Rsrc, // rsrc
11735 DAG.getConstant(0, DL, MVT::i32), // vindex
11736 VOffset, // voffset
11737 SOffset, // soffset
11738 Offset, // offset
11739 Op.getOperand(7), // cachepolicy
11740 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
11741 };
11742 EVT VT = Op.getValueType();
11743 auto *M = cast<MemSDNode>(Op);
11744
11745 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11746 Op->getVTList(), Ops, VT,
11747 M->getMemOperand());
11748 }
11749 case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
11750 case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cmpswap: {
11751 SDValue Rsrc = bufferRsrcPtrToVector(Op->getOperand(4), DAG);
11752 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(6), DAG);
11753 auto SOffset = selectSOffset(Op.getOperand(7), DAG, Subtarget);
11754 SDValue Ops[] = {
11755 Op.getOperand(0), // Chain
11756 Op.getOperand(2), // src
11757 Op.getOperand(3), // cmp
11758 Rsrc, // rsrc
11759 Op.getOperand(5), // vindex
11760 VOffset, // voffset
11761 SOffset, // soffset
11762 Offset, // offset
11763 Op.getOperand(8), // cachepolicy
11764 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
11765 };
11766 EVT VT = Op.getValueType();
11767 auto *M = cast<MemSDNode>(Op);
11768
11769 return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
11770 Op->getVTList(), Ops, VT,
11771 M->getMemOperand());
11772 }
11773 case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
11774 case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
11775 MemSDNode *M = cast<MemSDNode>(Op);
11776 SDValue NodePtr = M->getOperand(2);
11777 SDValue RayExtent = M->getOperand(3);
11778 SDValue InstanceMask = M->getOperand(4);
11779 SDValue RayOrigin = M->getOperand(5);
11780 SDValue RayDir = M->getOperand(6);
11781 SDValue Offsets = M->getOperand(7);
11782 SDValue TDescr = M->getOperand(8);
11783
11784 assert(NodePtr.getValueType() == MVT::i64);
11785 assert(RayDir.getValueType() == MVT::v3f32);
11786
11787 if (!Subtarget->hasBVHDualAndBVH8Insts()) {
11788 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11789 return SDValue();
11790 }
11791
11792 bool IsBVH8 = IntrID == Intrinsic::amdgcn_image_bvh8_intersect_ray;
11793 const unsigned NumVDataDwords = 10;
11794 const unsigned NumVAddrDwords = IsBVH8 ? 11 : 12;
11795 int Opcode = AMDGPU::getMIMGOpcode(
11796 IsBVH8 ? AMDGPU::IMAGE_BVH8_INTERSECT_RAY
11797 : AMDGPU::IMAGE_BVH_DUAL_INTERSECT_RAY,
11798 AMDGPU::MIMGEncGfx12, NumVDataDwords, NumVAddrDwords);
11799 assert(Opcode != -1);
11800
11802 Ops.push_back(NodePtr);
11803 Ops.push_back(DAG.getBuildVector(
11804 MVT::v2i32, DL,
11805 {DAG.getBitcast(MVT::i32, RayExtent),
11806 DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, InstanceMask)}));
11807 Ops.push_back(RayOrigin);
11808 Ops.push_back(RayDir);
11809 Ops.push_back(Offsets);
11810 Ops.push_back(TDescr);
11811 Ops.push_back(M->getChain());
11812
11813 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11814 MachineMemOperand *MemRef = M->getMemOperand();
11815 DAG.setNodeMemRefs(NewNode, {MemRef});
11816 return SDValue(NewNode, 0);
11817 }
11818 case Intrinsic::amdgcn_image_bvh_intersect_ray: {
11819 MemSDNode *M = cast<MemSDNode>(Op);
11820 SDValue NodePtr = M->getOperand(2);
11821 SDValue RayExtent = M->getOperand(3);
11822 SDValue RayOrigin = M->getOperand(4);
11823 SDValue RayDir = M->getOperand(5);
11824 SDValue RayInvDir = M->getOperand(6);
11825 SDValue TDescr = M->getOperand(7);
11826
11827 assert(NodePtr.getValueType() == MVT::i32 ||
11828 NodePtr.getValueType() == MVT::i64);
11829 assert(RayDir.getValueType() == MVT::v3f16 ||
11830 RayDir.getValueType() == MVT::v3f32);
11831
11832 if (!Subtarget->hasGFX10_AEncoding()) {
11833 emitRemovedIntrinsicError(DAG, DL, Op.getValueType());
11834 return SDValue();
11835 }
11836
11837 const bool IsGFX11 = AMDGPU::isGFX11(*Subtarget);
11838 const bool IsGFX11Plus = AMDGPU::isGFX11Plus(*Subtarget);
11839 const bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
11840 const bool IsA16 = RayDir.getValueType().getVectorElementType() == MVT::f16;
11841 const bool Is64 = NodePtr.getValueType() == MVT::i64;
11842 const unsigned NumVDataDwords = 4;
11843 const unsigned NumVAddrDwords = IsA16 ? (Is64 ? 9 : 8) : (Is64 ? 12 : 11);
11844 const unsigned NumVAddrs = IsGFX11Plus ? (IsA16 ? 4 : 5) : NumVAddrDwords;
11845 const bool UseNSA = (Subtarget->hasNSAEncoding() &&
11846 NumVAddrs <= Subtarget->getNSAMaxSize()) ||
11847 IsGFX12Plus;
11848 const unsigned BaseOpcodes[2][2] = {
11849 {AMDGPU::IMAGE_BVH_INTERSECT_RAY, AMDGPU::IMAGE_BVH_INTERSECT_RAY_a16},
11850 {AMDGPU::IMAGE_BVH64_INTERSECT_RAY,
11851 AMDGPU::IMAGE_BVH64_INTERSECT_RAY_a16}};
11852 int Opcode;
11853 if (UseNSA) {
11854 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11855 IsGFX12Plus ? AMDGPU::MIMGEncGfx12
11856 : IsGFX11 ? AMDGPU::MIMGEncGfx11NSA
11857 : AMDGPU::MIMGEncGfx10NSA,
11858 NumVDataDwords, NumVAddrDwords);
11859 } else {
11860 assert(!IsGFX12Plus);
11861 Opcode = AMDGPU::getMIMGOpcode(BaseOpcodes[Is64][IsA16],
11862 IsGFX11 ? AMDGPU::MIMGEncGfx11Default
11863 : AMDGPU::MIMGEncGfx10Default,
11864 NumVDataDwords, NumVAddrDwords);
11865 }
11866 assert(Opcode != -1);
11867
11869
11870 auto packLanes = [&DAG, &Ops, &DL](SDValue Op, bool IsAligned) {
11872 DAG.ExtractVectorElements(Op, Lanes, 0, 3);
11873 if (Lanes[0].getValueSizeInBits() == 32) {
11874 for (unsigned I = 0; I < 3; ++I)
11875 Ops.push_back(DAG.getBitcast(MVT::i32, Lanes[I]));
11876 } else {
11877 if (IsAligned) {
11878 Ops.push_back(DAG.getBitcast(
11879 MVT::i32,
11880 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[0], Lanes[1]})));
11881 Ops.push_back(Lanes[2]);
11882 } else {
11883 SDValue Elt0 = Ops.pop_back_val();
11884 Ops.push_back(DAG.getBitcast(
11885 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL, {Elt0, Lanes[0]})));
11886 Ops.push_back(DAG.getBitcast(
11887 MVT::i32,
11888 DAG.getBuildVector(MVT::v2f16, DL, {Lanes[1], Lanes[2]})));
11889 }
11890 }
11891 };
11892
11893 if (UseNSA && IsGFX11Plus) {
11894 Ops.push_back(NodePtr);
11895 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11896 Ops.push_back(RayOrigin);
11897 if (IsA16) {
11898 SmallVector<SDValue, 3> DirLanes, InvDirLanes, MergedLanes;
11899 DAG.ExtractVectorElements(RayDir, DirLanes, 0, 3);
11900 DAG.ExtractVectorElements(RayInvDir, InvDirLanes, 0, 3);
11901 for (unsigned I = 0; I < 3; ++I) {
11902 MergedLanes.push_back(DAG.getBitcast(
11903 MVT::i32, DAG.getBuildVector(MVT::v2f16, DL,
11904 {DirLanes[I], InvDirLanes[I]})));
11905 }
11906 Ops.push_back(DAG.getBuildVector(MVT::v3i32, DL, MergedLanes));
11907 } else {
11908 Ops.push_back(RayDir);
11909 Ops.push_back(RayInvDir);
11910 }
11911 } else {
11912 if (Is64)
11913 DAG.ExtractVectorElements(DAG.getBitcast(MVT::v2i32, NodePtr), Ops, 0,
11914 2);
11915 else
11916 Ops.push_back(NodePtr);
11917
11918 Ops.push_back(DAG.getBitcast(MVT::i32, RayExtent));
11919 packLanes(RayOrigin, true);
11920 packLanes(RayDir, true);
11921 packLanes(RayInvDir, false);
11922 }
11923
11924 if (!UseNSA) {
11925 // Build a single vector containing all the operands so far prepared.
11926 if (NumVAddrDwords > 12) {
11927 SDValue Undef = DAG.getPOISON(MVT::i32);
11928 Ops.append(16 - Ops.size(), Undef);
11929 }
11930 assert(Ops.size() >= 8 && Ops.size() <= 12);
11931 SDValue MergedOps =
11932 DAG.getBuildVector(MVT::getVectorVT(MVT::i32, Ops.size()), DL, Ops);
11933 Ops.clear();
11934 Ops.push_back(MergedOps);
11935 }
11936
11937 Ops.push_back(TDescr);
11938 Ops.push_back(DAG.getTargetConstant(IsA16, DL, MVT::i1));
11939 Ops.push_back(M->getChain());
11940
11941 auto *NewNode = DAG.getMachineNode(Opcode, DL, M->getVTList(), Ops);
11942 MachineMemOperand *MemRef = M->getMemOperand();
11943 DAG.setNodeMemRefs(NewNode, {MemRef});
11944 return SDValue(NewNode, 0);
11945 }
11946 case Intrinsic::amdgcn_global_atomic_fmin_num:
11947 case Intrinsic::amdgcn_global_atomic_fmax_num:
11948 case Intrinsic::amdgcn_flat_atomic_fmin_num:
11949 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11950 MemSDNode *M = cast<MemSDNode>(Op);
11951 SDValue Ops[] = {
11952 M->getOperand(0), // Chain
11953 M->getOperand(2), // Ptr
11954 M->getOperand(3) // Value
11955 };
11956 unsigned Opcode = 0;
11957 switch (IntrID) {
11958 case Intrinsic::amdgcn_global_atomic_fmin_num:
11959 case Intrinsic::amdgcn_flat_atomic_fmin_num: {
11960 Opcode = ISD::ATOMIC_LOAD_FMIN;
11961 break;
11962 }
11963 case Intrinsic::amdgcn_global_atomic_fmax_num:
11964 case Intrinsic::amdgcn_flat_atomic_fmax_num: {
11965 Opcode = ISD::ATOMIC_LOAD_FMAX;
11966 break;
11967 }
11968 default:
11969 llvm_unreachable("unhandled atomic opcode");
11970 }
11971 return DAG.getAtomic(Opcode, SDLoc(Op), M->getMemoryVT(), M->getVTList(),
11972 Ops, M->getMemOperand());
11973 }
11974 case Intrinsic::amdgcn_s_alloc_vgpr: {
11975 SDValue NumVGPRs = Op.getOperand(2);
11976 if (!NumVGPRs->isDivergent())
11977 return Op;
11978
11979 SDValue ReadFirstLaneID =
11980 DAG.getTargetConstant(Intrinsic::amdgcn_readfirstlane, DL, MVT::i32);
11981 NumVGPRs = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
11982 ReadFirstLaneID, NumVGPRs);
11983
11984 return DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, Op->getVTList(),
11985 Op.getOperand(0), Op.getOperand(1), NumVGPRs);
11986 }
11987 case Intrinsic::amdgcn_s_get_barrier_state:
11988 case Intrinsic::amdgcn_s_get_named_barrier_state: {
11989 SDValue Chain = Op->getOperand(0);
11991 unsigned Opc;
11992
11993 if (isa<ConstantSDNode>(Op->getOperand(2))) {
11994 uint64_t BarID = cast<ConstantSDNode>(Op->getOperand(2))->getZExtValue();
11995 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state)
11996 BarID = (BarID >> 4) & 0x3F;
11997 Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
11998 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
11999 Ops.push_back(K);
12000 Ops.push_back(Chain);
12001 } else {
12002 Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
12003 if (IntrID == Intrinsic::amdgcn_s_get_named_barrier_state) {
12004 SDValue M0Val;
12005 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, Op->getOperand(2),
12006 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12007 M0Val = SDValue(
12008 DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
12009 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12010 0);
12011 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12012 } else
12013 Ops.push_back(copyToM0(DAG, Chain, DL, Op->getOperand(2)).getValue(0));
12014 }
12015
12016 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12017 return SDValue(NewMI, 0);
12018 }
12019 case Intrinsic::amdgcn_cooperative_atomic_load_32x4B:
12020 case Intrinsic::amdgcn_cooperative_atomic_load_16x8B:
12021 case Intrinsic::amdgcn_cooperative_atomic_load_8x16B: {
12022 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12023 SDValue Chain = Op->getOperand(0);
12024 SDValue Ptr = Op->getOperand(2);
12025 EVT VT = Op->getValueType(0);
12026 return DAG.getAtomicLoad(ISD::NON_EXTLOAD, DL, MII->getMemoryVT(), VT,
12027 Chain, Ptr, MII->getMemOperand());
12028 }
12029 case Intrinsic::amdgcn_av_load_b128: {
12030 if (!Subtarget->hasFlatGlobalInsts()) {
12031 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
12033 "llvm.amdgcn.av.load.b128 not supported on subtarget",
12034 DL.getDebugLoc()));
12035 return DAG.getMergeValues(
12036 {DAG.getPOISON(Op->getValueType(0)), Op->getOperand(0)}, DL);
12037 }
12038 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12039 SDValue Chain = Op->getOperand(0);
12040 SDValue Ptr = Op->getOperand(2);
12041 EVT VT = Op->getValueType(0);
12042 // Lower to a regular ISD::LOAD. The MachineMemOperand carries Monotonic
12043 // ordering and syncscope so that SIMemoryLegalizer sets cache policy bits.
12044 // Address space filtering in the load_global/load_flat PatFrags selects
12045 // the correct GLOBAL vs FLAT instruction.
12046 return DAG.getLoad(VT, DL, Chain, Ptr, MII->getMemOperand());
12047 }
12048 case Intrinsic::amdgcn_flat_load_monitor_b32:
12049 case Intrinsic::amdgcn_flat_load_monitor_b64:
12050 case Intrinsic::amdgcn_flat_load_monitor_b128: {
12051 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12052 SDValue Chain = Op->getOperand(0);
12053 SDValue Ptr = Op->getOperand(2);
12054 return DAG.getMemIntrinsicNode(AMDGPUISD::FLAT_LOAD_MONITOR, DL,
12055 Op->getVTList(), {Chain, Ptr},
12056 MII->getMemoryVT(), MII->getMemOperand());
12057 }
12058 case Intrinsic::amdgcn_global_load_monitor_b32:
12059 case Intrinsic::amdgcn_global_load_monitor_b64:
12060 case Intrinsic::amdgcn_global_load_monitor_b128: {
12061 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12062 SDValue Chain = Op->getOperand(0);
12063 SDValue Ptr = Op->getOperand(2);
12064 return DAG.getMemIntrinsicNode(AMDGPUISD::GLOBAL_LOAD_MONITOR, DL,
12065 Op->getVTList(), {Chain, Ptr},
12066 MII->getMemoryVT(), MII->getMemOperand());
12067 }
12068 default:
12069
12070 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12072 return lowerImage(Op, ImageDimIntr, DAG, true);
12073
12074 return SDValue();
12075 }
12076}
12077
12078// Call DAG.getMemIntrinsicNode for a load, but first widen a dwordx3 type to
12079// dwordx4 if on SI and handle TFE loads.
12080SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
12081 SDVTList VTList,
12082 ArrayRef<SDValue> Ops, EVT MemVT,
12083 MachineMemOperand *MMO,
12084 SelectionDAG &DAG) const {
12085 LLVMContext &C = *DAG.getContext();
12086 MachineFunction &MF = DAG.getMachineFunction();
12087 EVT VT = VTList.VTs[0];
12088
12089 assert(VTList.NumVTs == 2 || VTList.NumVTs == 3);
12090 bool IsTFE = VTList.NumVTs == 3;
12091 if (IsTFE) {
12092 unsigned NumValueDWords = divideCeil(VT.getSizeInBits(), 32);
12093 unsigned NumOpDWords = NumValueDWords + 1;
12094 EVT OpDWordsVT = EVT::getVectorVT(C, MVT::i32, NumOpDWords);
12095 SDVTList OpDWordsVTList = DAG.getVTList(OpDWordsVT, VTList.VTs[2]);
12096 MachineMemOperand *OpDWordsMMO =
12097 MF.getMachineMemOperand(MMO, 0, NumOpDWords * 4);
12098 SDValue Op = getMemIntrinsicNode(Opcode, DL, OpDWordsVTList, Ops,
12099 OpDWordsVT, OpDWordsMMO, DAG);
12100 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12101 DAG.getVectorIdxConstant(NumValueDWords, DL));
12102 SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
12103 SDValue ValueDWords =
12104 NumValueDWords == 1
12105 ? DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op, ZeroIdx)
12107 EVT::getVectorVT(C, MVT::i32, NumValueDWords), Op,
12108 ZeroIdx);
12109 SDValue Value = DAG.getNode(ISD::BITCAST, DL, VT, ValueDWords);
12110 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
12111 }
12112
12113 if (!Subtarget->hasDwordx3LoadStores() &&
12114 (VT == MVT::v3i32 || VT == MVT::v3f32)) {
12115 EVT WidenedVT = EVT::getVectorVT(C, VT.getVectorElementType(), 4);
12116 EVT WidenedMemVT = EVT::getVectorVT(C, MemVT.getVectorElementType(), 4);
12117 MachineMemOperand *WidenedMMO = MF.getMachineMemOperand(MMO, 0, 16);
12118 SDVTList WidenedVTList = DAG.getVTList(WidenedVT, VTList.VTs[1]);
12119 SDValue Op = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
12120 WidenedMemVT, WidenedMMO);
12122 DAG.getVectorIdxConstant(0, DL));
12123 return DAG.getMergeValues({Value, SDValue(Op.getNode(), 1)}, DL);
12124 }
12125
12126 return DAG.getMemIntrinsicNode(Opcode, DL, VTList, Ops, MemVT, MMO);
12127}
12128
12129SDValue SITargetLowering::handleD16VData(SDValue VData, SelectionDAG &DAG,
12130 bool ImageStore) const {
12131 EVT StoreVT = VData.getValueType();
12132
12133 // No change for f16 and legal vector D16 types.
12134 if (!StoreVT.isVector())
12135 return VData;
12136
12137 SDLoc DL(VData);
12138 unsigned NumElements = StoreVT.getVectorNumElements();
12139
12140 if (Subtarget->hasUnpackedD16VMem()) {
12141 // We need to unpack the packed data to store.
12142 EVT IntStoreVT = StoreVT.changeTypeToInteger();
12143 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
12144
12145 EVT EquivStoreVT =
12146 EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElements);
12147 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, EquivStoreVT, IntVData);
12148 return DAG.UnrollVectorOp(ZExt.getNode());
12149 }
12150
12151 // The sq block of gfx8.1 does not estimate register use correctly for d16
12152 // image store instructions. The data operand is computed as if it were not a
12153 // d16 image instruction.
12154 if (ImageStore && Subtarget->hasImageStoreD16Bug()) {
12155 // Bitcast to i16
12156 EVT IntStoreVT = StoreVT.changeTypeToInteger();
12157 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
12158
12159 // Decompose into scalars
12161 DAG.ExtractVectorElements(IntVData, Elts);
12162
12163 // Group pairs of i16 into v2i16 and bitcast to i32
12164 SmallVector<SDValue, 4> PackedElts;
12165 for (unsigned I = 0; I < Elts.size() / 2; I += 1) {
12166 SDValue Pair =
12167 DAG.getBuildVector(MVT::v2i16, DL, {Elts[I * 2], Elts[I * 2 + 1]});
12168 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
12169 PackedElts.push_back(IntPair);
12170 }
12171 if ((NumElements % 2) == 1) {
12172 // Handle v3i16
12173 unsigned I = Elts.size() / 2;
12174 SDValue Pair = DAG.getBuildVector(MVT::v2i16, DL,
12175 {Elts[I * 2], DAG.getPOISON(MVT::i16)});
12176 SDValue IntPair = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Pair);
12177 PackedElts.push_back(IntPair);
12178 }
12179
12180 // Pad using UNDEF
12181 PackedElts.resize(Elts.size(), DAG.getPOISON(MVT::i32));
12182
12183 // Build final vector
12184 EVT VecVT =
12185 EVT::getVectorVT(*DAG.getContext(), MVT::i32, PackedElts.size());
12186 return DAG.getBuildVector(VecVT, DL, PackedElts);
12187 }
12188
12189 if (NumElements == 3) {
12190 EVT IntStoreVT =
12192 SDValue IntVData = DAG.getNode(ISD::BITCAST, DL, IntStoreVT, VData);
12193
12194 EVT WidenedStoreVT = EVT::getVectorVT(
12195 *DAG.getContext(), StoreVT.getVectorElementType(), NumElements + 1);
12196 EVT WidenedIntVT = EVT::getIntegerVT(*DAG.getContext(),
12197 WidenedStoreVT.getStoreSizeInBits());
12198 SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenedIntVT, IntVData);
12199 return DAG.getNode(ISD::BITCAST, DL, WidenedStoreVT, ZExt);
12200 }
12201
12202 assert(isTypeLegal(StoreVT));
12203 return VData;
12204}
12205
12206static bool isAsyncLDSDMA(Intrinsic::ID Intr) {
12207 switch (Intr) {
12208 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12209 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12210 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12211 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds:
12212 case Intrinsic::amdgcn_load_async_to_lds:
12213 case Intrinsic::amdgcn_global_load_async_lds:
12214 return true;
12215 }
12216 return false;
12217}
12218
12219SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
12220 SelectionDAG &DAG) const {
12221 SDLoc DL(Op);
12222 SDValue Chain = Op.getOperand(0);
12223 unsigned IntrinsicID = Op.getConstantOperandVal(1);
12224
12225 switch (IntrinsicID) {
12226 case Intrinsic::amdgcn_exp_compr: {
12227 if (!Subtarget->hasCompressedExport()) {
12228 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
12230 "intrinsic not supported on subtarget", DL.getDebugLoc()));
12231 }
12232 SDValue Src0 = Op.getOperand(4);
12233 SDValue Src1 = Op.getOperand(5);
12234 // Hack around illegal type on SI by directly selecting it.
12235 if (isTypeLegal(Src0.getValueType()))
12236 return SDValue();
12237
12238 const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
12239 SDValue Undef = DAG.getPOISON(MVT::f32);
12240 const SDValue Ops[] = {
12241 Op.getOperand(2), // tgt
12242 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
12243 DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
12244 Undef, // src2
12245 Undef, // src3
12246 Op.getOperand(7), // vm
12247 DAG.getTargetConstant(1, DL, MVT::i1), // compr
12248 Op.getOperand(3), // en
12249 Op.getOperand(0) // Chain
12250 };
12251
12252 unsigned Opc = Done->isZero() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
12253 return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
12254 }
12255
12256 case Intrinsic::amdgcn_struct_tbuffer_store:
12257 case Intrinsic::amdgcn_struct_ptr_tbuffer_store: {
12258 SDValue VData = Op.getOperand(2);
12259 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
12260 if (IsD16)
12261 VData = handleD16VData(VData, DAG);
12262 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12263 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
12264 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
12265 SDValue Ops[] = {
12266 Chain,
12267 VData, // vdata
12268 Rsrc, // rsrc
12269 Op.getOperand(4), // vindex
12270 VOffset, // voffset
12271 SOffset, // soffset
12272 Offset, // offset
12273 Op.getOperand(7), // format
12274 Op.getOperand(8), // cachepolicy, swizzled buffer
12275 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
12276 };
12277 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12278 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12279 MemSDNode *M = cast<MemSDNode>(Op);
12280 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12281 M->getMemoryVT(), M->getMemOperand());
12282 }
12283
12284 case Intrinsic::amdgcn_raw_tbuffer_store:
12285 case Intrinsic::amdgcn_raw_ptr_tbuffer_store: {
12286 SDValue VData = Op.getOperand(2);
12287 bool IsD16 = (VData.getValueType().getScalarType() == MVT::f16);
12288 if (IsD16)
12289 VData = handleD16VData(VData, DAG);
12290 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12291 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
12292 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
12293 SDValue Ops[] = {
12294 Chain,
12295 VData, // vdata
12296 Rsrc, // rsrc
12297 DAG.getConstant(0, DL, MVT::i32), // vindex
12298 VOffset, // voffset
12299 SOffset, // soffset
12300 Offset, // offset
12301 Op.getOperand(6), // format
12302 Op.getOperand(7), // cachepolicy, swizzled buffer
12303 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
12304 };
12305 unsigned Opc = IsD16 ? AMDGPUISD::TBUFFER_STORE_FORMAT_D16
12306 : AMDGPUISD::TBUFFER_STORE_FORMAT;
12307 MemSDNode *M = cast<MemSDNode>(Op);
12308 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12309 M->getMemoryVT(), M->getMemOperand());
12310 }
12311
12312 case Intrinsic::amdgcn_raw_buffer_store:
12313 case Intrinsic::amdgcn_raw_ptr_buffer_store:
12314 case Intrinsic::amdgcn_raw_buffer_store_format:
12315 case Intrinsic::amdgcn_raw_ptr_buffer_store_format: {
12316 const bool IsFormat =
12317 IntrinsicID == Intrinsic::amdgcn_raw_buffer_store_format ||
12318 IntrinsicID == Intrinsic::amdgcn_raw_ptr_buffer_store_format;
12319
12320 SDValue VData = Op.getOperand(2);
12321 EVT VDataVT = VData.getValueType();
12322 EVT EltType = VDataVT.getScalarType();
12323 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
12324 if (IsD16) {
12325 VData = handleD16VData(VData, DAG);
12326 VDataVT = VData.getValueType();
12327 }
12328
12329 if (!isTypeLegal(VDataVT)) {
12330 VData =
12331 DAG.getNode(ISD::BITCAST, DL,
12332 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
12333 }
12334
12335 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12336 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(4), DAG);
12337 auto SOffset = selectSOffset(Op.getOperand(5), DAG, Subtarget);
12338 SDValue Ops[] = {
12339 Chain,
12340 VData,
12341 Rsrc,
12342 DAG.getConstant(0, DL, MVT::i32), // vindex
12343 VOffset, // voffset
12344 SOffset, // soffset
12345 Offset, // offset
12346 Op.getOperand(6), // cachepolicy, swizzled buffer
12347 DAG.getTargetConstant(0, DL, MVT::i1), // idxen
12348 };
12349 unsigned Opc =
12350 IsFormat ? AMDGPUISD::BUFFER_STORE_FORMAT : AMDGPUISD::BUFFER_STORE;
12351 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12352 MemSDNode *M = cast<MemSDNode>(Op);
12353
12354 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12355 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12356 return handleByteShortBufferStores(DAG, VDataVT, DL, Ops, M);
12357
12358 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12359 M->getMemoryVT(), M->getMemOperand());
12360 }
12361
12362 case Intrinsic::amdgcn_struct_buffer_store:
12363 case Intrinsic::amdgcn_struct_ptr_buffer_store:
12364 case Intrinsic::amdgcn_struct_buffer_store_format:
12365 case Intrinsic::amdgcn_struct_ptr_buffer_store_format: {
12366 const bool IsFormat =
12367 IntrinsicID == Intrinsic::amdgcn_struct_buffer_store_format ||
12368 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_store_format;
12369
12370 SDValue VData = Op.getOperand(2);
12371 EVT VDataVT = VData.getValueType();
12372 EVT EltType = VDataVT.getScalarType();
12373 bool IsD16 = IsFormat && (EltType.getSizeInBits() == 16);
12374
12375 if (IsD16) {
12376 VData = handleD16VData(VData, DAG);
12377 VDataVT = VData.getValueType();
12378 }
12379
12380 if (!isTypeLegal(VDataVT)) {
12381 VData =
12382 DAG.getNode(ISD::BITCAST, DL,
12383 getEquivalentMemType(*DAG.getContext(), VDataVT), VData);
12384 }
12385
12386 auto Rsrc = bufferRsrcPtrToVector(Op.getOperand(3), DAG);
12387 auto [VOffset, Offset] = splitBufferOffsets(Op.getOperand(5), DAG);
12388 auto SOffset = selectSOffset(Op.getOperand(6), DAG, Subtarget);
12389 SDValue Ops[] = {
12390 Chain,
12391 VData,
12392 Rsrc,
12393 Op.getOperand(4), // vindex
12394 VOffset, // voffset
12395 SOffset, // soffset
12396 Offset, // offset
12397 Op.getOperand(7), // cachepolicy, swizzled buffer
12398 DAG.getTargetConstant(1, DL, MVT::i1), // idxen
12399 };
12400 unsigned Opc =
12401 !IsFormat ? AMDGPUISD::BUFFER_STORE : AMDGPUISD::BUFFER_STORE_FORMAT;
12402 Opc = IsD16 ? AMDGPUISD::BUFFER_STORE_FORMAT_D16 : Opc;
12403 MemSDNode *M = cast<MemSDNode>(Op);
12404
12405 // Handle BUFFER_STORE_BYTE/SHORT overloaded intrinsics
12406 EVT VDataType = VData.getValueType().getScalarType();
12407 if (!IsD16 && !VDataVT.isVector() && EltType.getSizeInBits() < 32)
12408 return handleByteShortBufferStores(DAG, VDataType, DL, Ops, M);
12409
12410 return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops,
12411 M->getMemoryVT(), M->getMemOperand());
12412 }
12413 case Intrinsic::amdgcn_raw_buffer_load_lds:
12414 case Intrinsic::amdgcn_raw_buffer_load_async_lds:
12415 case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
12416 case Intrinsic::amdgcn_raw_ptr_buffer_load_async_lds:
12417 case Intrinsic::amdgcn_struct_buffer_load_lds:
12418 case Intrinsic::amdgcn_struct_buffer_load_async_lds:
12419 case Intrinsic::amdgcn_struct_ptr_buffer_load_lds:
12420 case Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds: {
12421 if (!Subtarget->hasVMemToLDSLoad())
12422 return SDValue();
12423 unsigned Opc;
12424 bool HasVIndex =
12425 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
12426 IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_async_lds ||
12427 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_lds ||
12428 IntrinsicID == Intrinsic::amdgcn_struct_ptr_buffer_load_async_lds;
12429 unsigned OpOffset = HasVIndex ? 1 : 0;
12430 SDValue VOffset = Op.getOperand(5 + OpOffset);
12431 bool HasVOffset = !isNullConstant(VOffset);
12432 unsigned Size = Op->getConstantOperandVal(4);
12433
12434 switch (Size) {
12435 default:
12436 return SDValue();
12437 case 1:
12438 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_BOTHEN
12439 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_IDXEN
12440 : HasVOffset ? AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFEN
12441 : AMDGPU::BUFFER_LOAD_UBYTE_LDS_OFFSET;
12442 break;
12443 case 2:
12444 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_BOTHEN
12445 : AMDGPU::BUFFER_LOAD_USHORT_LDS_IDXEN
12446 : HasVOffset ? AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFEN
12447 : AMDGPU::BUFFER_LOAD_USHORT_LDS_OFFSET;
12448 break;
12449 case 4:
12450 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_BOTHEN
12451 : AMDGPU::BUFFER_LOAD_DWORD_LDS_IDXEN
12452 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFEN
12453 : AMDGPU::BUFFER_LOAD_DWORD_LDS_OFFSET;
12454 break;
12455 case 12:
12456 if (!Subtarget->hasLDSLoadB96_B128())
12457 return SDValue();
12458 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_BOTHEN
12459 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_IDXEN
12460 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFEN
12461 : AMDGPU::BUFFER_LOAD_DWORDX3_LDS_OFFSET;
12462 break;
12463 case 16:
12464 if (!Subtarget->hasLDSLoadB96_B128())
12465 return SDValue();
12466 Opc = HasVIndex ? HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_BOTHEN
12467 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_IDXEN
12468 : HasVOffset ? AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFEN
12469 : AMDGPU::BUFFER_LOAD_DWORDX4_LDS_OFFSET;
12470 break;
12471 }
12472
12473 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
12474
12476
12477 if (HasVIndex && HasVOffset)
12478 Ops.push_back(DAG.getBuildVector(MVT::v2i32, DL,
12479 {Op.getOperand(5), // VIndex
12480 VOffset}));
12481 else if (HasVIndex)
12482 Ops.push_back(Op.getOperand(5));
12483 else if (HasVOffset)
12484 Ops.push_back(VOffset);
12485
12486 SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(2), DAG);
12487 Ops.push_back(Rsrc);
12488 Ops.push_back(Op.getOperand(6 + OpOffset)); // soffset
12489 Ops.push_back(Op.getOperand(7 + OpOffset)); // imm offset
12490 bool IsGFX12Plus = AMDGPU::isGFX12Plus(*Subtarget);
12491 unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
12492 Ops.push_back(DAG.getTargetConstant(
12493 Aux & (IsGFX12Plus ? AMDGPU::CPol::ALL : AMDGPU::CPol::ALL_pregfx12),
12494 DL, MVT::i8)); // cpol
12495 Ops.push_back(DAG.getTargetConstant(
12496 Aux & (IsGFX12Plus ? AMDGPU::CPol::SWZ : AMDGPU::CPol::SWZ_pregfx12)
12497 ? 1
12498 : 0,
12499 DL, MVT::i8)); // swz
12500 Ops.push_back(
12501 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
12502 Ops.push_back(M0Val.getValue(0)); // Chain
12503 Ops.push_back(M0Val.getValue(1)); // Glue
12504
12505 auto *M = cast<MemSDNode>(Op);
12506 auto *Load = DAG.getMachineNode(Opc, DL, M->getVTList(), Ops);
12507 DAG.setNodeMemRefs(Load, M->memoperands());
12508
12509 return SDValue(Load, 0);
12510 }
12511 // Buffers are handled by LowerBufferFatPointers, and we're going to go
12512 // for "trust me" that the remaining cases are global pointers until
12513 // such time as we can put two mem operands on an intrinsic.
12514 case Intrinsic::amdgcn_load_to_lds:
12515 case Intrinsic::amdgcn_load_async_to_lds:
12516 case Intrinsic::amdgcn_global_load_lds:
12517 case Intrinsic::amdgcn_global_load_async_lds: {
12518 if (!Subtarget->hasVMemToLDSLoad())
12519 return SDValue();
12520
12521 unsigned Opc;
12522 unsigned Size = Op->getConstantOperandVal(4);
12523 switch (Size) {
12524 default:
12525 return SDValue();
12526 case 1:
12527 Opc = AMDGPU::GLOBAL_LOAD_LDS_UBYTE;
12528 break;
12529 case 2:
12530 Opc = AMDGPU::GLOBAL_LOAD_LDS_USHORT;
12531 break;
12532 case 4:
12533 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORD;
12534 break;
12535 case 12:
12536 if (!Subtarget->hasLDSLoadB96_B128())
12537 return SDValue();
12538 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX3;
12539 break;
12540 case 16:
12541 if (!Subtarget->hasLDSLoadB96_B128())
12542 return SDValue();
12543 Opc = AMDGPU::GLOBAL_LOAD_LDS_DWORDX4;
12544 break;
12545 }
12546
12547 SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(3));
12548
12550
12551 SDValue Addr = Op.getOperand(2); // Global ptr
12552 SDValue VOffset;
12553 // Try to split SAddr and VOffset. Global and LDS pointers share the same
12554 // immediate offset, so we cannot use a regular SelectGlobalSAddr().
12555 if (Addr->isDivergent() && Addr->isAnyAdd()) {
12556 SDValue LHS = Addr.getOperand(0);
12557 SDValue RHS = Addr.getOperand(1);
12558
12559 if (LHS->isDivergent())
12560 std::swap(LHS, RHS);
12561
12562 if (!LHS->isDivergent() && RHS.getOpcode() == ISD::ZERO_EXTEND &&
12563 RHS.getOperand(0).getValueType() == MVT::i32) {
12564 // add (i64 sgpr), (zero_extend (i32 vgpr))
12565 Addr = LHS;
12566 VOffset = RHS.getOperand(0);
12567 }
12568 }
12569
12570 Ops.push_back(Addr);
12571 if (!Addr->isDivergent()) {
12573 if (!VOffset)
12574 VOffset =
12575 SDValue(DAG.getMachineNode(AMDGPU::V_MOV_B32_e32, DL, MVT::i32,
12576 DAG.getTargetConstant(0, DL, MVT::i32)),
12577 0);
12578 Ops.push_back(VOffset);
12579 }
12580
12581 Ops.push_back(Op.getOperand(5)); // Offset
12582
12583 unsigned Aux = Op.getConstantOperandVal(6);
12584 Ops.push_back(DAG.getTargetConstant(Aux & ~AMDGPU::CPol::VIRTUAL_BITS, DL,
12585 MVT::i32)); // CPol
12586 Ops.push_back(
12587 DAG.getTargetConstant(isAsyncLDSDMA(IntrinsicID), DL, MVT::i8));
12588
12589 Ops.push_back(M0Val.getValue(0)); // Chain
12590 Ops.push_back(M0Val.getValue(1)); // Glue
12591
12592 auto *M = cast<MemSDNode>(Op);
12593 auto *Load = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12594 DAG.setNodeMemRefs(Load, M->memoperands());
12595
12596 return SDValue(Load, 0);
12597 }
12598 case Intrinsic::amdgcn_end_cf:
12599 return SDValue(DAG.getMachineNode(AMDGPU::SI_END_CF, DL, MVT::Other,
12600 Op->getOperand(2), Chain),
12601 0);
12602 case Intrinsic::amdgcn_s_barrier_signal_var: {
12603 // Member count of 0 means to re-use a previous member count,
12604 // which, if the named barrier is statically chosen, means we can use
12605 // the immarg form. Otherwisee, fall through to constructiong M0 as for
12606 // s_barrier_init.
12607 SDValue CntOp = Op->getOperand(3);
12608 auto *CntC = dyn_cast<ConstantSDNode>(CntOp);
12609 if (CntC && CntC->isZero()) {
12610 SDValue Chain = Op->getOperand(0);
12611 SDValue BarOp = Op->getOperand(2);
12613
12614 std::optional<uint64_t> BarVal;
12615 if (auto *C = dyn_cast<ConstantSDNode>(BarOp))
12616 BarVal = C->getZExtValue();
12617 else if (auto *GA = dyn_cast<GlobalAddressSDNode>(BarOp))
12619 *GA->getGlobal()))
12620 BarVal = *Addr + GA->getOffset();
12621
12622 if (BarVal) {
12623 unsigned BarID = (*BarVal >> 4) & 0x3F;
12624 Ops.push_back(DAG.getTargetConstant(BarID, DL, MVT::i32));
12625 Ops.push_back(Chain);
12626 auto *NewMI = DAG.getMachineNode(AMDGPU::S_BARRIER_SIGNAL_IMM, DL,
12627 Op->getVTList(), Ops);
12628 return SDValue(NewMI, 0);
12629 }
12630 }
12631 [[fallthrough]];
12632 }
12633 case Intrinsic::amdgcn_s_barrier_init: {
12634 // these two intrinsics have two operands: barrier pointer and member count
12635 SDValue Chain = Op->getOperand(0);
12637 SDValue BarOp = Op->getOperand(2);
12638 SDValue CntOp = Op->getOperand(3);
12639 SDValue M0Val;
12640 unsigned Opc = IntrinsicID == Intrinsic::amdgcn_s_barrier_init
12641 ? AMDGPU::S_BARRIER_INIT_M0
12642 : AMDGPU::S_BARRIER_SIGNAL_M0;
12643 // extract the BarrierID from bits 4-9 of BarOp
12644 SDValue BarID;
12645 BarID = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12646 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12647 BarID =
12648 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, BarID,
12649 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12650 0);
12651 // Member count should be put into M0[ShAmt:+6]
12652 // Barrier ID should be put into M0[5:0]
12653 M0Val =
12654 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, CntOp,
12655 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12656 0);
12657 constexpr unsigned ShAmt = 16;
12658 M0Val = DAG.getNode(ISD::SHL, DL, MVT::i32, CntOp,
12659 DAG.getShiftAmountConstant(ShAmt, MVT::i32, DL));
12660
12661 M0Val = SDValue(
12662 DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, M0Val, BarID), 0);
12663
12664 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12665
12666 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12667 return SDValue(NewMI, 0);
12668 }
12669 case Intrinsic::amdgcn_s_wakeup_barrier: {
12670 if (!Subtarget->hasSWakeupBarrier())
12671 return SDValue();
12672 [[fallthrough]];
12673 }
12674 case Intrinsic::amdgcn_s_barrier_join: {
12675 // these three intrinsics have one operand: barrier pointer
12676 SDValue Chain = Op->getOperand(0);
12678 SDValue BarOp = Op->getOperand(2);
12679 unsigned Opc;
12680
12681 if (isa<ConstantSDNode>(BarOp)) {
12682 uint64_t BarVal = cast<ConstantSDNode>(BarOp)->getZExtValue();
12683 switch (IntrinsicID) {
12684 default:
12685 return SDValue();
12686 case Intrinsic::amdgcn_s_barrier_join:
12687 Opc = AMDGPU::S_BARRIER_JOIN_IMM;
12688 break;
12689 case Intrinsic::amdgcn_s_wakeup_barrier:
12690 Opc = AMDGPU::S_WAKEUP_BARRIER_IMM;
12691 break;
12692 }
12693 // extract the BarrierID from bits 4-9 of the immediate
12694 unsigned BarID = (BarVal >> 4) & 0x3F;
12695 SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
12696 Ops.push_back(K);
12697 Ops.push_back(Chain);
12698 } else {
12699 switch (IntrinsicID) {
12700 default:
12701 return SDValue();
12702 case Intrinsic::amdgcn_s_barrier_join:
12703 Opc = AMDGPU::S_BARRIER_JOIN_M0;
12704 break;
12705 case Intrinsic::amdgcn_s_wakeup_barrier:
12706 Opc = AMDGPU::S_WAKEUP_BARRIER_M0;
12707 break;
12708 }
12709 // extract the BarrierID from bits 4-9 of BarOp, copy to M0[5:0]
12710 SDValue M0Val;
12711 M0Val = DAG.getNode(ISD::SRL, DL, MVT::i32, BarOp,
12712 DAG.getShiftAmountConstant(4, MVT::i32, DL));
12713 M0Val =
12714 SDValue(DAG.getMachineNode(AMDGPU::S_AND_B32, DL, MVT::i32, M0Val,
12715 DAG.getTargetConstant(0x3F, DL, MVT::i32)),
12716 0);
12717 Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
12718 }
12719
12720 auto *NewMI = DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops);
12721 return SDValue(NewMI, 0);
12722 }
12723 case Intrinsic::amdgcn_s_prefetch_data:
12724 case Intrinsic::amdgcn_s_prefetch_inst: {
12725 // For non-global address space preserve the chain and remove the call.
12727 return Op.getOperand(0);
12728 return Op;
12729 }
12730 case Intrinsic::amdgcn_s_buffer_prefetch_data: {
12731 SDValue Ops[] = {
12732 Chain, bufferRsrcPtrToVector(Op.getOperand(2), DAG),
12733 Op.getOperand(3), // offset
12734 Op.getOperand(4), // length
12735 };
12736
12737 MemSDNode *M = cast<MemSDNode>(Op);
12738 return DAG.getMemIntrinsicNode(AMDGPUISD::SBUFFER_PREFETCH_DATA, DL,
12739 Op->getVTList(), Ops, M->getMemoryVT(),
12740 M->getMemOperand());
12741 }
12742 case Intrinsic::amdgcn_cooperative_atomic_store_32x4B:
12743 case Intrinsic::amdgcn_cooperative_atomic_store_16x8B:
12744 case Intrinsic::amdgcn_cooperative_atomic_store_8x16B: {
12745 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12746 SDValue Chain = Op->getOperand(0);
12747 SDValue Ptr = Op->getOperand(2);
12748 SDValue Val = Op->getOperand(3);
12749 return DAG.getAtomic(ISD::ATOMIC_STORE, DL, MII->getMemoryVT(), Chain, Val,
12750 Ptr, MII->getMemOperand());
12751 }
12752 case Intrinsic::amdgcn_av_store_b128: {
12753 if (!Subtarget->hasFlatGlobalInsts()) {
12754 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
12756 "llvm.amdgcn.av.store.b128 not supported on subtarget",
12757 DL.getDebugLoc()));
12758 return Op->getOperand(0); // return the input chain
12759 }
12760 MemIntrinsicSDNode *MII = cast<MemIntrinsicSDNode>(Op);
12761 SDValue Chain = Op->getOperand(0);
12762 SDValue Ptr = Op->getOperand(2);
12763 SDValue Val = Op->getOperand(3);
12764 return DAG.getStore(Chain, DL, Val, Ptr, MII->getMemOperand());
12765 }
12766 default: {
12767 if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
12769 return lowerImage(Op, ImageDimIntr, DAG, true);
12770
12771 return Op;
12772 }
12773 }
12774}
12775
12776// Return whether the operation has NoUnsignedWrap property.
12777static bool isNoUnsignedWrap(SDValue Addr) {
12778 return (Addr.getOpcode() == ISD::ADD &&
12779 Addr->getFlags().hasNoUnsignedWrap()) ||
12780 Addr->getOpcode() == ISD::OR;
12781}
12782
12784 EVT PtrVT) const {
12785 return PtrVT == MVT::i64;
12786}
12787
12789 EVT PtrVT) const {
12790 return true;
12791}
12792
12793// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
12794// offset (the offset that is included in bounds checking and swizzling, to be
12795// split between the instruction's voffset and immoffset fields) and soffset
12796// (the offset that is excluded from bounds checking and swizzling, to go in
12797// the instruction's soffset field). This function takes the first kind of
12798// offset and figures out how to split it between voffset and immoffset.
12799std::pair<SDValue, SDValue>
12800SITargetLowering::splitBufferOffsets(SDValue Offset, SelectionDAG &DAG) const {
12801 SDLoc DL(Offset);
12802 const unsigned MaxImm = SIInstrInfo::getMaxMUBUFImmOffset(*Subtarget);
12803 SDValue N0 = Offset;
12804 ConstantSDNode *C1 = nullptr;
12805
12806 if ((C1 = dyn_cast<ConstantSDNode>(N0)))
12807 N0 = SDValue();
12808 else if (DAG.isBaseWithConstantOffset(N0)) {
12809 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12810 // being added, so we can only safely match a 32-bit addition with no
12811 // unsigned overflow.
12812 bool CheckNUW = Subtarget->hasGFX1250Insts();
12813 if (!CheckNUW || isNoUnsignedWrap(N0)) {
12814 C1 = cast<ConstantSDNode>(N0.getOperand(1));
12815 N0 = N0.getOperand(0);
12816 }
12817 }
12818
12819 if (C1) {
12820 unsigned ImmOffset = C1->getZExtValue();
12821 // If the immediate value is too big for the immoffset field, put only bits
12822 // that would normally fit in the immoffset field. The remaining value that
12823 // is copied/added for the voffset field is a large power of 2, and it
12824 // stands more chance of being CSEd with the copy/add for another similar
12825 // load/store.
12826 // However, do not do that rounding down if that is a negative
12827 // number, as it appears to be illegal to have a negative offset in the
12828 // vgpr, even if adding the immediate offset makes it positive.
12829 unsigned Overflow = ImmOffset & ~MaxImm;
12830 ImmOffset -= Overflow;
12831 if ((int32_t)Overflow < 0) {
12832 Overflow += ImmOffset;
12833 ImmOffset = 0;
12834 }
12835 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(ImmOffset, DL, MVT::i32));
12836 if (Overflow) {
12837 auto OverflowVal = DAG.getConstant(Overflow, DL, MVT::i32);
12838 if (!N0)
12839 N0 = OverflowVal;
12840 else {
12841 SDValue Ops[] = {N0, OverflowVal};
12842 N0 = DAG.getNode(ISD::ADD, DL, MVT::i32, Ops);
12843 }
12844 }
12845 }
12846 if (!N0)
12847 N0 = DAG.getConstant(0, DL, MVT::i32);
12848 if (!C1)
12849 C1 = cast<ConstantSDNode>(DAG.getTargetConstant(0, DL, MVT::i32));
12850 return {N0, SDValue(C1, 0)};
12851}
12852
12853// Analyze a combined offset from an amdgcn_s_buffer_load intrinsic and store
12854// the three offsets (voffset, soffset and instoffset) into the SDValue[3] array
12855// pointed to by Offsets.
12856void SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
12857 SelectionDAG &DAG, SDValue *Offsets,
12858 Align Alignment) const {
12859 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
12860 SDLoc DL(CombinedOffset);
12861 if (auto *C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
12862 uint32_t Imm = C->getZExtValue();
12863 uint32_t SOffset, ImmOffset;
12864 if (TII->splitMUBUFOffset(Imm, SOffset, ImmOffset, Alignment)) {
12865 Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
12866 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12867 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12868 return;
12869 }
12870 }
12871 if (DAG.isBaseWithConstantOffset(CombinedOffset)) {
12872 // On GFX1250+, voffset and immoffset are zero-extended from 32 bits before
12873 // being added, so we can only safely match a 32-bit addition with no
12874 // unsigned overflow.
12875 bool CheckNUW = Subtarget->hasGFX1250Insts();
12876 SDValue N0 = CombinedOffset.getOperand(0);
12877 SDValue N1 = CombinedOffset.getOperand(1);
12878 uint32_t SOffset, ImmOffset;
12879 int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
12880 if (Offset >= 0 && (!CheckNUW || isNoUnsignedWrap(CombinedOffset)) &&
12881 TII->splitMUBUFOffset(Offset, SOffset, ImmOffset, Alignment)) {
12882 Offsets[0] = N0;
12883 Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
12884 Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
12885 return;
12886 }
12887 }
12888
12889 SDValue SOffsetZero = Subtarget->hasRestrictedSOffset()
12890 ? DAG.getRegister(AMDGPU::SGPR_NULL, MVT::i32)
12891 : DAG.getConstant(0, DL, MVT::i32);
12892
12893 Offsets[0] = CombinedOffset;
12894 Offsets[1] = SOffsetZero;
12895 Offsets[2] = DAG.getTargetConstant(0, DL, MVT::i32);
12896}
12897
12898SDValue SITargetLowering::bufferRsrcPtrToVector(SDValue MaybePointer,
12899 SelectionDAG &DAG) const {
12900 if (!MaybePointer.getValueType().isScalarInteger())
12901 return MaybePointer;
12902
12903 SDValue Rsrc = DAG.getBitcast(MVT::v4i32, MaybePointer);
12904 return Rsrc;
12905}
12906
12907// Wrap a global or flat pointer into a buffer intrinsic using the flags
12908// specified in the intrinsic.
12909SDValue SITargetLowering::lowerPointerAsRsrcIntrin(SDNode *Op,
12910 SelectionDAG &DAG) const {
12911 SDLoc Loc(Op);
12912
12913 SDValue Pointer = Op->getOperand(1);
12914 SDValue Stride = Op->getOperand(2);
12915 SDValue NumRecords = Op->getOperand(3);
12916 SDValue Flags = Op->getOperand(4);
12917
12918 SDValue ExtStride = DAG.getAnyExtOrTrunc(Stride, Loc, MVT::i32);
12919 SDValue Rsrc;
12920
12921 if (Subtarget->has45BitNumRecordsBufferResource()) {
12922 SDValue Zero = DAG.getConstant(0, Loc, MVT::i32);
12923 // Build the lower 64-bit value, which has a 57-bit base and the lower 7-bit
12924 // num_records.
12925 SDValue ExtPointer = DAG.getAnyExtOrTrunc(Pointer, Loc, MVT::i64);
12926 SDValue NumRecordsLHS =
12927 DAG.getNode(ISD::SHL, Loc, MVT::i64, NumRecords,
12928 DAG.getShiftAmountConstant(57, MVT::i32, Loc));
12929 SDValue LowHalf =
12930 DAG.getNode(ISD::OR, Loc, MVT::i64, ExtPointer, NumRecordsLHS);
12931
12932 // Build the higher 64-bit value, which has the higher 38-bit num_records,
12933 // 6-bit zero (omit), 16-bit stride and scale and 4-bit flag.
12934 SDValue NumRecordsRHS =
12935 DAG.getNode(ISD::SRL, Loc, MVT::i64, NumRecords,
12936 DAG.getShiftAmountConstant(7, MVT::i32, Loc));
12937 SDValue ShiftedStride =
12938 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12939 DAG.getShiftAmountConstant(12, MVT::i32, Loc));
12940 SDValue ExtShiftedStrideVec =
12941 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedStride);
12942 SDValue ExtShiftedStride =
12943 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedStrideVec);
12944 SDValue ShiftedFlags =
12945 DAG.getNode(ISD::SHL, Loc, MVT::i32, Flags,
12946 DAG.getShiftAmountConstant(28, MVT::i32, Loc));
12947 SDValue ExtShiftedFlagsVec =
12948 DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i32, Zero, ShiftedFlags);
12949 SDValue ExtShiftedFlags =
12950 DAG.getNode(ISD::BITCAST, Loc, MVT::i64, ExtShiftedFlagsVec);
12951 SDValue CombinedFields =
12952 DAG.getNode(ISD::OR, Loc, MVT::i64, NumRecordsRHS, ExtShiftedStride);
12953 SDValue HighHalf =
12954 DAG.getNode(ISD::OR, Loc, MVT::i64, CombinedFields, ExtShiftedFlags);
12955
12956 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v2i64, LowHalf, HighHalf);
12957 } else {
12958 NumRecords = DAG.getAnyExtOrTrunc(NumRecords, Loc, MVT::i32);
12959 auto [LowHalf, HighHalf] =
12960 DAG.SplitScalar(Pointer, Loc, MVT::i32, MVT::i32);
12961 SDValue Mask = DAG.getConstant(0x0000ffff, Loc, MVT::i32);
12962 SDValue Masked = DAG.getNode(ISD::AND, Loc, MVT::i32, HighHalf, Mask);
12963 SDValue ShiftedStride =
12964 DAG.getNode(ISD::SHL, Loc, MVT::i32, ExtStride,
12965 DAG.getShiftAmountConstant(16, MVT::i32, Loc));
12966 SDValue NewHighHalf =
12967 DAG.getNode(ISD::OR, Loc, MVT::i32, Masked, ShiftedStride);
12968
12969 Rsrc = DAG.getNode(ISD::BUILD_VECTOR, Loc, MVT::v4i32, LowHalf, NewHighHalf,
12970 NumRecords, Flags);
12971 }
12972
12973 SDValue RsrcPtr = DAG.getNode(ISD::BITCAST, Loc, MVT::i128, Rsrc);
12974 return RsrcPtr;
12975}
12976
12977// Handle 8 bit and 16 bit buffer loads
12978SDValue SITargetLowering::handleByteShortBufferLoads(SelectionDAG &DAG,
12979 EVT LoadVT, SDLoc DL,
12981 MachineMemOperand *MMO,
12982 bool IsTFE) const {
12983 EVT IntVT = LoadVT.changeTypeToInteger();
12984
12985 if (IsTFE) {
12986 unsigned Opc = (LoadVT.getScalarType() == MVT::i8)
12987 ? AMDGPUISD::BUFFER_LOAD_UBYTE_TFE
12988 : AMDGPUISD::BUFFER_LOAD_USHORT_TFE;
12989 MachineFunction &MF = DAG.getMachineFunction();
12990 MachineMemOperand *OpMMO = MF.getMachineMemOperand(MMO, 0, 8);
12991 SDVTList VTs = DAG.getVTList(MVT::v2i32, MVT::Other);
12992 SDValue Op = getMemIntrinsicNode(Opc, DL, VTs, Ops, MVT::v2i32, OpMMO, DAG);
12993 SDValue Status = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12994 DAG.getConstant(1, DL, MVT::i32));
12995 SDValue Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, Op,
12996 DAG.getConstant(0, DL, MVT::i32));
12997 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IntVT, Data);
12998 SDValue Value = DAG.getNode(ISD::BITCAST, DL, LoadVT, Trunc);
12999 return DAG.getMergeValues({Value, Status, SDValue(Op.getNode(), 1)}, DL);
13000 }
13001
13002 unsigned Opc = LoadVT.getScalarType() == MVT::i8
13003 ? AMDGPUISD::BUFFER_LOAD_UBYTE
13004 : AMDGPUISD::BUFFER_LOAD_USHORT;
13005
13006 SDVTList ResList = DAG.getVTList(MVT::i32, MVT::Other);
13007 SDValue BufferLoad =
13008 DAG.getMemIntrinsicNode(Opc, DL, ResList, Ops, IntVT, MMO);
13009 SDValue LoadVal = DAG.getNode(ISD::TRUNCATE, DL, IntVT, BufferLoad);
13010 LoadVal = DAG.getNode(ISD::BITCAST, DL, LoadVT, LoadVal);
13011
13012 return DAG.getMergeValues({LoadVal, BufferLoad.getValue(1)}, DL);
13013}
13014
13015// Handle 8 bit and 16 bit buffer stores
13016SDValue SITargetLowering::handleByteShortBufferStores(SelectionDAG &DAG,
13017 EVT VDataType, SDLoc DL,
13018 SDValue Ops[],
13019 MemSDNode *M) const {
13020 if (VDataType == MVT::f16 || VDataType == MVT::bf16)
13021 Ops[1] = DAG.getNode(ISD::BITCAST, DL, MVT::i16, Ops[1]);
13022
13023 SDValue BufferStoreExt = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Ops[1]);
13024 Ops[1] = BufferStoreExt;
13025 unsigned Opc = (VDataType == MVT::i8) ? AMDGPUISD::BUFFER_STORE_BYTE
13026 : AMDGPUISD::BUFFER_STORE_SHORT;
13027 ArrayRef<SDValue> OpsRef = ArrayRef(&Ops[0], 9);
13028 return DAG.getMemIntrinsicNode(Opc, DL, M->getVTList(), OpsRef, VDataType,
13029 M->getMemOperand());
13030}
13031
13033 SDValue Op, const SDLoc &SL, EVT VT) {
13034 if (VT.bitsLT(Op.getValueType()))
13035 return DAG.getNode(ISD::TRUNCATE, SL, VT, Op);
13036
13037 switch (ExtType) {
13038 case ISD::SEXTLOAD:
13039 return DAG.getNode(ISD::SIGN_EXTEND, SL, VT, Op);
13040 case ISD::ZEXTLOAD:
13041 return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, Op);
13042 case ISD::EXTLOAD:
13043 return DAG.getNode(ISD::ANY_EXTEND, SL, VT, Op);
13044 case ISD::NON_EXTLOAD:
13045 return Op;
13046 }
13047
13048 llvm_unreachable("invalid ext type");
13049}
13050
13051// Try to turn 8 and 16-bit scalar loads into SMEM eligible 32-bit loads.
13052// TODO: Skip this on GFX12 which does have scalar sub-dword loads.
13053SDValue SITargetLowering::widenLoad(LoadSDNode *Ld,
13054 DAGCombinerInfo &DCI) const {
13055 SelectionDAG &DAG = DCI.DAG;
13056 if (Ld->getAlign() < Align(4) || Ld->isDivergent())
13057 return SDValue();
13058
13059 // FIXME: Constant loads should all be marked invariant.
13060 unsigned AS = Ld->getAddressSpace();
13061 if (AS != AMDGPUAS::CONSTANT_ADDRESS &&
13063 (AS != AMDGPUAS::GLOBAL_ADDRESS || !Ld->isInvariant()))
13064 return SDValue();
13065
13066 // Don't do this early, since it may interfere with adjacent load merging for
13067 // illegal types. We can avoid losing alignment information for exotic types
13068 // pre-legalize.
13069 EVT MemVT = Ld->getMemoryVT();
13070 if ((MemVT.isSimple() && !DCI.isAfterLegalizeDAG()) ||
13071 MemVT.getSizeInBits() >= 32)
13072 return SDValue();
13073
13074 SDLoc SL(Ld);
13075
13076 assert((!MemVT.isVector() || Ld->getExtensionType() == ISD::NON_EXTLOAD) &&
13077 "unexpected vector extload");
13078
13079 // TODO: Drop only high part of range.
13080 SDValue Ptr = Ld->getBasePtr();
13081 SDValue NewLoad = DAG.getLoad(
13082 ISD::UNINDEXED, ISD::NON_EXTLOAD, MVT::i32, SL, Ld->getChain(), Ptr,
13083 Ld->getOffset(), Ld->getPointerInfo(), MVT::i32, Ld->getAlign(),
13084 Ld->getMemOperand()->getFlags(), Ld->getAAInfo(),
13085 nullptr); // Drop ranges
13086
13087 EVT TruncVT = EVT::getIntegerVT(*DAG.getContext(), MemVT.getSizeInBits());
13088 if (MemVT.isFloatingPoint()) {
13090 "unexpected fp extload");
13091 TruncVT = MemVT.changeTypeToInteger();
13092 }
13093
13094 SDValue Cvt = NewLoad;
13095 if (Ld->getExtensionType() == ISD::SEXTLOAD) {
13096 Cvt = DAG.getNode(ISD::SIGN_EXTEND_INREG, SL, MVT::i32, NewLoad,
13097 DAG.getValueType(TruncVT));
13098 } else if (Ld->getExtensionType() == ISD::ZEXTLOAD ||
13100 Cvt = DAG.getZeroExtendInReg(NewLoad, SL, TruncVT);
13101 } else {
13103 }
13104
13105 EVT VT = Ld->getValueType(0);
13106 EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
13107
13108 DCI.AddToWorklist(Cvt.getNode());
13109
13110 // We may need to handle exotic cases, such as i16->i64 extloads, so insert
13111 // the appropriate extension from the 32-bit load.
13112 Cvt = getLoadExtOrTrunc(DAG, Ld->getExtensionType(), Cvt, SL, IntVT);
13113 DCI.AddToWorklist(Cvt.getNode());
13114
13115 // Handle conversion back to floating point if necessary.
13116 Cvt = DAG.getNode(ISD::BITCAST, SL, VT, Cvt);
13117
13118 return DAG.getMergeValues({Cvt, NewLoad.getValue(1)}, SL);
13119}
13120
13122 const SIMachineFunctionInfo &Info) {
13123 // TODO: Should check if the address can definitely not access stack.
13124 if (Info.isEntryFunction())
13125 return Info.getUserSGPRInfo().hasFlatScratchInit();
13126 return true;
13127}
13128
13129SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
13130 SDLoc DL(Op);
13131 LoadSDNode *Load = cast<LoadSDNode>(Op);
13132 ISD::LoadExtType ExtType = Load->getExtensionType();
13133 EVT MemVT = Load->getMemoryVT();
13134 MachineMemOperand *MMO = Load->getMemOperand();
13135
13136 if (ExtType == ISD::NON_EXTLOAD && MemVT.getSizeInBits() < 32) {
13137 if (MemVT == MVT::i16 && isTypeLegal(MVT::i16))
13138 return SDValue();
13139
13140 // FIXME: Copied from PPC
13141 // First, load into 32 bits, then truncate to 1 bit.
13142
13143 SDValue Chain = Load->getChain();
13144 SDValue BasePtr = Load->getBasePtr();
13145
13146 EVT RealMemVT = (MemVT == MVT::i1) ? MVT::i8 : MVT::i16;
13147
13148 SDValue NewLD = DAG.getExtLoad(ISD::EXTLOAD, DL, MVT::i32, Chain, BasePtr,
13149 RealMemVT, MMO);
13150
13151 if (!MemVT.isVector()) {
13152 SDValue Ops[] = {DAG.getNode(ISD::TRUNCATE, DL, MemVT, NewLD),
13153 NewLD.getValue(1)};
13154
13155 return DAG.getMergeValues(Ops, DL);
13156 }
13157
13159 for (unsigned I = 0, N = MemVT.getVectorNumElements(); I != N; ++I) {
13160 SDValue Elt = DAG.getNode(ISD::SRL, DL, MVT::i32, NewLD,
13161 DAG.getConstant(I, DL, MVT::i32));
13162
13163 Elts.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Elt));
13164 }
13165
13166 SDValue Ops[] = {DAG.getBuildVector(MemVT, DL, Elts), NewLD.getValue(1)};
13167
13168 return DAG.getMergeValues(Ops, DL);
13169 }
13170
13171 if (!MemVT.isVector())
13172 return SDValue();
13173
13174 assert(Op.getValueType().getVectorElementType() == MVT::i32 &&
13175 "Custom lowering for non-i32 vectors hasn't been implemented.");
13176
13177 Align Alignment = Load->getAlign();
13178 unsigned AS = Load->getAddressSpace();
13179 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13180 AS == AMDGPUAS::FLAT_ADDRESS &&
13181 Alignment.value() < MemVT.getStoreSize() && MemVT.getSizeInBits() > 32) {
13182 return SplitVectorLoad(Op, DAG);
13183 }
13184
13185 MachineFunction &MF = DAG.getMachineFunction();
13186 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13187 // If there is a possibility that flat instruction access scratch memory
13188 // then we need to use the same legalization rules we use for private.
13189 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13190 !Subtarget->hasMultiDwordFlatScratchAddressing())
13191 AS = addressMayBeAccessedAsPrivate(Load->getMemOperand(), *MFI)
13194
13195 unsigned NumElements = MemVT.getVectorNumElements();
13196
13197 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
13199 (AS == AMDGPUAS::GLOBAL_ADDRESS &&
13200 Subtarget->getScalarizeGlobalBehavior() && Load->isSimple() &&
13201 (Load->isInvariant() || isMemOpHasNoClobberedMemOperand(Load)))) {
13202 if ((!Op->isDivergent() || AMDGPU::isUniformMMO(MMO)) &&
13203 Alignment >= Align(4) && NumElements < 32) {
13204 if (MemVT.isPow2VectorType() ||
13205 (Subtarget->hasScalarDwordx3Loads() && NumElements == 3))
13206 return SDValue();
13207 return WidenOrSplitVectorLoad(Op, DAG);
13208 }
13209 // Non-uniform loads will be selected to MUBUF instructions, so they
13210 // have the same legalization requirements as global and private
13211 // loads.
13212 //
13213 }
13214 if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
13217 if (NumElements > 4)
13218 return SplitVectorLoad(Op, DAG);
13219 // v3 loads not supported on SI.
13220 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13221 return WidenOrSplitVectorLoad(Op, DAG);
13222
13223 // v3 and v4 loads are supported for private and global memory.
13224 return SDValue();
13225 }
13226 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13227 // Depending on the setting of the private_element_size field in the
13228 // resource descriptor, we can only make private accesses up to a certain
13229 // size.
13230 switch (Subtarget->getMaxPrivateElementSize()) {
13231 case 4: {
13232 auto [Op0, Op1] = scalarizeVectorLoad(Load, DAG);
13233 return DAG.getMergeValues({Op0, Op1}, DL);
13234 }
13235 case 8:
13236 if (NumElements > 2)
13237 return SplitVectorLoad(Op, DAG);
13238 return SDValue();
13239 case 16:
13240 // Same as global/flat
13241 if (NumElements > 4)
13242 return SplitVectorLoad(Op, DAG);
13243 // v3 loads not supported on SI.
13244 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13245 return WidenOrSplitVectorLoad(Op, DAG);
13246
13247 return SDValue();
13248 default:
13249 llvm_unreachable("unsupported private_element_size");
13250 }
13251 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13252 unsigned Fast = 0;
13253 auto Flags = Load->getMemOperand()->getFlags();
13255 Load->getAlign(), Flags, &Fast) &&
13256 Fast > 1)
13257 return SDValue();
13258
13259 if (MemVT.isVector())
13260 return SplitVectorLoad(Op, DAG);
13261 }
13262
13264 MemVT, *Load->getMemOperand())) {
13265 auto [Op0, Op1] = expandUnalignedLoad(Load, DAG);
13266 return DAG.getMergeValues({Op0, Op1}, DL);
13267 }
13268
13269 return SDValue();
13270}
13271
13272SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
13273 EVT VT = Op.getValueType();
13274 if (VT.getSizeInBits() == 128 || VT.getSizeInBits() == 256 ||
13275 VT.getSizeInBits() == 512)
13276 return splitTernaryVectorOp(Op, DAG);
13277
13278 assert(VT.getSizeInBits() == 64);
13279
13280 SDLoc DL(Op);
13281 SDValue Cond = DAG.getFreeze(Op.getOperand(0));
13282
13283 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
13284 SDValue One = DAG.getConstant(1, DL, MVT::i32);
13285
13286 SDValue LHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(1));
13287 SDValue RHS = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Op.getOperand(2));
13288
13289 SDValue Lo0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, Zero);
13290 SDValue Lo1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, Zero);
13291
13292 SDValue Lo = DAG.getSelect(DL, MVT::i32, Cond, Lo0, Lo1);
13293
13294 SDValue Hi0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, LHS, One);
13295 SDValue Hi1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, RHS, One);
13296
13297 SDValue Hi = DAG.getSelect(DL, MVT::i32, Cond, Hi0, Hi1);
13298
13299 SDValue Res = DAG.getBuildVector(MVT::v2i32, DL, {Lo, Hi});
13300 return DAG.getNode(ISD::BITCAST, DL, VT, Res);
13301}
13302
13303// Catch division cases where we can use shortcuts with rcp and rsq
13304// instructions.
13305SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
13306 SelectionDAG &DAG) const {
13307 SDLoc SL(Op);
13308 SDValue LHS = Op.getOperand(0);
13309 SDValue RHS = Op.getOperand(1);
13310 EVT VT = Op.getValueType();
13311 const SDNodeFlags Flags = Op->getFlags();
13312
13313 bool AllowInaccurateRcp = Flags.hasApproximateFuncs();
13314
13315 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
13316 // Without !fpmath accuracy information, we can't do more because we don't
13317 // know exactly whether rcp is accurate enough to meet !fpmath requirement.
13318 // f16 is always accurate enough
13319 if (!AllowInaccurateRcp && VT != MVT::f16 && VT != MVT::bf16)
13320 return SDValue();
13321
13322 if (CLHS->isOne()) {
13323 // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
13324 // the CI documentation has a worst case error of 1 ulp.
13325 // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
13326 // use it as long as we aren't trying to use denormals.
13327 //
13328 // v_rcp_f16 and v_rsq_f16 DO support denormals and 0.51ulp.
13329
13330 // 1.0 / sqrt(x) -> rsq(x)
13331
13332 // XXX - Is afn sufficient to do this for f64? The maximum ULP
13333 // error seems really high at 2^29 ULP.
13334 // 1.0 / x -> rcp(x)
13335 return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
13336 }
13337
13338 // Same as for 1.0, but expand the sign out of the constant.
13339 if (CLHS->isMinusOne()) {
13340 // -1.0 / x -> rcp (fneg x)
13341 SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
13342 return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
13343 }
13344 }
13345
13346 // For f16 and bf16 require afn or arcp.
13347 // For f32 require afn.
13348 if (!AllowInaccurateRcp &&
13349 ((VT != MVT::f16 && VT != MVT::bf16) || !Flags.hasAllowReciprocal()))
13350 return SDValue();
13351
13352 // Turn into multiply by the reciprocal.
13353 // x / y -> x * (1.0 / y)
13354 SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
13355 return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
13356}
13357
13358SDValue SITargetLowering::lowerFastUnsafeFDIV64(SDValue Op,
13359 SelectionDAG &DAG) const {
13360 SDLoc SL(Op);
13361 SDValue X = Op.getOperand(0);
13362 SDValue Y = Op.getOperand(1);
13363 EVT VT = Op.getValueType();
13364 const SDNodeFlags Flags = Op->getFlags();
13365
13366 bool AllowInaccurateDiv = Flags.hasApproximateFuncs();
13367 if (!AllowInaccurateDiv)
13368 return SDValue();
13369
13370 const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(X);
13371 bool IsNegRcp = CLHS && CLHS->isMinusOne();
13372
13373 // Pull out the negation so it folds for free into the source modifiers.
13374 if (IsNegRcp)
13375 X = DAG.getConstantFP(1.0, SL, VT);
13376
13377 SDValue NegY = IsNegRcp ? Y : DAG.getNode(ISD::FNEG, SL, VT, Y);
13378 SDValue One = DAG.getConstantFP(1.0, SL, VT);
13379
13380 SDValue R = DAG.getNode(AMDGPUISD::RCP, SL, VT, Y);
13381 if (IsNegRcp)
13382 R = DAG.getNode(ISD::FNEG, SL, VT, R);
13383
13384 SDValue Tmp0 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
13385
13386 R = DAG.getNode(ISD::FMA, SL, VT, Tmp0, R, R);
13387 SDValue Tmp1 = DAG.getNode(ISD::FMA, SL, VT, NegY, R, One);
13388 R = DAG.getNode(ISD::FMA, SL, VT, Tmp1, R, R);
13389
13390 // Skip the last 2 correction terms for reciprocal.
13391 if (IsNegRcp || (CLHS && CLHS->isOne()))
13392 return R;
13393
13394 SDValue Ret = DAG.getNode(ISD::FMUL, SL, VT, X, R);
13395 SDValue Tmp2 = DAG.getNode(ISD::FMA, SL, VT, NegY, Ret, X);
13396 return DAG.getNode(ISD::FMA, SL, VT, Tmp2, R, Ret);
13397}
13398
13399static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13400 EVT VT, SDValue A, SDValue B, SDValue GlueChain,
13401 SDNodeFlags Flags) {
13402 if (GlueChain->getNumValues() <= 1) {
13403 return DAG.getNode(Opcode, SL, VT, A, B, Flags);
13404 }
13405
13406 assert(GlueChain->getNumValues() == 3);
13407
13408 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
13409 switch (Opcode) {
13410 default:
13411 llvm_unreachable("no chain equivalent for opcode");
13412 case ISD::FMUL:
13413 Opcode = AMDGPUISD::FMUL_W_CHAIN;
13414 break;
13415 }
13416
13417 return DAG.getNode(Opcode, SL, VTList,
13418 {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
13419 Flags);
13420}
13421
13422static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
13423 EVT VT, SDValue A, SDValue B, SDValue C,
13424 SDValue GlueChain, SDNodeFlags Flags) {
13425 if (GlueChain->getNumValues() <= 1) {
13426 return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
13427 }
13428
13429 assert(GlueChain->getNumValues() == 3);
13430
13431 SDVTList VTList = DAG.getVTList(VT, MVT::Other, MVT::Glue);
13432 switch (Opcode) {
13433 default:
13434 llvm_unreachable("no chain equivalent for opcode");
13435 case ISD::FMA:
13436 Opcode = AMDGPUISD::FMA_W_CHAIN;
13437 break;
13438 }
13439
13440 return DAG.getNode(Opcode, SL, VTList,
13441 {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
13442 Flags);
13443}
13444
13445SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
13446 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13447 return FastLowered;
13448
13449 SDLoc SL(Op);
13450 EVT VT = Op.getValueType();
13451 SDValue LHS = Op.getOperand(0);
13452 SDValue RHS = Op.getOperand(1);
13453
13454 SDValue LHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, LHS);
13455 SDValue RHSExt = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, RHS);
13456
13457 if (VT == MVT::bf16) {
13458 SDValue ExtDiv =
13459 DAG.getNode(ISD::FDIV, SL, MVT::f32, LHSExt, RHSExt, Op->getFlags());
13460 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ExtDiv,
13461 DAG.getTargetConstant(0, SL, MVT::i32));
13462 }
13463
13464 assert(VT == MVT::f16);
13465
13466 // a32.u = opx(V_CVT_F32_F16, a.u); // CVT to F32
13467 // b32.u = opx(V_CVT_F32_F16, b.u); // CVT to F32
13468 // r32.u = opx(V_RCP_F32, b32.u); // rcp = 1 / d
13469 // q32.u = opx(V_MUL_F32, a32.u, r32.u); // q = n * rcp
13470 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13471 // q32.u = opx(V_MAD_F32, e32.u, r32.u, q32.u); // q = n * rcp
13472 // e32.u = opx(V_MAD_F32, (b32.u^_neg32), q32.u, a32.u); // err = -d * q + n
13473 // tmp.u = opx(V_MUL_F32, e32.u, r32.u);
13474 // tmp.u = opx(V_AND_B32, tmp.u, 0xff800000)
13475 // q32.u = opx(V_ADD_F32, tmp.u, q32.u);
13476 // q16.u = opx(V_CVT_F16_F32, q32.u);
13477 // q16.u = opx(V_DIV_FIXUP_F16, q16.u, b.u, a.u); // q = touchup(q, d, n)
13478
13479 // We will use ISD::FMA on targets that don't support ISD::FMAD.
13480 unsigned FMADOpCode =
13482 SDValue NegRHSExt = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHSExt);
13483 SDValue Rcp =
13484 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, RHSExt, Op->getFlags());
13485 SDValue Quot =
13486 DAG.getNode(ISD::FMUL, SL, MVT::f32, LHSExt, Rcp, Op->getFlags());
13487 SDValue Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13488 Op->getFlags());
13489 Quot = DAG.getNode(FMADOpCode, SL, MVT::f32, Err, Rcp, Quot, Op->getFlags());
13490 Err = DAG.getNode(FMADOpCode, SL, MVT::f32, NegRHSExt, Quot, LHSExt,
13491 Op->getFlags());
13492 SDValue Tmp = DAG.getNode(ISD::FMUL, SL, MVT::f32, Err, Rcp, Op->getFlags());
13493 SDValue TmpCast = DAG.getNode(ISD::BITCAST, SL, MVT::i32, Tmp);
13494 TmpCast = DAG.getNode(ISD::AND, SL, MVT::i32, TmpCast,
13495 DAG.getConstant(0xff800000, SL, MVT::i32));
13496 Tmp = DAG.getNode(ISD::BITCAST, SL, MVT::f32, TmpCast);
13497 Quot = DAG.getNode(ISD::FADD, SL, MVT::f32, Tmp, Quot, Op->getFlags());
13498 SDValue RDst = DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Quot,
13499 DAG.getTargetConstant(0, SL, MVT::i32));
13500 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f16, RDst, RHS, LHS,
13501 Op->getFlags());
13502}
13503
13504// Faster 2.5 ULP division that does not support denormals.
13505SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
13506 SDNodeFlags Flags = Op->getFlags();
13507 SDLoc SL(Op);
13508 SDValue LHS = Op.getOperand(1);
13509 SDValue RHS = Op.getOperand(2);
13510
13511 // TODO: The combiner should probably handle elimination of redundant fabs.
13513 ? RHS
13514 : DAG.getNode(ISD::FABS, SL, MVT::f32, RHS, Flags);
13515
13516 const APFloat K0Val(0x1p+96f);
13517 const SDValue K0 = DAG.getConstantFP(K0Val, SL, MVT::f32);
13518
13519 const APFloat K1Val(0x1p-32f);
13520 const SDValue K1 = DAG.getConstantFP(K1Val, SL, MVT::f32);
13521
13522 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
13523
13524 EVT SetCCVT =
13525 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
13526
13527 SDValue r2 = DAG.getSetCC(SL, SetCCVT, r1, K0, ISD::SETOGT);
13528
13529 SDValue r3 = DAG.getNode(ISD::SELECT, SL, MVT::f32, r2, K1, One, Flags);
13530
13531 r1 = DAG.getNode(ISD::FMUL, SL, MVT::f32, RHS, r3, Flags);
13532
13533 // rcp does not support denormals.
13534 SDValue r0 = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, r1, Flags);
13535
13536 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f32, LHS, r0, Flags);
13537
13538 return DAG.getNode(ISD::FMUL, SL, MVT::f32, r3, Mul, Flags);
13539}
13540
13541// Returns immediate value for setting the F32 denorm mode when using the
13542// S_DENORM_MODE instruction.
13544 const SIMachineFunctionInfo *Info,
13545 const GCNSubtarget *ST) {
13546 assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
13547 uint32_t DPDenormModeDefault = Info->getMode().fpDenormModeDPValue();
13548 uint32_t Mode = SPDenormMode | (DPDenormModeDefault << 2);
13549 return DAG.getTargetConstant(Mode, SDLoc(), MVT::i32);
13550}
13551
13552SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
13553 if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
13554 return FastLowered;
13555
13556 // The selection matcher assumes anything with a chain selecting to a
13557 // mayRaiseFPException machine instruction. Since we're introducing a chain
13558 // here, we need to explicitly report nofpexcept for the regular fdiv
13559 // lowering.
13560 SDNodeFlags Flags = Op->getFlags();
13561 Flags.setNoFPExcept(true);
13562
13563 SDLoc SL(Op);
13564 SDValue LHS = Op.getOperand(0);
13565 SDValue RHS = Op.getOperand(1);
13566
13567 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
13568
13569 SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
13570
13571 SDValue DenominatorScaled =
13572 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {RHS, RHS, LHS}, Flags);
13573 SDValue NumeratorScaled =
13574 DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, {LHS, RHS, LHS}, Flags);
13575
13576 // Denominator is scaled to not be denormal, so using rcp is ok.
13577 SDValue ApproxRcp =
13578 DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32, DenominatorScaled, Flags);
13579 SDValue NegDivScale0 =
13580 DAG.getNode(ISD::FNEG, SL, MVT::f32, DenominatorScaled, Flags);
13581
13582 using namespace AMDGPU::Hwreg;
13583 const unsigned Denorm32Reg = HwregEncoding::encode(ID_MODE, 4, 2);
13584 const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
13585
13586 const MachineFunction &MF = DAG.getMachineFunction();
13587 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
13588 const DenormalMode DenormMode = Info->getMode().FP32Denormals;
13589
13590 const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
13591 const bool HasDynamicDenormals =
13592 (DenormMode.Input == DenormalMode::Dynamic) ||
13593 (DenormMode.Output == DenormalMode::Dynamic);
13594
13595 SDValue SavedDenormMode;
13596
13597 if (!PreservesDenormals) {
13598 // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
13599 // lowering. The chain dependence is insufficient, and we need glue. We do
13600 // not need the glue variants in a strictfp function.
13601
13602 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
13603
13604 SDValue Glue = DAG.getEntryNode();
13605 if (HasDynamicDenormals) {
13606 SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
13607 DAG.getVTList(MVT::i32, MVT::Glue),
13608 {BitField, Glue});
13609 SavedDenormMode = SDValue(GetReg, 0);
13610
13611 Glue = DAG.getMergeValues(
13612 {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
13613 }
13614
13615 SDNode *EnableDenorm;
13616 if (Subtarget->hasDenormModeInst()) {
13617 const SDValue EnableDenormValue =
13618 getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
13619
13620 EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
13621 EnableDenormValue)
13622 .getNode();
13623 } else {
13624 const SDValue EnableDenormValue =
13625 DAG.getConstant(FP_DENORM_FLUSH_NONE, SL, MVT::i32);
13626 EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
13627 {EnableDenormValue, BitField, Glue});
13628 }
13629
13630 SDValue Ops[3] = {NegDivScale0, SDValue(EnableDenorm, 0),
13631 SDValue(EnableDenorm, 1)};
13632
13633 NegDivScale0 = DAG.getMergeValues(Ops, SL);
13634 }
13635
13636 SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
13637 ApproxRcp, One, NegDivScale0, Flags);
13638
13639 SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
13640 ApproxRcp, Fma0, Flags);
13641
13642 SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled, Fma1,
13643 Fma1, Flags);
13644
13645 SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
13646 NumeratorScaled, Mul, Flags);
13647
13648 SDValue Fma3 =
13649 getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2, Flags);
13650
13651 SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
13652 NumeratorScaled, Fma3, Flags);
13653
13654 if (!PreservesDenormals) {
13655 SDNode *DisableDenorm;
13656 if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
13657 const SDValue DisableDenormValue = getSPDenormModeValue(
13658 FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
13659
13660 SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
13661 DisableDenorm =
13662 DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
13663 Fma4.getValue(1), DisableDenormValue, Fma4.getValue(2))
13664 .getNode();
13665 } else {
13666 assert(HasDynamicDenormals == (bool)SavedDenormMode);
13667 const SDValue DisableDenormValue =
13668 HasDynamicDenormals
13669 ? SavedDenormMode
13670 : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
13671
13672 DisableDenorm = DAG.getMachineNode(
13673 AMDGPU::S_SETREG_B32, SL, MVT::Other,
13674 {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
13675 }
13676
13677 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
13678 SDValue(DisableDenorm, 0), DAG.getRoot());
13679 DAG.setRoot(OutputChain);
13680 }
13681
13682 SDValue Scale = NumeratorScaled.getValue(1);
13683 SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
13684 {Fma4, Fma1, Fma3, Scale}, Flags);
13685
13686 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
13687}
13688
13689SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
13690 if (SDValue FastLowered = lowerFastUnsafeFDIV64(Op, DAG))
13691 return FastLowered;
13692
13693 SDLoc SL(Op);
13694 SDValue X = Op.getOperand(0);
13695 SDValue Y = Op.getOperand(1);
13696
13697 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
13698
13699 SDVTList ScaleVT = DAG.getVTList(MVT::f64, MVT::i1);
13700
13701 SDValue DivScale0 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, Y, Y, X);
13702
13703 SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f64, DivScale0);
13704
13705 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f64, DivScale0);
13706
13707 SDValue Fma0 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Rcp, One);
13708
13709 SDValue Fma1 = DAG.getNode(ISD::FMA, SL, MVT::f64, Rcp, Fma0, Rcp);
13710
13711 SDValue Fma2 = DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Fma1, One);
13712
13713 SDValue DivScale1 = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT, X, Y, X);
13714
13715 SDValue Fma3 = DAG.getNode(ISD::FMA, SL, MVT::f64, Fma1, Fma2, Fma1);
13716 SDValue Mul = DAG.getNode(ISD::FMUL, SL, MVT::f64, DivScale1, Fma3);
13717
13718 SDValue Fma4 =
13719 DAG.getNode(ISD::FMA, SL, MVT::f64, NegDivScale0, Mul, DivScale1);
13720
13721 SDValue Scale;
13722
13723 if (!Subtarget->hasUsableDivScaleConditionOutput()) {
13724 // Workaround a hardware bug on SI where the condition output from div_scale
13725 // is not usable.
13726
13727 const SDValue Hi = DAG.getConstant(1, SL, MVT::i32);
13728
13729 // Figure out if the scale to use for div_fmas.
13730 SDValue NumBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
13731 SDValue DenBC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Y);
13732 SDValue Scale0BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale0);
13733 SDValue Scale1BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, DivScale1);
13734
13735 SDValue NumHi =
13736 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, NumBC, Hi);
13737 SDValue DenHi =
13738 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, DenBC, Hi);
13739
13740 SDValue Scale0Hi =
13741 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale0BC, Hi);
13742 SDValue Scale1Hi =
13743 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Scale1BC, Hi);
13744
13745 SDValue CmpDen = DAG.getSetCC(SL, MVT::i1, DenHi, Scale0Hi, ISD::SETEQ);
13746 SDValue CmpNum = DAG.getSetCC(SL, MVT::i1, NumHi, Scale1Hi, ISD::SETEQ);
13747 Scale = DAG.getNode(ISD::XOR, SL, MVT::i1, CmpNum, CmpDen);
13748 } else {
13749 Scale = DivScale1.getValue(1);
13750 }
13751
13752 SDValue Fmas =
13753 DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f64, Fma4, Fma3, Mul, Scale);
13754
13755 return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f64, Fmas, Y, X);
13756}
13757
13758SDValue SITargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
13759 EVT VT = Op.getValueType();
13760
13761 if (VT == MVT::f32)
13762 return LowerFDIV32(Op, DAG);
13763
13764 if (VT == MVT::f64)
13765 return LowerFDIV64(Op, DAG);
13766
13767 if (VT == MVT::f16 || VT == MVT::bf16)
13768 return LowerFDIV16(Op, DAG);
13769
13770 llvm_unreachable("Unexpected type for fdiv");
13771}
13772
13773SDValue SITargetLowering::LowerFFREXP(SDValue Op, SelectionDAG &DAG) const {
13774 SDLoc dl(Op);
13775 SDValue Val = Op.getOperand(0);
13776 EVT VT = Val.getValueType();
13777 EVT ResultExpVT = Op->getValueType(1);
13778 EVT InstrExpVT = VT == MVT::f16 ? MVT::i16 : MVT::i32;
13779
13780 SDValue Mant = DAG.getNode(
13782 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_mant, dl, MVT::i32), Val);
13783
13784 SDValue Exp = DAG.getNode(
13785 ISD::INTRINSIC_WO_CHAIN, dl, InstrExpVT,
13786 DAG.getTargetConstant(Intrinsic::amdgcn_frexp_exp, dl, MVT::i32), Val);
13787
13788 if (Subtarget->hasFractBug()) {
13789 SDValue Fabs = DAG.getNode(ISD::FABS, dl, VT, Val);
13790 SDValue Inf =
13792
13793 SDValue IsFinite = DAG.getSetCC(dl, MVT::i1, Fabs, Inf, ISD::SETOLT);
13794 SDValue Zero = DAG.getConstant(0, dl, InstrExpVT);
13795 Exp = DAG.getNode(ISD::SELECT, dl, InstrExpVT, IsFinite, Exp, Zero);
13796 Mant = DAG.getNode(ISD::SELECT, dl, VT, IsFinite, Mant, Val);
13797 }
13798
13799 SDValue CastExp = DAG.getSExtOrTrunc(Exp, dl, ResultExpVT);
13800 return DAG.getMergeValues({Mant, CastExp}, dl);
13801}
13802
13803SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
13804 SDLoc DL(Op);
13805 StoreSDNode *Store = cast<StoreSDNode>(Op);
13806 EVT VT = Store->getMemoryVT();
13807
13808 if (VT == MVT::i1) {
13809 return DAG.getTruncStore(
13810 Store->getChain(), DL,
13811 DAG.getSExtOrTrunc(Store->getValue(), DL, MVT::i32),
13812 Store->getBasePtr(), MVT::i1, Store->getMemOperand());
13813 }
13814
13815 assert(VT.isVector() &&
13816 Store->getValue().getValueType().getScalarType() == MVT::i32);
13817
13818 unsigned AS = Store->getAddressSpace();
13819 if (Subtarget->hasLDSMisalignedBugInWGPMode() &&
13820 AS == AMDGPUAS::FLAT_ADDRESS &&
13821 Store->getAlign().value() < VT.getStoreSize() &&
13822 VT.getSizeInBits() > 32) {
13823 return SplitVectorStore(Op, DAG);
13824 }
13825
13826 MachineFunction &MF = DAG.getMachineFunction();
13827 SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
13828 // If there is a possibility that flat instruction access scratch memory
13829 // then we need to use the same legalization rules we use for private.
13830 if (AS == AMDGPUAS::FLAT_ADDRESS &&
13831 !Subtarget->hasMultiDwordFlatScratchAddressing())
13832 AS = addressMayBeAccessedAsPrivate(Store->getMemOperand(), *MFI)
13835
13836 unsigned NumElements = VT.getVectorNumElements();
13838 if (NumElements > 4)
13839 return SplitVectorStore(Op, DAG);
13840 // v3 stores not supported on SI.
13841 if (NumElements == 3 && !Subtarget->hasDwordx3LoadStores())
13842 return SplitVectorStore(Op, DAG);
13843
13845 VT, *Store->getMemOperand()))
13846 return expandUnalignedStore(Store, DAG);
13847
13848 return SDValue();
13849 }
13850 if (AS == AMDGPUAS::PRIVATE_ADDRESS) {
13851 switch (Subtarget->getMaxPrivateElementSize()) {
13852 case 4:
13853 return scalarizeVectorStore(Store, DAG);
13854 case 8:
13855 if (NumElements > 2)
13856 return SplitVectorStore(Op, DAG);
13857 return SDValue();
13858 case 16:
13859 if (NumElements > 4 ||
13860 (NumElements == 3 && !Subtarget->hasFlatScratchEnabled()))
13861 return SplitVectorStore(Op, DAG);
13862 return SDValue();
13863 default:
13864 llvm_unreachable("unsupported private_element_size");
13865 }
13866 } else if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
13867 unsigned Fast = 0;
13868 auto Flags = Store->getMemOperand()->getFlags();
13870 Store->getAlign(), Flags, &Fast) &&
13871 Fast > 1)
13872 return SDValue();
13873
13874 if (VT.isVector())
13875 return SplitVectorStore(Op, DAG);
13876
13877 return expandUnalignedStore(Store, DAG);
13878 }
13879
13880 // Probably an invalid store. If so we'll end up emitting a selection error.
13881 return SDValue();
13882}
13883
13884// Avoid the full correct expansion for f32 sqrt when promoting from f16.
13885SDValue SITargetLowering::lowerFSQRTF16(SDValue Op, SelectionDAG &DAG) const {
13886 SDLoc SL(Op);
13887 assert(!Subtarget->has16BitInsts());
13888 SDNodeFlags Flags = Op->getFlags();
13889 SDValue Ext =
13890 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Op.getOperand(0), Flags);
13891
13892 SDValue SqrtID = DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, SL, MVT::i32);
13893 SDValue Sqrt =
13894 DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::f32, SqrtID, Ext, Flags);
13895
13896 return DAG.getNode(ISD::FP_ROUND, SL, MVT::f16, Sqrt,
13897 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
13898}
13899
13900SDValue SITargetLowering::lowerFSQRTF32(SDValue Op, SelectionDAG &DAG) const {
13901 SDLoc DL(Op);
13902 SDNodeFlags Flags = Op->getFlags();
13903 MVT VT = Op.getValueType().getSimpleVT();
13904 const SDValue X = Op.getOperand(0);
13905
13906 if (allowApproxFunc(DAG, Flags)) {
13907 // Instruction is 1ulp but ignores denormals.
13908 return DAG.getNode(
13910 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32), X, Flags);
13911 }
13912
13913 SDValue ScaleThreshold = DAG.getConstantFP(0x1.0p-96f, DL, VT);
13914 SDValue NeedScale = DAG.getSetCC(DL, MVT::i1, X, ScaleThreshold, ISD::SETOLT);
13915
13916 SDValue ScaleUpFactor = DAG.getConstantFP(0x1.0p+32f, DL, VT);
13917
13918 SDValue ScaledX = DAG.getNode(ISD::FMUL, DL, VT, X, ScaleUpFactor, Flags);
13919
13920 SDValue SqrtX =
13921 DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledX, X, Flags);
13922
13923 SDValue SqrtS;
13924 if (needsDenormHandlingF32(DAG, X, Flags)) {
13925 SDValue SqrtID =
13926 DAG.getTargetConstant(Intrinsic::amdgcn_sqrt, DL, MVT::i32);
13927 SqrtS = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, SqrtID, SqrtX, Flags);
13928
13929 SDValue SqrtSAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, SqrtS);
13930 SDValue SqrtSNextDownInt =
13931 DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13932 DAG.getAllOnesConstant(DL, MVT::i32));
13933 SDValue SqrtSNextDown = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextDownInt);
13934
13935 SDValue NegSqrtSNextDown =
13936 DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextDown, Flags);
13937
13938 SDValue SqrtVP =
13939 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextDown, SqrtS, SqrtX, Flags);
13940
13941 SDValue SqrtSNextUpInt = DAG.getNode(ISD::ADD, DL, MVT::i32, SqrtSAsInt,
13942 DAG.getConstant(1, DL, MVT::i32));
13943 SDValue SqrtSNextUp = DAG.getNode(ISD::BITCAST, DL, VT, SqrtSNextUpInt);
13944
13945 SDValue NegSqrtSNextUp = DAG.getNode(ISD::FNEG, DL, VT, SqrtSNextUp, Flags);
13946 SDValue SqrtVS =
13947 DAG.getNode(ISD::FMA, DL, VT, NegSqrtSNextUp, SqrtS, SqrtX, Flags);
13948
13949 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
13950 SDValue SqrtVPLE0 = DAG.getSetCC(DL, MVT::i1, SqrtVP, Zero, ISD::SETOLE);
13951
13952 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPLE0, SqrtSNextDown, SqrtS,
13953 Flags);
13954
13955 SDValue SqrtVPVSGT0 = DAG.getSetCC(DL, MVT::i1, SqrtVS, Zero, ISD::SETOGT);
13956 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, SqrtVPVSGT0, SqrtSNextUp, SqrtS,
13957 Flags);
13958 } else {
13959 SDValue SqrtR = DAG.getNode(AMDGPUISD::RSQ, DL, VT, SqrtX, Flags);
13960
13961 SqrtS = DAG.getNode(ISD::FMUL, DL, VT, SqrtX, SqrtR, Flags);
13962
13963 SDValue Half = DAG.getConstantFP(0.5f, DL, VT);
13964 SDValue SqrtH = DAG.getNode(ISD::FMUL, DL, VT, SqrtR, Half, Flags);
13965 SDValue NegSqrtH = DAG.getNode(ISD::FNEG, DL, VT, SqrtH, Flags);
13966
13967 SDValue SqrtE = DAG.getNode(ISD::FMA, DL, VT, NegSqrtH, SqrtS, Half, Flags);
13968 SqrtH = DAG.getNode(ISD::FMA, DL, VT, SqrtH, SqrtE, SqrtH, Flags);
13969 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtS, SqrtE, SqrtS, Flags);
13970
13971 SDValue NegSqrtS = DAG.getNode(ISD::FNEG, DL, VT, SqrtS, Flags);
13972 SDValue SqrtD =
13973 DAG.getNode(ISD::FMA, DL, VT, NegSqrtS, SqrtS, SqrtX, Flags);
13974 SqrtS = DAG.getNode(ISD::FMA, DL, VT, SqrtD, SqrtH, SqrtS, Flags);
13975 }
13976
13977 SDValue ScaleDownFactor = DAG.getConstantFP(0x1.0p-16f, DL, VT);
13978
13979 SDValue ScaledDown =
13980 DAG.getNode(ISD::FMUL, DL, VT, SqrtS, ScaleDownFactor, Flags);
13981
13982 SqrtS = DAG.getNode(ISD::SELECT, DL, VT, NeedScale, ScaledDown, SqrtS, Flags);
13983 SDValue IsZeroOrInf =
13984 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
13985 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
13986
13987 return DAG.getNode(ISD::SELECT, DL, VT, IsZeroOrInf, SqrtX, SqrtS, Flags);
13988}
13989
13990SDValue SITargetLowering::lowerFSQRTF64(SDValue Op, SelectionDAG &DAG) const {
13991 // For double type, the SQRT and RSQ instructions don't have required
13992 // precision, we apply Goldschmidt's algorithm to improve the result:
13993 //
13994 // y0 = rsq(x)
13995 // g0 = x * y0
13996 // h0 = 0.5 * y0
13997 //
13998 // r0 = 0.5 - h0 * g0
13999 // g1 = g0 * r0 + g0
14000 // h1 = h0 * r0 + h0
14001 //
14002 // r1 = 0.5 - h1 * g1 => d0 = x - g1 * g1
14003 // g2 = g1 * r1 + g1 g2 = d0 * h1 + g1
14004 // h2 = h1 * r1 + h1
14005 //
14006 // r2 = 0.5 - h2 * g2 => d1 = x - g2 * g2
14007 // g3 = g2 * r2 + g2 g3 = d1 * h1 + g2
14008 //
14009 // sqrt(x) = g3
14010
14011 SDNodeFlags Flags = Op->getFlags();
14012
14013 SDLoc DL(Op);
14014
14015 SDValue X = Op.getOperand(0);
14016 SDValue ZeroInt = DAG.getConstant(0, DL, MVT::i32);
14017
14018 SDValue SqrtX = X;
14019 SDValue Scaling;
14020 if (!Flags.hasApproximateFuncs()) {
14021 SDValue ScaleConstant = DAG.getConstantFP(0x1.0p-767, DL, MVT::f64);
14022 Scaling = DAG.getSetCC(DL, MVT::i1, X, ScaleConstant, ISD::SETOLT);
14023
14024 // Scale up input if it is too small.
14025 SDValue ScaleUpFactor = DAG.getConstant(256, DL, MVT::i32);
14026 SDValue ScaleUp =
14027 DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling, ScaleUpFactor, ZeroInt);
14028 SqrtX = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, X, ScaleUp, Flags);
14029 }
14030
14031 SDValue SqrtY = DAG.getNode(AMDGPUISD::RSQ, DL, MVT::f64, SqrtX);
14032
14033 SDValue SqrtS0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtX, SqrtY);
14034
14035 SDValue Half = DAG.getConstantFP(0.5, DL, MVT::f64);
14036 SDValue SqrtH0 = DAG.getNode(ISD::FMUL, DL, MVT::f64, SqrtY, Half);
14037
14038 SDValue NegSqrtH0 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtH0);
14039 SDValue SqrtR0 = DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtH0, SqrtS0, Half);
14040
14041 SDValue SqrtH1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtH0, SqrtR0, SqrtH0);
14042
14043 SDValue SqrtS1 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtS0, SqrtR0, SqrtS0);
14044
14045 SDValue NegSqrtS1 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS1);
14046 SDValue SqrtD0 =
14047 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS1, SqrtS1, SqrtX);
14048
14049 SDValue SqrtS2 = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD0, SqrtH1, SqrtS1);
14050
14051 SDValue SqrtRet = SqrtS2;
14052 if (!Flags.hasApproximateFuncs()) {
14053 SDValue NegSqrtS2 = DAG.getNode(ISD::FNEG, DL, MVT::f64, SqrtS2);
14054 SDValue SqrtD1 =
14055 DAG.getNode(ISD::FMA, DL, MVT::f64, NegSqrtS2, SqrtS2, SqrtX);
14056
14057 SqrtRet = DAG.getNode(ISD::FMA, DL, MVT::f64, SqrtD1, SqrtH1, SqrtS2);
14058
14059 SDValue ScaleDownFactor = DAG.getSignedConstant(-128, DL, MVT::i32);
14060 SDValue ScaleDown = DAG.getNode(ISD::SELECT, DL, MVT::i32, Scaling,
14061 ScaleDownFactor, ZeroInt);
14062 SqrtRet = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, SqrtRet, ScaleDown, Flags);
14063 }
14064
14065 // TODO: Check for DAZ and expand to subnormals
14066
14067 SDValue IsZeroOrInf;
14068 if (Flags.hasNoInfs()) {
14069 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
14070 IsZeroOrInf = DAG.getSetCC(DL, MVT::i1, SqrtX, Zero, ISD::SETOEQ);
14071 } else {
14072 IsZeroOrInf =
14073 DAG.getNode(ISD::IS_FPCLASS, DL, MVT::i1, SqrtX,
14074 DAG.getTargetConstant(fcZero | fcPosInf, DL, MVT::i32));
14075 }
14076
14077 // If x is +INF, +0, or -0, use its original value
14078 return DAG.getNode(ISD::SELECT, DL, MVT::f64, IsZeroOrInf, SqrtX, SqrtRet,
14079 Flags);
14080}
14081
14082SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
14083 SDLoc DL(Op);
14084 EVT VT = Op.getValueType();
14085 SDValue Arg = Op.getOperand(0);
14086 SDValue TrigVal;
14087
14088 // Propagate fast-math flags so that the multiply we introduce can be folded
14089 // if Arg is already the result of a multiply by constant.
14090 auto Flags = Op->getFlags();
14091
14092 // AMDGPUISD nodes of vector type must be unrolled here since
14093 // they will not be expanded elsewhere.
14094 auto UnrollIfVec = [&DAG](SDValue V) -> SDValue {
14095 if (!V.getValueType().isVector())
14096 return V;
14097
14098 return DAG.UnrollVectorOp(cast<SDNode>(V));
14099 };
14100
14101 SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
14102
14103 if (Subtarget->hasTrigReducedRange()) {
14104 SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
14105 TrigVal = UnrollIfVec(DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags));
14106 } else {
14107 TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
14108 }
14109
14110 switch (Op.getOpcode()) {
14111 case ISD::FCOS:
14112 TrigVal = DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
14113 break;
14114 case ISD::FSIN:
14115 TrigVal = DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
14116 break;
14117 default:
14118 llvm_unreachable("Wrong trig opcode");
14119 }
14120
14121 return UnrollIfVec(TrigVal);
14122}
14123
14124SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op,
14125 SelectionDAG &DAG) const {
14126 AtomicSDNode *AtomicNode = cast<AtomicSDNode>(Op);
14127 assert(AtomicNode->isCompareAndSwap());
14128 unsigned AS = AtomicNode->getAddressSpace();
14129
14130 // No custom lowering required for local address space
14132 return Op;
14133
14134 // Non-local address space requires custom lowering for atomic compare
14135 // and swap; cmp and swap should be in a v2i32 or v2i64 in case of _X2
14136 SDLoc DL(Op);
14137 SDValue ChainIn = Op.getOperand(0);
14138 SDValue Addr = Op.getOperand(1);
14139 SDValue Old = Op.getOperand(2);
14140 SDValue New = Op.getOperand(3);
14141 EVT VT = Op.getValueType();
14142 MVT SimpleVT = VT.getSimpleVT();
14143 MVT VecType = MVT::getVectorVT(SimpleVT, 2);
14144
14145 SDValue NewOld = DAG.getBuildVector(VecType, DL, {New, Old});
14146 SDValue Ops[] = {ChainIn, Addr, NewOld};
14147
14148 return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_CMP_SWAP, DL,
14149 Op->getVTList(), Ops, VT,
14150 AtomicNode->getMemOperand());
14151}
14152
14153//===----------------------------------------------------------------------===//
14154// Custom DAG optimizations
14155//===----------------------------------------------------------------------===//
14156
14157SDValue
14158SITargetLowering::performUCharToFloatCombine(SDNode *N,
14159 DAGCombinerInfo &DCI) const {
14160 EVT VT = N->getValueType(0);
14161 EVT ScalarVT = VT.getScalarType();
14162 if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
14163 return SDValue();
14164
14165 SelectionDAG &DAG = DCI.DAG;
14166 SDLoc DL(N);
14167
14168 SDValue Src = N->getOperand(0);
14169 EVT SrcVT = Src.getValueType();
14170
14171 // TODO: We could try to match extracting the higher bytes, which would be
14172 // easier if i8 vectors weren't promoted to i32 vectors, particularly after
14173 // types are legalized. v4i8 -> v4f32 is probably the only case to worry
14174 // about in practice.
14175 if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
14176 if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
14177 SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
14178 DCI.AddToWorklist(Cvt.getNode());
14179
14180 // For the f16 case, fold to a cast to f32 and then cast back to f16.
14181 if (ScalarVT != MVT::f32) {
14182 Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
14183 DAG.getTargetConstant(0, DL, MVT::i32));
14184 }
14185 return Cvt;
14186 }
14187 }
14188
14189 return SDValue();
14190}
14191
14192SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
14193 DAGCombinerInfo &DCI) const {
14194 SDValue MagnitudeOp = N->getOperand(0);
14195 SDValue SignOp = N->getOperand(1);
14196
14197 // The generic combine for fcopysign + fp cast is too conservative with
14198 // vectors, and also gets confused by the splitting we will perform here, so
14199 // peek through FP casts.
14200 if (SignOp.getOpcode() == ISD::FP_EXTEND ||
14201 SignOp.getOpcode() == ISD::FP_ROUND)
14202 SignOp = SignOp.getOperand(0);
14203
14204 SelectionDAG &DAG = DCI.DAG;
14205 SDLoc DL(N);
14206 EVT SignVT = SignOp.getValueType();
14207
14208 // f64 fcopysign is really an f32 copysign on the high bits, so replace the
14209 // lower half with a copy.
14210 // fcopysign f64:x, _:y -> x.lo32, (fcopysign (f32 x.hi32), _:y)
14211 EVT MagVT = MagnitudeOp.getValueType();
14212
14213 unsigned NumElts = MagVT.isVector() ? MagVT.getVectorNumElements() : 1;
14214
14215 if (MagVT.getScalarType() == MVT::f64) {
14216 EVT F32VT = MagVT.isVector()
14217 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
14218 : MVT::v2f32;
14219
14220 SDValue MagAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, MagnitudeOp);
14221
14223 for (unsigned I = 0; I != NumElts; ++I) {
14224 SDValue MagLo =
14225 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
14226 DAG.getConstant(2 * I, DL, MVT::i32));
14227 SDValue MagHi =
14228 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, MagAsVector,
14229 DAG.getConstant(2 * I + 1, DL, MVT::i32));
14230
14231 SDValue SignOpElt =
14232 MagVT.isVector()
14234 SignOp, DAG.getConstant(I, DL, MVT::i32))
14235 : SignOp;
14236
14237 SDValue HiOp =
14238 DAG.getNode(ISD::FCOPYSIGN, DL, MVT::f32, MagHi, SignOpElt);
14239
14240 SDValue Vector =
14241 DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f32, MagLo, HiOp);
14242
14243 SDValue NewElt = DAG.getNode(ISD::BITCAST, DL, MVT::f64, Vector);
14244 NewElts.push_back(NewElt);
14245 }
14246
14247 if (NewElts.size() == 1)
14248 return NewElts[0];
14249
14250 return DAG.getNode(ISD::BUILD_VECTOR, DL, MagVT, NewElts);
14251 }
14252
14253 if (SignVT.getScalarType() != MVT::f64)
14254 return SDValue();
14255
14256 // Reduce width of sign operand, we only need the highest bit.
14257 //
14258 // fcopysign f64:x, f64:y ->
14259 // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1)
14260 // TODO: In some cases it might make sense to go all the way to f16.
14261
14262 EVT F32VT = MagVT.isVector()
14263 ? EVT::getVectorVT(*DAG.getContext(), MVT::f32, 2 * NumElts)
14264 : MVT::v2f32;
14265
14266 SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, F32VT, SignOp);
14267
14268 SmallVector<SDValue, 8> F32Signs;
14269 for (unsigned I = 0; I != NumElts; ++I) {
14270 // Take sign from odd elements of cast vector
14271 SDValue SignAsF32 =
14272 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector,
14273 DAG.getConstant(2 * I + 1, DL, MVT::i32));
14274 F32Signs.push_back(SignAsF32);
14275 }
14276
14277 SDValue NewSign =
14278 NumElts == 1
14279 ? F32Signs.back()
14281 EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumElts),
14282 F32Signs);
14283
14284 return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0),
14285 NewSign);
14286}
14287
14288// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
14289// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
14290// bits
14291
14292// This is a variant of
14293// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
14294//
14295// The normal DAG combiner will do this, but only if the add has one use since
14296// that would increase the number of instructions.
14297//
14298// This prevents us from seeing a constant offset that can be folded into a
14299// memory instruction's addressing mode. If we know the resulting add offset of
14300// a pointer can be folded into an addressing offset, we can replace the pointer
14301// operand with the add of new constant offset. This eliminates one of the uses,
14302// and may allow the remaining use to also be simplified.
14303//
14304SDValue SITargetLowering::performSHLPtrCombine(SDNode *N, unsigned AddrSpace,
14305 EVT MemVT,
14306 DAGCombinerInfo &DCI) const {
14307 SDValue N0 = N->getOperand(0);
14308 SDValue N1 = N->getOperand(1);
14309
14310 // We only do this to handle cases where it's profitable when there are
14311 // multiple uses of the add, so defer to the standard combine.
14312 if ((!N0->isAnyAdd() && N0.getOpcode() != ISD::OR) || N0->hasOneUse())
14313 return SDValue();
14314
14315 const ConstantSDNode *CN1 = dyn_cast<ConstantSDNode>(N1);
14316 if (!CN1)
14317 return SDValue();
14318
14319 const ConstantSDNode *CAdd = dyn_cast<ConstantSDNode>(N0.getOperand(1));
14320 if (!CAdd)
14321 return SDValue();
14322
14323 SelectionDAG &DAG = DCI.DAG;
14324
14325 if (N0->getOpcode() == ISD::OR &&
14326 !DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
14327 return SDValue();
14328
14329 // If the resulting offset is too large, we can't fold it into the
14330 // addressing mode offset.
14331 APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
14332 Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
14333
14334 AddrMode AM;
14335 AM.HasBaseReg = true;
14336 AM.BaseOffs = Offset.getSExtValue();
14337 if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
14338 return SDValue();
14339
14340 SDLoc SL(N);
14341 EVT VT = N->getValueType(0);
14342
14343 SDValue ShlX = DAG.getNode(ISD::SHL, SL, VT, N0.getOperand(0), N1);
14344 SDValue COffset = DAG.getConstant(Offset, SL, VT);
14345
14346 SDNodeFlags Flags;
14347 Flags.setNoUnsignedWrap(
14348 N->getFlags().hasNoUnsignedWrap() &&
14349 (N0.getOpcode() == ISD::OR || N0->getFlags().hasNoUnsignedWrap()));
14350
14351 // Use ISD::ADD even if the original operation was ISD::PTRADD, since we can't
14352 // be sure that the new left operand is a proper base pointer.
14353 return DAG.getNode(ISD::ADD, SL, VT, ShlX, COffset, Flags);
14354}
14355
14356/// MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset
14357/// by the chain and intrinsic ID. Theoretically we would also need to check the
14358/// specific intrinsic, but they all place the pointer operand first.
14359static unsigned getBasePtrIndex(const MemSDNode *N) {
14360 switch (N->getOpcode()) {
14361 case ISD::STORE:
14364 return 2;
14365 default:
14366 return 1;
14367 }
14368}
14369
14370SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
14371 DAGCombinerInfo &DCI) const {
14372 SelectionDAG &DAG = DCI.DAG;
14373
14374 unsigned PtrIdx = getBasePtrIndex(N);
14375 SDValue Ptr = N->getOperand(PtrIdx);
14376
14377 // TODO: We could also do this for multiplies.
14378 if (Ptr.getOpcode() == ISD::SHL) {
14379 SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), N->getAddressSpace(),
14380 N->getMemoryVT(), DCI);
14381 if (NewPtr) {
14382 SmallVector<SDValue, 8> NewOps(N->ops());
14383
14384 NewOps[PtrIdx] = NewPtr;
14385 return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
14386 }
14387 }
14388
14389 return SDValue();
14390}
14391
14392static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val) {
14393 return (Opc == ISD::AND && (Val == 0 || Val == 0xffffffff)) ||
14394 (Opc == ISD::OR && (Val == 0xffffffff || Val == 0)) ||
14395 (Opc == ISD::XOR && Val == 0);
14396}
14397
14398// Break up 64-bit bit operation of a constant into two 32-bit and/or/xor. This
14399// will typically happen anyway for a VALU 64-bit and. This exposes other 32-bit
14400// integer combine opportunities since most 64-bit operations are decomposed
14401// this way. TODO: We won't want this for SALU especially if it is an inline
14402// immediate.
14403SDValue SITargetLowering::splitBinaryBitConstantOp(
14404 DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS,
14405 const ConstantSDNode *CRHS) const {
14406 uint64_t Val = CRHS->getZExtValue();
14407 uint32_t ValLo = Lo_32(Val);
14408 uint32_t ValHi = Hi_32(Val);
14409 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14410
14411 if ((bitOpWithConstantIsReducible(Opc, ValLo) ||
14413 (CRHS->hasOneUse() && !TII->isInlineConstant(CRHS->getAPIntValue()))) {
14414 // We have 64-bit scalar and/or/xor, but do not have vector forms.
14415 if (Subtarget->has64BitLiterals() && CRHS->hasOneUse() &&
14416 !CRHS->user_begin()->isDivergent())
14417 return SDValue();
14418
14419 // If we need to materialize a 64-bit immediate, it will be split up later
14420 // anyway. Avoid creating the harder to understand 64-bit immediate
14421 // materialization.
14422 return splitBinaryBitConstantOpImpl(DCI, SL, Opc, LHS, ValLo, ValHi);
14423 }
14424
14425 return SDValue();
14426}
14427
14429 if (V.getValueType() != MVT::i1)
14430 return false;
14431 switch (V.getOpcode()) {
14432 default:
14433 break;
14434 case ISD::SETCC:
14435 case ISD::IS_FPCLASS:
14436 case AMDGPUISD::FP_CLASS:
14437 return true;
14438 case ISD::AND:
14439 case ISD::OR:
14440 case ISD::XOR:
14441 return isBoolSGPR(V.getOperand(0)) && isBoolSGPR(V.getOperand(1));
14442 case ISD::SADDO:
14443 case ISD::UADDO:
14444 case ISD::SSUBO:
14445 case ISD::USUBO:
14446 case ISD::SMULO:
14447 case ISD::UMULO:
14448 return V.getResNo() == 1;
14450 unsigned IntrinsicID = V.getConstantOperandVal(0);
14451 switch (IntrinsicID) {
14452 case Intrinsic::amdgcn_is_shared:
14453 case Intrinsic::amdgcn_is_private:
14454 return true;
14455 default:
14456 return false;
14457 }
14458
14459 return false;
14460 }
14461 }
14462 return false;
14463}
14464
14465// If a constant has all zeroes or all ones within each byte return it.
14466// Otherwise return 0.
14468 // 0xff for any zero byte in the mask
14469 uint32_t ZeroByteMask = 0;
14470 if (!(C & 0x000000ff))
14471 ZeroByteMask |= 0x000000ff;
14472 if (!(C & 0x0000ff00))
14473 ZeroByteMask |= 0x0000ff00;
14474 if (!(C & 0x00ff0000))
14475 ZeroByteMask |= 0x00ff0000;
14476 if (!(C & 0xff000000))
14477 ZeroByteMask |= 0xff000000;
14478 uint32_t NonZeroByteMask = ~ZeroByteMask; // 0xff for any non-zero byte
14479 if ((NonZeroByteMask & C) != NonZeroByteMask)
14480 return 0; // Partial bytes selected.
14481 return C;
14482}
14483
14484// Check if a node selects whole bytes from its operand 0 starting at a byte
14485// boundary while masking the rest. Returns select mask as in the v_perm_b32
14486// or -1 if not succeeded.
14487// Note byte select encoding:
14488// value 0-3 selects corresponding source byte;
14489// value 0xc selects zero;
14490// value 0xff selects 0xff.
14492 assert(V.getValueSizeInBits() == 32);
14493
14494 if (V.getNumOperands() != 2)
14495 return ~0;
14496
14497 ConstantSDNode *N1 = dyn_cast<ConstantSDNode>(V.getOperand(1));
14498 if (!N1)
14499 return ~0;
14500
14501 uint32_t C = N1->getZExtValue();
14502
14503 switch (V.getOpcode()) {
14504 default:
14505 break;
14506 case ISD::AND:
14507 if (uint32_t ConstMask = getConstantPermuteMask(C))
14508 return (0x03020100 & ConstMask) | (0x0c0c0c0c & ~ConstMask);
14509 break;
14510
14511 case ISD::OR:
14512 if (uint32_t ConstMask = getConstantPermuteMask(C))
14513 return (0x03020100 & ~ConstMask) | ConstMask;
14514 break;
14515
14516 case ISD::SHL:
14517 if (C % 8)
14518 return ~0;
14519
14520 return uint32_t((0x030201000c0c0c0cull << C) >> 32);
14521
14522 case ISD::SRL:
14523 if (C % 8)
14524 return ~0;
14525
14526 return uint32_t(0x0c0c0c0c03020100ull >> C);
14527 }
14528
14529 return ~0;
14530}
14531
14532SDValue SITargetLowering::performAndCombine(SDNode *N,
14533 DAGCombinerInfo &DCI) const {
14534 if (DCI.isBeforeLegalize())
14535 return SDValue();
14536
14537 SelectionDAG &DAG = DCI.DAG;
14538 EVT VT = N->getValueType(0);
14539 SDValue LHS = N->getOperand(0);
14540 SDValue RHS = N->getOperand(1);
14541
14542 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS);
14543 if (VT == MVT::i64 && CRHS) {
14544 if (SDValue Split =
14545 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::AND, LHS, CRHS))
14546 return Split;
14547 }
14548
14549 if (CRHS && VT == MVT::i32) {
14550 // and (srl x, c), mask => shl (bfe x, nb + c, mask >> nb), nb
14551 // nb = number of trailing zeroes in mask
14552 // It can be optimized out using SDWA for GFX8+ in the SDWA peephole pass,
14553 // given that we are selecting 8 or 16 bit fields starting at byte boundary.
14554 uint64_t Mask = CRHS->getZExtValue();
14555 unsigned Bits = llvm::popcount(Mask);
14556 if (getSubtarget()->hasSDWA() && LHS->getOpcode() == ISD::SRL &&
14557 (Bits == 8 || Bits == 16) && isShiftedMask_64(Mask) && !(Mask & 1)) {
14558 if (auto *CShift = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) {
14559 unsigned Shift = CShift->getZExtValue();
14560 unsigned NB = CRHS->getAPIntValue().countr_zero();
14561 unsigned Offset = NB + Shift;
14562 if ((Offset & (Bits - 1)) == 0) { // Starts at a byte or word boundary.
14563 SDLoc SL(N);
14564 SDValue BFE =
14565 DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, LHS->getOperand(0),
14566 DAG.getConstant(Offset, SL, MVT::i32),
14567 DAG.getConstant(Bits, SL, MVT::i32));
14568 EVT NarrowVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
14569 SDValue Ext = DAG.getNode(ISD::AssertZext, SL, VT, BFE,
14570 DAG.getValueType(NarrowVT));
14571 SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(LHS), VT, Ext,
14572 DAG.getConstant(NB, SDLoc(CRHS), MVT::i32));
14573 return Shl;
14574 }
14575 }
14576 }
14577
14578 // and (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
14579 if (LHS.hasOneUse() && LHS.getOpcode() == AMDGPUISD::PERM &&
14580 isa<ConstantSDNode>(LHS.getOperand(2))) {
14581 uint32_t Sel = getConstantPermuteMask(Mask);
14582 if (!Sel)
14583 return SDValue();
14584
14585 // Select 0xc for all zero bytes
14586 Sel = (LHS.getConstantOperandVal(2) & Sel) | (~Sel & 0x0c0c0c0c);
14587 SDLoc DL(N);
14588 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14589 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
14590 }
14591 }
14592
14593 // (and (fcmp ord x, x), (fcmp une (fabs x), inf)) ->
14594 // fp_class x, ~(s_nan | q_nan | n_infinity | p_infinity)
14595 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == ISD::SETCC) {
14596 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
14597 ISD::CondCode RCC = cast<CondCodeSDNode>(RHS.getOperand(2))->get();
14598
14599 SDValue X = LHS.getOperand(0);
14600 SDValue Y = RHS.getOperand(0);
14601 if (Y.getOpcode() != ISD::FABS || Y.getOperand(0) != X ||
14602 !isTypeLegal(X.getValueType()))
14603 return SDValue();
14604
14605 if (LCC == ISD::SETO) {
14606 if (X != LHS.getOperand(1))
14607 return SDValue();
14608
14609 if (RCC == ISD::SETUNE) {
14610 const ConstantFPSDNode *C1 =
14611 dyn_cast<ConstantFPSDNode>(RHS.getOperand(1));
14612 if (!C1 || !C1->isInfinity() || C1->isNegative())
14613 return SDValue();
14614
14615 const uint32_t Mask = SIInstrFlags::N_NORMAL |
14619
14620 static_assert(
14623 0x3ff) == Mask,
14624 "mask not equal");
14625
14626 SDLoc DL(N);
14627 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, X,
14628 DAG.getConstant(Mask, DL, MVT::i32));
14629 }
14630 }
14631 }
14632
14633 if (RHS.getOpcode() == ISD::SETCC && LHS.getOpcode() == AMDGPUISD::FP_CLASS)
14634 std::swap(LHS, RHS);
14635
14636 if (LHS.getOpcode() == ISD::SETCC && RHS.getOpcode() == AMDGPUISD::FP_CLASS &&
14637 RHS.hasOneUse()) {
14638 ISD::CondCode LCC = cast<CondCodeSDNode>(LHS.getOperand(2))->get();
14639 // and (fcmp seto), (fp_class x, mask) -> fp_class x, mask & ~(p_nan |
14640 // n_nan) and (fcmp setuo), (fp_class x, mask) -> fp_class x, mask & (p_nan
14641 // | n_nan)
14642 const ConstantSDNode *Mask = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
14643 if ((LCC == ISD::SETO || LCC == ISD::SETUO) && Mask &&
14644 (RHS.getOperand(0) == LHS.getOperand(0) &&
14645 LHS.getOperand(0) == LHS.getOperand(1))) {
14646 const unsigned OrdMask = SIInstrFlags::S_NAN | SIInstrFlags::Q_NAN;
14647 unsigned NewMask = LCC == ISD::SETO ? Mask->getZExtValue() & ~OrdMask
14648 : Mask->getZExtValue() & OrdMask;
14649
14650 SDLoc DL(N);
14651 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, RHS.getOperand(0),
14652 DAG.getConstant(NewMask, DL, MVT::i32));
14653 }
14654 }
14655
14656 if (VT == MVT::i32 && (RHS.getOpcode() == ISD::SIGN_EXTEND ||
14657 LHS.getOpcode() == ISD::SIGN_EXTEND)) {
14658 // and x, (sext cc from i1) => select cc, x, 0
14659 if (RHS.getOpcode() != ISD::SIGN_EXTEND)
14660 std::swap(LHS, RHS);
14661 if (isBoolSGPR(RHS.getOperand(0)))
14662 return DAG.getSelect(SDLoc(N), MVT::i32, RHS.getOperand(0), LHS,
14663 DAG.getConstant(0, SDLoc(N), MVT::i32));
14664 }
14665
14666 // and (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
14667 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
14668 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
14669 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
14670 uint32_t LHSMask = getPermuteMask(LHS);
14671 uint32_t RHSMask = getPermuteMask(RHS);
14672 if (LHSMask != ~0u && RHSMask != ~0u) {
14673 // Canonicalize the expression in an attempt to have fewer unique masks
14674 // and therefore fewer registers used to hold the masks.
14675 if (LHSMask > RHSMask) {
14676 std::swap(LHSMask, RHSMask);
14677 std::swap(LHS, RHS);
14678 }
14679
14680 // Select 0xc for each lane used from source operand. Zero has 0xc mask
14681 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
14682 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14683 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
14684
14685 // Check of we need to combine values from two sources within a byte.
14686 if (!(LHSUsedLanes & RHSUsedLanes) &&
14687 // If we select high and lower word keep it for SDWA.
14688 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
14689 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
14690 // Each byte in each mask is either selector mask 0-3, or has higher
14691 // bits set in either of masks, which can be 0xff for 0xff or 0x0c for
14692 // zero. If 0x0c is in either mask it shall always be 0x0c. Otherwise
14693 // mask which is not 0xff wins. By anding both masks we have a correct
14694 // result except that 0x0c shall be corrected to give 0x0c only.
14695 uint32_t Mask = LHSMask & RHSMask;
14696 for (unsigned I = 0; I < 32; I += 8) {
14697 uint32_t ByteSel = 0xff << I;
14698 if ((LHSMask & ByteSel) == 0x0c || (RHSMask & ByteSel) == 0x0c)
14699 Mask &= (0x0c << I) & 0xffffffff;
14700 }
14701
14702 // Add 4 to each active LHS lane. It will not affect any existing 0xff
14703 // or 0x0c.
14704 uint32_t Sel = Mask | (LHSUsedLanes & 0x04040404);
14705 SDLoc DL(N);
14706
14707 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
14708 RHS.getOperand(0),
14709 DAG.getConstant(Sel, DL, MVT::i32));
14710 }
14711 }
14712 }
14713
14714 return SDValue();
14715}
14716
14717// A key component of v_perm is a mapping between byte position of the src
14718// operands, and the byte position of the dest. To provide such, we need: 1. the
14719// node that provides x byte of the dest of the OR, and 2. the byte of the node
14720// used to provide that x byte. calculateByteProvider finds which node provides
14721// a certain byte of the dest of the OR, and calculateSrcByte takes that node,
14722// and finds an ultimate src and byte position For example: The supported
14723// LoadCombine pattern for vector loads is as follows
14724// t1
14725// or
14726// / \
14727// t2 t3
14728// zext shl
14729// | | \
14730// t4 t5 16
14731// or anyext
14732// / \ |
14733// t6 t7 t8
14734// srl shl or
14735// / | / \ / \
14736// t9 t10 t11 t12 t13 t14
14737// trunc* 8 trunc* 8 and and
14738// | | / | | \
14739// t15 t16 t17 t18 t19 t20
14740// trunc* 255 srl -256
14741// | / \
14742// t15 t15 16
14743//
14744// *In this example, the truncs are from i32->i16
14745//
14746// calculateByteProvider would find t6, t7, t13, and t14 for bytes 0-3
14747// respectively. calculateSrcByte would find (given node) -> ultimate src &
14748// byteposition: t6 -> t15 & 1, t7 -> t16 & 0, t13 -> t15 & 0, t14 -> t15 & 3.
14749// After finding the mapping, we can combine the tree into vperm t15, t16,
14750// 0x05000407
14751
14752// Find the source and byte position from a node.
14753// \p DestByte is the byte position of the dest of the or that the src
14754// ultimately provides. \p SrcIndex is the byte of the src that maps to this
14755// dest of the or byte. \p Depth tracks how many recursive iterations we have
14756// performed.
14757static const std::optional<ByteProvider<SDValue>>
14758calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
14759 unsigned Depth = 0) {
14760 // We may need to recursively traverse a series of SRLs
14761 if (Depth >= 6)
14762 return std::nullopt;
14763
14764 if (Op.getValueSizeInBits() < 8)
14765 return std::nullopt;
14766
14767 if (Op.getValueType().isVector())
14768 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14769
14770 switch (Op->getOpcode()) {
14771 case ISD::TRUNCATE: {
14772 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14773 }
14774
14775 case ISD::ANY_EXTEND:
14776 case ISD::SIGN_EXTEND:
14777 case ISD::ZERO_EXTEND:
14779 SDValue NarrowOp = Op->getOperand(0);
14780 auto NarrowVT = NarrowOp.getValueType();
14781 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
14782 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14783 NarrowVT = VTSign->getVT();
14784 }
14785 if (!NarrowVT.isByteSized())
14786 return std::nullopt;
14787 uint64_t NarrowByteWidth = NarrowVT.getStoreSize();
14788
14789 if (SrcIndex >= NarrowByteWidth)
14790 return std::nullopt;
14791 return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
14792 }
14793
14794 case ISD::SRA:
14795 case ISD::SRL: {
14796 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14797 if (!ShiftOp)
14798 return std::nullopt;
14799
14800 uint64_t BitShift = ShiftOp->getZExtValue();
14801
14802 if (BitShift % 8 != 0)
14803 return std::nullopt;
14804
14805 uint64_t NewSrcIndex = SrcIndex + BitShift / 8;
14806 if (NewSrcIndex >= Op.getScalarValueSizeInBits() / 8)
14807 return std::nullopt;
14808
14809 return calculateSrcByte(Op->getOperand(0), DestByte, NewSrcIndex,
14810 Depth + 1);
14811 }
14812
14813 default: {
14814 return ByteProvider<SDValue>::getSrc(Op, DestByte, SrcIndex);
14815 }
14816 }
14817 llvm_unreachable("fully handled switch");
14818}
14819
14820// For a byte position in the result of an Or, traverse the tree and find the
14821// node (and the byte of the node) which ultimately provides this {Or,
14822// BytePosition}. \p Op is the operand we are currently examining. \p Index is
14823// the byte position of the Op that corresponds with the originally requested
14824// byte of the Or \p Depth tracks how many recursive iterations we have
14825// performed. \p StartingIndex is the originally requested byte of the Or
14826static const std::optional<ByteProvider<SDValue>>
14827calculateByteProvider(const SDValue &Op, unsigned Index, unsigned Depth,
14828 unsigned StartingIndex = 0) {
14829 // Finding Src tree of RHS of or typically requires at least 1 additional
14830 // depth
14831 if (Depth > 6)
14832 return std::nullopt;
14833
14834 unsigned BitWidth = Op.getScalarValueSizeInBits();
14835 if (BitWidth % 8 != 0)
14836 return std::nullopt;
14837 if (Index > BitWidth / 8 - 1)
14838 return std::nullopt;
14839
14840 bool IsVec = Op.getValueType().isVector();
14841 switch (Op.getOpcode()) {
14842 case ISD::OR: {
14843 if (IsVec)
14844 return std::nullopt;
14845
14846 auto RHS = calculateByteProvider(Op.getOperand(1), Index, Depth + 1,
14847 StartingIndex);
14848 if (!RHS)
14849 return std::nullopt;
14850 auto LHS = calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
14851 StartingIndex);
14852 if (!LHS)
14853 return std::nullopt;
14854 // A well formed Or will have two ByteProviders for each byte, one of which
14855 // is constant zero
14856 if (!LHS->isConstantZero() && !RHS->isConstantZero())
14857 return std::nullopt;
14858 if (!LHS || LHS->isConstantZero())
14859 return RHS;
14860 if (!RHS || RHS->isConstantZero())
14861 return LHS;
14862 return std::nullopt;
14863 }
14864
14865 case ISD::AND: {
14866 if (IsVec)
14867 return std::nullopt;
14868
14869 auto *BitMaskOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14870 if (!BitMaskOp)
14871 return std::nullopt;
14872
14873 uint32_t BitMask = BitMaskOp->getZExtValue();
14874 // Bits we expect for our StartingIndex
14875 uint32_t IndexMask = 0xFF << (Index * 8);
14876
14877 if ((IndexMask & BitMask) != IndexMask) {
14878 // If the result of the and partially provides the byte, then it
14879 // is not well formatted
14880 if (IndexMask & BitMask)
14881 return std::nullopt;
14883 }
14884
14885 return calculateSrcByte(Op->getOperand(0), StartingIndex, Index);
14886 }
14887
14888 case ISD::FSHR: {
14889 if (IsVec)
14890 return std::nullopt;
14891
14892 // fshr(X,Y,Z): (X << (BW - (Z % BW))) | (Y >> (Z % BW))
14893 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(2));
14894 if (!ShiftOp || Op.getValueType().isVector())
14895 return std::nullopt;
14896
14897 uint64_t BitsProvided = Op.getValueSizeInBits();
14898 if (BitsProvided % 8 != 0)
14899 return std::nullopt;
14900
14901 uint64_t BitShift = ShiftOp->getAPIntValue().urem(BitsProvided);
14902 if (BitShift % 8)
14903 return std::nullopt;
14904
14905 uint64_t ConcatSizeInBytes = BitsProvided / 4;
14906 uint64_t ByteShift = BitShift / 8;
14907
14908 uint64_t NewIndex = (Index + ByteShift) % ConcatSizeInBytes;
14909 uint64_t BytesProvided = BitsProvided / 8;
14910 SDValue NextOp = Op.getOperand(NewIndex >= BytesProvided ? 0 : 1);
14911 NewIndex %= BytesProvided;
14912 return calculateByteProvider(NextOp, NewIndex, Depth + 1, StartingIndex);
14913 }
14914
14915 case ISD::SRA:
14916 case ISD::SRL: {
14917 if (IsVec)
14918 return std::nullopt;
14919
14920 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14921 if (!ShiftOp)
14922 return std::nullopt;
14923
14924 uint64_t BitShift = ShiftOp->getZExtValue();
14925 if (BitShift % 8)
14926 return std::nullopt;
14927
14928 auto BitsProvided = Op.getScalarValueSizeInBits();
14929 if (BitsProvided % 8 != 0)
14930 return std::nullopt;
14931
14932 uint64_t BytesProvided = BitsProvided / 8;
14933 uint64_t ByteShift = BitShift / 8;
14934 if (Index + ByteShift < BytesProvided)
14935 return calculateSrcByte(Op->getOperand(0), StartingIndex,
14936 Index + ByteShift);
14937 // SRA's out-of-range bytes are sign bits, not constant zero.
14938 if (Op.getOpcode() == ISD::SRA)
14939 return std::nullopt;
14941 }
14942
14943 case ISD::SHL: {
14944 if (IsVec)
14945 return std::nullopt;
14946
14947 auto *ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
14948 if (!ShiftOp)
14949 return std::nullopt;
14950
14951 uint64_t BitShift = ShiftOp->getZExtValue();
14952 if (BitShift % 8 != 0)
14953 return std::nullopt;
14954 uint64_t ByteShift = BitShift / 8;
14955
14956 // If we are shifting by an amount greater than (or equal to)
14957 // the index we are trying to provide, then it provides 0s. If not,
14958 // then this bytes are not definitively 0s, and the corresponding byte
14959 // of interest is Index - ByteShift of the src
14960 return Index < ByteShift
14962 : calculateByteProvider(Op.getOperand(0), Index - ByteShift,
14963 Depth + 1, StartingIndex);
14964 }
14965 case ISD::ANY_EXTEND:
14966 case ISD::SIGN_EXTEND:
14967 case ISD::ZERO_EXTEND:
14969 case ISD::AssertZext:
14970 case ISD::AssertSext: {
14971 if (IsVec)
14972 return std::nullopt;
14973
14974 SDValue NarrowOp = Op->getOperand(0);
14975 unsigned NarrowBitWidth = NarrowOp.getValueSizeInBits();
14976 if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG ||
14977 Op->getOpcode() == ISD::AssertZext ||
14978 Op->getOpcode() == ISD::AssertSext) {
14979 auto *VTSign = cast<VTSDNode>(Op->getOperand(1));
14980 NarrowBitWidth = VTSign->getVT().getSizeInBits();
14981 }
14982 if (NarrowBitWidth % 8 != 0)
14983 return std::nullopt;
14984 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
14985
14986 if (Index >= NarrowByteWidth)
14987 return Op.getOpcode() == ISD::ZERO_EXTEND
14988 ? std::optional<ByteProvider<SDValue>>(
14990 : std::nullopt;
14991 return calculateByteProvider(NarrowOp, Index, Depth + 1, StartingIndex);
14992 }
14993
14994 case ISD::TRUNCATE: {
14995 if (IsVec)
14996 return std::nullopt;
14997
14998 uint64_t NarrowByteWidth = BitWidth / 8;
14999
15000 if (NarrowByteWidth >= Index) {
15001 return calculateByteProvider(Op.getOperand(0), Index, Depth + 1,
15002 StartingIndex);
15003 }
15004
15005 return std::nullopt;
15006 }
15007
15008 case ISD::CopyFromReg: {
15009 if (BitWidth / 8 > Index)
15010 return calculateSrcByte(Op, StartingIndex, Index);
15011
15012 return std::nullopt;
15013 }
15014
15015 case ISD::LOAD: {
15016 auto *L = cast<LoadSDNode>(Op.getNode());
15017
15018 unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
15019 if (NarrowBitWidth % 8 != 0)
15020 return std::nullopt;
15021 uint64_t NarrowByteWidth = NarrowBitWidth / 8;
15022
15023 // If the width of the load does not reach byte we are trying to provide for
15024 // and it is not a ZEXTLOAD, then the load does not provide for the byte in
15025 // question
15026 if (Index >= NarrowByteWidth) {
15027 return L->getExtensionType() == ISD::ZEXTLOAD
15028 ? std::optional<ByteProvider<SDValue>>(
15030 : std::nullopt;
15031 }
15032
15033 if (NarrowByteWidth > Index) {
15034 return calculateSrcByte(Op, StartingIndex, Index);
15035 }
15036
15037 return std::nullopt;
15038 }
15039
15040 case ISD::BSWAP: {
15041 if (IsVec)
15042 return std::nullopt;
15043
15044 return calculateByteProvider(Op->getOperand(0), BitWidth / 8 - Index - 1,
15045 Depth + 1, StartingIndex);
15046 }
15047
15049 auto *IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
15050 if (!IdxOp)
15051 return std::nullopt;
15052 auto VecIdx = IdxOp->getZExtValue();
15053 auto ScalarSize = Op.getScalarValueSizeInBits();
15054 if (ScalarSize < 32)
15055 Index = ScalarSize == 8 ? VecIdx : VecIdx * 2 + Index;
15056 return calculateSrcByte(ScalarSize >= 32 ? Op : Op.getOperand(0),
15057 StartingIndex, Index);
15058 }
15059
15060 case AMDGPUISD::PERM: {
15061 if (IsVec)
15062 return std::nullopt;
15063
15064 auto *PermMask = dyn_cast<ConstantSDNode>(Op->getOperand(2));
15065 if (!PermMask)
15066 return std::nullopt;
15067
15068 auto IdxMask =
15069 (PermMask->getZExtValue() & (0xFF << (Index * 8))) >> (Index * 8);
15070 if (IdxMask > 0x07 && IdxMask != 0x0c)
15071 return std::nullopt;
15072
15073 auto NextOp = Op.getOperand(IdxMask > 0x03 ? 0 : 1);
15074 auto NextIndex = IdxMask > 0x03 ? IdxMask % 4 : IdxMask;
15075
15076 return IdxMask != 0x0c ? calculateSrcByte(NextOp, StartingIndex, NextIndex)
15079 }
15080
15081 default: {
15082 return std::nullopt;
15083 }
15084 }
15085
15086 llvm_unreachable("fully handled switch");
15087}
15088
15089// Returns true if the Operand is a scalar and is 16 bits
15090static bool isExtendedFrom16Bits(SDValue &Operand) {
15091
15092 switch (Operand.getOpcode()) {
15093 case ISD::ANY_EXTEND:
15094 case ISD::SIGN_EXTEND:
15095 case ISD::ZERO_EXTEND: {
15096 auto OpVT = Operand.getOperand(0).getValueType();
15097 return !OpVT.isVector() && OpVT.getSizeInBits() == 16;
15098 }
15099 case ISD::LOAD: {
15100 LoadSDNode *L = cast<LoadSDNode>(Operand.getNode());
15101 auto ExtType = cast<LoadSDNode>(L)->getExtensionType();
15102 if (ExtType == ISD::ZEXTLOAD || ExtType == ISD::SEXTLOAD ||
15103 ExtType == ISD::EXTLOAD) {
15104 auto MemVT = L->getMemoryVT();
15105 return !MemVT.isVector() && MemVT.getSizeInBits() == 16;
15106 }
15107 return L->getMemoryVT().getSizeInBits() == 16;
15108 }
15109 default:
15110 return false;
15111 }
15112}
15113
15114// Returns true if the mask matches consecutive bytes, and the first byte
15115// begins at a power of 2 byte offset from 0th byte
15116static bool addresses16Bits(int Mask) {
15117 int Low8 = Mask & 0xff;
15118 int Hi8 = (Mask & 0xff00) >> 8;
15119
15120 assert(Low8 < 8 && Hi8 < 8);
15121 // Are the bytes contiguous in the order of increasing addresses.
15122 bool IsConsecutive = (Hi8 - Low8 == 1);
15123 // Is the first byte at location that is aligned for 16 bit instructions.
15124 // A counter example is taking 2 consecutive bytes starting at the 8th bit.
15125 // In this case, we still need code to extract the 16 bit operand, so it
15126 // is better to use i8 v_perm
15127 bool Is16Aligned = !(Low8 % 2);
15128
15129 return IsConsecutive && Is16Aligned;
15130}
15131
15132// Do not lower into v_perm if the operands are actually 16 bit
15133// and the selected bits (based on PermMask) correspond with two
15134// easily addressable 16 bit operands.
15136 SDValue &OtherOp) {
15137 int Low16 = PermMask & 0xffff;
15138 int Hi16 = (PermMask & 0xffff0000) >> 16;
15139
15140 auto TempOp = peekThroughBitcasts(Op);
15141 auto TempOtherOp = peekThroughBitcasts(OtherOp);
15142
15143 auto OpIs16Bit =
15144 TempOp.getValueSizeInBits() == 16 || isExtendedFrom16Bits(TempOp);
15145 if (!OpIs16Bit)
15146 return true;
15147
15148 auto OtherOpIs16Bit = TempOtherOp.getValueSizeInBits() == 16 ||
15149 isExtendedFrom16Bits(TempOtherOp);
15150 if (!OtherOpIs16Bit)
15151 return true;
15152
15153 // Do we cleanly address both
15154 return !addresses16Bits(Low16) || !addresses16Bits(Hi16);
15155}
15156
15158 unsigned DWordOffset) {
15159 SDValue Ret;
15160
15161 auto TypeSize = Src.getValueSizeInBits().getFixedValue();
15162 // ByteProvider must be at least 8 bits
15163 assert(Src.getValueSizeInBits().isKnownMultipleOf(8));
15164
15165 if (TypeSize <= 32)
15166 return DAG.getBitcastedAnyExtOrTrunc(Src, SL, MVT::i32);
15167
15168 if (Src.getValueType().isVector()) {
15169 auto ScalarTySize = Src.getScalarValueSizeInBits();
15170 auto ScalarTy = Src.getValueType().getScalarType();
15171 if (ScalarTySize == 32) {
15172 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Src,
15173 DAG.getConstant(DWordOffset, SL, MVT::i32));
15174 }
15175 if (ScalarTySize > 32) {
15176 Ret = DAG.getNode(
15177 ISD::EXTRACT_VECTOR_ELT, SL, ScalarTy, Src,
15178 DAG.getConstant(DWordOffset / (ScalarTySize / 32), SL, MVT::i32));
15179 auto ShiftVal = 32 * (DWordOffset % (ScalarTySize / 32));
15180 if (ShiftVal)
15181 Ret = DAG.getNode(ISD::SRL, SL, Ret.getValueType(), Ret,
15182 DAG.getConstant(ShiftVal, SL, MVT::i32));
15183 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
15184 }
15185
15186 assert(ScalarTySize < 32);
15187 auto NumElements = TypeSize / ScalarTySize;
15188 auto Trunc32Elements = (ScalarTySize * NumElements) / 32;
15189 auto NormalizedTrunc = Trunc32Elements * 32 / ScalarTySize;
15190 auto NumElementsIn32 = 32 / ScalarTySize;
15191 auto NumAvailElements = DWordOffset < Trunc32Elements
15192 ? NumElementsIn32
15193 : NumElements - NormalizedTrunc;
15194
15196 DAG.ExtractVectorElements(Src, VecSrcs, DWordOffset * NumElementsIn32,
15197 NumAvailElements);
15198
15199 Ret = DAG.getBuildVector(
15200 MVT::getVectorVT(MVT::getIntegerVT(ScalarTySize), NumAvailElements), SL,
15201 VecSrcs);
15202 return Ret = DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
15203 }
15204
15205 /// Scalar Type
15206 auto ShiftVal = 32 * DWordOffset;
15207 Ret = DAG.getNode(ISD::SRL, SL, Src.getValueType(), Src,
15208 DAG.getConstant(ShiftVal, SL, MVT::i32));
15209 return DAG.getBitcastedAnyExtOrTrunc(Ret, SL, MVT::i32);
15210}
15211
15213 SelectionDAG &DAG = DCI.DAG;
15214 [[maybe_unused]] EVT VT = N->getValueType(0);
15216
15217 // VT is known to be MVT::i32, so we need to provide 4 bytes.
15218 assert(VT == MVT::i32);
15219 for (int i = 0; i < 4; i++) {
15220 // Find the ByteProvider that provides the ith byte of the result of OR
15221 std::optional<ByteProvider<SDValue>> P =
15222 calculateByteProvider(SDValue(N, 0), i, 0, /*StartingIndex = */ i);
15223 // TODO support constantZero
15224 if (!P || P->isConstantZero())
15225 return SDValue();
15226
15227 PermNodes.push_back(*P);
15228 }
15229 if (PermNodes.size() != 4)
15230 return SDValue();
15231
15232 std::pair<unsigned, unsigned> FirstSrc(0, PermNodes[0].SrcOffset / 4);
15233 std::optional<std::pair<unsigned, unsigned>> SecondSrc;
15234 uint64_t PermMask = 0x00000000;
15235 for (size_t i = 0; i < PermNodes.size(); i++) {
15236 auto PermOp = PermNodes[i];
15237 // Since the mask is applied to Src1:Src2, Src1 bytes must be offset
15238 // by sizeof(Src2) = 4
15239 int SrcByteAdjust = 4;
15240
15241 // If the Src uses a byte from a different DWORD, then it corresponds
15242 // with a difference source
15243 if (!PermOp.hasSameSrc(PermNodes[FirstSrc.first]) ||
15244 ((PermOp.SrcOffset / 4) != FirstSrc.second)) {
15245 if (SecondSrc)
15246 if (!PermOp.hasSameSrc(PermNodes[SecondSrc->first]) ||
15247 ((PermOp.SrcOffset / 4) != SecondSrc->second))
15248 return SDValue();
15249
15250 // Set the index of the second distinct Src node
15251 SecondSrc = {i, PermNodes[i].SrcOffset / 4};
15252 assert(!(PermNodes[SecondSrc->first].Src->getValueSizeInBits() % 8));
15253 SrcByteAdjust = 0;
15254 }
15255 assert((PermOp.SrcOffset % 4) + SrcByteAdjust < 8);
15257 PermMask |= ((PermOp.SrcOffset % 4) + SrcByteAdjust) << (i * 8);
15258 }
15259 SDLoc DL(N);
15260 SDValue Op = *PermNodes[FirstSrc.first].Src;
15261 Op = getDWordFromOffset(DAG, DL, Op, FirstSrc.second);
15262 assert(Op.getValueSizeInBits() == 32);
15263
15264 // Check that we are not just extracting the bytes in order from an op
15265 if (!SecondSrc) {
15266 int Low16 = PermMask & 0xffff;
15267 int Hi16 = (PermMask & 0xffff0000) >> 16;
15268
15269 bool WellFormedLow = (Low16 == 0x0504) || (Low16 == 0x0100);
15270 bool WellFormedHi = (Hi16 == 0x0706) || (Hi16 == 0x0302);
15271
15272 // The perm op would really just produce Op. So combine into Op
15273 if (WellFormedLow && WellFormedHi)
15274 return DAG.getBitcast(MVT::getIntegerVT(32), Op);
15275 }
15276
15277 SDValue OtherOp = SecondSrc ? *PermNodes[SecondSrc->first].Src : Op;
15278
15279 if (SecondSrc) {
15280 OtherOp = getDWordFromOffset(DAG, DL, OtherOp, SecondSrc->second);
15281 assert(OtherOp.getValueSizeInBits() == 32);
15282 }
15283
15284 // Check that we haven't just recreated the same FSHR node.
15285 if (N->getOpcode() == ISD::FSHR &&
15286 (N->getOperand(0) == Op || N->getOperand(0) == OtherOp) &&
15287 (N->getOperand(1) == Op || N->getOperand(1) == OtherOp))
15288 return SDValue();
15289
15290 if (hasNon16BitAccesses(PermMask, Op, OtherOp)) {
15291
15292 assert(Op.getValueType().isByteSized() &&
15293 OtherOp.getValueType().isByteSized());
15294
15295 // If the ultimate src is less than 32 bits, then we will only be
15296 // using bytes 0: Op.getValueSizeInBytes() - 1 in the or.
15297 // CalculateByteProvider would not have returned Op as source if we
15298 // used a byte that is outside its ValueType. Thus, we are free to
15299 // ANY_EXTEND as the extended bits are dont-cares.
15300 Op = DAG.getBitcastedAnyExtOrTrunc(Op, DL, MVT::i32);
15301 OtherOp = DAG.getBitcastedAnyExtOrTrunc(OtherOp, DL, MVT::i32);
15302
15303 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, Op, OtherOp,
15304 DAG.getConstant(PermMask, DL, MVT::i32));
15305 }
15306 return SDValue();
15307}
15308
15309SDValue SITargetLowering::performOrCombine(SDNode *N,
15310 DAGCombinerInfo &DCI) const {
15311 SelectionDAG &DAG = DCI.DAG;
15312 SDValue LHS = N->getOperand(0);
15313 SDValue RHS = N->getOperand(1);
15314
15315 EVT VT = N->getValueType(0);
15316 if (VT == MVT::i1) {
15317 // or (fp_class x, c1), (fp_class x, c2) -> fp_class x, (c1 | c2)
15318 if (LHS.getOpcode() == AMDGPUISD::FP_CLASS &&
15319 RHS.getOpcode() == AMDGPUISD::FP_CLASS) {
15320 SDValue Src = LHS.getOperand(0);
15321 if (Src != RHS.getOperand(0))
15322 return SDValue();
15323
15324 const ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
15325 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(RHS.getOperand(1));
15326 if (!CLHS || !CRHS)
15327 return SDValue();
15328
15329 // Only 10 bits are used.
15330 static const uint32_t MaxMask = 0x3ff;
15331
15332 uint32_t NewMask =
15333 (CLHS->getZExtValue() | CRHS->getZExtValue()) & MaxMask;
15334 SDLoc DL(N);
15335 return DAG.getNode(AMDGPUISD::FP_CLASS, DL, MVT::i1, Src,
15336 DAG.getConstant(NewMask, DL, MVT::i32));
15337 }
15338
15339 return SDValue();
15340 }
15341
15342 // or (perm x, y, c1), c2 -> perm x, y, permute_mask(c1, c2)
15344 LHS.getOpcode() == AMDGPUISD::PERM &&
15345 isa<ConstantSDNode>(LHS.getOperand(2))) {
15346 uint32_t Sel = getConstantPermuteMask(N->getConstantOperandVal(1));
15347 if (!Sel)
15348 return SDValue();
15349
15350 Sel |= LHS.getConstantOperandVal(2);
15351 SDLoc DL(N);
15352 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
15353 LHS.getOperand(1), DAG.getConstant(Sel, DL, MVT::i32));
15354 }
15355
15356 // or (op x, c1), (op y, c2) -> perm x, y, permute_mask(c1, c2)
15357 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
15358 if (VT == MVT::i32 && LHS.hasOneUse() && RHS.hasOneUse() &&
15359 N->isDivergent() && TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
15360
15361 // If all the uses of an or need to extract the individual elements, do not
15362 // attempt to lower into v_perm
15363 auto usesCombinedOperand = [](SDNode *OrUse) {
15364 // If we have any non-vectorized use, then it is a candidate for v_perm
15365 if (OrUse->getOpcode() != ISD::BITCAST ||
15366 !OrUse->getValueType(0).isVector())
15367 return true;
15368
15369 // If we have any non-vectorized use, then it is a candidate for v_perm
15370 for (auto *VUser : OrUse->users()) {
15371 if (!VUser->getValueType(0).isVector())
15372 return true;
15373
15374 // If the use of a vector is a store, then combining via a v_perm
15375 // is beneficial.
15376 // TODO -- whitelist more uses
15377 for (auto VectorwiseOp : {ISD::STORE, ISD::CopyToReg, ISD::CopyFromReg})
15378 if (VUser->getOpcode() == VectorwiseOp)
15379 return true;
15380 }
15381 return false;
15382 };
15383
15384 if (!any_of(N->users(), usesCombinedOperand))
15385 return SDValue();
15386
15387 uint32_t LHSMask = getPermuteMask(LHS);
15388 uint32_t RHSMask = getPermuteMask(RHS);
15389
15390 if (LHSMask != ~0u && RHSMask != ~0u) {
15391 // Canonicalize the expression in an attempt to have fewer unique masks
15392 // and therefore fewer registers used to hold the masks.
15393 if (LHSMask > RHSMask) {
15394 std::swap(LHSMask, RHSMask);
15395 std::swap(LHS, RHS);
15396 }
15397
15398 // Select 0xc for each lane used from source operand. Zero has 0xc mask
15399 // set, 0xff have 0xff in the mask, actual lanes are in the 0-3 range.
15400 uint32_t LHSUsedLanes = ~(LHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15401 uint32_t RHSUsedLanes = ~(RHSMask & 0x0c0c0c0c) & 0x0c0c0c0c;
15402
15403 // Check of we need to combine values from two sources within a byte.
15404 if (!(LHSUsedLanes & RHSUsedLanes) &&
15405 // If we select high and lower word keep it for SDWA.
15406 // TODO: teach SDWA to work with v_perm_b32 and remove the check.
15407 !(LHSUsedLanes == 0x0c0c0000 && RHSUsedLanes == 0x00000c0c)) {
15408 // Kill zero bytes selected by other mask. Zero value is 0xc.
15409 LHSMask &= ~RHSUsedLanes;
15410 RHSMask &= ~LHSUsedLanes;
15411 // Add 4 to each active LHS lane
15412 LHSMask |= LHSUsedLanes & 0x04040404;
15413 // Combine masks
15414 uint32_t Sel = LHSMask | RHSMask;
15415 SDLoc DL(N);
15416
15417 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, LHS.getOperand(0),
15418 RHS.getOperand(0),
15419 DAG.getConstant(Sel, DL, MVT::i32));
15420 }
15421 }
15422 if (LHSMask == ~0u || RHSMask == ~0u) {
15423 if (SDValue Perm = matchPERM(N, DCI))
15424 return Perm;
15425 }
15426 }
15427
15428 // Detect identity v2i32 OR and replace with identity source node.
15429 // Specifically an Or that has operands constructed from the same source node
15430 // via extract_vector_elt and build_vector. I.E.
15431 // v2i32 or(
15432 // v2i32 build_vector(
15433 // i32 extract_elt(%IdentitySrc, 0),
15434 // i32 0
15435 // ),
15436 // v2i32 build_vector(
15437 // i32 0,
15438 // i32 extract_elt(%IdentitySrc, 1)
15439 // ) )
15440 // =>
15441 // v2i32 %IdentitySrc
15442
15443 if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR &&
15444 RHS->getOpcode() == ISD::BUILD_VECTOR) {
15445
15446 ConstantSDNode *LC = dyn_cast<ConstantSDNode>(LHS->getOperand(1));
15447 ConstantSDNode *RC = dyn_cast<ConstantSDNode>(RHS->getOperand(0));
15448
15449 // Test for and normalise build vectors.
15450 if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) {
15451
15452 // Get the extract_vector_element operands.
15453 SDValue LEVE = LHS->getOperand(0);
15454 SDValue REVE = RHS->getOperand(1);
15455
15456 if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15458 // Check that different elements from the same vector are
15459 // extracted.
15460 if (LEVE->getOperand(0) == REVE->getOperand(0) &&
15461 LEVE->getOperand(1) != REVE->getOperand(1)) {
15462 SDValue IdentitySrc = LEVE.getOperand(0);
15463 return IdentitySrc;
15464 }
15465 }
15466 }
15467 }
15468
15469 if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
15470 return SDValue();
15471
15472 // TODO: This could be a generic combine with a predicate for extracting the
15473 // high half of an integer being free.
15474
15475 // (or i64:x, (zero_extend i32:y)) ->
15476 // i64 (bitcast (v2i32 build_vector (or i32:y, lo_32(x)), hi_32(x)))
15477 if (LHS.getOpcode() == ISD::ZERO_EXTEND &&
15478 RHS.getOpcode() != ISD::ZERO_EXTEND)
15479 std::swap(LHS, RHS);
15480
15481 if (RHS.getOpcode() == ISD::ZERO_EXTEND) {
15482 SDValue ExtSrc = RHS.getOperand(0);
15483 EVT SrcVT = ExtSrc.getValueType();
15484 if (SrcVT == MVT::i32) {
15485 SDLoc SL(N);
15486 auto [LowLHS, HiBits] = split64BitValue(LHS, DAG);
15487 SDValue LowOr = DAG.getNode(ISD::OR, SL, MVT::i32, LowLHS, ExtSrc);
15488
15489 DCI.AddToWorklist(LowOr.getNode());
15490 DCI.AddToWorklist(HiBits.getNode());
15491
15492 SDValue Vec =
15493 DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, LowOr, HiBits);
15494 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
15495 }
15496 }
15497
15498 const ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(N->getOperand(1));
15499 if (CRHS) {
15500 if (SDValue Split = splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::OR,
15501 N->getOperand(0), CRHS))
15502 return Split;
15503 }
15504
15505 return SDValue();
15506}
15507
15508SDValue SITargetLowering::performXorCombine(SDNode *N,
15509 DAGCombinerInfo &DCI) const {
15510 if (SDValue RV = reassociateScalarOps(N, DCI.DAG))
15511 return RV;
15512
15513 SDValue LHS = N->getOperand(0);
15514 SDValue RHS = N->getOperand(1);
15515
15516 const ConstantSDNode *CRHS = isConstOrConstSplat(RHS);
15517 SelectionDAG &DAG = DCI.DAG;
15518
15519 EVT VT = N->getValueType(0);
15520 if (CRHS && VT == MVT::i64) {
15521 if (SDValue Split =
15522 splitBinaryBitConstantOp(DCI, SDLoc(N), ISD::XOR, LHS, CRHS))
15523 return Split;
15524 }
15525
15526 // v2i32 (xor (vselect cc, x, y), K) ->
15527 // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be
15528 // replaced with source modifiers when the select is lowered to CNDMASK.
15529 unsigned Opc = LHS.getOpcode();
15530 if (((Opc == ISD::VSELECT && VT == MVT::v2i32) ||
15531 (Opc == ISD::SELECT && VT == MVT::i64)) &&
15532 CRHS && CRHS->getAPIntValue().isSignMask()) {
15533 SDValue CC = LHS->getOperand(0);
15534 SDValue TRUE = LHS->getOperand(1);
15535 SDValue FALSE = LHS->getOperand(2);
15536 SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS);
15537 SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS);
15538 SDValue XSelect =
15539 DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse);
15540 return XSelect;
15541 }
15542
15543 // Make sure to apply the 64-bit constant splitting fold before trying to fold
15544 // fneg-like xors into 64-bit select.
15545 if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) {
15546 // This looks like an fneg, try to fold as a source modifier.
15547 if (CRHS && CRHS->getAPIntValue().isSignMask() &&
15549 // xor (select c, a, b), 0x80000000 ->
15550 // bitcast (select c, (fneg (bitcast a)), (fneg (bitcast b)))
15551 SDLoc DL(N);
15552 SDValue CastLHS =
15553 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(1));
15554 SDValue CastRHS =
15555 DAG.getNode(ISD::BITCAST, DL, MVT::f32, LHS->getOperand(2));
15556 SDValue FNegLHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastLHS);
15557 SDValue FNegRHS = DAG.getNode(ISD::FNEG, DL, MVT::f32, CastRHS);
15558 SDValue NewSelect = DAG.getNode(ISD::SELECT, DL, MVT::f32,
15559 LHS->getOperand(0), FNegLHS, FNegRHS);
15560 return DAG.getNode(ISD::BITCAST, DL, VT, NewSelect);
15561 }
15562 }
15563
15564 return SDValue();
15565}
15566
15567SDValue
15568SITargetLowering::performZeroOrAnyExtendCombine(SDNode *N,
15569 DAGCombinerInfo &DCI) const {
15570 if (!Subtarget->has16BitInsts() ||
15571 DCI.getDAGCombineLevel() < AfterLegalizeTypes)
15572 return SDValue();
15573
15574 EVT VT = N->getValueType(0);
15575 if (VT != MVT::i32)
15576 return SDValue();
15577
15578 SDValue Src = N->getOperand(0);
15579 if (Src.getValueType() != MVT::i16)
15580 return SDValue();
15581
15582 if (!Src->hasOneUse())
15583 return SDValue();
15584
15585 // TODO: We bail out below if SrcOffset is not in the first dword (>= 4). It's
15586 // possible we're missing out on some combine opportunities, but we'd need to
15587 // weigh the cost of extracting the byte from the upper dwords.
15588
15589 std::optional<ByteProvider<SDValue>> BP0 =
15590 calculateByteProvider(SDValue(N, 0), 0, 0, 0);
15591 if (!BP0 || BP0->SrcOffset >= 4 || !BP0->Src)
15592 return SDValue();
15593 SDValue V0 = *BP0->Src;
15594
15595 std::optional<ByteProvider<SDValue>> BP1 =
15596 calculateByteProvider(SDValue(N, 0), 1, 0, 1);
15597 if (!BP1 || BP1->SrcOffset >= 4 || !BP1->Src)
15598 return SDValue();
15599
15600 SDValue V1 = *BP1->Src;
15601
15602 if (V0 == V1)
15603 return SDValue();
15604
15605 SelectionDAG &DAG = DCI.DAG;
15606 SDLoc DL(N);
15607 uint32_t PermMask = 0x0c0c0c0c;
15608 if (V0) {
15609 V0 = DAG.getBitcastedAnyExtOrTrunc(V0, DL, MVT::i32);
15610 PermMask = (PermMask & ~0xFF) | (BP0->SrcOffset + 4);
15611 }
15612
15613 if (V1) {
15614 V1 = DAG.getBitcastedAnyExtOrTrunc(V1, DL, MVT::i32);
15615 PermMask = (PermMask & ~(0xFF << 8)) | (BP1->SrcOffset << 8);
15616 }
15617
15618 return DAG.getNode(AMDGPUISD::PERM, DL, MVT::i32, V0, V1,
15619 DAG.getConstant(PermMask, DL, MVT::i32));
15620}
15621
15622SDValue
15623SITargetLowering::performSignExtendInRegCombine(SDNode *N,
15624 DAGCombinerInfo &DCI) const {
15625 SDValue Src = N->getOperand(0);
15626 auto *VTSign = cast<VTSDNode>(N->getOperand(1));
15627
15628 // Combine s_buffer_load_u8 or s_buffer_load_u16 with sext and replace them
15629 // with s_buffer_load_i8 and s_buffer_load_i16 respectively.
15630 if (((Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE &&
15631 VTSign->getVT() == MVT::i8) ||
15632 (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_USHORT &&
15633 VTSign->getVT() == MVT::i16))) {
15634 assert(Subtarget->hasScalarSubwordLoads() &&
15635 "s_buffer_load_{u8, i8} are supported "
15636 "in GFX12 (or newer) architectures.");
15637 EVT VT = Src.getValueType();
15638 unsigned Opc = (Src.getOpcode() == AMDGPUISD::SBUFFER_LOAD_UBYTE)
15639 ? AMDGPUISD::SBUFFER_LOAD_BYTE
15640 : AMDGPUISD::SBUFFER_LOAD_SHORT;
15641 SDLoc DL(N);
15642 SDVTList ResList = DCI.DAG.getVTList(MVT::i32);
15643 SDValue Ops[] = {
15644 Src.getOperand(0), // source register
15645 Src.getOperand(1), // offset
15646 Src.getOperand(2) // cachePolicy
15647 };
15648 auto *M = cast<MemSDNode>(Src);
15649 SDValue BufferLoad = DCI.DAG.getMemIntrinsicNode(
15650 Opc, DL, ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15651 SDValue LoadVal = DCI.DAG.getNode(ISD::TRUNCATE, DL, VT, BufferLoad);
15652 return LoadVal;
15653 }
15654 if (((Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE &&
15655 VTSign->getVT() == MVT::i8) ||
15656 (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_USHORT &&
15657 VTSign->getVT() == MVT::i16)) &&
15658 Src.hasOneUse()) {
15659 auto *M = cast<MemSDNode>(Src);
15660 SDValue Ops[] = {Src.getOperand(0), // Chain
15661 Src.getOperand(1), // rsrc
15662 Src.getOperand(2), // vindex
15663 Src.getOperand(3), // voffset
15664 Src.getOperand(4), // soffset
15665 Src.getOperand(5), // offset
15666 Src.getOperand(6), Src.getOperand(7)};
15667 // replace with BUFFER_LOAD_BYTE/SHORT
15668 SDVTList ResList =
15669 DCI.DAG.getVTList(MVT::i32, Src.getOperand(0).getValueType());
15670 unsigned Opc = (Src.getOpcode() == AMDGPUISD::BUFFER_LOAD_UBYTE)
15671 ? AMDGPUISD::BUFFER_LOAD_BYTE
15672 : AMDGPUISD::BUFFER_LOAD_SHORT;
15673 SDValue BufferLoadSignExt = DCI.DAG.getMemIntrinsicNode(
15674 Opc, SDLoc(N), ResList, Ops, M->getMemoryVT(), M->getMemOperand());
15675 return DCI.DAG.getMergeValues(
15676 {BufferLoadSignExt, BufferLoadSignExt.getValue(1)}, SDLoc(N));
15677 }
15678 return SDValue();
15679}
15680
15681SDValue SITargetLowering::performClassCombine(SDNode *N,
15682 DAGCombinerInfo &DCI) const {
15683 SelectionDAG &DAG = DCI.DAG;
15684 SDValue Mask = N->getOperand(1);
15685
15686 // fp_class x, 0 -> false
15687 if (isNullConstant(Mask))
15688 return DAG.getConstant(0, SDLoc(N), MVT::i1);
15689
15690 if (N->getOperand(0).isUndef())
15691 return DAG.getUNDEF(MVT::i1);
15692
15693 return SDValue();
15694}
15695
15696SDValue SITargetLowering::performRcpCombine(SDNode *N,
15697 DAGCombinerInfo &DCI) const {
15698 EVT VT = N->getValueType(0);
15699 SDValue N0 = N->getOperand(0);
15700
15701 if (N0.isUndef()) {
15702 return DCI.DAG.getConstantFP(APFloat::getQNaN(VT.getFltSemantics()),
15703 SDLoc(N), VT);
15704 }
15705
15706 // TODO: Could handle f32 + amdgcn.sqrt but probably never reaches here.
15707 if ((VT == MVT::f16 && N0.getOpcode() == ISD::FSQRT) &&
15708 N->getFlags().hasAllowContract() && N0->getFlags().hasAllowContract()) {
15709 return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT, N0.getOperand(0),
15710 N->getFlags());
15711 }
15712
15714}
15715
15717 SDNodeFlags UserFlags,
15718 unsigned MaxDepth) const {
15719 unsigned Opcode = Op.getOpcode();
15720 if (Opcode == ISD::FCANONICALIZE)
15721 return true;
15722
15723 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
15724 const auto &F = CFP->getValueAPF();
15725 if (F.isNaN() && F.isSignaling())
15726 return false;
15727 if (!F.isDenormal())
15728 return true;
15729
15730 DenormalMode Mode =
15731 DAG.getMachineFunction().getDenormalMode(F.getSemantics());
15732 return Mode == DenormalMode::getIEEE();
15733 }
15734
15735 // If source is a result of another standard FP operation it is already in
15736 // canonical form.
15737 if (MaxDepth == 0)
15738 return false;
15739
15740 switch (Opcode) {
15741 // These will flush denorms if required.
15742 case ISD::FADD:
15743 case ISD::FSUB:
15744 case ISD::FMUL:
15745 case ISD::FCEIL:
15746 case ISD::FFLOOR:
15747 case ISD::FMA:
15748 case ISD::FMAD:
15749 case ISD::FSQRT:
15750 case ISD::FDIV:
15751 case ISD::FREM:
15752 case ISD::FP_ROUND:
15753 case ISD::FP_EXTEND:
15754 case ISD::FP16_TO_FP:
15755 case ISD::FP_TO_FP16:
15756 case ISD::BF16_TO_FP:
15757 case ISD::FP_TO_BF16:
15758 case ISD::FLDEXP:
15759 case AMDGPUISD::FMUL_LEGACY:
15760 case AMDGPUISD::FMAD_FTZ:
15761 case AMDGPUISD::RCP:
15762 case AMDGPUISD::RSQ:
15763 case AMDGPUISD::RSQ_CLAMP:
15764 case AMDGPUISD::RCP_LEGACY:
15765 case AMDGPUISD::RCP_IFLAG:
15766 case AMDGPUISD::LOG:
15767 case AMDGPUISD::EXP:
15768 case AMDGPUISD::DIV_SCALE:
15769 case AMDGPUISD::DIV_FMAS:
15770 case AMDGPUISD::DIV_FIXUP:
15771 case AMDGPUISD::FRACT:
15772 case AMDGPUISD::CVT_PKRTZ_F16_F32:
15773 case AMDGPUISD::CVT_F32_UBYTE0:
15774 case AMDGPUISD::CVT_F32_UBYTE1:
15775 case AMDGPUISD::CVT_F32_UBYTE2:
15776 case AMDGPUISD::CVT_F32_UBYTE3:
15777 case AMDGPUISD::FP_TO_FP16:
15778 case AMDGPUISD::SIN_HW:
15779 case AMDGPUISD::COS_HW:
15780 return true;
15781
15782 // It can/will be lowered or combined as a bit operation.
15783 // Need to check their input recursively to handle.
15784 case ISD::FNEG:
15785 case ISD::FABS:
15786 case ISD::FCOPYSIGN:
15787 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15788
15789 case ISD::AND:
15790 if (Op.getValueType() == MVT::i32) {
15791 // Be careful as we only know it is a bitcast floating point type. It
15792 // could be f32, v2f16, we have no way of knowing. Luckily the constant
15793 // value that we optimize for, which comes up in fp32 to bf16 conversions,
15794 // is valid to optimize for all types.
15795 if (auto *RHS = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
15796 if (RHS->getZExtValue() == 0xffff0000) {
15797 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15798 }
15799 }
15800 }
15801 break;
15802
15803 case ISD::FSIN:
15804 case ISD::FCOS:
15805 case ISD::FSINCOS:
15806 return Op.getValueType().getScalarType() != MVT::f16;
15807
15808 case ISD::FMINNUM:
15809 case ISD::FMAXNUM:
15810 case ISD::FMINNUM_IEEE:
15811 case ISD::FMAXNUM_IEEE:
15812 case ISD::FMINIMUM:
15813 case ISD::FMAXIMUM:
15814 case ISD::FMINIMUMNUM:
15815 case ISD::FMAXIMUMNUM:
15816 case AMDGPUISD::CLAMP:
15817 case AMDGPUISD::FMED3:
15818 case AMDGPUISD::FMAX3:
15819 case AMDGPUISD::FMIN3:
15820 case AMDGPUISD::FMAXIMUM3:
15821 case AMDGPUISD::FMINIMUM3: {
15822 // FIXME: Shouldn't treat the generic operations different based these.
15823 // However, we aren't really required to flush the result from
15824 // minnum/maxnum..
15825
15826 // snans will be quieted, so we only need to worry about denormals.
15827 if (Subtarget->supportsMinMaxDenormModes() ||
15828 // FIXME: denormalsEnabledForType is broken for dynamic
15829 denormalsEnabledForType(DAG, Op.getValueType()))
15830 return true;
15831
15832 // Flushing may be required.
15833 // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. For such
15834 // targets need to check their input recursively.
15835
15836 // FIXME: Does this apply with clamp? It's implemented with max.
15837 for (unsigned I = 0, E = Op.getNumOperands(); I != E; ++I) {
15838 if (!isCanonicalized(DAG, Op.getOperand(I), MaxDepth - 1))
15839 return false;
15840 }
15841
15842 return true;
15843 }
15844 case ISD::SELECT: {
15845 return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1) &&
15846 isCanonicalized(DAG, Op.getOperand(2), MaxDepth - 1);
15847 }
15848 case ISD::BUILD_VECTOR: {
15849 for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
15850 SDValue SrcOp = Op.getOperand(i);
15851 if (!isCanonicalized(DAG, SrcOp, MaxDepth - 1))
15852 return false;
15853 }
15854
15855 return true;
15856 }
15859 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15860 }
15862 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1) &&
15863 isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1);
15864 }
15865 case ISD::UNDEF:
15866 // Could be anything.
15867 return false;
15868
15869 case ISD::BITCAST:
15870 // TODO: This is incorrect as it loses track of the operand's type. We may
15871 // end up effectively bitcasting from f32 to v2f16 or vice versa, and the
15872 // same bits that are canonicalized in one type need not be in the other.
15873 return isCanonicalized(DAG, Op.getOperand(0), MaxDepth - 1);
15874 case ISD::TRUNCATE: {
15875 // Hack round the mess we make when legalizing extract_vector_elt
15876 if (Op.getValueType() == MVT::i16) {
15877 SDValue TruncSrc = Op.getOperand(0);
15878 if (TruncSrc.getValueType() == MVT::i32 &&
15879 TruncSrc.getOpcode() == ISD::BITCAST &&
15880 TruncSrc.getOperand(0).getValueType() == MVT::v2f16) {
15881 return isCanonicalized(DAG, TruncSrc.getOperand(0), MaxDepth - 1);
15882 }
15883 }
15884 return false;
15885 }
15887 unsigned IntrinsicID = Op.getConstantOperandVal(0);
15888 // TODO: Handle more intrinsics
15889 switch (IntrinsicID) {
15890 case Intrinsic::amdgcn_cvt_pkrtz:
15891 case Intrinsic::amdgcn_cubeid:
15892 case Intrinsic::amdgcn_frexp_mant:
15893 case Intrinsic::amdgcn_fdot2:
15894 case Intrinsic::amdgcn_rcp:
15895 case Intrinsic::amdgcn_rsq:
15896 case Intrinsic::amdgcn_rsq_clamp:
15897 case Intrinsic::amdgcn_rcp_legacy:
15898 case Intrinsic::amdgcn_rsq_legacy:
15899 case Intrinsic::amdgcn_trig_preop:
15900 case Intrinsic::amdgcn_tanh:
15901 case Intrinsic::amdgcn_log:
15902 case Intrinsic::amdgcn_exp2:
15903 case Intrinsic::amdgcn_sqrt:
15904 return true;
15905 default:
15906 break;
15907 }
15908
15909 break;
15910 }
15911 default:
15912 break;
15913 }
15914
15915 // FIXME: denormalsEnabledForType is broken for dynamic
15916 return denormalsEnabledForType(DAG, Op.getValueType()) &&
15917 (UserFlags.hasNoNaNs() || DAG.isKnownNeverSNaN(Op));
15918}
15919
15921 unsigned MaxDepth) const {
15922 const MachineRegisterInfo &MRI = MF.getRegInfo();
15923 MachineInstr *MI = MRI.getVRegDef(Reg);
15924 unsigned Opcode = MI->getOpcode();
15925
15926 if (Opcode == AMDGPU::G_FCANONICALIZE)
15927 return true;
15928
15929 std::optional<FPValueAndVReg> FCR;
15930 // Constant splat (can be padded with undef) or scalar constant.
15931 if (mi_match(Reg, MRI, MIPatternMatch::m_GFCstOrSplat(FCR))) {
15932 if (FCR->Value.isSignaling())
15933 return false;
15934 if (!FCR->Value.isDenormal())
15935 return true;
15936
15937 DenormalMode Mode = MF.getDenormalMode(FCR->Value.getSemantics());
15938 return Mode == DenormalMode::getIEEE();
15939 }
15940
15941 if (MaxDepth == 0)
15942 return false;
15943
15944 switch (Opcode) {
15945 case AMDGPU::G_FADD:
15946 case AMDGPU::G_FSUB:
15947 case AMDGPU::G_FMUL:
15948 case AMDGPU::G_FCEIL:
15949 case AMDGPU::G_FFLOOR:
15950 case AMDGPU::G_FRINT:
15951 case AMDGPU::G_FNEARBYINT:
15952 case AMDGPU::G_INTRINSIC_FPTRUNC_ROUND:
15953 case AMDGPU::G_INTRINSIC_TRUNC:
15954 case AMDGPU::G_INTRINSIC_ROUNDEVEN:
15955 case AMDGPU::G_FMA:
15956 case AMDGPU::G_FMAD:
15957 case AMDGPU::G_FSQRT:
15958 case AMDGPU::G_FDIV:
15959 case AMDGPU::G_FREM:
15960 case AMDGPU::G_FPOW:
15961 case AMDGPU::G_FPEXT:
15962 case AMDGPU::G_FLOG:
15963 case AMDGPU::G_FLOG2:
15964 case AMDGPU::G_FLOG10:
15965 case AMDGPU::G_FPTRUNC:
15966 case AMDGPU::G_AMDGPU_RCP_IFLAG:
15967 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
15968 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
15969 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
15970 case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
15971 return true;
15972 case AMDGPU::G_FNEG:
15973 case AMDGPU::G_FABS:
15974 case AMDGPU::G_FCOPYSIGN:
15975 return isCanonicalized(MI->getOperand(1).getReg(), MF, MaxDepth - 1);
15976 case AMDGPU::G_FMINNUM:
15977 case AMDGPU::G_FMAXNUM:
15978 case AMDGPU::G_FMINNUM_IEEE:
15979 case AMDGPU::G_FMAXNUM_IEEE:
15980 case AMDGPU::G_FMINIMUM:
15981 case AMDGPU::G_FMAXIMUM:
15982 case AMDGPU::G_FMINIMUMNUM:
15983 case AMDGPU::G_FMAXIMUMNUM: {
15984 if (Subtarget->supportsMinMaxDenormModes() ||
15985 // FIXME: denormalsEnabledForType is broken for dynamic
15986 denormalsEnabledForType(MRI.getType(Reg), MF))
15987 return true;
15988
15989 [[fallthrough]];
15990 }
15991 case AMDGPU::G_BUILD_VECTOR:
15992 for (const MachineOperand &MO : llvm::drop_begin(MI->operands()))
15993 if (!isCanonicalized(MO.getReg(), MF, MaxDepth - 1))
15994 return false;
15995 return true;
15996 case AMDGPU::G_INTRINSIC:
15997 case AMDGPU::G_INTRINSIC_CONVERGENT:
15998 switch (cast<GIntrinsic>(MI)->getIntrinsicID()) {
15999 case Intrinsic::amdgcn_fmul_legacy:
16000 case Intrinsic::amdgcn_fmad_ftz:
16001 case Intrinsic::amdgcn_sqrt:
16002 case Intrinsic::amdgcn_fmed3:
16003 case Intrinsic::amdgcn_sin:
16004 case Intrinsic::amdgcn_cos:
16005 case Intrinsic::amdgcn_log:
16006 case Intrinsic::amdgcn_exp2:
16007 case Intrinsic::amdgcn_log_clamp:
16008 case Intrinsic::amdgcn_rcp:
16009 case Intrinsic::amdgcn_rcp_legacy:
16010 case Intrinsic::amdgcn_rsq:
16011 case Intrinsic::amdgcn_rsq_clamp:
16012 case Intrinsic::amdgcn_rsq_legacy:
16013 case Intrinsic::amdgcn_div_scale:
16014 case Intrinsic::amdgcn_div_fmas:
16015 case Intrinsic::amdgcn_div_fixup:
16016 case Intrinsic::amdgcn_fract:
16017 case Intrinsic::amdgcn_cvt_pkrtz:
16018 case Intrinsic::amdgcn_cubeid:
16019 case Intrinsic::amdgcn_cubema:
16020 case Intrinsic::amdgcn_cubesc:
16021 case Intrinsic::amdgcn_cubetc:
16022 case Intrinsic::amdgcn_frexp_mant:
16023 case Intrinsic::amdgcn_fdot2:
16024 case Intrinsic::amdgcn_trig_preop:
16025 case Intrinsic::amdgcn_tanh:
16026 return true;
16027 default:
16028 break;
16029 }
16030
16031 [[fallthrough]];
16032 default:
16033 return false;
16034 }
16035
16036 llvm_unreachable("invalid operation");
16037}
16038
16039// Constant fold canonicalize.
16040SDValue SITargetLowering::getCanonicalConstantFP(SelectionDAG &DAG,
16041 const SDLoc &SL, EVT VT,
16042 const APFloat &C) const {
16043 // Flush denormals to 0 if not enabled.
16044 if (C.isDenormal()) {
16045 DenormalMode Mode =
16046 DAG.getMachineFunction().getDenormalMode(C.getSemantics());
16047 if (Mode == DenormalMode::getPreserveSign()) {
16048 return DAG.getConstantFP(
16049 APFloat::getZero(C.getSemantics(), C.isNegative()), SL, VT);
16050 }
16051
16052 if (Mode != DenormalMode::getIEEE())
16053 return SDValue();
16054 }
16055
16056 if (C.isNaN()) {
16057 APFloat CanonicalQNaN = APFloat::getQNaN(C.getSemantics());
16058 if (C.isSignaling()) {
16059 // Quiet a signaling NaN.
16060 // FIXME: Is this supposed to preserve payload bits?
16061 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
16062 }
16063
16064 // Make sure it is the canonical NaN bitpattern.
16065 //
16066 // TODO: Can we use -1 as the canonical NaN value since it's an inline
16067 // immediate?
16068 if (C.bitcastToAPInt() != CanonicalQNaN.bitcastToAPInt())
16069 return DAG.getConstantFP(CanonicalQNaN, SL, VT);
16070 }
16071
16072 // Already canonical.
16073 return DAG.getConstantFP(C, SL, VT);
16074}
16075
16077 return Op.isUndef() || isa<ConstantFPSDNode>(Op);
16078}
16079
16080SDValue
16081SITargetLowering::performFCanonicalizeCombine(SDNode *N,
16082 DAGCombinerInfo &DCI) const {
16083 SelectionDAG &DAG = DCI.DAG;
16084 SDValue N0 = N->getOperand(0);
16085 EVT VT = N->getValueType(0);
16086
16087 // fcanonicalize undef -> qnan
16088 if (N0.isUndef()) {
16090 return DAG.getConstantFP(QNaN, SDLoc(N), VT);
16091 }
16092
16093 if (ConstantFPSDNode *CFP = isConstOrConstSplatFP(N0)) {
16094 EVT VT = N->getValueType(0);
16095 return getCanonicalConstantFP(DAG, SDLoc(N), VT, CFP->getValueAPF());
16096 }
16097
16098 // fcanonicalize (build_vector x, k) -> build_vector (fcanonicalize x),
16099 // (fcanonicalize k)
16100 //
16101 // fcanonicalize (build_vector x, undef) -> build_vector (fcanonicalize x), 0
16102
16103 // TODO: This could be better with wider vectors that will be split to v2f16,
16104 // and to consider uses since there aren't that many packed operations.
16105 if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
16106 isTypeLegal(MVT::v2f16)) {
16107 SDLoc SL(N);
16108 SDValue NewElts[2];
16109 SDValue Lo = N0.getOperand(0);
16110 SDValue Hi = N0.getOperand(1);
16111 EVT EltVT = Lo.getValueType();
16112
16114 for (unsigned I = 0; I != 2; ++I) {
16115 SDValue Op = N0.getOperand(I);
16116 if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
16117 NewElts[I] =
16118 getCanonicalConstantFP(DAG, SL, EltVT, CFP->getValueAPF());
16119 } else if (Op.isUndef()) {
16120 // Handled below based on what the other operand is.
16121 NewElts[I] = Op;
16122 } else {
16123 NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
16124 }
16125 }
16126
16127 // If one half is undef, and one is constant, prefer a splat vector rather
16128 // than the normal qNaN. If it's a register, prefer 0.0 since that's
16129 // cheaper to use and may be free with a packed operation.
16130 if (NewElts[0].isUndef()) {
16131 if (isa<ConstantFPSDNode>(NewElts[1]))
16132 NewElts[0] = isa<ConstantFPSDNode>(NewElts[1])
16133 ? NewElts[1]
16134 : DAG.getConstantFP(0.0f, SL, EltVT);
16135 }
16136
16137 if (NewElts[1].isUndef()) {
16138 NewElts[1] = isa<ConstantFPSDNode>(NewElts[0])
16139 ? NewElts[0]
16140 : DAG.getConstantFP(0.0f, SL, EltVT);
16141 }
16142
16143 return DAG.getBuildVector(VT, SL, NewElts);
16144 }
16145 }
16146
16147 return SDValue();
16148}
16149
16150static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
16151 switch (Opc) {
16152 case ISD::FMAXNUM:
16153 case ISD::FMAXNUM_IEEE:
16154 case ISD::FMAXIMUMNUM:
16155 return AMDGPUISD::FMAX3;
16156 case ISD::FMAXIMUM:
16157 return AMDGPUISD::FMAXIMUM3;
16158 case ISD::SMAX:
16159 return AMDGPUISD::SMAX3;
16160 case ISD::UMAX:
16161 return AMDGPUISD::UMAX3;
16162 case ISD::FMINNUM:
16163 case ISD::FMINNUM_IEEE:
16164 case ISD::FMINIMUMNUM:
16165 return AMDGPUISD::FMIN3;
16166 case ISD::FMINIMUM:
16167 return AMDGPUISD::FMINIMUM3;
16168 case ISD::SMIN:
16169 return AMDGPUISD::SMIN3;
16170 case ISD::UMIN:
16171 return AMDGPUISD::UMIN3;
16172 default:
16173 llvm_unreachable("Not a min/max opcode");
16174 }
16175}
16176
16177SDValue SITargetLowering::performIntMed3ImmCombine(SelectionDAG &DAG,
16178 const SDLoc &SL, SDValue Src,
16179 SDValue MinVal,
16180 SDValue MaxVal,
16181 bool Signed) const {
16182
16183 // med3 comes from
16184 // min(max(x, K0), K1), K0 < K1
16185 // max(min(x, K0), K1), K1 < K0
16186 //
16187 // "MinVal" and "MaxVal" respectively refer to the rhs of the
16188 // min/max op.
16189 ConstantSDNode *MinK = dyn_cast<ConstantSDNode>(MinVal);
16190 ConstantSDNode *MaxK = dyn_cast<ConstantSDNode>(MaxVal);
16191
16192 if (!MinK || !MaxK)
16193 return SDValue();
16194
16195 if (Signed) {
16196 if (MaxK->getAPIntValue().sge(MinK->getAPIntValue()))
16197 return SDValue();
16198 } else {
16199 if (MaxK->getAPIntValue().uge(MinK->getAPIntValue()))
16200 return SDValue();
16201 }
16202
16203 EVT VT = MinK->getValueType(0);
16204 unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
16205 if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16()))
16206 return DAG.getNode(Med3Opc, SL, VT, Src, MaxVal, MinVal);
16207
16208 // Note: we could also extend to i32 and use i32 med3 if i16 med3 is
16209 // not available, but this is unlikely to be profitable as constants
16210 // will often need to be materialized & extended, especially on
16211 // pre-GFX10 where VOP3 instructions couldn't take literal operands.
16212 return SDValue();
16213}
16214
16217 return C;
16218
16220 if (ConstantFPSDNode *C = BV->getConstantFPSplatNode())
16221 return C;
16222 }
16223
16224 return nullptr;
16225}
16226
16227SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
16228 const SDLoc &SL, SDValue Op0,
16229 SDValue Op1,
16230 bool IsKnownNoNaNs) const {
16231 ConstantFPSDNode *K1 = getSplatConstantFP(Op1);
16232 if (!K1)
16233 return SDValue();
16234
16235 ConstantFPSDNode *K0 = getSplatConstantFP(Op0.getOperand(1));
16236 if (!K0)
16237 return SDValue();
16238
16239 // Ordered >= (although NaN inputs should have folded away by now).
16240 if (K0->getValueAPF() > K1->getValueAPF())
16241 return SDValue();
16242
16243 // med3 with a nan input acts like
16244 // v_min_f32(v_min_f32(S0.f32, S1.f32), S2.f32)
16245 //
16246 // So the result depends on whether the IEEE mode bit is enabled or not with a
16247 // signaling nan input.
16248 // ieee=1
16249 // s0 snan: yields s2
16250 // s1 snan: yields s2
16251 // s2 snan: qnan
16252
16253 // s0 qnan: min(s1, s2)
16254 // s1 qnan: min(s0, s2)
16255 // s2 qnan: min(s0, s1)
16256
16257 // ieee=0
16258 // s0 snan: min(s1, s2)
16259 // s1 snan: min(s0, s2)
16260 // s2 snan: qnan
16261
16262 // s0 qnan: min(s1, s2)
16263 // s1 qnan: min(s0, s2)
16264 // s2 qnan: min(s0, s1)
16265 const MachineFunction &MF = DAG.getMachineFunction();
16266 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16267
16268 // TODO: Check IEEE bit enabled. We can form fmed3 with IEEE=0 regardless of
16269 // whether the input is a signaling nan if op0 is fmaximum or fmaximumnum. We
16270 // can only form if op0 is fmaxnum_ieee if IEEE=1.
16271 EVT VT = Op0.getValueType();
16272 if (Info->getMode().DX10Clamp) {
16273 // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
16274 // hardware fmed3 behavior converting to a min.
16275 // FIXME: Should this be allowing -0.0?
16276 if (K1->isOne() && K0->isPosZero())
16277 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
16278 }
16279
16280 // med3 for f16 is only available on gfx9+, and not available for v2f16.
16281 if (VT == MVT::f32 || (VT == MVT::f16 && Subtarget->hasMed3_16())) {
16282 // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
16283 // signaling NaN gives a quiet NaN. The quiet NaN input to the min would
16284 // then give the other result, which is different from med3 with a NaN
16285 // input.
16286 SDValue Var = Op0.getOperand(0);
16287 if (!IsKnownNoNaNs && !DAG.isKnownNeverSNaN(Var))
16288 return SDValue();
16289
16290 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
16291
16292 if ((!K0->hasOneUse() || TII->isInlineConstant(K0->getValueAPF())) &&
16293 (!K1->hasOneUse() || TII->isInlineConstant(K1->getValueAPF()))) {
16294 return DAG.getNode(AMDGPUISD::FMED3, SL, K0->getValueType(0), Var,
16295 SDValue(K0, 0), SDValue(K1, 0));
16296 }
16297 }
16298
16299 return SDValue();
16300}
16301
16302/// \return true if the subtarget supports minimum3 and maximum3 with the given
16303/// base min/max opcode \p Opc for type \p VT.
16304static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc,
16305 EVT VT) {
16306 switch (Opc) {
16307 case ISD::FMINNUM:
16308 case ISD::FMAXNUM:
16309 case ISD::FMINNUM_IEEE:
16310 case ISD::FMAXNUM_IEEE:
16311 case ISD::FMINIMUMNUM:
16312 case ISD::FMAXIMUMNUM:
16313 case AMDGPUISD::FMIN_LEGACY:
16314 case AMDGPUISD::FMAX_LEGACY:
16315 return (VT == MVT::f32) || (VT == MVT::f16 && Subtarget.hasMin3Max3_16()) ||
16316 (VT == MVT::v2f16 && Subtarget.hasMin3Max3PKF16());
16317 case ISD::FMINIMUM:
16318 case ISD::FMAXIMUM:
16319 return (VT == MVT::f32 && Subtarget.hasMinimum3Maximum3F32()) ||
16320 (VT == MVT::f16 && Subtarget.hasMinimum3Maximum3F16()) ||
16321 (VT == MVT::v2f16 && Subtarget.hasMinimum3Maximum3PKF16());
16322 case ISD::SMAX:
16323 case ISD::SMIN:
16324 case ISD::UMAX:
16325 case ISD::UMIN:
16326 return (VT == MVT::i32) || (VT == MVT::i16 && Subtarget.hasMin3Max3_16());
16327 default:
16328 return false;
16329 }
16330
16331 llvm_unreachable("not a min/max opcode");
16332}
16333
16334SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
16335 DAGCombinerInfo &DCI) const {
16336 SelectionDAG &DAG = DCI.DAG;
16337
16338 EVT VT = N->getValueType(0);
16339 unsigned Opc = N->getOpcode();
16340 SDValue Op0 = N->getOperand(0);
16341 SDValue Op1 = N->getOperand(1);
16342
16343 // Only do this if the inner op has one use since this will just increases
16344 // register pressure for no benefit.
16345
16346 if (supportsMin3Max3(*Subtarget, Opc, VT)) {
16347 auto IsTreeWithCombinableChildren = [Opc](SDValue Op) {
16348 return (Op.getOperand(0).getOpcode() == Opc &&
16349 Op.getOperand(0).hasOneUse()) ||
16350 (Op.getOperand(1).getOpcode() == Opc &&
16351 Op.getOperand(1).hasOneUse());
16352 };
16353
16354 bool CanTreeCombineApply = Op0.getOpcode() == Opc && Op0.hasOneUse() &&
16355 Op1.getOpcode() == Opc && Op1.hasOneUse();
16356 bool HasCombinableTreeChild =
16357 CanTreeCombineApply && (IsTreeWithCombinableChildren(Op0) ||
16358 IsTreeWithCombinableChildren(Op1));
16359
16360 // Tree reduction: when both operands are the same min/max op, restructure
16361 // to keep a 2-op node on top so higher tree levels can still combine.
16362 //
16363 // max(max(a, b), max(c, d)) -> max(max3(a, b, c), d)
16364 // min(min(a, b), min(c, d)) -> min(min3(a, b, c), d)
16365 //
16366 // Defer when either inner op is a tree node with combinable children.
16367 if (CanTreeCombineApply && !HasCombinableTreeChild) {
16368 SDLoc DL(N);
16369 SDValue Inner =
16371 Op0.getOperand(1), Op1.getOperand(0));
16372 return DAG.getNode(Opc, DL, VT, Inner, Op1.getOperand(1));
16373 }
16374
16375 // max(max(a, b), c) -> max3(a, b, c)
16376 // min(min(a, b), c) -> min3(a, b, c)
16377 // Deferred when Op0 is a tree node with combinable children.
16378 if (Op0.getOpcode() == Opc && Op0.hasOneUse() && !HasCombinableTreeChild) {
16379 SDLoc DL(N);
16380 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
16381 Op0.getOperand(0), Op0.getOperand(1), Op1);
16382 }
16383
16384 // Try commuted.
16385 // max(a, max(b, c)) -> max3(a, b, c)
16386 // min(a, min(b, c)) -> min3(a, b, c)
16387 // Deferred when Op1 is a tree node with combinable children.
16388 if (Op1.getOpcode() == Opc && Op1.hasOneUse() && !HasCombinableTreeChild) {
16389 SDLoc DL(N);
16390 return DAG.getNode(minMaxOpcToMin3Max3Opc(Opc), DL, N->getValueType(0),
16391 Op0, Op1.getOperand(0), Op1.getOperand(1));
16392 }
16393 }
16394
16395 // umin(sffbh(x), bitwidth) -> sffbh(x) if x is known to be not 0 or -1.
16396 SDValue FfbhSrc;
16397 uint64_t Clamp = 0;
16398 if (Opc == ISD::UMIN &&
16399 sd_match(Op0,
16401 sd_match(Op1, m_ConstInt(Clamp))) {
16402 unsigned BitWidth = FfbhSrc.getValueType().getScalarSizeInBits();
16403 if (Clamp >= BitWidth) {
16404 KnownBits Known = DAG.computeKnownBits(FfbhSrc);
16405 if (Known.isNonZero() && Known.Zero.getBoolValue())
16406 return Op0;
16407 }
16408 }
16409
16410 // min(max(x, K0), K1), K0 < K1 -> med3(x, K0, K1)
16411 // max(min(x, K0), K1), K1 < K0 -> med3(x, K1, K0)
16412 if (Opc == ISD::SMIN && Op0.getOpcode() == ISD::SMAX && Op0.hasOneUse()) {
16413 if (SDValue Med3 = performIntMed3ImmCombine(
16414 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), true))
16415 return Med3;
16416 }
16417 if (Opc == ISD::SMAX && Op0.getOpcode() == ISD::SMIN && Op0.hasOneUse()) {
16418 if (SDValue Med3 = performIntMed3ImmCombine(
16419 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, true))
16420 return Med3;
16421 }
16422
16423 if (Opc == ISD::UMIN && Op0.getOpcode() == ISD::UMAX && Op0.hasOneUse()) {
16424 if (SDValue Med3 = performIntMed3ImmCombine(
16425 DAG, SDLoc(N), Op0->getOperand(0), Op1, Op0->getOperand(1), false))
16426 return Med3;
16427 }
16428 if (Opc == ISD::UMAX && Op0.getOpcode() == ISD::UMIN && Op0.hasOneUse()) {
16429 if (SDValue Med3 = performIntMed3ImmCombine(
16430 DAG, SDLoc(N), Op0->getOperand(0), Op0->getOperand(1), Op1, false))
16431 return Med3;
16432 }
16433
16434 // if !is_snan(x):
16435 // fminnum(fmaxnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16436 // fminnum_ieee(fmaxnum_ieee(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16437 // fminnumnum(fmaxnumnum(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16438 // fmin_legacy(fmax_legacy(x, K0), K1), K0 < K1 -> fmed3(x, K0, K1)
16439 if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
16442 (Opc == AMDGPUISD::FMIN_LEGACY &&
16443 Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
16444 (VT == MVT::f32 || VT == MVT::f64 ||
16445 (VT == MVT::f16 && Subtarget->has16BitInsts()) ||
16446 (VT == MVT::bf16 && Subtarget->hasBF16PackedInsts()) ||
16447 (VT == MVT::v2bf16 && Subtarget->hasBF16PackedInsts()) ||
16448 (VT == MVT::v2f16 && Subtarget->hasVOP3PInsts())) &&
16449 Op0.hasOneUse()) {
16450 if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1,
16451 N->getFlags().hasNoNaNs()))
16452 return Res;
16453 }
16454
16455 // Prefer fminnum_ieee over fminimum. For gfx950, minimum/maximum are legal
16456 // for some types, but at a higher cost since it's implemented with a 3
16457 // operand form.
16458 const SDNodeFlags Flags = N->getFlags();
16459 if ((Opc == ISD::FMINIMUM || Opc == ISD::FMAXIMUM) && Flags.hasNoNaNs() &&
16460 !Subtarget->hasIEEEMinimumMaximumInsts() &&
16462 unsigned NewOpc =
16464 return DAG.getNode(NewOpc, SDLoc(N), VT, Op0, Op1, Flags);
16465 }
16466
16467 return SDValue();
16468}
16469
16473 // FIXME: Should this be allowing -0.0?
16474 return (CA->isPosZero() && CB->isOne()) ||
16475 (CA->isOne() && CB->isPosZero());
16476 }
16477 }
16478
16479 return false;
16480}
16481
16482// FIXME: Should only worry about snans for version with chain.
16483SDValue SITargetLowering::performFMed3Combine(SDNode *N,
16484 DAGCombinerInfo &DCI) const {
16485 EVT VT = N->getValueType(0);
16486 // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
16487 // NaNs. With a NaN input, the order of the operands may change the result.
16488
16489 SelectionDAG &DAG = DCI.DAG;
16490 SDLoc SL(N);
16491
16492 SDValue Src0 = N->getOperand(0);
16493 SDValue Src1 = N->getOperand(1);
16494 SDValue Src2 = N->getOperand(2);
16495
16496 if (isClampZeroToOne(Src0, Src1)) {
16497 // const_a, const_b, x -> clamp is safe in all cases including signaling
16498 // nans.
16499 // FIXME: Should this be allowing -0.0?
16500 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
16501 }
16502
16503 const MachineFunction &MF = DAG.getMachineFunction();
16504 const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
16505
16506 // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
16507 // handling no dx10-clamp?
16508 if (Info->getMode().DX10Clamp) {
16509 // If NaNs is clamped to 0, we are free to reorder the inputs.
16510
16511 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
16512 std::swap(Src0, Src1);
16513
16514 if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
16515 std::swap(Src1, Src2);
16516
16517 if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
16518 std::swap(Src0, Src1);
16519
16520 if (isClampZeroToOne(Src1, Src2))
16521 return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
16522 }
16523
16524 return SDValue();
16525}
16526
16527SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
16528 DAGCombinerInfo &DCI) const {
16529 SDValue Src0 = N->getOperand(0);
16530 SDValue Src1 = N->getOperand(1);
16531 if (Src0.isUndef() && Src1.isUndef())
16532 return DCI.DAG.getUNDEF(N->getValueType(0));
16533 return SDValue();
16534}
16535
16536// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
16537// expanded into a set of cmp/select instructions.
16539 unsigned NumElem,
16540 bool IsDivergentIdx,
16541 const GCNSubtarget *Subtarget) {
16543 return false;
16544
16545 unsigned VecSize = EltSize * NumElem;
16546
16547 // Sub-dword vectors of size 2 dword or less have better implementation.
16548 if (VecSize <= 64 && EltSize < 32)
16549 return false;
16550
16551 // Always expand the rest of sub-dword instructions, otherwise it will be
16552 // lowered via memory.
16553 if (EltSize < 32)
16554 return true;
16555
16556 // Always do this if var-idx is divergent, otherwise it will become a loop.
16557 if (IsDivergentIdx)
16558 return true;
16559
16560 // Large vectors would yield too many compares and v_cndmask_b32 instructions.
16561 unsigned NumInsts = NumElem /* Number of compares */ +
16562 ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
16563
16564 // On some architectures (GFX9) movrel is not available and it's better
16565 // to expand.
16566 if (Subtarget->useVGPRIndexMode())
16567 return NumInsts <= 16;
16568
16569 // If movrel is available, use it instead of expanding for vector of 8
16570 // elements.
16571 if (Subtarget->hasMovrel())
16572 return NumInsts <= 15;
16573
16574 return true;
16575}
16576
16578 SDValue Idx = N->getOperand(N->getNumOperands() - 1);
16579 if (isa<ConstantSDNode>(Idx))
16580 return false;
16581
16582 SDValue Vec = N->getOperand(0);
16583 EVT VecVT = Vec.getValueType();
16584 EVT EltVT = VecVT.getVectorElementType();
16585 unsigned EltSize = EltVT.getSizeInBits();
16586 unsigned NumElem = VecVT.getVectorNumElements();
16587
16589 EltSize, NumElem, Idx->isDivergent(), getSubtarget());
16590}
16591
16592SDValue
16593SITargetLowering::performExtractVectorEltCombine(SDNode *N,
16594 DAGCombinerInfo &DCI) const {
16595 SDValue Vec = N->getOperand(0);
16596 SelectionDAG &DAG = DCI.DAG;
16597
16598 EVT VecVT = Vec.getValueType();
16599 EVT VecEltVT = VecVT.getVectorElementType();
16600 EVT ResVT = N->getValueType(0);
16601
16602 unsigned VecSize = VecVT.getSizeInBits();
16603 unsigned VecEltSize = VecEltVT.getSizeInBits();
16604
16605 if ((Vec.getOpcode() == ISD::FNEG || Vec.getOpcode() == ISD::FABS) &&
16607 SDLoc SL(N);
16608 SDValue Idx = N->getOperand(1);
16609 SDValue Elt =
16610 DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec.getOperand(0), Idx);
16611 return DAG.getNode(Vec.getOpcode(), SL, ResVT, Elt);
16612 }
16613
16614 // (extract_vector_element (and {y0, y1}, (build_vector 0x1f, 0x1f)), index)
16615 // -> (and (extract_vector_element {y0, y1}, index), 0x1f)
16616 // There are optimisations to transform 64-bit shifts into 32-bit shifts
16617 // depending on the shift operand. See e.g. performSraCombine().
16618 // This combine ensures that the optimisation is compatible with v2i32
16619 // legalised AND.
16620 if (VecVT == MVT::v2i32 && Vec->getOpcode() == ISD::AND &&
16621 Vec->getOperand(1)->getOpcode() == ISD::BUILD_VECTOR) {
16622
16624 if (!C || C->getZExtValue() != 0x1f)
16625 return SDValue();
16626
16627 SDLoc SL(N);
16628 SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32);
16629 SDValue EVE = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32,
16630 Vec->getOperand(0), N->getOperand(1));
16631 SDValue A = DAG.getNode(ISD::AND, SL, MVT::i32, EVE, AndMask);
16632 DAG.ReplaceAllUsesWith(N, A.getNode());
16633 }
16634
16635 // ScalarRes = EXTRACT_VECTOR_ELT ((vector-BINOP Vec1, Vec2), Idx)
16636 // =>
16637 // Vec1Elt = EXTRACT_VECTOR_ELT(Vec1, Idx)
16638 // Vec2Elt = EXTRACT_VECTOR_ELT(Vec2, Idx)
16639 // ScalarRes = scalar-BINOP Vec1Elt, Vec2Elt
16640 if (Vec.hasOneUse() && DCI.isBeforeLegalize() && VecEltVT == ResVT) {
16641 SDLoc SL(N);
16642 SDValue Idx = N->getOperand(1);
16643 unsigned Opc = Vec.getOpcode();
16644
16645 switch (Opc) {
16646 default:
16647 break;
16648 // TODO: Support other binary operations.
16649 case ISD::FADD:
16650 case ISD::FSUB:
16651 case ISD::FMUL:
16652 case ISD::ADD:
16653 case ISD::UMIN:
16654 case ISD::UMAX:
16655 case ISD::SMIN:
16656 case ISD::SMAX:
16657 case ISD::FMAXNUM:
16658 case ISD::FMINNUM:
16659 case ISD::FMAXNUM_IEEE:
16660 case ISD::FMINNUM_IEEE:
16661 case ISD::FMAXIMUM:
16662 case ISD::FMINIMUM: {
16663 SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
16664 Vec.getOperand(0), Idx);
16665 SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT,
16666 Vec.getOperand(1), Idx);
16667
16668 DCI.AddToWorklist(Elt0.getNode());
16669 DCI.AddToWorklist(Elt1.getNode());
16670 return DAG.getNode(Opc, SL, ResVT, Elt0, Elt1, Vec->getFlags());
16671 }
16672 }
16673 }
16674
16675 // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
16677 SDLoc SL(N);
16678 SDValue Idx = N->getOperand(1);
16679 SDValue V;
16680 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16681 SDValue IC = DAG.getVectorIdxConstant(I, SL);
16682 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, ResVT, Vec, IC);
16683 if (I == 0)
16684 V = Elt;
16685 else
16686 V = DAG.getSelectCC(SL, Idx, IC, Elt, V, ISD::SETEQ);
16687 }
16688 return V;
16689 }
16690
16691 // EXTRACT_VECTOR_ELT (v2i32 bitcast (i64/f64:k), Idx)
16692 // =>
16693 // i32:Lo(k) if Idx == 0, or
16694 // i32:Hi(k) if Idx == 1
16695 auto *Idx = dyn_cast<ConstantSDNode>(N->getOperand(1));
16696 if (Vec.getOpcode() == ISD::BITCAST && VecVT == MVT::v2i32 && Idx) {
16697 SDLoc SL(N);
16698 SDValue PeekThrough = Vec.getOperand(0);
16699 auto *KImm = dyn_cast<ConstantSDNode>(PeekThrough);
16700 if (KImm && KImm->getValueType(0).getSizeInBits() == 64) {
16701 uint64_t KImmValue = KImm->getZExtValue();
16702 return DAG.getConstant(
16703 (KImmValue >> (32 * Idx->getZExtValue())) & 0xffffffff, SL, MVT::i32);
16704 }
16705 auto *KFPImm = dyn_cast<ConstantFPSDNode>(PeekThrough);
16706 if (KFPImm && KFPImm->getValueType(0).getSizeInBits() == 64) {
16707 uint64_t KFPImmValue =
16708 KFPImm->getValueAPF().bitcastToAPInt().getZExtValue();
16709 return DAG.getConstant((KFPImmValue >> (32 * Idx->getZExtValue())) &
16710 0xffffffff,
16711 SL, MVT::i32);
16712 }
16713 }
16714
16715 if (!DCI.isBeforeLegalize())
16716 return SDValue();
16717
16718 // Try to turn sub-dword accesses of vectors into accesses of the same 32-bit
16719 // elements. This exposes more load reduction opportunities by replacing
16720 // multiple small extract_vector_elements with a single 32-bit extract.
16721 if (isa<MemSDNode>(Vec) && VecEltSize <= 16 && VecEltVT.isByteSized() &&
16722 VecSize > 32 && VecSize % 32 == 0 && Idx) {
16723 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VecVT);
16724
16725 unsigned BitIndex = Idx->getZExtValue() * VecEltSize;
16726 unsigned EltIdx = BitIndex / 32;
16727 unsigned LeftoverBitIdx = BitIndex % 32;
16728 SDLoc SL(N);
16729
16730 SDValue Cast = DAG.getNode(ISD::BITCAST, SL, NewVT, Vec);
16731 DCI.AddToWorklist(Cast.getNode());
16732
16733 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Cast,
16734 DAG.getConstant(EltIdx, SL, MVT::i32));
16735 DCI.AddToWorklist(Elt.getNode());
16736 SDValue Srl = DAG.getNode(ISD::SRL, SL, MVT::i32, Elt,
16737 DAG.getConstant(LeftoverBitIdx, SL, MVT::i32));
16738 DCI.AddToWorklist(Srl.getNode());
16739
16740 EVT VecEltAsIntVT = VecEltVT.changeTypeToInteger();
16741 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VecEltAsIntVT, Srl);
16742 DCI.AddToWorklist(Trunc.getNode());
16743
16744 if (VecEltVT == ResVT) {
16745 return DAG.getNode(ISD::BITCAST, SL, VecEltVT, Trunc);
16746 }
16747
16748 assert(ResVT.isScalarInteger());
16749 return DAG.getAnyExtOrTrunc(Trunc, SL, ResVT);
16750 }
16751
16752 return SDValue();
16753}
16754
16755SDValue
16756SITargetLowering::performInsertVectorEltCombine(SDNode *N,
16757 DAGCombinerInfo &DCI) const {
16758 SDValue Vec = N->getOperand(0);
16759 SDValue Idx = N->getOperand(2);
16760 EVT VecVT = Vec.getValueType();
16761 EVT EltVT = VecVT.getVectorElementType();
16762
16763 // INSERT_VECTOR_ELT (<n x e>, var-idx)
16764 // => BUILD_VECTOR n x select (e, const-idx)
16766 return SDValue();
16767
16768 SelectionDAG &DAG = DCI.DAG;
16769 SDLoc SL(N);
16770 SDValue Ins = N->getOperand(1);
16771 EVT IdxVT = Idx.getValueType();
16772
16774 for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
16775 SDValue IC = DAG.getConstant(I, SL, IdxVT);
16776 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
16777 SDValue V = DAG.getSelectCC(SL, Idx, IC, Ins, Elt, ISD::SETEQ);
16778 Ops.push_back(V);
16779 }
16780
16781 return DAG.getBuildVector(VecVT, SL, Ops);
16782}
16783
16784/// Return the source of an fp_extend from f16 to f32, or a converted FP
16785/// constant.
16787 if (Src.getOpcode() == ISD::FP_EXTEND &&
16788 Src.getOperand(0).getValueType() == MVT::f16) {
16789 return Src.getOperand(0);
16790 }
16791
16792 if (auto *CFP = dyn_cast<ConstantFPSDNode>(Src)) {
16793 APFloat Val = CFP->getValueAPF();
16794 bool LosesInfo = true;
16796 if (!LosesInfo)
16797 return DAG.getConstantFP(Val, SDLoc(Src), MVT::f16);
16798 }
16799
16800 return SDValue();
16801}
16802
16803SDValue SITargetLowering::performFPRoundCombine(SDNode *N,
16804 DAGCombinerInfo &DCI) const {
16805 assert(Subtarget->has16BitInsts() && !Subtarget->hasMed3_16() &&
16806 "combine only useful on gfx8");
16807
16808 SDValue TruncSrc = N->getOperand(0);
16809 EVT VT = N->getValueType(0);
16810 if (VT != MVT::f16)
16811 return SDValue();
16812
16813 if (TruncSrc.getOpcode() != AMDGPUISD::FMED3 ||
16814 TruncSrc.getValueType() != MVT::f32 || !TruncSrc.hasOneUse())
16815 return SDValue();
16816
16817 SelectionDAG &DAG = DCI.DAG;
16818 SDLoc SL(N);
16819
16820 // Optimize f16 fmed3 pattern performed on f32. On gfx8 there is no f16 fmed3,
16821 // and expanding it with min/max saves 1 instruction vs. casting to f32 and
16822 // casting back.
16823
16824 // fptrunc (f32 (fmed3 (fpext f16:a, fpext f16:b, fpext f16:c))) =>
16825 // fmin(fmax(a, b), fmax(fmin(a, b), c))
16826 SDValue A = strictFPExtFromF16(DAG, TruncSrc.getOperand(0));
16827 if (!A)
16828 return SDValue();
16829
16830 SDValue B = strictFPExtFromF16(DAG, TruncSrc.getOperand(1));
16831 if (!B)
16832 return SDValue();
16833
16834 SDValue C = strictFPExtFromF16(DAG, TruncSrc.getOperand(2));
16835 if (!C)
16836 return SDValue();
16837
16838 // This changes signaling nan behavior. If an input is a signaling nan, it
16839 // would have been quieted by the fpext originally. We don't care because
16840 // these are unconstrained ops. If we needed to insert quieting canonicalizes
16841 // we would be worse off than just doing the promotion.
16842 SDValue A1 = DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, A, B);
16843 SDValue B1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A, B);
16844 SDValue C1 = DAG.getNode(ISD::FMAXNUM_IEEE, SL, VT, A1, C);
16845 return DAG.getNode(ISD::FMINNUM_IEEE, SL, VT, B1, C1);
16846}
16847
16848unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
16849 const SDNode *N0,
16850 const SDNode *N1) const {
16851 EVT VT = N0->getValueType(0);
16852
16853 // Only do this if we are not trying to support denormals. v_mad_f32 does not
16854 // support denormals ever.
16855 if (((VT == MVT::f32 &&
16857 (VT == MVT::f16 && Subtarget->hasMadF16() &&
16860 return ISD::FMAD;
16861
16862 const TargetOptions &Options = DAG.getTarget().Options;
16863 if ((Options.AllowFPOpFusion == FPOpFusion::Fast ||
16864 (N0->getFlags().hasAllowContract() &&
16865 N1->getFlags().hasAllowContract())) &&
16867 return ISD::FMA;
16868 }
16869
16870 return 0;
16871}
16872
16873// For a reassociatable opcode perform:
16874// op x, (op y, z) -> op (op x, z), y, if x and z are uniform
16875SDValue SITargetLowering::reassociateScalarOps(SDNode *N,
16876 SelectionDAG &DAG) const {
16877 EVT VT = N->getValueType(0);
16878 if (VT != MVT::i32 && VT != MVT::i64)
16879 return SDValue();
16880
16881 if (DAG.isBaseWithConstantOffset(SDValue(N, 0)))
16882 return SDValue();
16883
16884 unsigned Opc = N->getOpcode();
16885 SDValue Op0 = N->getOperand(0);
16886 SDValue Op1 = N->getOperand(1);
16887
16888 if (!(Op0->isDivergent() ^ Op1->isDivergent()))
16889 return SDValue();
16890
16891 if (Op0->isDivergent())
16892 std::swap(Op0, Op1);
16893
16894 if (Op1.getOpcode() != Opc || !Op1.hasOneUse())
16895 return SDValue();
16896
16897 SDValue Op2 = Op1.getOperand(1);
16898 Op1 = Op1.getOperand(0);
16899 if (!(Op1->isDivergent() ^ Op2->isDivergent()))
16900 return SDValue();
16901
16902 if (Op1->isDivergent())
16903 std::swap(Op1, Op2);
16904
16905 SDLoc SL(N);
16906 SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1);
16907 return DAG.getNode(Opc, SL, VT, Add1, Op2);
16908}
16909
16910static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT,
16911 SDValue N0, SDValue N1, SDValue N2, bool Signed) {
16913 SDVTList VTs = DAG.getVTList(MVT::i64, MVT::i1);
16914 SDValue Mad = DAG.getNode(MadOpc, SL, VTs, N0, N1, N2);
16915 return DAG.getNode(ISD::TRUNCATE, SL, VT, Mad);
16916}
16917
16918// Fold
16919// y = lshr i64 x, 32
16920// res = add (mul i64 y, Const), x where "Const" is a 64-bit constant
16921// with Const.hi == -1
16922// To
16923// res = mad_u64_u32 y.lo ,Const.lo, x.lo
16925 SDValue MulLHS, SDValue MulRHS,
16926 SDValue AddRHS) {
16927 if (MulRHS.getOpcode() == ISD::SRL)
16928 std::swap(MulLHS, MulRHS);
16929
16930 if (MulLHS.getValueType() != MVT::i64 || MulLHS.getOpcode() != ISD::SRL)
16931 return SDValue();
16932
16933 ConstantSDNode *ShiftVal = dyn_cast<ConstantSDNode>(MulLHS.getOperand(1));
16934 if (!ShiftVal || ShiftVal->getAsZExtVal() != 32 ||
16935 MulLHS.getOperand(0) != AddRHS)
16936 return SDValue();
16937
16939 if (!Const || Hi_32(Const->getZExtValue()) != uint32_t(-1))
16940 return SDValue();
16941
16942 SDValue ConstMul =
16943 DAG.getConstant(Lo_32(Const->getZExtValue()), SL, MVT::i32);
16944 return getMad64_32(DAG, SL, MVT::i64,
16945 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS), ConstMul,
16946 DAG.getZeroExtendInReg(AddRHS, SL, MVT::i32), false);
16947}
16948
16949// Fold (add (mul x, y), z) --> (mad_[iu]64_[iu]32 x, y, z) plus high
16950// multiplies, if any.
16951//
16952// Full 64-bit multiplies that feed into an addition are lowered here instead
16953// of using the generic expansion. The generic expansion ends up with
16954// a tree of ADD nodes that prevents us from using the "add" part of the
16955// MAD instruction. The expansion produced here results in a chain of ADDs
16956// instead of a tree.
16957SDValue SITargetLowering::tryFoldToMad64_32(SDNode *N,
16958 DAGCombinerInfo &DCI) const {
16959 assert(N->isAnyAdd());
16960
16961 SelectionDAG &DAG = DCI.DAG;
16962 EVT VT = N->getValueType(0);
16963 SDLoc SL(N);
16964 SDValue LHS = N->getOperand(0);
16965 SDValue RHS = N->getOperand(1);
16966
16967 if (VT.isVector())
16968 return SDValue();
16969
16970 // S_MUL_HI_[IU]32 was added in gfx9, which allows us to keep the overall
16971 // result in scalar registers for uniform values.
16972 if (!N->isDivergent() && Subtarget->hasSMulHi())
16973 return SDValue();
16974
16975 unsigned NumBits = VT.getScalarSizeInBits();
16976 if (NumBits <= 32 || NumBits > 64)
16977 return SDValue();
16978
16979 if (LHS.getOpcode() != ISD::MUL) {
16980 assert(RHS.getOpcode() == ISD::MUL);
16981 std::swap(LHS, RHS);
16982 }
16983
16984 // Avoid the fold if it would unduly increase the number of multiplies due to
16985 // multiple uses, except on hardware with full-rate multiply-add (which is
16986 // part of full-rate 64-bit ops).
16987 if (!Subtarget->hasFullRate64Ops()) {
16988 unsigned NumUsers = 0;
16989 for (SDNode *User : LHS->users()) {
16990 // There is a use that does not feed into addition, so the multiply can't
16991 // be removed. We prefer MUL + ADD + ADDC over MAD + MUL.
16992 if (!User->isAnyAdd())
16993 return SDValue();
16994
16995 // We prefer 2xMAD over MUL + 2xADD + 2xADDC (code density), and prefer
16996 // MUL + 3xADD + 3xADDC over 3xMAD.
16997 ++NumUsers;
16998 if (NumUsers >= 3)
16999 return SDValue();
17000 }
17001 }
17002
17003 SDValue MulLHS = LHS.getOperand(0);
17004 SDValue MulRHS = LHS.getOperand(1);
17005 SDValue AddRHS = RHS;
17006
17007 if (SDValue FoldedMAD = tryFoldMADwithSRL(DAG, SL, MulLHS, MulRHS, AddRHS))
17008 return FoldedMAD;
17009
17010 // Always check whether operands are small unsigned values, since that
17011 // knowledge is useful in more cases. Check for small signed values only if
17012 // doing so can unlock a shorter code sequence.
17013 bool MulLHSUnsigned32 = numBitsUnsigned(MulLHS, DAG) <= 32;
17014 bool MulRHSUnsigned32 = numBitsUnsigned(MulRHS, DAG) <= 32;
17015
17016 bool MulSignedLo = false;
17017 if (!MulLHSUnsigned32 || !MulRHSUnsigned32) {
17018 MulSignedLo =
17019 numBitsSigned(MulLHS, DAG) <= 32 && numBitsSigned(MulRHS, DAG) <= 32;
17020 }
17021
17022 // The operands and final result all have the same number of bits. If
17023 // operands need to be extended, they can be extended with garbage. The
17024 // resulting garbage in the high bits of the mad_[iu]64_[iu]32 result is
17025 // truncated away in the end.
17026 if (VT != MVT::i64) {
17027 MulLHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulLHS);
17028 MulRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, MulRHS);
17029 AddRHS = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i64, AddRHS);
17030 }
17031
17032 // The basic code generated is conceptually straightforward. Pseudo code:
17033 //
17034 // accum = mad_64_32 lhs.lo, rhs.lo, accum
17035 // accum.hi = add (mul lhs.hi, rhs.lo), accum.hi
17036 // accum.hi = add (mul lhs.lo, rhs.hi), accum.hi
17037 //
17038 // The second and third lines are optional, depending on whether the factors
17039 // are {sign,zero}-extended or not.
17040 //
17041 // The actual DAG is noisier than the pseudo code, but only due to
17042 // instructions that disassemble values into low and high parts, and
17043 // assemble the final result.
17044 SDValue One = DAG.getConstant(1, SL, MVT::i32);
17045
17046 auto MulLHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulLHS);
17047 auto MulRHSLo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, MulRHS);
17048 SDValue Accum =
17049 getMad64_32(DAG, SL, MVT::i64, MulLHSLo, MulRHSLo, AddRHS, MulSignedLo);
17050
17051 if (!MulSignedLo && (!MulLHSUnsigned32 || !MulRHSUnsigned32)) {
17052 auto [AccumLo, AccumHi] = DAG.SplitScalar(Accum, SL, MVT::i32, MVT::i32);
17053
17054 if (!MulLHSUnsigned32) {
17055 auto MulLHSHi =
17056 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulLHS, One);
17057 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSHi, MulRHSLo);
17058 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
17059 }
17060
17061 if (!MulRHSUnsigned32) {
17062 auto MulRHSHi =
17063 DAG.getNode(ISD::EXTRACT_ELEMENT, SL, MVT::i32, MulRHS, One);
17064 SDValue MulHi = DAG.getNode(ISD::MUL, SL, MVT::i32, MulLHSLo, MulRHSHi);
17065 AccumHi = DAG.getNode(ISD::ADD, SL, MVT::i32, MulHi, AccumHi);
17066 }
17067
17068 Accum = DAG.getBuildVector(MVT::v2i32, SL, {AccumLo, AccumHi});
17069 Accum = DAG.getBitcast(MVT::i64, Accum);
17070 }
17071
17072 if (VT != MVT::i64)
17073 Accum = DAG.getNode(ISD::TRUNCATE, SL, VT, Accum);
17074 return Accum;
17075}
17076
17077SDValue
17078SITargetLowering::foldAddSub64WithZeroLowBitsTo32(SDNode *N,
17079 DAGCombinerInfo &DCI) const {
17080 SDValue RHS = N->getOperand(1);
17081 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
17082 if (!CRHS)
17083 return SDValue();
17084
17085 // TODO: Worth using computeKnownBits? Maybe expensive since it's so
17086 // common.
17087 uint64_t Val = CRHS->getZExtValue();
17088 if (countr_zero(Val) >= 32) {
17089 SelectionDAG &DAG = DCI.DAG;
17090 SDLoc SL(N);
17091 SDValue LHS = N->getOperand(0);
17092
17093 // Avoid carry machinery if we know the low half of the add does not
17094 // contribute to the final result.
17095 //
17096 // add i64:x, K if computeTrailingZeros(K) >= 32
17097 // => build_pair (add x.hi, K.hi), x.lo
17098
17099 // Breaking the 64-bit add here with this strange constant is unlikely
17100 // to interfere with addressing mode patterns.
17101
17102 SDValue Hi = getHiHalf64(LHS, DAG);
17103 SDValue ConstHi32 = DAG.getConstant(Hi_32(Val), SL, MVT::i32);
17104 unsigned Opcode = N->getOpcode();
17105 if (Opcode == ISD::PTRADD)
17106 Opcode = ISD::ADD;
17107 SDValue AddHi =
17108 DAG.getNode(Opcode, SL, MVT::i32, Hi, ConstHi32, N->getFlags());
17109
17110 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS);
17111 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, Lo, AddHi);
17112 }
17113
17114 return SDValue();
17115}
17116
17117// Collect the ultimate src of each of the mul node's operands, and confirm
17118// each operand is 8 bytes.
17119static std::optional<ByteProvider<SDValue>>
17120handleMulOperand(const SDValue &MulOperand) {
17121 auto Byte0 = calculateByteProvider(MulOperand, 0, 0);
17122 if (!Byte0 || Byte0->isConstantZero()) {
17123 return std::nullopt;
17124 }
17125 auto Byte1 = calculateByteProvider(MulOperand, 1, 0);
17126 if (Byte1 && !Byte1->isConstantZero()) {
17127 return std::nullopt;
17128 }
17129 return Byte0;
17130}
17131
17132static unsigned addPermMasks(unsigned First, unsigned Second) {
17133 unsigned FirstCs = First & 0x0c0c0c0c;
17134 unsigned SecondCs = Second & 0x0c0c0c0c;
17135 unsigned FirstNoCs = First & ~0x0c0c0c0c;
17136 unsigned SecondNoCs = Second & ~0x0c0c0c0c;
17137
17138 assert((FirstCs & 0xFF) | (SecondCs & 0xFF));
17139 assert((FirstCs & 0xFF00) | (SecondCs & 0xFF00));
17140 assert((FirstCs & 0xFF0000) | (SecondCs & 0xFF0000));
17141 assert((FirstCs & 0xFF000000) | (SecondCs & 0xFF000000));
17142
17143 return (FirstNoCs | SecondNoCs) | (FirstCs & SecondCs);
17144}
17145
17146struct DotSrc {
17148 int64_t PermMask;
17150};
17151
17155 SmallVectorImpl<DotSrc> &Src1s, int Step) {
17156
17157 assert(Src0.Src.has_value() && Src1.Src.has_value());
17158 // Src0s and Src1s are empty, just place arbitrarily.
17159 if (Step == 0) {
17160 Src0s.push_back({*Src0.Src, ((Src0.SrcOffset % 4) << 24) + 0x0c0c0c,
17161 Src0.SrcOffset / 4});
17162 Src1s.push_back({*Src1.Src, ((Src1.SrcOffset % 4) << 24) + 0x0c0c0c,
17163 Src1.SrcOffset / 4});
17164 return;
17165 }
17166
17167 for (int BPI = 0; BPI < 2; BPI++) {
17168 std::pair<ByteProvider<SDValue>, ByteProvider<SDValue>> BPP = {Src0, Src1};
17169 if (BPI == 1) {
17170 BPP = {Src1, Src0};
17171 }
17172 unsigned ZeroMask = 0x0c0c0c0c;
17173 unsigned FMask = 0xFF << (8 * (3 - Step));
17174
17175 unsigned FirstMask =
17176 (BPP.first.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
17177 unsigned SecondMask =
17178 (BPP.second.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask);
17179 // Attempt to find Src vector which contains our SDValue, if so, add our
17180 // perm mask to the existing one. If we are unable to find a match for the
17181 // first SDValue, attempt to find match for the second.
17182 int FirstGroup = -1;
17183 for (int I = 0; I < 2; I++) {
17184 SmallVectorImpl<DotSrc> &Srcs = I == 0 ? Src0s : Src1s;
17185 auto MatchesFirst = [&BPP](DotSrc &IterElt) {
17186 return IterElt.SrcOp == *BPP.first.Src &&
17187 (IterElt.DWordOffset == (BPP.first.SrcOffset / 4));
17188 };
17189
17190 auto *Match = llvm::find_if(Srcs, MatchesFirst);
17191 if (Match != Srcs.end()) {
17192 Match->PermMask = addPermMasks(FirstMask, Match->PermMask);
17193 FirstGroup = I;
17194 break;
17195 }
17196 }
17197 if (FirstGroup != -1) {
17198 SmallVectorImpl<DotSrc> &Srcs = FirstGroup == 1 ? Src0s : Src1s;
17199 auto MatchesSecond = [&BPP](DotSrc &IterElt) {
17200 return IterElt.SrcOp == *BPP.second.Src &&
17201 (IterElt.DWordOffset == (BPP.second.SrcOffset / 4));
17202 };
17203 auto *Match = llvm::find_if(Srcs, MatchesSecond);
17204 if (Match != Srcs.end()) {
17205 Match->PermMask = addPermMasks(SecondMask, Match->PermMask);
17206 } else
17207 Srcs.push_back({*BPP.second.Src, SecondMask, BPP.second.SrcOffset / 4});
17208 return;
17209 }
17210 }
17211
17212 // If we have made it here, then we could not find a match in Src0s or Src1s
17213 // for either Src0 or Src1, so just place them arbitrarily.
17214
17215 unsigned ZeroMask = 0x0c0c0c0c;
17216 unsigned FMask = 0xFF << (8 * (3 - Step));
17217
17218 Src0s.push_back(
17219 {*Src0.Src,
17220 ((Src0.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17221 Src0.SrcOffset / 4});
17222 Src1s.push_back(
17223 {*Src1.Src,
17224 ((Src1.SrcOffset % 4) << (8 * (3 - Step)) | (ZeroMask & ~FMask)),
17225 Src1.SrcOffset / 4});
17226}
17227
17229 SmallVectorImpl<DotSrc> &Srcs, bool IsSigned,
17230 bool IsAny) {
17231
17232 // If we just have one source, just permute it accordingly.
17233 if (Srcs.size() == 1) {
17234 auto *Elt = Srcs.begin();
17235 auto EltOp = getDWordFromOffset(DAG, SL, Elt->SrcOp, Elt->DWordOffset);
17236
17237 // v_perm will produce the original value
17238 if (Elt->PermMask == 0x3020100)
17239 return EltOp;
17240
17241 return DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17242 DAG.getConstant(Elt->PermMask, SL, MVT::i32));
17243 }
17244
17245 auto *FirstElt = Srcs.begin();
17246 auto *SecondElt = std::next(FirstElt);
17247
17249
17250 // If we have multiple sources in the chain, combine them via perms (using
17251 // calculated perm mask) and Ors.
17252 while (true) {
17253 auto FirstMask = FirstElt->PermMask;
17254 auto SecondMask = SecondElt->PermMask;
17255
17256 unsigned FirstCs = FirstMask & 0x0c0c0c0c;
17257 unsigned FirstPlusFour = FirstMask | 0x04040404;
17258 // 0x0c + 0x04 = 0x10, so anding with 0x0F will produced 0x00 for any
17259 // original 0x0C.
17260 FirstMask = (FirstPlusFour & 0x0F0F0F0F) | FirstCs;
17261
17262 auto PermMask = addPermMasks(FirstMask, SecondMask);
17263 auto FirstVal =
17264 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
17265 auto SecondVal =
17266 getDWordFromOffset(DAG, SL, SecondElt->SrcOp, SecondElt->DWordOffset);
17267
17268 Perms.push_back(DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, FirstVal,
17269 SecondVal,
17270 DAG.getConstant(PermMask, SL, MVT::i32)));
17271
17272 FirstElt = std::next(SecondElt);
17273 if (FirstElt == Srcs.end())
17274 break;
17275
17276 SecondElt = std::next(FirstElt);
17277 // If we only have a FirstElt, then just combine that into the cumulative
17278 // source node.
17279 if (SecondElt == Srcs.end()) {
17280 auto EltOp =
17281 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
17282
17283 Perms.push_back(
17284 DAG.getNode(AMDGPUISD::PERM, SL, MVT::i32, EltOp, EltOp,
17285 DAG.getConstant(FirstElt->PermMask, SL, MVT::i32)));
17286 break;
17287 }
17288 }
17289
17290 assert(Perms.size() == 1 || Perms.size() == 2);
17291 return Perms.size() == 2
17292 ? DAG.getNode(ISD::OR, SL, MVT::i32, Perms[0], Perms[1])
17293 : Perms[0];
17294}
17295
17296static void fixMasks(SmallVectorImpl<DotSrc> &Srcs, unsigned ChainLength) {
17297 for (auto &[EntryVal, EntryMask, EntryOffset] : Srcs) {
17298 EntryMask = EntryMask >> ((4 - ChainLength) * 8);
17299 auto ZeroMask = ChainLength == 2 ? 0x0c0c0000 : 0x0c000000;
17300 EntryMask += ZeroMask;
17301 }
17302}
17303
17304static bool isMul(const SDValue Op) {
17305 auto Opcode = Op.getOpcode();
17306
17307 return (Opcode == ISD::MUL || Opcode == AMDGPUISD::MUL_U24 ||
17308 Opcode == AMDGPUISD::MUL_I24);
17309}
17310
17311static std::optional<bool>
17313 ByteProvider<SDValue> &Src1, const SDValue &S0Op,
17314 const SDValue &S1Op, const SelectionDAG &DAG) {
17315 // If we both ops are i8s (pre legalize-dag), then the signedness semantics
17316 // of the dot4 is irrelevant.
17317 if (S0Op.getValueSizeInBits() == 8 && S1Op.getValueSizeInBits() == 8)
17318 return false;
17319
17320 auto Known0 = DAG.computeKnownBits(S0Op, 0);
17321 bool S0IsUnsigned = Known0.countMinLeadingZeros() > 0;
17322 bool S0IsSigned = Known0.countMinLeadingOnes() > 0;
17323 auto Known1 = DAG.computeKnownBits(S1Op, 0);
17324 bool S1IsUnsigned = Known1.countMinLeadingZeros() > 0;
17325 bool S1IsSigned = Known1.countMinLeadingOnes() > 0;
17326
17327 assert(!(S0IsUnsigned && S0IsSigned));
17328 assert(!(S1IsUnsigned && S1IsSigned));
17329
17330 // There are 9 possible permutations of
17331 // {S0IsUnsigned, S0IsSigned, S1IsUnsigned, S1IsSigned}
17332
17333 // In two permutations, the sign bits are known to be the same for both Ops,
17334 // so simply return Signed / Unsigned corresponding to the MSB
17335
17336 if ((S0IsUnsigned && S1IsUnsigned) || (S0IsSigned && S1IsSigned))
17337 return S0IsSigned;
17338
17339 // In another two permutations, the sign bits are known to be opposite. In
17340 // this case return std::nullopt to indicate a bad match.
17341
17342 if ((S0IsUnsigned && S1IsSigned) || (S0IsSigned && S1IsUnsigned))
17343 return std::nullopt;
17344
17345 // In the remaining five permutations, we don't know the value of the sign
17346 // bit for at least one Op. Since we have a valid ByteProvider, we know that
17347 // the upper bits must be extension bits. Thus, the only ways for the sign
17348 // bit to be unknown is if it was sign extended from unknown value, or if it
17349 // was any extended. In either case, it is correct to use the signed
17350 // version of the signedness semantics of dot4
17351
17352 // In two of such permutations, we known the sign bit is set for
17353 // one op, and the other is unknown. It is okay to used signed version of
17354 // dot4.
17355 if ((S0IsSigned && !(S1IsSigned || S1IsUnsigned)) ||
17356 ((S1IsSigned && !(S0IsSigned || S0IsUnsigned))))
17357 return true;
17358
17359 // In one such permutation, we don't know either of the sign bits. It is okay
17360 // to used the signed version of dot4.
17361 if ((!(S1IsSigned || S1IsUnsigned) && !(S0IsSigned || S0IsUnsigned)))
17362 return true;
17363
17364 // In two of such permutations, we known the sign bit is unset for
17365 // one op, and the other is unknown. Return std::nullopt to indicate a
17366 // bad match.
17367 if ((S0IsUnsigned && !(S1IsSigned || S1IsUnsigned)) ||
17368 ((S1IsUnsigned && !(S0IsSigned || S0IsUnsigned))))
17369 return std::nullopt;
17370
17371 llvm_unreachable("Fully covered condition");
17372}
17373
17374SDValue SITargetLowering::performAddCombine(SDNode *N,
17375 DAGCombinerInfo &DCI) const {
17376 SelectionDAG &DAG = DCI.DAG;
17377 EVT VT = N->getValueType(0);
17378 SDLoc SL(N);
17379 SDValue LHS = N->getOperand(0);
17380 SDValue RHS = N->getOperand(1);
17381
17382 if (LHS.getOpcode() == ISD::MUL || RHS.getOpcode() == ISD::MUL) {
17383 if (Subtarget->hasMad64_32()) {
17384 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17385 return Folded;
17386 }
17387 }
17388
17389 if (SDValue V = reassociateScalarOps(N, DAG)) {
17390 return V;
17391 }
17392
17393 if (VT == MVT::i64) {
17394 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17395 return Folded;
17396 }
17397
17398 if ((isMul(LHS) || isMul(RHS)) && Subtarget->hasDot7Insts() &&
17399 (Subtarget->hasDot1Insts() || Subtarget->hasDot8Insts())) {
17400 SDValue TempNode(N, 0);
17401 std::optional<bool> IsSigned;
17405
17406 // Match the v_dot4 tree, while collecting src nodes.
17407 int ChainLength = 0;
17408 for (int I = 0; I < 4; I++) {
17409 auto MulIdx = isMul(LHS) ? 0 : isMul(RHS) ? 1 : -1;
17410 if (MulIdx == -1)
17411 break;
17412 auto Src0 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(0));
17413 if (!Src0)
17414 break;
17415 auto Src1 = handleMulOperand(TempNode->getOperand(MulIdx)->getOperand(1));
17416 if (!Src1)
17417 break;
17418
17419 auto IterIsSigned = checkDot4MulSignedness(
17420 TempNode->getOperand(MulIdx), *Src0, *Src1,
17421 TempNode->getOperand(MulIdx)->getOperand(0),
17422 TempNode->getOperand(MulIdx)->getOperand(1), DAG);
17423 if (!IterIsSigned)
17424 break;
17425 if (!IsSigned)
17426 IsSigned = *IterIsSigned;
17427 if (*IterIsSigned != *IsSigned)
17428 break;
17429 placeSources(*Src0, *Src1, Src0s, Src1s, I);
17430 auto AddIdx = 1 - MulIdx;
17431 // Allow the special case where add (add (mul24, 0), mul24) became ->
17432 // add (mul24, mul24).
17433 if (I == 2 && isMul(TempNode->getOperand(AddIdx))) {
17434 Src2s.push_back(TempNode->getOperand(AddIdx));
17435 auto Src0 =
17436 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(0));
17437 if (!Src0)
17438 break;
17439 auto Src1 =
17440 handleMulOperand(TempNode->getOperand(AddIdx)->getOperand(1));
17441 if (!Src1)
17442 break;
17443 auto IterIsSigned = checkDot4MulSignedness(
17444 TempNode->getOperand(AddIdx), *Src0, *Src1,
17445 TempNode->getOperand(AddIdx)->getOperand(0),
17446 TempNode->getOperand(AddIdx)->getOperand(1), DAG);
17447 if (!IterIsSigned)
17448 break;
17449 assert(IsSigned);
17450 if (*IterIsSigned != *IsSigned)
17451 break;
17452 placeSources(*Src0, *Src1, Src0s, Src1s, I + 1);
17453 Src2s.push_back(DAG.getConstant(0, SL, MVT::i32));
17454 ChainLength = I + 2;
17455 break;
17456 }
17457
17458 TempNode = TempNode->getOperand(AddIdx);
17459 Src2s.push_back(TempNode);
17460 ChainLength = I + 1;
17461 if (TempNode->getNumOperands() < 2)
17462 break;
17463 LHS = TempNode->getOperand(0);
17464 RHS = TempNode->getOperand(1);
17465 }
17466
17467 if (ChainLength < 2)
17468 return SDValue();
17469
17470 // Masks were constructed with assumption that we would find a chain of
17471 // length 4. If not, then we need to 0 out the MSB bits (via perm mask of
17472 // 0x0c) so they do not affect dot calculation.
17473 if (ChainLength < 4) {
17474 fixMasks(Src0s, ChainLength);
17475 fixMasks(Src1s, ChainLength);
17476 }
17477
17478 SDValue Src0, Src1;
17479
17480 // If we are just using a single source for both, and have permuted the
17481 // bytes consistently, we can just use the sources without permuting
17482 // (commutation).
17483 bool UseOriginalSrc = false;
17484 if (ChainLength == 4 && Src0s.size() == 1 && Src1s.size() == 1 &&
17485 Src0s.begin()->PermMask == Src1s.begin()->PermMask &&
17486 Src0s.begin()->SrcOp.getValueSizeInBits() >= 32 &&
17487 Src1s.begin()->SrcOp.getValueSizeInBits() >= 32) {
17488 SmallVector<unsigned, 4> SrcBytes;
17489 auto Src0Mask = Src0s.begin()->PermMask;
17490 SrcBytes.push_back(Src0Mask & 0xFF000000);
17491 bool UniqueEntries = true;
17492 for (auto I = 1; I < 4; I++) {
17493 auto NextByte = Src0Mask & (0xFF << ((3 - I) * 8));
17494
17495 if (is_contained(SrcBytes, NextByte)) {
17496 UniqueEntries = false;
17497 break;
17498 }
17499 SrcBytes.push_back(NextByte);
17500 }
17501
17502 if (UniqueEntries) {
17503 UseOriginalSrc = true;
17504
17505 auto *FirstElt = Src0s.begin();
17506 auto FirstEltOp =
17507 getDWordFromOffset(DAG, SL, FirstElt->SrcOp, FirstElt->DWordOffset);
17508
17509 auto *SecondElt = Src1s.begin();
17510 auto SecondEltOp = getDWordFromOffset(DAG, SL, SecondElt->SrcOp,
17511 SecondElt->DWordOffset);
17512
17513 Src0 = DAG.getBitcastedAnyExtOrTrunc(FirstEltOp, SL,
17514 MVT::getIntegerVT(32));
17515 Src1 = DAG.getBitcastedAnyExtOrTrunc(SecondEltOp, SL,
17516 MVT::getIntegerVT(32));
17517 }
17518 }
17519
17520 if (!UseOriginalSrc) {
17521 Src0 = resolveSources(DAG, SL, Src0s, false, true);
17522 Src1 = resolveSources(DAG, SL, Src1s, false, true);
17523 }
17524
17525 assert(IsSigned);
17526 SDValue Src2 =
17527 DAG.getExtOrTrunc(*IsSigned, Src2s[ChainLength - 1], SL, MVT::i32);
17528
17529 SDValue IID = DAG.getTargetConstant(*IsSigned ? Intrinsic::amdgcn_sdot4
17530 : Intrinsic::amdgcn_udot4,
17531 SL, MVT::i64);
17532
17533 assert(!VT.isVector());
17534 auto Dot = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32, IID, Src0,
17535 Src1, Src2, DAG.getTargetConstant(0, SL, MVT::i1));
17536
17537 return DAG.getExtOrTrunc(*IsSigned, Dot, SL, VT);
17538 }
17539
17540 if (VT != MVT::i32 || !DCI.isAfterLegalizeDAG())
17541 return SDValue();
17542
17543 // add x, zext (setcc) => uaddo_carry x, 0, setcc
17544 // add x, sext (setcc) => usubo_carry x, 0, setcc
17545 unsigned Opc = LHS.getOpcode();
17548 std::swap(RHS, LHS);
17549
17550 Opc = RHS.getOpcode();
17551 switch (Opc) {
17552 default:
17553 break;
17554 case ISD::ZERO_EXTEND:
17555 case ISD::SIGN_EXTEND:
17556 case ISD::ANY_EXTEND: {
17557 auto Cond = RHS.getOperand(0);
17558 // If this won't be a real VOPC output, we would still need to insert an
17559 // extra instruction anyway.
17560 if (!isBoolSGPR(Cond))
17561 break;
17562 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
17563 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
17565 return DAG.getNode(Opc, SL, VTList, Args);
17566 }
17567 case ISD::UADDO_CARRY: {
17568 // add x, (uaddo_carry y, 0, cc) => uaddo_carry x, y, cc
17569 if (!isNullConstant(RHS.getOperand(1)))
17570 break;
17571 SDValue Args[] = {LHS, RHS.getOperand(0), RHS.getOperand(2)};
17572 return DAG.getNode(ISD::UADDO_CARRY, SDLoc(N), RHS->getVTList(), Args);
17573 }
17574 }
17575 return SDValue();
17576}
17577
17578SDValue SITargetLowering::performPtrAddCombine(SDNode *N,
17579 DAGCombinerInfo &DCI) const {
17580 SelectionDAG &DAG = DCI.DAG;
17581 SDLoc DL(N);
17582 EVT VT = N->getValueType(0);
17583 SDValue N0 = N->getOperand(0);
17584 SDValue N1 = N->getOperand(1);
17585
17586 // The following folds transform PTRADDs into regular arithmetic in cases
17587 // where the PTRADD wouldn't be folded as an immediate offset into memory
17588 // instructions anyway. They are target-specific in that other targets might
17589 // prefer to not lose information about the pointer arithmetic.
17590
17591 // Fold (ptradd x, shl(0 - v, k)) -> sub(x, shl(v, k)).
17592 // Adapted from DAGCombiner::visitADDLikeCommutative.
17593 SDValue V, K;
17594 if (sd_match(N1, m_Shl(m_Neg(m_Value(V)), m_Value(K)))) {
17595 SDNodeFlags ShlFlags = N1->getFlags();
17596 // If the original shl is NUW and NSW, the first k+1 bits of 0-v are all 0,
17597 // so v is either 0 or the first k+1 bits of v are all 1 -> NSW can be
17598 // preserved.
17599 SDNodeFlags NewShlFlags =
17600 ShlFlags.hasNoUnsignedWrap() && ShlFlags.hasNoSignedWrap()
17602 : SDNodeFlags();
17603 SDValue Inner = DAG.getNode(ISD::SHL, DL, VT, V, K, NewShlFlags);
17604 DCI.AddToWorklist(Inner.getNode());
17605 return DAG.getNode(ISD::SUB, DL, VT, N0, Inner);
17606 }
17607
17608 // Fold into Mad64 if the right-hand side is a MUL. Analogous to a fold in
17609 // performAddCombine.
17610 if (N1.getOpcode() == ISD::MUL) {
17611 if (Subtarget->hasMad64_32()) {
17612 if (SDValue Folded = tryFoldToMad64_32(N, DCI))
17613 return Folded;
17614 }
17615 }
17616
17617 // If the 32 low bits of the constant are all zero, there is nothing to fold
17618 // into an immediate offset, so it's better to eliminate the unnecessary
17619 // addition for the lower 32 bits than to preserve the PTRADD.
17620 // Analogous to a fold in performAddCombine.
17621 if (VT == MVT::i64) {
17622 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17623 return Folded;
17624 }
17625
17626 if (N1.getOpcode() != ISD::ADD || !N1.hasOneUse())
17627 return SDValue();
17628
17629 SDValue X = N0;
17630 SDValue Y = N1.getOperand(0);
17631 SDValue Z = N1.getOperand(1);
17632 bool YIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Y);
17633 bool ZIsConstant = DAG.isConstantIntBuildVectorOrConstantInt(Z);
17634
17635 if (!YIsConstant && !ZIsConstant && !X->isDivergent() &&
17636 Y->isDivergent() != Z->isDivergent()) {
17637 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, y), z) if x and
17638 // y are uniform and z isn't.
17639 // Reassociate (ptradd x, (add y, z)) -> (ptradd (ptradd x, z), y) if x and
17640 // z are uniform and y isn't.
17641 // The goal is to push uniform operands up in the computation, so that they
17642 // can be handled with scalar operations. We can't use reassociateScalarOps
17643 // for this since it requires two identical commutative operations to
17644 // reassociate.
17645 if (Y->isDivergent())
17646 std::swap(Y, Z);
17647 // If both additions in the original were NUW, reassociation preserves that.
17648 SDNodeFlags ReassocFlags =
17649 (N->getFlags() & N1->getFlags()) & SDNodeFlags::NoUnsignedWrap;
17650 SDValue UniformInner = DAG.getMemBasePlusOffset(X, Y, DL, ReassocFlags);
17651 DCI.AddToWorklist(UniformInner.getNode());
17652 return DAG.getMemBasePlusOffset(UniformInner, Z, DL, ReassocFlags);
17653 }
17654
17655 return SDValue();
17656}
17657
17658static bool isCtlzOpc(unsigned Opc) {
17659 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
17660}
17661
17662SDValue SITargetLowering::performSubCombine(SDNode *N,
17663 DAGCombinerInfo &DCI) const {
17664 SelectionDAG &DAG = DCI.DAG;
17665 EVT VT = N->getValueType(0);
17666
17667 if (VT == MVT::i64) {
17668 if (SDValue Folded = foldAddSub64WithZeroLowBitsTo32(N, DCI))
17669 return Folded;
17670 }
17671
17672 if (VT != MVT::i32)
17673 return SDValue();
17674
17675 SDLoc SL(N);
17676 SDValue LHS = N->getOperand(0);
17677 SDValue RHS = N->getOperand(1);
17678
17679 // sub x, zext (setcc) => usubo_carry x, 0, setcc
17680 // sub x, sext (setcc) => uaddo_carry x, 0, setcc
17681 unsigned Opc = RHS.getOpcode();
17682 switch (Opc) {
17683 default:
17684 break;
17685 case ISD::ZERO_EXTEND:
17686 case ISD::SIGN_EXTEND:
17687 case ISD::ANY_EXTEND: {
17688 auto Cond = RHS.getOperand(0);
17689 // If this won't be a real VOPC output, we would still need to insert an
17690 // extra instruction anyway.
17691 if (!isBoolSGPR(Cond))
17692 break;
17693 SDVTList VTList = DAG.getVTList(MVT::i32, MVT::i1);
17694 SDValue Args[] = {LHS, DAG.getConstant(0, SL, MVT::i32), Cond};
17696 return DAG.getNode(Opc, SL, VTList, Args);
17697 }
17698 }
17699
17700 if (LHS.getOpcode() == ISD::USUBO_CARRY) {
17701 // sub (usubo_carry x, 0, cc), y => usubo_carry x, y, cc
17702 if (!isNullConstant(LHS.getOperand(1)))
17703 return SDValue();
17704 SDValue Args[] = {LHS.getOperand(0), RHS, LHS.getOperand(2)};
17705 return DAG.getNode(ISD::USUBO_CARRY, SDLoc(N), LHS->getVTList(), Args);
17706 }
17707
17708 // sub (ctlz (xor x, (sra x, 31))), 1 -> ctls x.
17709 if (isOneConstant(RHS) && isCtlzOpc(LHS.getOpcode())) {
17710 SDValue CtlzSrc = LHS.getOperand(0);
17711 // Check for xor x, (sra x, 31) pattern.
17712 if (CtlzSrc.getOpcode() == ISD::XOR) {
17713 SDValue X = CtlzSrc.getOperand(0);
17714 SDValue SignExt = CtlzSrc.getOperand(1);
17715 // Try both ordering of XOR operands.
17716 if (SignExt.getOpcode() != ISD::SRA)
17717 std::swap(X, SignExt);
17718 if (SignExt.getOpcode() == ISD::SRA && SignExt.getOperand(0) == X) {
17719 ConstantSDNode *ShiftAmt =
17721 unsigned BitWidth = X.getValueType().getScalarSizeInBits();
17722 if (ShiftAmt && ShiftAmt->getZExtValue() == BitWidth - 1)
17723 return DAG.getNode(ISD::CTLS, SL, VT, X);
17724 }
17725 }
17726 }
17727
17728 return SDValue();
17729}
17730
17731SDValue SITargetLowering::performFAddCombine(SDNode *N,
17732 DAGCombinerInfo &DCI) const {
17733 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17734 return SDValue();
17735
17736 SelectionDAG &DAG = DCI.DAG;
17737 EVT VT = N->getValueType(0);
17738
17739 SDLoc SL(N);
17740 SDValue LHS = N->getOperand(0);
17741 SDValue RHS = N->getOperand(1);
17742
17743 // These should really be instruction patterns, but writing patterns with
17744 // source modifiers is a pain.
17745
17746 // fadd (fadd (a, a), b) -> mad 2.0, a, b
17747 if (LHS.getOpcode() == ISD::FADD) {
17748 SDValue A = LHS.getOperand(0);
17749 if (A == LHS.getOperand(1)) {
17750 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17751 if (FusedOp != 0) {
17752 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17753 return DAG.getNode(FusedOp, SL, VT, A, Two, RHS);
17754 }
17755 }
17756 }
17757
17758 // fadd (b, fadd (a, a)) -> mad 2.0, a, b
17759 if (RHS.getOpcode() == ISD::FADD) {
17760 SDValue A = RHS.getOperand(0);
17761 if (A == RHS.getOperand(1)) {
17762 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17763 if (FusedOp != 0) {
17764 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17765 return DAG.getNode(FusedOp, SL, VT, A, Two, LHS);
17766 }
17767 }
17768 }
17769
17770 return SDValue();
17771}
17772
17773SDValue SITargetLowering::performFSubCombine(SDNode *N,
17774 DAGCombinerInfo &DCI) const {
17775 if (DCI.getDAGCombineLevel() < AfterLegalizeDAG)
17776 return SDValue();
17777
17778 SelectionDAG &DAG = DCI.DAG;
17779 SDLoc SL(N);
17780 EVT VT = N->getValueType(0);
17781 assert(!VT.isVector());
17782
17783 // Try to get the fneg to fold into the source modifier. This undoes generic
17784 // DAG combines and folds them into the mad.
17785 //
17786 // Only do this if we are not trying to support denormals. v_mad_f32 does
17787 // not support denormals ever.
17788 SDValue LHS = N->getOperand(0);
17789 SDValue RHS = N->getOperand(1);
17790 if (LHS.getOpcode() == ISD::FADD) {
17791 // (fsub (fadd a, a), c) -> mad 2.0, a, (fneg c)
17792 SDValue A = LHS.getOperand(0);
17793 if (A == LHS.getOperand(1)) {
17794 unsigned FusedOp = getFusedOpcode(DAG, N, LHS.getNode());
17795 if (FusedOp != 0) {
17796 const SDValue Two = DAG.getConstantFP(2.0, SL, VT);
17797 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
17798
17799 return DAG.getNode(FusedOp, SL, VT, A, Two, NegRHS);
17800 }
17801 }
17802 }
17803
17804 if (RHS.getOpcode() == ISD::FADD) {
17805 // (fsub c, (fadd a, a)) -> mad -2.0, a, c
17806
17807 SDValue A = RHS.getOperand(0);
17808 if (A == RHS.getOperand(1)) {
17809 unsigned FusedOp = getFusedOpcode(DAG, N, RHS.getNode());
17810 if (FusedOp != 0) {
17811 const SDValue NegTwo = DAG.getConstantFP(-2.0, SL, VT);
17812 return DAG.getNode(FusedOp, SL, VT, A, NegTwo, LHS);
17813 }
17814 }
17815 }
17816
17817 return SDValue();
17818}
17819
17820SDValue SITargetLowering::performFDivCombine(SDNode *N,
17821 DAGCombinerInfo &DCI) const {
17822 SelectionDAG &DAG = DCI.DAG;
17823 SDLoc SL(N);
17824 EVT VT = N->getValueType(0);
17825
17826 if (VT != MVT::f16 && VT != MVT::bf16)
17827 return SDValue();
17828
17829 SDValue LHS = N->getOperand(0);
17830 SDValue RHS = N->getOperand(1);
17831
17832 SDNodeFlags Flags = N->getFlags();
17833 SDNodeFlags RHSFlags = RHS->getFlags();
17834 if (!Flags.hasAllowContract() || !RHSFlags.hasAllowContract() ||
17835 !RHS->hasOneUse())
17836 return SDValue();
17837
17838 if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
17839 bool IsNegative = false;
17840 if (CLHS->isOne() || (IsNegative = CLHS->isMinusOne())) {
17841 // fdiv contract 1.0, (sqrt contract x) -> rsq
17842 // fdiv contract -1.0, (sqrt contract x) -> fneg(rsq)
17843 if (RHS.getOpcode() == ISD::FSQRT) {
17844 // TODO: Or in RHS flags, somehow missing from SDNodeFlags
17845 SDValue SqrtOp = RHS.getOperand(0);
17846 SDValue Rsq;
17847 if (isOperationLegal(ISD::FSQRT, VT)) {
17848 // fsqrt legality correlates to rsq availability of the same type.
17849 Rsq = DAG.getNode(AMDGPUISD::RSQ, SL, VT, SqrtOp, Flags);
17850 } else if (VT == MVT::f16) {
17851 // Targets without 16-bit instructions (gfx6/gfx7) have no f16 rsq,
17852 // but v_rsq_f32 is more than accurate enough for f16. Unlike bf16,
17853 // every f16 value (including denormals) extends to a normal f32, and
17854 // an f16 rsq result is never denormal, so the f32 reciprocal square
17855 // root needs no denormal handling. Compute it in f32 and round back.
17856 SDValue Ext =
17857 DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, SqrtOp, Flags);
17858 SDValue F32Rsq =
17859 DAG.getNode(AMDGPUISD::RSQ, SL, MVT::f32, Ext, Flags);
17860 Rsq = DAG.getNode(ISD::FP_ROUND, SL, VT, F32Rsq,
17861 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
17862 } else {
17863 // bf16 shares f32's exponent range, so bf16 denormals would extend to
17864 // f32 denormals that v_rsq_f32 does not handle. Leave it expanded.
17865 return SDValue();
17866 }
17867 return IsNegative ? DAG.getNode(ISD::FNEG, SL, VT, Rsq, Flags) : Rsq;
17868 }
17869 }
17870 }
17871
17872 return SDValue();
17873}
17874
17875SDValue SITargetLowering::performFMulCombine(SDNode *N,
17876 DAGCombinerInfo &DCI) const {
17877 SelectionDAG &DAG = DCI.DAG;
17878 EVT VT = N->getValueType(0);
17879 EVT ScalarVT = VT.getScalarType();
17880 EVT IntVT = VT.changeElementType(*DAG.getContext(), MVT::i32);
17881
17882 if (!N->isDivergent() && getSubtarget()->hasSALUFloatInsts() &&
17883 (ScalarVT == MVT::f32 || ScalarVT == MVT::f16)) {
17884 // Prefer to use s_mul_f16/f32 instead of v_ldexp_f16/f32.
17885 return SDValue();
17886 }
17887
17888 SDValue LHS = N->getOperand(0);
17889 SDValue RHS = N->getOperand(1);
17890
17891 // It is cheaper to realize i32 inline constants as compared against
17892 // materializing f16 or f64 (or even non-inline f32) values,
17893 // possible via ldexp usage, as shown below :
17894 //
17895 // Given : A = 2^a & B = 2^b ; where a and b are integers.
17896 // fmul x, (select y, A, B) -> ldexp( x, (select i32 y, a, b) )
17897 // fmul x, (select y, -A, -B) -> ldexp( (fneg x), (select i32 y, a, b) )
17898 if ((ScalarVT == MVT::f64 || ScalarVT == MVT::f32 || ScalarVT == MVT::f16) &&
17899 (RHS.hasOneUse() && RHS.getOpcode() == ISD::SELECT)) {
17900 const ConstantFPSDNode *TrueNode = isConstOrConstSplatFP(RHS.getOperand(1));
17901 if (!TrueNode)
17902 return SDValue();
17903 const ConstantFPSDNode *FalseNode =
17904 isConstOrConstSplatFP(RHS.getOperand(2));
17905 if (!FalseNode)
17906 return SDValue();
17907
17908 if (TrueNode->isNegative() != FalseNode->isNegative())
17909 return SDValue();
17910
17911 // For f32, only non-inline constants should be transformed.
17912 const SIInstrInfo *TII = getSubtarget()->getInstrInfo();
17913 if (ScalarVT == MVT::f32 &&
17914 TII->isInlineConstant(TrueNode->getValueAPF()) &&
17915 TII->isInlineConstant(FalseNode->getValueAPF()))
17916 return SDValue();
17917
17918 int TrueNodeExpVal = TrueNode->getValueAPF().getExactLog2Abs();
17919 if (TrueNodeExpVal == INT_MIN)
17920 return SDValue();
17921 int FalseNodeExpVal = FalseNode->getValueAPF().getExactLog2Abs();
17922 if (FalseNodeExpVal == INT_MIN)
17923 return SDValue();
17924
17925 SDLoc SL(N);
17926 SDValue SelectNode =
17927 DAG.getNode(ISD::SELECT, SL, IntVT, RHS.getOperand(0),
17928 DAG.getSignedConstant(TrueNodeExpVal, SL, IntVT),
17929 DAG.getSignedConstant(FalseNodeExpVal, SL, IntVT));
17930
17931 LHS = TrueNode->isNegative()
17932 ? DAG.getNode(ISD::FNEG, SL, VT, LHS, LHS->getFlags())
17933 : LHS;
17934
17935 return DAG.getNode(ISD::FLDEXP, SL, VT, LHS, SelectNode, N->getFlags());
17936 }
17937
17938 return SDValue();
17939}
17940
17941SDValue SITargetLowering::performFMACombine(SDNode *N,
17942 DAGCombinerInfo &DCI) const {
17943 SelectionDAG &DAG = DCI.DAG;
17944 EVT VT = N->getValueType(0);
17945 SDLoc SL(N);
17946
17947 if (!Subtarget->hasDot10Insts() || VT != MVT::f32)
17948 return SDValue();
17949
17950 // FMA((F32)S0.x, (F32)S1. x, FMA((F32)S0.y, (F32)S1.y, (F32)z)) ->
17951 // FDOT2((V2F16)S0, (V2F16)S1, (F32)z))
17952 SDValue Op1 = N->getOperand(0);
17953 SDValue Op2 = N->getOperand(1);
17954 SDValue FMA = N->getOperand(2);
17955
17956 if (FMA.getOpcode() != ISD::FMA || Op1.getOpcode() != ISD::FP_EXTEND ||
17957 Op2.getOpcode() != ISD::FP_EXTEND)
17958 return SDValue();
17959
17960 // fdot2_f32_f16 always flushes fp32 denormal operand and output to zero,
17961 // regardless of the denorm mode setting. Therefore,
17962 // fp-contract is sufficient to allow generating fdot2.
17963 const TargetOptions &Options = DAG.getTarget().Options;
17964 if (Options.AllowFPOpFusion == FPOpFusion::Fast ||
17965 (N->getFlags().hasAllowContract() &&
17966 FMA->getFlags().hasAllowContract())) {
17967 Op1 = Op1.getOperand(0);
17968 Op2 = Op2.getOperand(0);
17969 if (Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17971 return SDValue();
17972
17973 SDValue Vec1 = Op1.getOperand(0);
17974 SDValue Idx1 = Op1.getOperand(1);
17975 SDValue Vec2 = Op2.getOperand(0);
17976
17977 SDValue FMAOp1 = FMA.getOperand(0);
17978 SDValue FMAOp2 = FMA.getOperand(1);
17979 SDValue FMAAcc = FMA.getOperand(2);
17980
17981 if (FMAOp1.getOpcode() != ISD::FP_EXTEND ||
17982 FMAOp2.getOpcode() != ISD::FP_EXTEND)
17983 return SDValue();
17984
17985 FMAOp1 = FMAOp1.getOperand(0);
17986 FMAOp2 = FMAOp2.getOperand(0);
17987 if (FMAOp1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
17989 return SDValue();
17990
17991 SDValue Vec3 = FMAOp1.getOperand(0);
17992 SDValue Vec4 = FMAOp2.getOperand(0);
17993 SDValue Idx2 = FMAOp1.getOperand(1);
17994
17995 if (Idx1 != Op2.getOperand(1) || Idx2 != FMAOp2.getOperand(1) ||
17996 // Idx1 and Idx2 cannot be the same.
17997 Idx1 == Idx2)
17998 return SDValue();
17999
18000 if (Vec1 == Vec2 || Vec3 == Vec4)
18001 return SDValue();
18002
18003 if (Vec1.getValueType() != MVT::v2f16 || Vec2.getValueType() != MVT::v2f16)
18004 return SDValue();
18005
18006 if ((Vec1 == Vec3 && Vec2 == Vec4) || (Vec1 == Vec4 && Vec2 == Vec3)) {
18007 return DAG.getNode(AMDGPUISD::FDOT2, SL, MVT::f32, Vec1, Vec2, FMAAcc,
18008 DAG.getTargetConstant(0, SL, MVT::i1));
18009 }
18010 }
18011 return SDValue();
18012}
18013
18014// Given a double-precision ordered or unordered comparison, return the
18015// condition code for an equivalent integral comparison of the operands' upper
18016// 32 bits, or `SETCC_INVALID` if not possible.
18017// For simplicity, no simplification occurs if the operands are not both known
18018// to have sign bit zero.
18019//
18020// EQ/NE:
18021// If LHS.lo32 == RHS.lo32:
18022// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
18023// If LHS.lo32 != RHS.lo32:
18024// setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
18025// The reduction is not possible if operands may be +0 and -0.
18026// For ordered eq / unordered ne, at most one operand may be NaN.
18027// For unordered eq / ordered ne, neither operand can be NaN.
18028//
18029// LT/GE:
18030// If LHS.lo32 >= RHS.lo32 (unsigned):
18031// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
18032// If LHS.lo32 < RHS.lo32 (unsigned):
18033// setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
18034// The reduction is only supported if both operands are nonnegative.
18035// For ordered lt / unordered ge, the RHS cannot be NaN.
18036// For unordered lt / ordered ge, neither operand can be NaN.
18037//
18038// LE/GT:
18039// If LHS.lo32 > RHS.lo32 (unsigned):
18040// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
18041// If LHS.lo32 <= RHS.lo32 (unsigned):
18042// setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
18043// The reduction is only supported if both operands are nonnegative.
18044// For unordered le / ordered gt, the LHS cannot be NaN.
18045// For ordered le / unordered gt, neither operand can be NaN.
18047 const SDValue LHS,
18048 const SDValue RHS,
18049 const SelectionDAG &DAG) {
18050 EVT VT = LHS.getValueType();
18051 assert(VT == MVT::f64 && "Incorrect operand type!");
18052
18053 const KnownBits RHSBits = DAG.computeKnownBits(RHS);
18054 // Bail if RHS sign bit is not known to be zero.
18055 if (!RHSBits.Zero.isSignBitSet())
18056 return ISD::SETCC_INVALID;
18057
18058 const KnownBits RHSKnownLo32 = RHSBits.trunc(32);
18059 const KnownFPClass RHSFPClass =
18061 const bool RHSMaybeNaN = !RHSFPClass.isKnownNeverNaN();
18062
18063 const KnownBits LHSBits = DAG.computeKnownBits(LHS);
18064 const KnownBits LHSKnownLo32 = LHSBits.trunc(32);
18065 const KnownFPClass LHSFPClass =
18067 const bool LHSMaybeNaN = !LHSFPClass.isKnownNeverNaN();
18068
18069 // Bail if LHS sign bit is not known to be zero.
18070 if (!LHSBits.Zero.isSignBitSet())
18071 return ISD::SETCC_INVALID;
18072
18073 switch (CC) {
18074 default:
18075 break;
18076 case ISD::SETEQ:
18077 case ISD::SETOEQ:
18078 case ISD::SETUEQ:
18079 case ISD::SETONE:
18080 case ISD::SETUNE: {
18081 // OEQ should be false if either operand is NaN, so it suffices that at
18082 // least one operand is not NaN.
18083 if (CC == ISD::SETOEQ && LHSMaybeNaN && RHSMaybeNaN)
18084 break;
18085 // UEQ should be true if either operand is NaN, but this cannot be checked
18086 // on underlying bits.
18087 if (CC == ISD::SETUEQ && (LHSMaybeNaN || RHSMaybeNaN))
18088 break;
18089 // ONE should be false if either operand is NaN, but this cannot be
18090 // checked on underlying bits.
18091 if (CC == ISD::SETONE && (LHSMaybeNaN || RHSMaybeNaN))
18092 break;
18093 // UNE should be true if either operand is NaN, so it suffices that they
18094 // are not both NaN.
18095 if (CC == ISD::SETUNE && LHSMaybeNaN && RHSMaybeNaN)
18096 break;
18097
18098 const std::optional<bool> KnownEq =
18099 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
18100
18101 if (!KnownEq)
18102 break;
18103
18104 if (*KnownEq)
18105 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
18106 ? ISD::SETEQ
18107 : ISD::SETNE;
18108
18109 return (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETUEQ)
18111 : ISD::SETTRUE;
18112 }
18113 case ISD::SETLT:
18114 case ISD::SETOLT:
18115 case ISD::SETULT:
18116 case ISD::SETGE:
18117 case ISD::SETOGE:
18118 case ISD::SETUGE: {
18119 // OLT should be false if either operand is NaN.
18120 // Since NaNs have maximum exponent and nonzero mantissa, false positives
18121 // are only possible if the RHS is NaN. (No issue with RHS == +inf since
18122 // the inequality is strict)
18123 if (CC == ISD::SETOLT && RHSMaybeNaN)
18124 break;
18125 // ULT should be true if either operand is NaN, but this cannot be ensured
18126 // with a truncated comparison.
18127 if (CC == ISD::SETULT && (LHSMaybeNaN || RHSMaybeNaN))
18128 break;
18129 // OGE should be false if either operand is NaN, but this cannot be
18130 // ensured with a truncated comparison.
18131 if (CC == ISD::SETOGE && (LHSMaybeNaN || RHSMaybeNaN))
18132 break;
18133 // UGE should be true if either operand is NaN.
18134 // False negatives are only possible if the RHS is NaN.
18135 // (No issue with RHS == +inf since the inequality is inclusive)
18136 if (CC == ISD::SETUGE && RHSMaybeNaN)
18137 break;
18138
18139 const std::optional<bool> KnownUge =
18140 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
18141
18142 if (!KnownUge)
18143 break;
18144
18145 if (*KnownUge) {
18146 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
18147 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
18148 ? ISD::SETLT
18149 : ISD::SETGE;
18150 }
18151 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
18152 return (CC == ISD::SETLT || CC == ISD::SETOLT || CC == ISD::SETULT)
18153 ? ISD::SETLE
18154 : ISD::SETGT;
18155 }
18156 case ISD::SETLE:
18157 case ISD::SETOLE:
18158 case ISD::SETULE:
18159 case ISD::SETGT:
18160 case ISD::SETOGT:
18161 case ISD::SETUGT: {
18162 // OLE should be false if either operand is NaN, but this cannot be
18163 // ensured with a truncated comparison.
18164 if (CC == ISD::SETOLE && (LHSMaybeNaN || RHSMaybeNaN))
18165 break;
18166 // ULE should be true if either operand is NaN.
18167 // False negatives are only possible if the LHS is NaN.
18168 // (No issue with LHS == +inf since the inequality is inclusive)
18169 if (CC == ISD::SETULE && LHSMaybeNaN)
18170 break;
18171 // OGT should be false if either operand is NaN.
18172 // False positives are only possible if the LHS is NaN.
18173 // (No issue with LHS == +inf since the inequality is strict)
18174 if (CC == ISD::SETOGT && LHSMaybeNaN)
18175 break;
18176 // UGT should be true if either operand is NaN, but this cannot be ensured
18177 // with a truncated comparison.
18178 if (CC == ISD::SETUGT && (LHSMaybeNaN || RHSMaybeNaN))
18179 break;
18180
18181 const std::optional<bool> KnownUle =
18182 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
18183
18184 if (!KnownUle)
18185 break;
18186
18187 if (*KnownUle) {
18188 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
18189 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
18190 ? ISD::SETLE
18191 : ISD::SETGT;
18192 }
18193 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
18194 return (CC == ISD::SETLE || CC == ISD::SETOLE || CC == ISD::SETULE)
18195 ? ISD::SETLT
18196 : ISD::SETGE;
18197 }
18198 }
18199
18200 return ISD::SETCC_INVALID;
18201}
18202
18203SDValue SITargetLowering::performSetCCCombine(SDNode *N,
18204 DAGCombinerInfo &DCI) const {
18205 SelectionDAG &DAG = DCI.DAG;
18206 SDLoc SL(N);
18207
18208 SDValue LHS = N->getOperand(0);
18209 SDValue RHS = N->getOperand(1);
18210 EVT VT = LHS.getValueType();
18211 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
18212
18213 auto *CRHS = dyn_cast<ConstantSDNode>(RHS);
18214 if (!CRHS) {
18216 if (CRHS) {
18217 std::swap(LHS, RHS);
18218 CC = getSetCCSwappedOperands(CC);
18219 }
18220 }
18221
18222 if (CRHS) {
18223 if (VT == MVT::i32 && LHS.getOpcode() == ISD::SIGN_EXTEND &&
18224 isBoolSGPR(LHS.getOperand(0))) {
18225 // setcc (sext from i1 cc), -1, ne|sgt|ult) => not cc => xor cc, -1
18226 // setcc (sext from i1 cc), -1, eq|sle|uge) => cc
18227 // setcc (sext from i1 cc), 0, eq|sge|ule) => not cc => xor cc, -1
18228 // setcc (sext from i1 cc), 0, ne|ugt|slt) => cc
18229 if ((CRHS->isAllOnes() &&
18230 (CC == ISD::SETNE || CC == ISD::SETGT || CC == ISD::SETULT)) ||
18231 (CRHS->isZero() &&
18232 (CC == ISD::SETEQ || CC == ISD::SETGE || CC == ISD::SETULE)))
18233 return DAG.getNode(ISD::XOR, SL, MVT::i1, LHS.getOperand(0),
18234 DAG.getAllOnesConstant(SL, MVT::i1));
18235 if ((CRHS->isAllOnes() &&
18236 (CC == ISD::SETEQ || CC == ISD::SETLE || CC == ISD::SETUGE)) ||
18237 (CRHS->isZero() &&
18238 (CC == ISD::SETNE || CC == ISD::SETUGT || CC == ISD::SETLT)))
18239 return LHS.getOperand(0);
18240 }
18241
18242 const APInt &CRHSVal = CRHS->getAPIntValue();
18243 if ((CC == ISD::SETEQ || CC == ISD::SETNE) &&
18244 LHS.getOpcode() == ISD::SELECT &&
18245 isa<ConstantSDNode>(LHS.getOperand(1)) &&
18246 isa<ConstantSDNode>(LHS.getOperand(2)) &&
18247 isBoolSGPR(LHS.getOperand(0))) {
18248 // Given CT != FT:
18249 // setcc (select cc, CT, CF), CF, eq => xor cc, -1
18250 // setcc (select cc, CT, CF), CF, ne => cc
18251 // setcc (select cc, CT, CF), CT, ne => xor cc, -1
18252 // setcc (select cc, CT, CF), CT, eq => cc
18253 const APInt &CT = LHS.getConstantOperandAPInt(1);
18254 const APInt &CF = LHS.getConstantOperandAPInt(2);
18255
18256 if (CT != CF) {
18257 if ((CF == CRHSVal && CC == ISD::SETEQ) ||
18258 (CT == CRHSVal && CC == ISD::SETNE))
18259 return DAG.getNOT(SL, LHS.getOperand(0), MVT::i1);
18260 if ((CF == CRHSVal && CC == ISD::SETNE) ||
18261 (CT == CRHSVal && CC == ISD::SETEQ))
18262 return LHS.getOperand(0);
18263 }
18264 }
18265 }
18266
18267 // Truncate 64-bit setcc to test only upper 32-bits of its operands in the
18268 // following cases where information about the lower 32-bits of its operands
18269 // is known:
18270 //
18271 // If LHS.lo32 == RHS.lo32:
18272 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, eq/ne
18273 // If LHS.lo32 != RHS.lo32:
18274 // setcc LHS, RHS, eq/ne => setcc LHS.hi32, RHS.hi32, false/true
18275 // If LHS.lo32 >= RHS.lo32 (unsigned):
18276 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]lt/ge
18277 // If LHS.lo32 > RHS.lo32 (unsigned):
18278 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]lt/ge
18279 // If LHS.lo32 <= RHS.lo32 (unsigned):
18280 // setcc LHS, RHS, [u]le/gt => LHS.hi32, RHS.hi32, [u]le/gt
18281 // If LHS.lo32 < RHS.lo32 (unsigned):
18282 // setcc LHS, RHS, [u]lt/ge => LHS.hi32, RHS.hi32, [u]le/gt
18283 if (VT == MVT::i64) {
18284 const KnownBits LHSKnownLo32 = DAG.computeKnownBits(LHS).trunc(32);
18285 const KnownBits RHSKnownLo32 = DAG.computeKnownBits(RHS).trunc(32);
18286
18287 // NewCC is valid iff we can truncate the setcc to only test the upper 32
18288 // bits
18290
18291 switch (CC) {
18292 default:
18293 break;
18294 case ISD::SETEQ: {
18295 const std::optional<bool> KnownEq =
18296 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
18297 if (KnownEq)
18298 NewCC = *KnownEq ? ISD::SETEQ : ISD::SETFALSE;
18299
18300 break;
18301 }
18302 case ISD::SETNE: {
18303 const std::optional<bool> KnownEq =
18304 KnownBits::eq(LHSKnownLo32, RHSKnownLo32);
18305 if (KnownEq)
18306 NewCC = *KnownEq ? ISD::SETNE : ISD::SETTRUE;
18307
18308 break;
18309 }
18310 case ISD::SETULT:
18311 case ISD::SETUGE:
18312 case ISD::SETLT:
18313 case ISD::SETGE: {
18314 const std::optional<bool> KnownUge =
18315 KnownBits::uge(LHSKnownLo32, RHSKnownLo32);
18316 if (KnownUge) {
18317 if (*KnownUge) {
18318 // LHS.lo32 uge RHS.lo32, so LHS >= RHS iff LHS.hi32 >= RHS.hi32
18319 NewCC = CC;
18320 } else {
18321 // LHS.lo32 ult RHS.lo32, so LHS >= RHS iff LHS.hi32 > RHS.hi32
18322 NewCC = CC == ISD::SETULT ? ISD::SETULE
18323 : CC == ISD::SETUGE ? ISD::SETUGT
18324 : CC == ISD::SETLT ? ISD::SETLE
18325 : ISD::SETGT;
18326 }
18327 }
18328 break;
18329 }
18330 case ISD::SETULE:
18331 case ISD::SETUGT:
18332 case ISD::SETLE:
18333 case ISD::SETGT: {
18334 const std::optional<bool> KnownUle =
18335 KnownBits::ule(LHSKnownLo32, RHSKnownLo32);
18336 if (KnownUle) {
18337 if (*KnownUle) {
18338 // LHS.lo32 ule RHS.lo32, so LHS <= RHS iff LHS.hi32 <= RHS.hi32
18339 NewCC = CC;
18340 } else {
18341 // LHS.lo32 ugt RHS.lo32, so LHS <= RHS iff LHS.hi32 < RHS.hi32
18342 NewCC = CC == ISD::SETULE ? ISD::SETULT
18343 : CC == ISD::SETUGT ? ISD::SETUGE
18344 : CC == ISD::SETLE ? ISD::SETLT
18345 : ISD::SETGE;
18346 }
18347 }
18348 break;
18349 }
18350 }
18351
18352 if (NewCC != ISD::SETCC_INVALID)
18353 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
18354 getHiHalf64(RHS, DAG), NewCC);
18355 }
18356
18357 // Eliminate setcc by using carryout from add/sub instruction
18358
18359 // LHS = ADD i64 RHS, Z LHSlo = UADDO i32 RHSlo, Zlo
18360 // setcc LHS ult RHS -> LHSHi = UADDO_CARRY i32 RHShi, Zhi
18361 // similarly for subtraction
18362
18363 // LHS = ADD i64 Y, 1 LHSlo = UADDO i32 Ylo, 1
18364 // setcc LHS eq 0 -> LHSHi = UADDO_CARRY i32 Yhi, 0
18365
18366 if (VT == MVT::i64 && ((CC == ISD::SETULT &&
18368 (CC == ISD::SETUGT &&
18370 (CC == ISD::SETEQ && CRHS && CRHS->isZero() &&
18371 sd_match(LHS, m_Add(m_Value(), m_One()))))) {
18372 bool IsAdd = LHS.getOpcode() == ISD::ADD;
18373
18374 SDValue Op0 = LHS.getOperand(0);
18375 SDValue Op1 = LHS.getOperand(1);
18376
18377 SDValue Op0Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op0);
18378 SDValue Op1Lo = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Op1);
18379
18380 SDValue Op0Hi = getHiHalf64(Op0, DAG);
18381 SDValue Op1Hi = getHiHalf64(Op1, DAG);
18382
18383 SDValue NodeLo =
18384 DAG.getNode(IsAdd ? ISD::UADDO : ISD::USUBO, SL,
18385 DAG.getVTList(MVT::i32, MVT::i1), {Op0Lo, Op1Lo});
18386
18387 SDValue CarryInHi = NodeLo.getValue(1);
18388 SDValue NodeHi = DAG.getNode(IsAdd ? ISD::UADDO_CARRY : ISD::USUBO_CARRY,
18389 SL, DAG.getVTList(MVT::i32, MVT::i1),
18390 {Op0Hi, Op1Hi, CarryInHi});
18391
18392 SDValue ResultLo = NodeLo.getValue(0);
18393 SDValue ResultHi = NodeHi.getValue(0);
18394
18395 SDValue JoinedResult =
18396 DAG.getBuildVector(MVT::v2i32, SL, {ResultLo, ResultHi});
18397
18398 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, JoinedResult);
18399 SDValue Overflow = NodeHi.getValue(1);
18400 DCI.CombineTo(LHS.getNode(), Result);
18401 return Overflow;
18402 }
18403
18404 if (VT != MVT::f32 && VT != MVT::f64 &&
18405 (!Subtarget->has16BitInsts() || VT != MVT::f16))
18406 return SDValue();
18407
18408 // Match isinf/isfinite pattern
18409 // (fcmp oeq (fabs x), inf) -> (fp_class x, (p_infinity | n_infinity))
18410 // (fcmp one (fabs x), inf) -> (fp_class x,
18411 // (p_normal | n_normal | p_subnormal | n_subnormal | p_zero | n_zero)
18412 if ((CC == ISD::SETOEQ || CC == ISD::SETONE) &&
18413 LHS.getOpcode() == ISD::FABS) {
18414 const ConstantFPSDNode *CRHS = dyn_cast<ConstantFPSDNode>(RHS);
18415 if (!CRHS)
18416 return SDValue();
18417
18418 const APFloat &APF = CRHS->getValueAPF();
18419 if (APF.isInfinity() && !APF.isNegative()) {
18420 const unsigned IsInfMask =
18422 const unsigned IsFiniteMask =
18426 unsigned Mask = CC == ISD::SETOEQ ? IsInfMask : IsFiniteMask;
18427 return DAG.getNode(AMDGPUISD::FP_CLASS, SL, MVT::i1, LHS.getOperand(0),
18428 DAG.getConstant(Mask, SL, MVT::i32));
18429 }
18430 }
18431
18432 if (VT == MVT::f64) {
18433 ISD::CondCode HiHalfCC = tryReduceF64CompareToHiHalf(CC, LHS, RHS, DAG);
18434 if (HiHalfCC != ISD::SETCC_INVALID)
18435 return DAG.getSetCC(SL, N->getValueType(0), getHiHalf64(LHS, DAG),
18436 getHiHalf64(RHS, DAG), HiHalfCC);
18437 }
18438
18439 return SDValue();
18440}
18441
18442SDValue
18443SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
18444 DAGCombinerInfo &DCI) const {
18445 SelectionDAG &DAG = DCI.DAG;
18446 SDLoc SL(N);
18447 unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
18448
18449 SDValue Src = N->getOperand(0);
18450 SDValue Shift = N->getOperand(0);
18451
18452 // TODO: Extend type shouldn't matter (assuming legal types).
18453 if (Shift.getOpcode() == ISD::ZERO_EXTEND)
18454 Shift = Shift.getOperand(0);
18455
18456 if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
18457 // cvt_f32_ubyte1 (shl x, 8) -> cvt_f32_ubyte0 x
18458 // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
18459 // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
18460 // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
18461 // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
18462 if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
18463 SDValue Shifted = DAG.getZExtOrTrunc(
18464 Shift.getOperand(0), SDLoc(Shift.getOperand(0)), MVT::i32);
18465
18466 unsigned ShiftOffset = 8 * Offset;
18467 if (Shift.getOpcode() == ISD::SHL)
18468 ShiftOffset -= C->getZExtValue();
18469 else
18470 ShiftOffset += C->getZExtValue();
18471
18472 if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
18473 return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
18474 MVT::f32, Shifted);
18475 }
18476 }
18477 }
18478
18479 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18480 APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
18481 if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
18482 // We simplified Src. If this node is not dead, visit it again so it is
18483 // folded properly.
18484 if (N->getOpcode() != ISD::DELETED_NODE)
18485 DCI.AddToWorklist(N);
18486 return SDValue(N, 0);
18487 }
18488
18489 // Handle (or x, (srl y, 8)) pattern when known bits are zero.
18490 if (SDValue DemandedSrc =
18491 TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
18492 return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
18493
18494 return SDValue();
18495}
18496
18497SDValue SITargetLowering::performClampCombine(SDNode *N,
18498 DAGCombinerInfo &DCI) const {
18499 ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
18500 if (!CSrc)
18501 return SDValue();
18502
18503 const MachineFunction &MF = DCI.DAG.getMachineFunction();
18504 const APFloat &F = CSrc->getValueAPF();
18505 APFloat Zero = APFloat::getZero(F.getSemantics());
18506 if (F < Zero ||
18507 (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
18508 return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
18509 }
18510
18511 APFloat One(F.getSemantics(), "1.0");
18512 if (F > One)
18513 return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
18514
18515 return SDValue(CSrc, 0);
18516}
18517
18518SDValue SITargetLowering::performSelectCombine(SDNode *N,
18519 DAGCombinerInfo &DCI) const {
18520
18521 // Try to fold CMP + SELECT patterns with shared constants (both FP and
18522 // integer).
18523 // Detect when CMP and SELECT use the same constant and fold them to avoid
18524 // loading the constant twice. Specifically handles patterns like:
18525 // %cmp = icmp eq i32 %val, 4242
18526 // %sel = select i1 %cmp, i32 4242, i32 %other
18527 // It can be optimized to reuse %val instead of 4242 in select.
18528 SDValue Cond = N->getOperand(0);
18529 SDValue TrueVal = N->getOperand(1);
18530 SDValue FalseVal = N->getOperand(2);
18531
18532 // Check if condition is a comparison.
18533 if (Cond.getOpcode() != ISD::SETCC)
18534 return SDValue();
18535
18536 SDValue LHS = Cond.getOperand(0);
18537 SDValue RHS = Cond.getOperand(1);
18538 ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
18539
18540 bool isFloatingPoint = LHS.getValueType().isFloatingPoint();
18541 bool isInteger = LHS.getValueType().isInteger();
18542
18543 // Handle simple floating-point and integer types only.
18544 if (!isFloatingPoint && !isInteger)
18545 return SDValue();
18546
18547 bool isEquality = CC == (isFloatingPoint ? ISD::SETOEQ : ISD::SETEQ);
18548 bool isNonEquality = CC == (isFloatingPoint ? ISD::SETONE : ISD::SETNE);
18549 if (!isEquality && !isNonEquality)
18550 return SDValue();
18551
18552 SDValue ArgVal, ConstVal;
18553 if ((isFloatingPoint && isa<ConstantFPSDNode>(RHS)) ||
18554 (isInteger && isa<ConstantSDNode>(RHS))) {
18555 ConstVal = RHS;
18556 ArgVal = LHS;
18557 } else if ((isFloatingPoint && isa<ConstantFPSDNode>(LHS)) ||
18558 (isInteger && isa<ConstantSDNode>(LHS))) {
18559 ConstVal = LHS;
18560 ArgVal = RHS;
18561 } else {
18562 return SDValue();
18563 }
18564
18565 // Skip optimization for inlinable immediates.
18566 if (isFloatingPoint) {
18567 const APFloat &Val = cast<ConstantFPSDNode>(ConstVal)->getValueAPF();
18568 if (!Val.isNormal() || Subtarget->getInstrInfo()->isInlineConstant(Val))
18569 return SDValue();
18570 } else {
18571 const std::optional<int64_t> Val =
18572 cast<ConstantSDNode>(ConstVal)->getAPIntValue().trySExtValue();
18573 if (Val && AMDGPU::isInlinableIntLiteral(*Val))
18574 return SDValue();
18575 }
18576
18577 // For equality and non-equality comparisons, patterns:
18578 // select (setcc x, const), const, y -> select (setcc x, const), x, y
18579 // select (setccinv x, const), y, const -> select (setccinv x, const), y, x
18580 if (!(isEquality && TrueVal == ConstVal) &&
18581 !(isNonEquality && FalseVal == ConstVal))
18582 return SDValue();
18583
18584 SDValue SelectLHS = (isEquality && TrueVal == ConstVal) ? ArgVal : TrueVal;
18585 SDValue SelectRHS =
18586 (isNonEquality && FalseVal == ConstVal) ? ArgVal : FalseVal;
18587 return DCI.DAG.getNode(ISD::SELECT, SDLoc(N), N->getValueType(0), Cond,
18588 SelectLHS, SelectRHS);
18589}
18590
18592 DAGCombinerInfo &DCI) const {
18593 switch (N->getOpcode()) {
18594 case ISD::ABS:
18595 if (SDValue Res = promoteUniformUnaryOpToI32(SDValue(N, 0), DCI))
18596 return Res;
18597 break;
18598 case ISD::ADD:
18599 case ISD::SUB:
18600 case ISD::SHL:
18601 case ISD::SRL:
18602 case ISD::SRA:
18603 case ISD::AND:
18604 case ISD::OR:
18605 case ISD::XOR:
18606 case ISD::MUL:
18607 case ISD::SETCC:
18608 case ISD::SELECT:
18609 case ISD::SMIN:
18610 case ISD::SMAX:
18611 case ISD::UMIN:
18612 case ISD::UMAX:
18613 case ISD::USUBSAT:
18614 if (auto Res = promoteUniformOpToI32(SDValue(N, 0), DCI))
18615 return Res;
18616 break;
18617 default:
18618 break;
18619 }
18620
18621 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
18622 return SDValue();
18623
18624 switch (N->getOpcode()) {
18625 case ISD::ADD:
18626 return performAddCombine(N, DCI);
18627 case ISD::PTRADD:
18628 return performPtrAddCombine(N, DCI);
18629 case ISD::SUB:
18630 return performSubCombine(N, DCI);
18631 case ISD::FADD:
18632 return performFAddCombine(N, DCI);
18633 case ISD::FSUB:
18634 return performFSubCombine(N, DCI);
18635 case ISD::FDIV:
18636 return performFDivCombine(N, DCI);
18637 case ISD::FMUL:
18638 return performFMulCombine(N, DCI);
18639 case ISD::SETCC:
18640 return performSetCCCombine(N, DCI);
18641 case ISD::SELECT:
18642 if (auto Res = performSelectCombine(N, DCI))
18643 return Res;
18644 break;
18645 case ISD::FMAXNUM:
18646 case ISD::FMINNUM:
18647 case ISD::FMAXNUM_IEEE:
18648 case ISD::FMINNUM_IEEE:
18649 case ISD::FMAXIMUM:
18650 case ISD::FMINIMUM:
18651 case ISD::FMAXIMUMNUM:
18652 case ISD::FMINIMUMNUM:
18653 case ISD::SMAX:
18654 case ISD::SMIN:
18655 case ISD::UMAX:
18656 case ISD::UMIN:
18657 case AMDGPUISD::FMIN_LEGACY:
18658 case AMDGPUISD::FMAX_LEGACY:
18659 return performMinMaxCombine(N, DCI);
18660 case ISD::FMA:
18661 return performFMACombine(N, DCI);
18662 case ISD::AND:
18663 return performAndCombine(N, DCI);
18664 case ISD::OR:
18665 return performOrCombine(N, DCI);
18666 case ISD::FSHR: {
18668 if (N->getValueType(0) == MVT::i32 && N->isDivergent() &&
18669 TII->pseudoToMCOpcode(AMDGPU::V_PERM_B32_e64) != -1) {
18670 return matchPERM(N, DCI);
18671 }
18672 break;
18673 }
18674 case ISD::XOR:
18675 return performXorCombine(N, DCI);
18676 case ISD::ANY_EXTEND:
18677 case ISD::ZERO_EXTEND:
18678 return performZeroOrAnyExtendCombine(N, DCI);
18680 return performSignExtendInRegCombine(N, DCI);
18681 case AMDGPUISD::FP_CLASS:
18682 return performClassCombine(N, DCI);
18683 case ISD::FCANONICALIZE:
18684 return performFCanonicalizeCombine(N, DCI);
18685 case AMDGPUISD::RCP:
18686 return performRcpCombine(N, DCI);
18687 case ISD::FLDEXP:
18688 case AMDGPUISD::FRACT:
18689 case AMDGPUISD::RSQ:
18690 case AMDGPUISD::RCP_LEGACY:
18691 case AMDGPUISD::RCP_IFLAG:
18692 case AMDGPUISD::RSQ_CLAMP: {
18693 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
18694 SDValue Src = N->getOperand(0);
18695 if (Src.isUndef())
18696 return Src;
18697 break;
18698 }
18699 case ISD::SINT_TO_FP:
18700 case ISD::UINT_TO_FP:
18701 return performUCharToFloatCombine(N, DCI);
18702 case ISD::FCOPYSIGN:
18703 return performFCopySignCombine(N, DCI);
18704 case AMDGPUISD::CVT_F32_UBYTE0:
18705 case AMDGPUISD::CVT_F32_UBYTE1:
18706 case AMDGPUISD::CVT_F32_UBYTE2:
18707 case AMDGPUISD::CVT_F32_UBYTE3:
18708 return performCvtF32UByteNCombine(N, DCI);
18709 case AMDGPUISD::FMED3:
18710 return performFMed3Combine(N, DCI);
18711 case AMDGPUISD::CVT_PKRTZ_F16_F32:
18712 return performCvtPkRTZCombine(N, DCI);
18713 case AMDGPUISD::CLAMP:
18714 return performClampCombine(N, DCI);
18715 case ISD::SCALAR_TO_VECTOR: {
18716 SelectionDAG &DAG = DCI.DAG;
18717 EVT VT = N->getValueType(0);
18718
18719 // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
18720 if (VT == MVT::v2i16 || VT == MVT::v2f16 || VT == MVT::v2bf16) {
18721 SDLoc SL(N);
18722 SDValue Src = N->getOperand(0);
18723 EVT EltVT = Src.getValueType();
18724 if (EltVT != MVT::i16)
18725 Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
18726
18727 SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
18728 return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
18729 }
18730
18731 break;
18732 }
18734 return performExtractVectorEltCombine(N, DCI);
18736 return performInsertVectorEltCombine(N, DCI);
18737 case ISD::FP_ROUND:
18738 return performFPRoundCombine(N, DCI);
18739 case ISD::LOAD: {
18740 if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
18741 return Widened;
18742 [[fallthrough]];
18743 }
18744 default: {
18745 if (!DCI.isBeforeLegalize()) {
18746 if (MemSDNode *MemNode = dyn_cast<MemSDNode>(N))
18747 return performMemSDNodeCombine(MemNode, DCI);
18748 }
18749
18750 break;
18751 }
18752 }
18753
18755}
18756
18757/// Helper function for adjustWritemask
18758static unsigned SubIdx2Lane(unsigned Idx) {
18759 switch (Idx) {
18760 default:
18761 return ~0u;
18762 case AMDGPU::sub0:
18763 return 0;
18764 case AMDGPU::sub1:
18765 return 1;
18766 case AMDGPU::sub2:
18767 return 2;
18768 case AMDGPU::sub3:
18769 return 3;
18770 case AMDGPU::sub4:
18771 return 4; // Possible with TFE/LWE
18772 }
18773}
18774
18775/// Adjust the writemask of MIMG, VIMAGE or VSAMPLE instructions
18776SDNode *SITargetLowering::adjustWritemask(MachineSDNode *&Node,
18777 SelectionDAG &DAG) const {
18778 unsigned Opcode = Node->getMachineOpcode();
18779
18780 // Subtract 1 because the vdata output is not a MachineSDNode operand.
18781 int D16Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::d16) - 1;
18782 if (D16Idx >= 0 && Node->getConstantOperandVal(D16Idx))
18783 return Node; // not implemented for D16
18784
18785 SDNode *Users[5] = {nullptr};
18786 unsigned Lane = 0;
18787 unsigned DmaskIdx =
18788 AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::dmask) - 1;
18789 unsigned OldDmask = Node->getConstantOperandVal(DmaskIdx);
18790 unsigned NewDmask = 0;
18791 unsigned TFEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::tfe) - 1;
18792 unsigned LWEIdx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::lwe) - 1;
18793 bool UsesTFC = (int(TFEIdx) >= 0 && Node->getConstantOperandVal(TFEIdx)) ||
18794 (int(LWEIdx) >= 0 && Node->getConstantOperandVal(LWEIdx));
18795 unsigned TFCLane = 0;
18796 bool HasChain = Node->getNumValues() > 1;
18797
18798 if (OldDmask == 0) {
18799 // These are folded out, but on the chance it happens don't assert.
18800 return Node;
18801 }
18802
18803 unsigned OldBitsSet = llvm::popcount(OldDmask);
18804 // Work out which is the TFE/LWE lane if that is enabled.
18805 if (UsesTFC) {
18806 TFCLane = OldBitsSet;
18807 }
18808
18809 // Try to figure out the used register components
18810 for (SDUse &Use : Node->uses()) {
18811
18812 // Don't look at users of the chain.
18813 if (Use.getResNo() != 0)
18814 continue;
18815
18816 SDNode *User = Use.getUser();
18817
18818 // Abort if we can't understand the usage
18819 if (!User->isMachineOpcode() ||
18820 User->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
18821 return Node;
18822
18823 // Lane means which subreg of %vgpra_vgprb_vgprc_vgprd is used.
18824 // Note that subregs are packed, i.e. Lane==0 is the first bit set
18825 // in OldDmask, so it can be any of X,Y,Z,W; Lane==1 is the second bit
18826 // set, etc.
18827 Lane = SubIdx2Lane(User->getConstantOperandVal(1));
18828 if (Lane == ~0u)
18829 return Node;
18830
18831 // Check if the use is for the TFE/LWE generated result at VGPRn+1.
18832 if (UsesTFC && Lane == TFCLane) {
18833 Users[Lane] = User;
18834 } else {
18835 // Set which texture component corresponds to the lane.
18836 unsigned Comp;
18837 for (unsigned i = 0, Dmask = OldDmask; (i <= Lane) && (Dmask != 0); i++) {
18838 Comp = llvm::countr_zero(Dmask);
18839 Dmask &= ~(1 << Comp);
18840 }
18841
18842 // Abort if we have more than one user per component.
18843 if (Users[Lane])
18844 return Node;
18845
18846 Users[Lane] = User;
18847 NewDmask |= 1 << Comp;
18848 }
18849 }
18850
18851 // Don't allow 0 dmask, as hardware assumes one channel enabled.
18852 bool NoChannels = !NewDmask;
18853 if (NoChannels) {
18854 if (!UsesTFC) {
18855 // No uses of the result and not using TFC. Then do nothing.
18856 return Node;
18857 }
18858 // If the original dmask has one channel - then nothing to do
18859 if (OldBitsSet == 1)
18860 return Node;
18861 // Use an arbitrary dmask - required for the instruction to work
18862 NewDmask = 1;
18863 }
18864 // Abort if there's no change
18865 if (NewDmask == OldDmask)
18866 return Node;
18867
18868 unsigned BitsSet = llvm::popcount(NewDmask);
18869
18870 // Check for TFE or LWE - increase the number of channels by one to account
18871 // for the extra return value
18872 // This will need adjustment for D16 if this is also included in
18873 // adjustWriteMask (this function) but at present D16 are excluded.
18874 unsigned NewChannels = BitsSet + UsesTFC;
18875
18876 int NewOpcode =
18877 AMDGPU::getMaskedMIMGOp(Node->getMachineOpcode(), NewChannels);
18878 assert(NewOpcode != -1 &&
18879 NewOpcode != static_cast<int>(Node->getMachineOpcode()) &&
18880 "failed to find equivalent MIMG op");
18881
18882 // Adjust the writemask in the node
18884 llvm::append_range(Ops, Node->ops().take_front(DmaskIdx));
18885 Ops.push_back(DAG.getTargetConstant(NewDmask, SDLoc(Node), MVT::i32));
18886 llvm::append_range(Ops, Node->ops().drop_front(DmaskIdx + 1));
18887
18888 MVT SVT = Node->getValueType(0).getVectorElementType().getSimpleVT();
18889
18890 MVT ResultVT = NewChannels == 1
18891 ? SVT
18892 : MVT::getVectorVT(SVT, NewChannels == 3 ? 4
18893 : NewChannels == 5 ? 8
18894 : NewChannels);
18895 SDVTList NewVTList =
18896 HasChain ? DAG.getVTList(ResultVT, MVT::Other) : DAG.getVTList(ResultVT);
18897
18898 MachineSDNode *NewNode =
18899 DAG.getMachineNode(NewOpcode, SDLoc(Node), NewVTList, Ops);
18900
18901 if (HasChain) {
18902 // Update chain.
18903 DAG.setNodeMemRefs(NewNode, Node->memoperands());
18904 DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), SDValue(NewNode, 1));
18905 }
18906
18907 if (NewChannels == 1) {
18908 assert(Node->hasNUsesOfValue(1, 0));
18909 SDNode *Copy =
18910 DAG.getMachineNode(TargetOpcode::COPY, SDLoc(Node),
18911 Users[Lane]->getValueType(0), SDValue(NewNode, 0));
18912 DAG.ReplaceAllUsesWith(Users[Lane], Copy);
18913 return nullptr;
18914 }
18915
18916 // Update the users of the node with the new indices
18917 for (unsigned i = 0, Idx = AMDGPU::sub0; i < 5; ++i) {
18918 SDNode *User = Users[i];
18919 if (!User) {
18920 // Handle the special case of NoChannels. We set NewDmask to 1 above, but
18921 // Users[0] is still nullptr because channel 0 doesn't really have a use.
18922 if (i || !NoChannels)
18923 continue;
18924 } else {
18925 SDValue Op = DAG.getTargetConstant(Idx, SDLoc(User), MVT::i32);
18926 SDNode *NewUser = DAG.UpdateNodeOperands(User, SDValue(NewNode, 0), Op);
18927 if (NewUser != User) {
18928 DAG.ReplaceAllUsesWith(SDValue(User, 0), SDValue(NewUser, 0));
18929 DAG.RemoveDeadNode(User);
18930 }
18931 }
18932
18933 switch (Idx) {
18934 default:
18935 break;
18936 case AMDGPU::sub0:
18937 Idx = AMDGPU::sub1;
18938 break;
18939 case AMDGPU::sub1:
18940 Idx = AMDGPU::sub2;
18941 break;
18942 case AMDGPU::sub2:
18943 Idx = AMDGPU::sub3;
18944 break;
18945 case AMDGPU::sub3:
18946 Idx = AMDGPU::sub4;
18947 break;
18948 }
18949 }
18950
18951 DAG.RemoveDeadNode(Node);
18952 return nullptr;
18953}
18954
18956 if (Op.getOpcode() == ISD::AssertZext)
18957 Op = Op.getOperand(0);
18958
18959 return isa<FrameIndexSDNode>(Op);
18960}
18961
18962/// Legalize target independent instructions (e.g. INSERT_SUBREG)
18963/// with frame index operands.
18964/// LLVM assumes that inputs are to these instructions are registers.
18965SDNode *
18967 SelectionDAG &DAG) const {
18968 if (Node->getOpcode() == ISD::CopyToReg) {
18969 RegisterSDNode *DestReg = cast<RegisterSDNode>(Node->getOperand(1));
18970 SDValue SrcVal = Node->getOperand(2);
18971
18972 // Insert a copy to a VReg_1 virtual register so LowerI1Copies doesn't have
18973 // to try understanding copies to physical registers.
18974 if (SrcVal.getValueType() == MVT::i1 && DestReg->getReg().isPhysical()) {
18975 SDLoc SL(Node);
18977 SDValue VReg = DAG.getRegister(
18978 MRI.createVirtualRegister(&AMDGPU::VReg_1RegClass), MVT::i1);
18979
18980 SDNode *Glued = Node->getGluedNode();
18981 SDValue ToVReg = DAG.getCopyToReg(
18982 Node->getOperand(0), SL, VReg, SrcVal,
18983 SDValue(Glued, Glued ? Glued->getNumValues() - 1 : 0));
18984 SDValue ToResultReg = DAG.getCopyToReg(ToVReg, SL, SDValue(DestReg, 0),
18985 VReg, ToVReg.getValue(1));
18986 DAG.ReplaceAllUsesWith(Node, ToResultReg.getNode());
18987 DAG.RemoveDeadNode(Node);
18988 return ToResultReg.getNode();
18989 }
18990 }
18991
18993 for (unsigned i = 0; i < Node->getNumOperands(); ++i) {
18994 if (!isFrameIndexOp(Node->getOperand(i))) {
18995 Ops.push_back(Node->getOperand(i));
18996 continue;
18997 }
18998
18999 SDLoc DL(Node);
19000 Ops.push_back(SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL,
19001 Node->getOperand(i).getValueType(),
19002 Node->getOperand(i)),
19003 0));
19004 }
19005
19006 return DAG.UpdateNodeOperands(Node, Ops);
19007}
19008
19009/// Fold the instructions after selecting them.
19010/// Returns null if users were already updated.
19012 SelectionDAG &DAG) const {
19014 unsigned Opcode = Node->getMachineOpcode();
19015
19016 if (TII->isImage(Opcode) && !TII->get(Opcode).mayStore() &&
19017 !TII->isGather4(Opcode) &&
19018 AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::dmask)) {
19019 return adjustWritemask(Node, DAG);
19020 }
19021
19022 if (Opcode == AMDGPU::INSERT_SUBREG || Opcode == AMDGPU::REG_SEQUENCE) {
19024 return Node;
19025 }
19026
19027 switch (Opcode) {
19028 case AMDGPU::V_DIV_SCALE_F32_e64:
19029 case AMDGPU::V_DIV_SCALE_F64_e64: {
19030 // Satisfy the operand register constraint when one of the inputs is
19031 // undefined. Ordinarily each undef value will have its own implicit_def of
19032 // a vreg, so force these to use a single register.
19033 SDValue Src0 = Node->getOperand(1);
19034 SDValue Src1 = Node->getOperand(3);
19035 SDValue Src2 = Node->getOperand(5);
19036
19037 if ((Src0.isMachineOpcode() &&
19038 Src0.getMachineOpcode() != AMDGPU::IMPLICIT_DEF) &&
19039 (Src0 == Src1 || Src0 == Src2))
19040 break;
19041
19042 MVT VT = Src0.getValueType().getSimpleVT();
19043 const TargetRegisterClass *RC =
19044 getRegClassFor(VT, Src0.getNode()->isDivergent());
19045
19047 SDValue UndefReg = DAG.getRegister(MRI.createVirtualRegister(RC), VT);
19048
19049 SDValue ImpDef = DAG.getCopyToReg(DAG.getEntryNode(), SDLoc(Node), UndefReg,
19050 Src0, SDValue());
19051
19052 // src0 must be the same register as src1 or src2, even if the value is
19053 // undefined, so make sure we don't violate this constraint.
19054 if (Src0.isMachineOpcode() &&
19055 Src0.getMachineOpcode() == AMDGPU::IMPLICIT_DEF) {
19056 if (Src1.isMachineOpcode() &&
19057 Src1.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
19058 Src0 = Src1;
19059 else if (Src2.isMachineOpcode() &&
19060 Src2.getMachineOpcode() != AMDGPU::IMPLICIT_DEF)
19061 Src0 = Src2;
19062 else {
19063 assert(Src1.getMachineOpcode() == AMDGPU::IMPLICIT_DEF);
19064 Src0 = UndefReg;
19065 Src1 = UndefReg;
19066 }
19067 } else
19068 break;
19069
19071 Ops[1] = Src0;
19072 Ops[3] = Src1;
19073 Ops[5] = Src2;
19074 Ops.push_back(ImpDef.getValue(1));
19075 return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
19076 }
19077 default:
19078 break;
19079 }
19080
19081 return Node;
19082}
19083
19084// Any MIMG instructions that use tfe or lwe require an initialization of the
19085// result register that will be written in the case of a memory access failure.
19086// The required code is also added to tie this init code to the result of the
19087// img instruction.
19090 const SIRegisterInfo &TRI = TII->getRegisterInfo();
19091 MachineRegisterInfo &MRI = MI.getMF()->getRegInfo();
19092 MachineBasicBlock &MBB = *MI.getParent();
19093
19094 int DstIdx =
19095 AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdata);
19096 unsigned InitIdx = 0;
19097
19098 if (TII->isImage(MI)) {
19099 MachineOperand *TFE = TII->getNamedOperand(MI, AMDGPU::OpName::tfe);
19100 MachineOperand *LWE = TII->getNamedOperand(MI, AMDGPU::OpName::lwe);
19101 MachineOperand *D16 = TII->getNamedOperand(MI, AMDGPU::OpName::d16);
19102
19103 if (!TFE && !LWE) // intersect_ray
19104 return;
19105
19106 unsigned TFEVal = TFE ? TFE->getImm() : 0;
19107 unsigned LWEVal = LWE ? LWE->getImm() : 0;
19108 unsigned D16Val = D16 ? D16->getImm() : 0;
19109
19110 if (!TFEVal && !LWEVal)
19111 return;
19112
19113 // At least one of TFE or LWE are non-zero
19114 // We have to insert a suitable initialization of the result value and
19115 // tie this to the dest of the image instruction.
19116
19117 // Calculate which dword we have to initialize to 0.
19118 MachineOperand *MO_Dmask = TII->getNamedOperand(MI, AMDGPU::OpName::dmask);
19119
19120 // check that dmask operand is found.
19121 assert(MO_Dmask && "Expected dmask operand in instruction");
19122
19123 unsigned dmask = MO_Dmask->getImm();
19124 // Determine the number of active lanes taking into account the
19125 // Gather4 special case
19126 unsigned ActiveLanes = TII->isGather4(MI) ? 4 : llvm::popcount(dmask);
19127
19128 bool Packed = !Subtarget->hasUnpackedD16VMem();
19129
19130 InitIdx = D16Val && Packed ? ((ActiveLanes + 1) >> 1) + 1 : ActiveLanes + 1;
19131
19132 // Abandon attempt if the dst size isn't large enough
19133 // - this is in fact an error but this is picked up elsewhere and
19134 // reported correctly.
19135 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
19136
19137 uint32_t DstSize = TRI.getRegSizeInBits(*DstRC) / 32;
19138 if (DstSize < InitIdx)
19139 return;
19140 } else if (TII->isMUBUF(MI) && AMDGPU::getMUBUFTfe(MI.getOpcode())) {
19141 const TargetRegisterClass *DstRC = TII->getRegClass(MI.getDesc(), DstIdx);
19142 InitIdx = TRI.getRegSizeInBits(*DstRC) / 32;
19143 } else {
19144 return;
19145 }
19146
19147 const DebugLoc &DL = MI.getDebugLoc();
19148
19149 // Create a register for the initialization value.
19150 Register PrevDst = MRI.cloneVirtualRegister(MI.getOperand(DstIdx).getReg());
19151 unsigned NewDst = 0; // Final initialized value will be in here
19152
19153 // If PRTStrictNull feature is enabled (the default) then initialize
19154 // all the result registers to 0, otherwise just the error indication
19155 // register (VGPRn+1)
19156 unsigned SizeLeft = Subtarget->usePRTStrictNull() ? InitIdx : 1;
19157 unsigned CurrIdx = Subtarget->usePRTStrictNull() ? 0 : (InitIdx - 1);
19158
19159 BuildMI(MBB, MI, DL, TII->get(AMDGPU::IMPLICIT_DEF), PrevDst);
19160 for (; SizeLeft; SizeLeft--, CurrIdx++) {
19161 NewDst = MRI.createVirtualRegister(TII->getOpRegClass(MI, DstIdx));
19162 // Initialize dword
19163 Register SubReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
19164 // clang-format off
19165 BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), SubReg)
19166 .addImm(0);
19167 // clang-format on
19168 // Insert into the super-reg
19169 BuildMI(MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
19170 .addReg(PrevDst)
19171 .addReg(SubReg)
19173
19174 PrevDst = NewDst;
19175 }
19176
19177 // Add as an implicit operand
19178 MI.addOperand(MachineOperand::CreateReg(NewDst, false, true));
19179
19180 // Tie the just added implicit operand to the dst
19181 MI.tieOperands(DstIdx, MI.getNumOperands() - 1);
19182}
19183
19184/// Assign the register class depending on the number of
19185/// bits set in the writemask
19187 SDNode *Node) const {
19189
19190 MachineFunction *MF = MI.getMF();
19191 MachineRegisterInfo &MRI = MF->getRegInfo();
19192
19193 if (TII->isVOP3(MI.getOpcode())) {
19194 // Make sure constant bus requirements are respected.
19195 TII->legalizeOperandsVOP3(MRI, MI);
19196
19197 if (TII->isMAI(MI)) {
19198 // The ordinary src0, src1, src2 were legalized above.
19199 //
19200 // We have to also legalize the appended v_mfma_ld_scale_b32 operands,
19201 // as a separate instruction.
19202 int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
19203 AMDGPU::OpName::scale_src0);
19204 if (Src0Idx != -1) {
19205 int Src1Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
19206 AMDGPU::OpName::scale_src1);
19207 if (TII->usesConstantBus(MRI, MI, Src0Idx) &&
19208 TII->usesConstantBus(MRI, MI, Src1Idx))
19209 TII->legalizeOpWithMove(MI, Src1Idx);
19210 }
19211 }
19212
19213 return;
19214 }
19215
19216 if (TII->isImage(MI))
19217 TII->enforceOperandRCAlignment(MI, AMDGPU::OpName::vaddr);
19218}
19219
19221 uint64_t Val) {
19222 SDValue K = DAG.getTargetConstant(Val, DL, MVT::i32);
19223 return SDValue(DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, K), 0);
19224}
19225
19227 const SDLoc &DL,
19228 SDValue Ptr) const {
19230
19231 // Build the half of the subregister with the constants before building the
19232 // full 128-bit register. If we are building multiple resource descriptors,
19233 // this will allow CSEing of the 2-component register.
19234 const SDValue Ops0[] = {
19235 DAG.getTargetConstant(AMDGPU::SGPR_64RegClassID, DL, MVT::i32),
19236 buildSMovImm32(DAG, DL, 0),
19237 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
19238 buildSMovImm32(DAG, DL, TII->getDefaultRsrcDataFormat() >> 32),
19239 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32)};
19240
19241 SDValue SubRegHi = SDValue(
19242 DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v2i32, Ops0), 0);
19243
19244 // Combine the constants and the pointer.
19245 const SDValue Ops1[] = {
19246 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32), Ptr,
19247 DAG.getTargetConstant(AMDGPU::sub0_sub1, DL, MVT::i32), SubRegHi,
19248 DAG.getTargetConstant(AMDGPU::sub2_sub3, DL, MVT::i32)};
19249
19250 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops1);
19251}
19252
19253/// Return a resource descriptor with the 'Add TID' bit enabled
19254/// The TID (Thread ID) is multiplied by the stride value (bits [61:48]
19255/// of the resource descriptor) to create an offset, which is added to
19256/// the resource pointer.
19258 SDValue Ptr, uint32_t RsrcDword1,
19259 uint64_t RsrcDword2And3) const {
19260 SDValue PtrLo = DAG.getTargetExtractSubreg(AMDGPU::sub0, DL, MVT::i32, Ptr);
19261 SDValue PtrHi = DAG.getTargetExtractSubreg(AMDGPU::sub1, DL, MVT::i32, Ptr);
19262 if (RsrcDword1) {
19263 PtrHi =
19264 SDValue(DAG.getMachineNode(AMDGPU::S_OR_B32, DL, MVT::i32, PtrHi,
19265 DAG.getConstant(RsrcDword1, DL, MVT::i32)),
19266 0);
19267 }
19268
19269 SDValue DataLo =
19270 buildSMovImm32(DAG, DL, RsrcDword2And3 & UINT64_C(0xFFFFFFFF));
19271 SDValue DataHi = buildSMovImm32(DAG, DL, RsrcDword2And3 >> 32);
19272
19273 const SDValue Ops[] = {
19274 DAG.getTargetConstant(AMDGPU::SGPR_128RegClassID, DL, MVT::i32),
19275 PtrLo,
19276 DAG.getTargetConstant(AMDGPU::sub0, DL, MVT::i32),
19277 PtrHi,
19278 DAG.getTargetConstant(AMDGPU::sub1, DL, MVT::i32),
19279 DataLo,
19280 DAG.getTargetConstant(AMDGPU::sub2, DL, MVT::i32),
19281 DataHi,
19282 DAG.getTargetConstant(AMDGPU::sub3, DL, MVT::i32)};
19283
19284 return DAG.getMachineNode(AMDGPU::REG_SEQUENCE, DL, MVT::v4i32, Ops);
19285}
19286
19287//===----------------------------------------------------------------------===//
19288// SI Inline Assembly Support
19289//===----------------------------------------------------------------------===//
19290
19291std::pair<unsigned, const TargetRegisterClass *>
19293 StringRef Constraint,
19294 MVT VT) const {
19295 const SIRegisterInfo *TRI = static_cast<const SIRegisterInfo *>(TRI_);
19296
19297 const TargetRegisterClass *RC = nullptr;
19298 if (Constraint.size() == 1) {
19299 // Check if we cannot determine the bit size of the given value type. This
19300 // can happen, for example, in this situation where we have an empty struct
19301 // (size 0): `call void asm "", "v"({} poison)`-
19302 if (VT == MVT::Other)
19303 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19304 const unsigned BitWidth = VT.getSizeInBits();
19305 switch (Constraint[0]) {
19306 default:
19307 return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19308 case 's':
19309 case 'r':
19310 switch (BitWidth) {
19311 case 16:
19312 RC = &AMDGPU::SReg_32RegClass;
19313 break;
19314 case 64:
19315 RC = &AMDGPU::SGPR_64RegClass;
19316 break;
19317 default:
19319 if (!RC)
19320 return std::pair(0U, nullptr);
19321 break;
19322 }
19323 break;
19324 case 'v':
19325 switch (BitWidth) {
19326 case 1:
19327 return std::pair(0U, nullptr);
19328 case 16:
19329 RC = Subtarget->useRealTrue16Insts() ? &AMDGPU::VGPR_16RegClass
19330 : &AMDGPU::VGPR_32_Lo256RegClass;
19331 break;
19332 default:
19333 RC = Subtarget->has1024AddressableVGPRs()
19334 ? TRI->getAlignedLo256VGPRClassForBitWidth(BitWidth)
19335 : TRI->getVGPRClassForBitWidth(BitWidth);
19336 if (!RC)
19337 return std::pair(0U, nullptr);
19338 break;
19339 }
19340 break;
19341 case 'a':
19342 if (!Subtarget->hasMAIInsts())
19343 break;
19344 switch (BitWidth) {
19345 case 1:
19346 return std::pair(0U, nullptr);
19347 case 16:
19348 RC = &AMDGPU::AGPR_32RegClass;
19349 break;
19350 default:
19351 RC = TRI->getAGPRClassForBitWidth(BitWidth);
19352 if (!RC)
19353 return std::pair(0U, nullptr);
19354 break;
19355 }
19356 break;
19357 }
19358 } else if (Constraint == "VA" && Subtarget->hasGFX90AInsts()) {
19359 const unsigned BitWidth = VT.getSizeInBits();
19360 switch (BitWidth) {
19361 case 16:
19362 RC = &AMDGPU::AV_32RegClass;
19363 break;
19364 default:
19365 RC = TRI->getVectorSuperClassForBitWidth(BitWidth);
19366 if (!RC)
19367 return std::pair(0U, nullptr);
19368 break;
19369 }
19370 }
19371
19372 // We actually support i128, i16 and f16 as inline parameters
19373 // even if they are not reported as legal
19374 if (RC && (isTypeLegal(VT) || VT.SimpleTy == MVT::i128 ||
19375 VT.SimpleTy == MVT::i16 || VT.SimpleTy == MVT::f16))
19376 return std::pair(0U, RC);
19377
19378 auto [Kind, Idx, NumRegs] = AMDGPU::parseAsmConstraintPhysReg(Constraint);
19379 if (Kind != '\0') {
19380 if (Kind == 'v') {
19381 RC = &AMDGPU::VGPR_32_Lo256RegClass;
19382 } else if (Kind == 's') {
19383 RC = &AMDGPU::SGPR_32RegClass;
19384 } else if (Kind == 'a') {
19385 RC = &AMDGPU::AGPR_32RegClass;
19386 }
19387
19388 if (RC) {
19389 if (NumRegs > 1) {
19390 if (Idx >= RC->getNumRegs() || Idx + NumRegs - 1 >= RC->getNumRegs())
19391 return std::pair(0U, nullptr);
19392
19393 uint32_t Width = NumRegs * 32;
19394 // Prohibit constraints for register ranges with a width that does not
19395 // match the required type.
19396 if (VT.SimpleTy != MVT::Other && Width != VT.getSizeInBits())
19397 return std::pair(0U, nullptr);
19398
19399 MCRegister Reg = RC->getRegister(Idx);
19401 RC = TRI->getVGPRClassForBitWidth(Width);
19402 else if (SIRegisterInfo::isSGPRClass(RC))
19403 RC = TRI->getSGPRClassForBitWidth(Width);
19404 else if (SIRegisterInfo::isAGPRClass(RC))
19405 RC = TRI->getAGPRClassForBitWidth(Width);
19406 if (RC) {
19407 Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0, RC);
19408 if (!Reg) {
19409 // The register class does not contain the requested register,
19410 // e.g., because it is an SGPR pair that would violate alignment
19411 // requirements.
19412 return std::pair(0U, nullptr);
19413 }
19414 return std::pair(Reg, RC);
19415 }
19416 }
19417
19418 // Reject types that do not fit a single 32-bit register: any scalar wider
19419 // than 32 bits, or a vector that is not exactly 32 bits.
19420 if (VT.SimpleTy != MVT::Other &&
19421 (VT.getSizeInBits() > 32 ||
19422 (VT.isVector() && VT.getSizeInBits() != 32)))
19423 return std::pair(0U, nullptr);
19424 if (RC && Idx < RC->getNumRegs())
19425 return std::pair(RC->getRegister(Idx), RC);
19426 return std::pair(0U, nullptr);
19427 }
19428 }
19429
19430 auto Ret = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
19431 if (Ret.first)
19432 Ret.second = TRI->getPhysRegBaseClass(Ret.first);
19433
19434 return Ret;
19435}
19436
19437static bool isImmConstraint(StringRef Constraint) {
19438 if (Constraint.size() == 1) {
19439 switch (Constraint[0]) {
19440 default:
19441 break;
19442 case 'I':
19443 case 'J':
19444 case 'A':
19445 case 'B':
19446 case 'C':
19447 return true;
19448 }
19449 } else if (Constraint == "DA" || Constraint == "DB") {
19450 return true;
19451 }
19452 return false;
19453}
19454
19457 if (Constraint.size() == 1) {
19458 switch (Constraint[0]) {
19459 default:
19460 break;
19461 case 's':
19462 case 'v':
19463 case 'a':
19464 return C_RegisterClass;
19465 }
19466 } else if (Constraint.size() == 2) {
19467 if (Constraint == "VA")
19468 return C_RegisterClass;
19469 }
19470 if (isImmConstraint(Constraint)) {
19471 return C_Other;
19472 }
19473 return TargetLowering::getConstraintType(Constraint);
19474}
19475
19476static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
19478 Val = Val & maskTrailingOnes<uint64_t>(Size);
19479 }
19480 return Val;
19481}
19482
19484 StringRef Constraint,
19485 std::vector<SDValue> &Ops,
19486 SelectionDAG &DAG) const {
19487 if (isImmConstraint(Constraint)) {
19488 uint64_t Val;
19489 if (getAsmOperandConstVal(Op, Val) &&
19490 checkAsmConstraintVal(Op, Constraint, Val)) {
19491 Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
19492 Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
19493 }
19494 } else {
19496 }
19497}
19498
19500 unsigned Size = Op.getScalarValueSizeInBits();
19501 if (Size > 64)
19502 return false;
19503
19504 if (Size == 16 && !Subtarget->has16BitInsts())
19505 return false;
19506
19508 Val = C->getSExtValue();
19509 return true;
19510 }
19512 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19513 return true;
19514 }
19516 if (Size != 16 || Op.getNumOperands() != 2)
19517 return false;
19518 if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
19519 return false;
19520 if (ConstantSDNode *C = V->getConstantSplatNode()) {
19521 Val = C->getSExtValue();
19522 return true;
19523 }
19524 if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
19525 Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
19526 return true;
19527 }
19528 }
19529
19530 return false;
19531}
19532
19534 uint64_t Val) const {
19535 if (Constraint.size() == 1) {
19536 switch (Constraint[0]) {
19537 case 'I':
19539 case 'J':
19540 return isInt<16>(Val);
19541 case 'A':
19542 return checkAsmConstraintValA(Op, Val);
19543 case 'B':
19544 return isInt<32>(Val);
19545 case 'C':
19546 return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
19548 default:
19549 break;
19550 }
19551 } else if (Constraint.size() == 2) {
19552 if (Constraint == "DA") {
19553 int64_t HiBits = static_cast<int32_t>(Val >> 32);
19554 int64_t LoBits = static_cast<int32_t>(Val);
19555 return checkAsmConstraintValA(Op, HiBits, 32) &&
19556 checkAsmConstraintValA(Op, LoBits, 32);
19557 }
19558 if (Constraint == "DB") {
19559 return true;
19560 }
19561 }
19562 llvm_unreachable("Invalid asm constraint");
19563}
19564
19566 unsigned MaxSize) const {
19567 unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
19568 bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
19569 if (Size == 16) {
19570 MVT VT = Op.getSimpleValueType();
19571 switch (VT.SimpleTy) {
19572 default:
19573 return false;
19574 case MVT::i16:
19575 return AMDGPU::isInlinableLiteralI16(Val, HasInv2Pi);
19576 case MVT::f16:
19577 return AMDGPU::isInlinableLiteralFP16(Val, HasInv2Pi);
19578 case MVT::bf16:
19579 return AMDGPU::isInlinableLiteralBF16(Val, HasInv2Pi);
19580 case MVT::v2i16:
19581 return AMDGPU::getInlineEncodingV2I16(Val).has_value();
19582 case MVT::v2f16:
19583 return AMDGPU::getInlineEncodingV2F16(Val).has_value();
19584 case MVT::v2bf16:
19585 return AMDGPU::getInlineEncodingV2BF16(Val).has_value();
19586 }
19587 }
19588 if ((Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
19589 (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi)))
19590 return true;
19591 return false;
19592}
19593
19594static int getAlignedAGPRClassID(unsigned UnalignedClassID) {
19595 switch (UnalignedClassID) {
19596 case AMDGPU::VReg_64RegClassID:
19597 return AMDGPU::VReg_64_Align2RegClassID;
19598 case AMDGPU::VReg_96RegClassID:
19599 return AMDGPU::VReg_96_Align2RegClassID;
19600 case AMDGPU::VReg_128RegClassID:
19601 return AMDGPU::VReg_128_Align2RegClassID;
19602 case AMDGPU::VReg_160RegClassID:
19603 return AMDGPU::VReg_160_Align2RegClassID;
19604 case AMDGPU::VReg_192RegClassID:
19605 return AMDGPU::VReg_192_Align2RegClassID;
19606 case AMDGPU::VReg_224RegClassID:
19607 return AMDGPU::VReg_224_Align2RegClassID;
19608 case AMDGPU::VReg_256RegClassID:
19609 return AMDGPU::VReg_256_Align2RegClassID;
19610 case AMDGPU::VReg_288RegClassID:
19611 return AMDGPU::VReg_288_Align2RegClassID;
19612 case AMDGPU::VReg_320RegClassID:
19613 return AMDGPU::VReg_320_Align2RegClassID;
19614 case AMDGPU::VReg_352RegClassID:
19615 return AMDGPU::VReg_352_Align2RegClassID;
19616 case AMDGPU::VReg_384RegClassID:
19617 return AMDGPU::VReg_384_Align2RegClassID;
19618 case AMDGPU::VReg_512RegClassID:
19619 return AMDGPU::VReg_512_Align2RegClassID;
19620 case AMDGPU::VReg_1024RegClassID:
19621 return AMDGPU::VReg_1024_Align2RegClassID;
19622 case AMDGPU::AReg_64RegClassID:
19623 return AMDGPU::AReg_64_Align2RegClassID;
19624 case AMDGPU::AReg_96RegClassID:
19625 return AMDGPU::AReg_96_Align2RegClassID;
19626 case AMDGPU::AReg_128RegClassID:
19627 return AMDGPU::AReg_128_Align2RegClassID;
19628 case AMDGPU::AReg_160RegClassID:
19629 return AMDGPU::AReg_160_Align2RegClassID;
19630 case AMDGPU::AReg_192RegClassID:
19631 return AMDGPU::AReg_192_Align2RegClassID;
19632 case AMDGPU::AReg_256RegClassID:
19633 return AMDGPU::AReg_256_Align2RegClassID;
19634 case AMDGPU::AReg_512RegClassID:
19635 return AMDGPU::AReg_512_Align2RegClassID;
19636 case AMDGPU::AReg_1024RegClassID:
19637 return AMDGPU::AReg_1024_Align2RegClassID;
19638 default:
19639 return -1;
19640 }
19641}
19642
19643// Figure out which registers should be reserved for stack access. Only after
19644// the function is legalized do we know all of the non-spill stack objects or if
19645// calls are present.
19647 MachineRegisterInfo &MRI = MF.getRegInfo();
19649 const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
19650 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
19651 const SIInstrInfo *TII = ST.getInstrInfo();
19652
19653 if (Info->isEntryFunction()) {
19654 // Callable functions have fixed registers used for stack access.
19656 }
19657
19658 // TODO: Move this logic to getReservedRegs()
19659 // Reserve the SGPR(s) to save/restore EXEC for WWM spill/copy handling.
19660 unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
19661 Register SReg = ST.isWave32()
19662 ? AMDGPU::SGPR_32RegClass.getRegister(MaxNumSGPRs - 1)
19663 : TRI->getAlignedHighSGPRForRC(MF, /*Align=*/2,
19664 &AMDGPU::SGPR_64RegClass);
19665 Info->setSGPRForEXECCopy(SReg);
19666
19667 assert(!TRI->isSubRegister(Info->getScratchRSrcReg(),
19668 Info->getStackPtrOffsetReg()));
19669 if (Info->getStackPtrOffsetReg() != AMDGPU::SP_REG)
19670 MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg());
19671
19672 // We need to worry about replacing the default register with itself in case
19673 // of MIR testcases missing the MFI.
19674 if (Info->getScratchRSrcReg() != AMDGPU::PRIVATE_RSRC_REG)
19675 MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg());
19676
19677 if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
19678 MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
19679
19680 Info->limitOccupancy(MF);
19681
19682 if (ST.isWave32() && !MF.empty()) {
19683 for (auto &MBB : MF) {
19684 for (auto &MI : MBB) {
19685 TII->fixImplicitOperands(MI);
19686 }
19687 }
19688 }
19689
19690 // FIXME: This is a hack to fixup AGPR classes to use the properly aligned
19691 // classes if required. Ideally the register class constraints would differ
19692 // per-subtarget, but there's no easy way to achieve that right now. This is
19693 // not a problem for VGPRs because the correctly aligned VGPR class is implied
19694 // from using them as the register class for legal types.
19695 if (ST.needsAlignedVGPRs()) {
19696 for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
19697 const Register Reg = Register::index2VirtReg(I);
19698 const TargetRegisterClass *RC = MRI.getRegClassOrNull(Reg);
19699 if (!RC)
19700 continue;
19701 int NewClassID = getAlignedAGPRClassID(RC->getID());
19702 if (NewClassID != -1)
19703 MRI.setRegClass(Reg, TRI->getRegClass(NewClassID));
19704 }
19705 }
19706
19708}
19709
19711 KnownBits &Known,
19712 const APInt &DemandedElts,
19713 const SelectionDAG &DAG,
19714 unsigned Depth) const {
19715 Known.resetAll();
19716 unsigned Opc = Op.getOpcode();
19717 switch (Opc) {
19719 unsigned IID = Op.getConstantOperandVal(0);
19720 switch (IID) {
19721 case Intrinsic::amdgcn_mbcnt_lo:
19722 case Intrinsic::amdgcn_mbcnt_hi: {
19723 const GCNSubtarget &ST =
19725 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19726 // most 31 + src1.
19727 Known.Zero.setBitsFrom(
19728 IID == Intrinsic::amdgcn_mbcnt_lo ? ST.getWavefrontSizeLog2() : 5);
19729 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
19730 Known = KnownBits::add(Known, Known2);
19731 return;
19732 }
19733 }
19734 break;
19735 }
19736 }
19738 Op, Known, DemandedElts, DAG, Depth);
19739}
19740
19742 const int FI, KnownBits &Known, const MachineFunction &MF) const {
19744
19745 // Set the high bits to zero based on the maximum allowed scratch size per
19746 // wave. We can't use vaddr in MUBUF instructions if we don't know the address
19747 // calculation won't overflow, so assume the sign bit is never set.
19748 Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
19749}
19750
19752 GISelValueTracking &VT, KnownBits &Known,
19753 unsigned Dim) {
19754 unsigned MaxValue =
19755 ST.getMaxWorkitemID(VT.getMachineFunction().getFunction(), Dim);
19756 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
19757}
19758
19760 KnownBits &Known, const APInt &DemandedElts,
19761 unsigned BFEWidth, bool SExt, unsigned Depth) {
19763 const MachineOperand &Src1 = MI.getOperand(2);
19764
19765 unsigned Src1Cst = 0;
19766 if (Src1.isImm()) {
19767 Src1Cst = Src1.getImm();
19768 } else if (Src1.isReg()) {
19769 auto Cst = getIConstantVRegValWithLookThrough(Src1.getReg(), MRI);
19770 if (!Cst)
19771 return;
19772 Src1Cst = Cst->Value.getZExtValue();
19773 } else {
19774 return;
19775 }
19776
19777 // Offset is at bits [4:0] for 32 bit, [5:0] for 64 bit.
19778 // Width is always [22:16].
19779 const unsigned Offset =
19780 Src1Cst & maskTrailingOnes<unsigned>((BFEWidth == 32) ? 5 : 6);
19781 const unsigned Width = (Src1Cst >> 16) & maskTrailingOnes<unsigned>(6);
19782
19783 if (Width >= BFEWidth) // Ill-formed.
19784 return;
19785
19786 VT.computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
19787 Depth + 1);
19788
19789 Known = Known.extractBits(Width, Offset);
19790
19791 if (SExt)
19792 Known = Known.sext(BFEWidth);
19793 else
19794 Known = Known.zext(BFEWidth);
19795}
19796
19798 GISelValueTracking &VT, Register R, KnownBits &Known,
19799 const APInt &DemandedElts, const MachineRegisterInfo &MRI,
19800 unsigned Depth) const {
19801 Known.resetAll();
19802 const MachineInstr *MI = MRI.getVRegDef(R);
19803 switch (MI->getOpcode()) {
19804 case AMDGPU::S_BFE_I32:
19805 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
19806 /*SExt=*/true, Depth);
19807 case AMDGPU::S_BFE_U32:
19808 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/32,
19809 /*SExt=*/false, Depth);
19810 case AMDGPU::S_BFE_I64:
19811 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
19812 /*SExt=*/true, Depth);
19813 case AMDGPU::S_BFE_U64:
19814 return knownBitsForSBFE(*MI, VT, Known, DemandedElts, /*Width=*/64,
19815 /*SExt=*/false, Depth);
19816 case AMDGPU::G_INTRINSIC:
19817 case AMDGPU::G_INTRINSIC_CONVERGENT: {
19818 Intrinsic::ID IID = cast<GIntrinsic>(MI)->getIntrinsicID();
19819 switch (IID) {
19820 case Intrinsic::amdgcn_workitem_id_x:
19821 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 0);
19822 break;
19823 case Intrinsic::amdgcn_workitem_id_y:
19824 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 1);
19825 break;
19826 case Intrinsic::amdgcn_workitem_id_z:
19827 knownBitsForWorkitemID(*getSubtarget(), VT, Known, 2);
19828 break;
19829 case Intrinsic::amdgcn_mbcnt_lo:
19830 case Intrinsic::amdgcn_mbcnt_hi: {
19831 // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
19832 // most 31 + src1.
19833 Known.Zero.setBitsFrom(IID == Intrinsic::amdgcn_mbcnt_lo
19834 ? getSubtarget()->getWavefrontSizeLog2()
19835 : 5);
19836 KnownBits Known2;
19837 VT.computeKnownBitsImpl(MI->getOperand(3).getReg(), Known2, DemandedElts,
19838 Depth + 1);
19839 Known = KnownBits::add(Known, Known2);
19840 break;
19841 }
19842 case Intrinsic::amdgcn_groupstaticsize: {
19843 // We can report everything over the maximum size as 0. We can't report
19844 // based on the actual size because we don't know if it's accurate or not
19845 // at any given point.
19846 Known.Zero.setHighBits(
19847 llvm::countl_zero(getSubtarget()->getAddressableLocalMemorySize()));
19848 break;
19849 }
19850 }
19851 break;
19852 }
19853 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
19854 Known.Zero.setHighBits(24);
19855 break;
19856 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
19857 Known.Zero.setHighBits(16);
19858 break;
19859 case AMDGPU::G_AMDGPU_COPY_SCC_VCC:
19860 // G_AMDGPU_COPY_SCC_VCC converts a uniform boolean in VCC to SGPR s32,
19861 // producing exactly 0 or 1.
19862 Known.Zero.setHighBits(Known.getBitWidth() - 1);
19863 break;
19864 case AMDGPU::G_AMDGPU_SMED3:
19865 case AMDGPU::G_AMDGPU_UMED3: {
19866 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
19867
19868 KnownBits Known2;
19869 VT.computeKnownBitsImpl(Src2, Known2, DemandedElts, Depth + 1);
19870 if (Known2.isUnknown())
19871 break;
19872
19873 KnownBits Known1;
19874 VT.computeKnownBitsImpl(Src1, Known1, DemandedElts, Depth + 1);
19875 if (Known1.isUnknown())
19876 break;
19877
19878 KnownBits Known0;
19879 VT.computeKnownBitsImpl(Src0, Known0, DemandedElts, Depth + 1);
19880 if (Known0.isUnknown())
19881 break;
19882
19883 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
19884 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
19885 Known.One = Known0.One & Known1.One & Known2.One;
19886 break;
19887 }
19888 }
19889}
19890
19893 unsigned Depth) const {
19894 const MachineInstr *MI = MRI.getVRegDef(R);
19895 if (auto *GI = dyn_cast<GIntrinsic>(MI)) {
19896 // FIXME: Can this move to generic code? What about the case where the call
19897 // site specifies a lower alignment?
19898 Intrinsic::ID IID = GI->getIntrinsicID();
19900 AttributeList Attrs =
19901 Intrinsic::getAttributes(Ctx, IID, Intrinsic::getType(Ctx, IID));
19902 if (MaybeAlign RetAlign = Attrs.getRetAlignment())
19903 return *RetAlign;
19904 }
19905 return Align(1);
19906}
19907
19910 const Align CacheLineAlign = Align(64);
19911
19912 // GFX950: Prevent an 8-byte instruction at loop header from being split by
19913 // the 32-byte instruction fetch window boundary. This avoids a significant
19914 // fetch delay after backward branch. We use 32-byte alignment with max
19915 // padding of 4 bytes (one s_nop), see getMaxPermittedBytesForAlignment().
19916 if (ML && !DisableLoopAlignment &&
19917 getSubtarget()->hasLoopHeadInstSplitSensitivity()) {
19918 const MachineBasicBlock *Header = ML->getHeader();
19919 // Respect user-specified or previously set alignment.
19920 if (Header->getAlignment() != PrefAlign)
19921 return Header->getAlignment();
19922 if (needsFetchWindowAlignment(*Header))
19923 return Align(32);
19924 }
19925
19926 // Pre-GFX10 target did not benefit from loop alignment
19927 if (!ML || DisableLoopAlignment || !getSubtarget()->hasInstPrefetch() ||
19928 getSubtarget()->hasInstFwdPrefetchBug())
19929 return PrefAlign;
19930
19931 // On GFX10 I$ is 4 x 64 bytes cache lines.
19932 // By default prefetcher keeps one cache line behind and reads two ahead.
19933 // We can modify it with S_INST_PREFETCH for larger loops to have two lines
19934 // behind and one ahead.
19935 // Therefor we can benefit from aligning loop headers if loop fits 192 bytes.
19936 // If loop fits 64 bytes it always spans no more than two cache lines and
19937 // does not need an alignment.
19938 // Else if loop is less or equal 128 bytes we do not need to modify prefetch,
19939 // Else if loop is less or equal 192 bytes we need two lines behind.
19940
19942 const MachineBasicBlock *Header = ML->getHeader();
19943 if (Header->getAlignment() != PrefAlign)
19944 return Header->getAlignment(); // Already processed.
19945
19946 unsigned LoopSize = 0;
19947 for (const MachineBasicBlock *MBB : ML->blocks()) {
19948 // If inner loop block is aligned assume in average half of the alignment
19949 // size to be added as nops.
19950 if (MBB != Header)
19951 LoopSize += MBB->getAlignment().value() / 2;
19952
19953 for (const MachineInstr &MI : *MBB) {
19954 LoopSize += TII->getInstSizeInBytes(MI);
19955 if (LoopSize > 192)
19956 return PrefAlign;
19957 }
19958 }
19959
19960 if (LoopSize <= 64)
19961 return PrefAlign;
19962
19963 if (LoopSize <= 128)
19964 return CacheLineAlign;
19965
19966 // If any of parent loops is surrounded by prefetch instructions do not
19967 // insert new for inner loop, which would reset parent's settings.
19968 for (MachineLoop *P = ML->getParentLoop(); P; P = P->getParentLoop()) {
19969 if (MachineBasicBlock *Exit = P->getExitBlock()) {
19970 auto I = Exit->getFirstNonDebugInstr();
19971 if (I != Exit->end() && I->getOpcode() == AMDGPU::S_INST_PREFETCH)
19972 return CacheLineAlign;
19973 }
19974 }
19975
19976 MachineBasicBlock *Pre = ML->getLoopPreheader();
19977 MachineBasicBlock *Exit = ML->getExitBlock();
19978
19979 if (Pre && Exit) {
19980 auto PreTerm = Pre->getFirstTerminator();
19981 if (PreTerm == Pre->begin() ||
19982 std::prev(PreTerm)->getOpcode() != AMDGPU::S_INST_PREFETCH)
19983 BuildMI(*Pre, PreTerm, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19984 .addImm(1); // prefetch 2 lines behind PC
19985
19986 auto ExitHead = Exit->getFirstNonDebugInstr();
19987 if (ExitHead == Exit->end() ||
19988 ExitHead->getOpcode() != AMDGPU::S_INST_PREFETCH)
19989 BuildMI(*Exit, ExitHead, DebugLoc(), TII->get(AMDGPU::S_INST_PREFETCH))
19990 .addImm(2); // prefetch 1 line behind PC
19991 }
19992
19993 return CacheLineAlign;
19994}
19995
19997 MachineBasicBlock *MBB) const {
19998 // GFX950: Limit padding to 4 bytes (one s_nop) for blocks where an 8-byte
19999 // instruction could be split by the 32-byte fetch window boundary.
20000 // See getPrefLoopAlignment() for context.
20001 if (needsFetchWindowAlignment(*MBB))
20002 return 4;
20004}
20005
20006bool SITargetLowering::needsFetchWindowAlignment(
20007 const MachineBasicBlock &MBB) const {
20008 if (!getSubtarget()->hasLoopHeadInstSplitSensitivity())
20009 return false;
20011 for (const MachineInstr &MI : MBB) {
20012 if (MI.isMetaInstruction())
20013 continue;
20014 // Instructions larger than 4 bytes can be split by a 32-byte boundary.
20015 return TII->getInstSizeInBytes(MI) > 4;
20016 }
20017 return false;
20018}
20019
20020[[maybe_unused]]
20021static bool isCopyFromRegOfInlineAsm(const SDNode *N) {
20022 assert(N->getOpcode() == ISD::CopyFromReg);
20023 do {
20024 // Follow the chain until we find an INLINEASM node.
20025 N = N->getOperand(0).getNode();
20026 if (N->getOpcode() == ISD::INLINEASM || N->getOpcode() == ISD::INLINEASM_BR)
20027 return true;
20028 } while (N->getOpcode() == ISD::CopyFromReg);
20029 return false;
20030}
20031
20034 UniformityInfo *UA) const {
20035 switch (N->getOpcode()) {
20036 case ISD::CopyFromReg: {
20037 const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
20038 const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
20039 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
20040 Register Reg = R->getReg();
20041
20042 // FIXME: Why does this need to consider isLiveIn?
20043 if (Reg.isPhysical() || MRI.isLiveIn(Reg))
20044 return !TRI->isSGPRReg(MRI, Reg);
20045
20046 if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
20047 return UA->isDivergentAtDef(V);
20048
20050 return !TRI->isSGPRReg(MRI, Reg);
20051 }
20052 case ISD::LOAD: {
20053 const LoadSDNode *L = cast<LoadSDNode>(N);
20054 unsigned AS = L->getAddressSpace();
20055 // A flat load may access private memory.
20057 }
20058 case ISD::CALLSEQ_END:
20059 return true;
20061 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(0));
20063 return AMDGPU::isIntrinsicSourceOfDivergence(N->getConstantOperandVal(1));
20064 case AMDGPUISD::ATOMIC_CMP_SWAP:
20065 case AMDGPUISD::BUFFER_ATOMIC_SWAP:
20066 case AMDGPUISD::BUFFER_ATOMIC_ADD:
20067 case AMDGPUISD::BUFFER_ATOMIC_SUB:
20068 case AMDGPUISD::BUFFER_ATOMIC_SMIN:
20069 case AMDGPUISD::BUFFER_ATOMIC_UMIN:
20070 case AMDGPUISD::BUFFER_ATOMIC_SMAX:
20071 case AMDGPUISD::BUFFER_ATOMIC_UMAX:
20072 case AMDGPUISD::BUFFER_ATOMIC_AND:
20073 case AMDGPUISD::BUFFER_ATOMIC_OR:
20074 case AMDGPUISD::BUFFER_ATOMIC_XOR:
20075 case AMDGPUISD::BUFFER_ATOMIC_INC:
20076 case AMDGPUISD::BUFFER_ATOMIC_DEC:
20077 case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
20078 case AMDGPUISD::BUFFER_ATOMIC_FADD:
20079 case AMDGPUISD::BUFFER_ATOMIC_FMIN:
20080 case AMDGPUISD::BUFFER_ATOMIC_FMAX:
20081 // Target-specific read-modify-write atomics are sources of divergence.
20082 return true;
20083 default:
20084 if (auto *A = dyn_cast<AtomicSDNode>(N)) {
20085 // Generic read-modify-write atomics are sources of divergence.
20086 return A->readMem() && A->writeMem();
20087 }
20088 return false;
20089 }
20090}
20091
20093 EVT VT) const {
20094 switch (VT.getScalarType().getSimpleVT().SimpleTy) {
20095 case MVT::f32:
20097 case MVT::f64:
20098 case MVT::f16:
20100 default:
20101 return false;
20102 }
20103}
20104
20106 LLT Ty, const MachineFunction &MF) const {
20107 switch (Ty.getScalarSizeInBits()) {
20108 case 32:
20109 return !denormalModeIsFlushAllF32(MF);
20110 case 64:
20111 case 16:
20112 return !denormalModeIsFlushAllF64F16(MF);
20113 default:
20114 return false;
20115 }
20116}
20117
20119 const APInt &DemandedElts,
20120 const SelectionDAG &DAG,
20121 bool SNaN,
20122 unsigned Depth) const {
20123 if (Op.getOpcode() == AMDGPUISD::CLAMP) {
20124 const MachineFunction &MF = DAG.getMachineFunction();
20126
20127 if (Info->getMode().DX10Clamp)
20128 return true; // Clamped to 0.
20129 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
20130 }
20131
20133 DAG, SNaN, Depth);
20134}
20135
20136// On older subtargets, global FP atomic instructions have a hardcoded FP mode
20137// and do not support FP32 denormals, and only support v2f16/f64 denormals.
20139 if (RMW->hasMetadata("amdgpu.ignore.denormal.mode"))
20140 return true;
20141
20142 const fltSemantics &Flt = RMW->getType()->getScalarType()->getFltSemantics();
20143 auto DenormMode = RMW->getFunction()->getDenormalMode(Flt);
20144 if (DenormMode == DenormalMode::getPreserveSign())
20145 return true;
20146
20147 // TODO: Remove this.
20148 return RMW->getFunction()
20149 ->getFnAttribute("amdgpu-unsafe-fp-atomics")
20150 .getValueAsBool();
20151}
20152
20154 LLVMContext &Ctx = RMW->getContext();
20155 StringRef MemScope =
20156 Ctx.getSyncScopeName(RMW->getSyncScopeID()).value_or("system");
20157
20158 return OptimizationRemark(DEBUG_TYPE, "Passed", RMW)
20159 << "Hardware instruction generated for atomic "
20160 << RMW->getOperationName(RMW->getOperation())
20161 << " operation at memory scope " << MemScope;
20162}
20163
20164static bool isV2F16OrV2BF16(Type *Ty) {
20165 if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
20166 Type *EltTy = VT->getElementType();
20167 return VT->getNumElements() == 2 &&
20168 (EltTy->isHalfTy() || EltTy->isBFloatTy());
20169 }
20170
20171 return false;
20172}
20173
20174static bool isV2F16(Type *Ty) {
20176 return VT && VT->getNumElements() == 2 && VT->getElementType()->isHalfTy();
20177}
20178
20179static bool isV2BF16(Type *Ty) {
20181 return VT && VT->getNumElements() == 2 && VT->getElementType()->isBFloatTy();
20182}
20183
20184/// \return true if atomicrmw integer ops work for the type.
20185static bool isAtomicRMWLegalIntTy(Type *Ty) {
20186 if (auto *IT = dyn_cast<IntegerType>(Ty)) {
20187 unsigned BW = IT->getBitWidth();
20188 return BW == 32 || BW == 64;
20189 }
20190
20191 return false;
20192}
20193
20194/// \return true if this atomicrmw xchg type can be selected.
20195static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW) {
20196 Type *Ty = RMW->getType();
20197 if (isAtomicRMWLegalIntTy(Ty))
20198 return true;
20199
20200 if (PointerType *PT = dyn_cast<PointerType>(Ty)) {
20201 const DataLayout &DL = RMW->getFunction()->getParent()->getDataLayout();
20202 unsigned BW = DL.getPointerSizeInBits(PT->getAddressSpace());
20203 return BW == 32 || BW == 64;
20204 }
20205
20206 if (Ty->isFloatTy() || Ty->isDoubleTy())
20207 return true;
20208
20210 return VT->getNumElements() == 2 &&
20211 VT->getElementType()->getPrimitiveSizeInBits() == 16;
20212 }
20213
20214 return false;
20215}
20216
20217/// \returns true if it's valid to emit a native instruction for \p RMW, based
20218/// on the properties of the target memory.
20219static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
20220 const AtomicRMWInst *RMW,
20221 bool HasSystemScope) {
20222 // The remote/fine-grained access logic is different from the integer
20223 // atomics. Without AgentScopeFineGrainedRemoteMemoryAtomics support,
20224 // fine-grained access does not work, even for a device local allocation.
20225 //
20226 // With AgentScopeFineGrainedRemoteMemoryAtomics, system scoped device local
20227 // allocations work.
20228 if (HasSystemScope) {
20229 if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics() &&
20230 RMW->hasMetadata("amdgpu.no.remote.memory"))
20231 return true;
20232 if (Subtarget.hasEmulatedSystemScopeAtomics())
20233 return true;
20234 } else if (Subtarget.hasAgentScopeFineGrainedRemoteMemoryAtomics())
20235 return true;
20236
20237 return RMW->hasMetadata("amdgpu.no.fine.grained.memory");
20238}
20239
20240/// \return Action to perform on AtomicRMWInsts for integer operations.
20247
20248/// Return if a flat address space atomicrmw can access private memory.
20250 const MDNode *MD = I->getMetadata(LLVMContext::MD_noalias_addrspace);
20251 return !MD ||
20253}
20254
20257 // For GAS, lower to flat atomic.
20258 return STI.hasGloballyAddressableScratch()
20261}
20262
20265 unsigned AS = RMW->getPointerAddressSpace();
20266 if (AS == AMDGPUAS::PRIVATE_ADDRESS)
20268
20269 // 64-bit flat atomics that dynamically reside in private memory will silently
20270 // be dropped.
20271 //
20272 // Note that we will emit a new copy of the original atomic in the expansion,
20273 // which will be incrementally relegalized.
20274 const DataLayout &DL = RMW->getFunction()->getDataLayout();
20275 if (AS == AMDGPUAS::FLAT_ADDRESS &&
20276 DL.getTypeSizeInBits(RMW->getType()) == 64 &&
20279
20280 auto ReportUnsafeHWInst = [=](TargetLowering::AtomicExpansionKind Kind) {
20282 ORE.emit([=]() {
20283 return emitAtomicRMWLegalRemark(RMW) << " due to an unsafe request.";
20284 });
20285 return Kind;
20286 };
20287
20288 auto SSID = RMW->getSyncScopeID();
20289 bool HasSystemScope =
20290 SSID == SyncScope::System ||
20291 SSID == RMW->getContext().getOrInsertSyncScopeID("one-as");
20292
20293 auto Op = RMW->getOperation();
20294 switch (Op) {
20296 // PCIe supports add and xchg for system atomics.
20297 return isAtomicRMWLegalXChgTy(RMW)
20300 case AtomicRMWInst::Add:
20301 // PCIe supports add and xchg for system atomics.
20303 case AtomicRMWInst::Sub:
20304 case AtomicRMWInst::And:
20305 case AtomicRMWInst::Or:
20306 case AtomicRMWInst::Xor:
20307 case AtomicRMWInst::Max:
20308 case AtomicRMWInst::Min:
20315 if (Op == AtomicRMWInst::USubCond && !Subtarget->hasCondSubInsts())
20317 if (Op == AtomicRMWInst::USubSat && !Subtarget->hasSubClampInsts())
20320 auto *IT = dyn_cast<IntegerType>(RMW->getType());
20321 if (!IT || IT->getBitWidth() != 32)
20323 }
20324
20327 if (Subtarget->hasEmulatedSystemScopeAtomics())
20329
20330 // On most subtargets, for atomicrmw operations other than add/xchg,
20331 // whether or not the instructions will behave correctly depends on where
20332 // the address physically resides and what interconnect is used in the
20333 // system configuration. On some some targets the instruction will nop,
20334 // and in others synchronization will only occur at degraded device scope.
20335 //
20336 // If the allocation is known local to the device, the instructions should
20337 // work correctly.
20338 if (RMW->hasMetadata("amdgpu.no.remote.memory"))
20340
20341 // If fine-grained remote memory works at device scope, we don't need to
20342 // do anything.
20343 if (!HasSystemScope &&
20344 Subtarget->hasAgentScopeFineGrainedRemoteMemoryAtomics())
20346
20347 // If we are targeting a remote allocated address, it depends what kind of
20348 // allocation the address belongs to.
20349 //
20350 // If the allocation is fine-grained (in host memory, or in PCIe peer
20351 // device memory), the operation will fail depending on the target.
20352 //
20353 // Note fine-grained host memory access does work on APUs or if XGMI is
20354 // used, but we do not know if we are targeting an APU or the system
20355 // configuration from the ISA version/target-cpu.
20356 if (RMW->hasMetadata("amdgpu.no.fine.grained.memory"))
20358
20361 // Atomic sub/or/xor do not work over PCI express, but atomic add
20362 // does. InstCombine transforms these with 0 to or, so undo that.
20363 if (const Constant *ConstVal = dyn_cast<Constant>(RMW->getValOperand());
20364 ConstVal && ConstVal->isNullValue())
20366 }
20367
20368 // If the allocation could be in remote, fine-grained memory, the rmw
20369 // instructions may fail. cmpxchg should work, so emit that. On some
20370 // system configurations, PCIe atomics aren't supported so cmpxchg won't
20371 // even work, so you're out of luck anyway.
20372
20373 // In summary:
20374 //
20375 // Cases that may fail:
20376 // - fine-grained pinned host memory
20377 // - fine-grained migratable host memory
20378 // - fine-grained PCIe peer device
20379 //
20380 // Cases that should work, but may be treated overly conservatively.
20381 // - fine-grained host memory on an APU
20382 // - fine-grained XGMI peer device
20384 }
20385
20387 }
20388 case AtomicRMWInst::FAdd: {
20389 Type *Ty = RMW->getType();
20390
20391 // TODO: Handle REGION_ADDRESS
20392 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20393 // DS F32 FP atomics do respect the denormal mode, but the rounding mode
20394 // is fixed to round-to-nearest-even.
20395 //
20396 // F64 / PK_F16 / PK_BF16 never flush and are also fixed to
20397 // round-to-nearest-even.
20398 //
20399 // We ignore the rounding mode problem, even in strictfp. The C++ standard
20400 // suggests it is OK if the floating-point mode may not match the calling
20401 // thread.
20402 if (Ty->isFloatTy()) {
20403 return Subtarget->hasLDSFPAtomicAddF32() ? AtomicExpansionKind::None
20405 }
20406
20407 if (Ty->isDoubleTy()) {
20408 // Ignores denormal mode, but we don't consider flushing mandatory.
20409 return Subtarget->hasLDSFPAtomicAddF64() ? AtomicExpansionKind::None
20411 }
20412
20413 if (Subtarget->hasAtomicDsPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20415
20417 }
20418
20419 // LDS atomics respect the denormal mode from the mode register.
20420 //
20421 // Traditionally f32 global/buffer memory atomics would unconditionally
20422 // flush denormals, but newer targets do not flush. f64/f16/bf16 cases never
20423 // flush.
20424 //
20425 // On targets with flat atomic fadd, denormals would flush depending on
20426 // whether the target address resides in LDS or global memory. We consider
20427 // this flat-maybe-flush as will-flush.
20428 if (Ty->isFloatTy() &&
20429 !Subtarget->hasMemoryAtomicFaddF32DenormalSupport() &&
20432
20433 // FIXME: These ReportUnsafeHWInsts are imprecise. Some of these cases are
20434 // safe. The message phrasing also should be better.
20435 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
20436 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20437 // gfx942, gfx12
20438 if (Subtarget->hasAtomicFlatPkAdd16Insts() && isV2F16OrV2BF16(Ty))
20439 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20440 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS)) {
20441 // gfx90a, gfx942, gfx12
20442 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20443 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20444
20445 // gfx942, gfx12
20446 if (Subtarget->hasAtomicGlobalPkAddBF16Inst() && isV2BF16(Ty))
20447 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20448 } else if (AS == AMDGPUAS::BUFFER_FAT_POINTER) {
20449 // gfx90a, gfx942, gfx12
20450 if (Subtarget->hasAtomicBufferGlobalPkAddF16Insts() && isV2F16(Ty))
20451 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20452
20453 // While gfx90a/gfx942 supports v2bf16 for global/flat, it does not for
20454 // buffer. gfx12 does have the buffer version.
20455 if (Subtarget->hasAtomicBufferPkAddBF16Inst() && isV2BF16(Ty))
20456 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20457 }
20458
20459 // global and flat atomic fadd f64: gfx90a, gfx942.
20460 if (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() && Ty->isDoubleTy())
20461 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20462
20463 if (AS != AMDGPUAS::FLAT_ADDRESS) {
20464 if (Ty->isFloatTy()) {
20465 // global/buffer atomic fadd f32 no-rtn: gfx908, gfx90a, gfx942,
20466 // gfx11+.
20467 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20468 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20469 // global/buffer atomic fadd f32 rtn: gfx90a, gfx942, gfx11+.
20470 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20471 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20472 } else {
20473 // gfx908
20474 if (RMW->use_empty() &&
20475 Subtarget->hasAtomicBufferGlobalPkAddF16NoRtnInsts() &&
20476 isV2F16(Ty))
20477 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20478 }
20479 }
20480
20481 // flat atomic fadd f32: gfx942, gfx11+.
20482 if (AS == AMDGPUAS::FLAT_ADDRESS && Ty->isFloatTy()) {
20483 if (Subtarget->hasFlatAtomicFaddF32Inst())
20484 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20485
20486 // If it is in flat address space, and the type is float, we will try to
20487 // expand it, if the target supports global and lds atomic fadd. The
20488 // reason we need that is, in the expansion, we emit the check of
20489 // address space. If it is in global address space, we emit the global
20490 // atomic fadd; if it is in shared address space, we emit the LDS atomic
20491 // fadd.
20492 if (Subtarget->hasLDSFPAtomicAddF32()) {
20493 if (RMW->use_empty() && Subtarget->hasAtomicFaddNoRtnInsts())
20495 if (!RMW->use_empty() && Subtarget->hasAtomicFaddRtnInsts())
20497 }
20498 }
20499 }
20500
20502 }
20504 case AtomicRMWInst::FMax: {
20505 Type *Ty = RMW->getType();
20506
20507 // LDS float and double fmin/fmax were always supported.
20508 if (AS == AMDGPUAS::LOCAL_ADDRESS) {
20509 return Ty->isFloatTy() || Ty->isDoubleTy() ? AtomicExpansionKind::None
20511 }
20512
20513 if (globalMemoryFPAtomicIsLegal(*Subtarget, RMW, HasSystemScope)) {
20514 // For flat and global cases:
20515 // float, double in gfx7. Manual claims denormal support.
20516 // Removed in gfx8.
20517 // float, double restored in gfx10.
20518 // double removed again in gfx11, so only f32 for gfx11/gfx12.
20519 //
20520 // For gfx9, gfx90a and gfx942 support f64 for global (same as fadd), but
20521 // no f32.
20522 if (AS == AMDGPUAS::FLAT_ADDRESS) {
20523 if (Subtarget->hasAtomicFMinFMaxF32FlatInsts() && Ty->isFloatTy())
20524 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20525 if (Subtarget->hasAtomicFMinFMaxF64FlatInsts() && Ty->isDoubleTy())
20526 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20527 } else if (AMDGPU::isExtendedGlobalAddrSpace(AS) ||
20529 if (Subtarget->hasAtomicFMinFMaxF32GlobalInsts() && Ty->isFloatTy())
20530 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20531 if (Subtarget->hasAtomicFMinFMaxF64GlobalInsts() && Ty->isDoubleTy())
20532 return ReportUnsafeHWInst(AtomicExpansionKind::None);
20533 }
20534 }
20535
20537 }
20540 default:
20542 }
20543
20544 llvm_unreachable("covered atomicrmw op switch");
20545}
20546
20553
20560
20563 const AtomicCmpXchgInst *CmpX) const {
20564 unsigned AddrSpace = CmpX->getPointerAddressSpace();
20565 if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
20567
20568 if (AddrSpace != AMDGPUAS::FLAT_ADDRESS || !flatInstrMayAccessPrivate(CmpX))
20570
20571 const DataLayout &DL = CmpX->getDataLayout();
20572
20573 Type *ValTy = CmpX->getNewValOperand()->getType();
20574
20575 // If a 64-bit flat atomic may alias private, we need to avoid using the
20576 // atomic in the private case.
20577 return DL.getTypeSizeInBits(ValTy) == 64 ? AtomicExpansionKind::CustomExpand
20579}
20580
20581const TargetRegisterClass *
20582SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
20584 const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
20585 if (RC == &AMDGPU::VReg_1RegClass && !isDivergent)
20586 return Subtarget->isWave64() ? &AMDGPU::SReg_64RegClass
20587 : &AMDGPU::SReg_32RegClass;
20588 if (!TRI->isSGPRClass(RC) && !isDivergent)
20589 return TRI->getEquivalentSGPRClass(RC);
20590 if (TRI->isSGPRClass(RC) && isDivergent) {
20591 if (Subtarget->hasGFX90AInsts())
20592 return TRI->getEquivalentAVClass(RC);
20593 return TRI->getEquivalentVGPRClass(RC);
20594 }
20595
20596 return RC;
20597}
20598
20599// FIXME: This is a workaround for DivergenceAnalysis not understanding always
20600// uniform values (as produced by the mask results of control flow intrinsics)
20601// used outside of divergent blocks. The phi users need to also be treated as
20602// always uniform.
20603//
20604// FIXME: DA is no longer in-use. Does this still apply to UniformityAnalysis?
20605static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
20606 unsigned WaveSize) {
20607 // FIXME: We assume we never cast the mask results of a control flow
20608 // intrinsic.
20609 // Early exit if the type won't be consistent as a compile time hack.
20610 IntegerType *IT = dyn_cast<IntegerType>(V->getType());
20611 if (!IT || IT->getBitWidth() != WaveSize)
20612 return false;
20613
20614 if (!isa<Instruction>(V))
20615 return false;
20616 if (!Visited.insert(V).second)
20617 return false;
20618 bool Result = false;
20619 for (const auto *U : V->users()) {
20621 if (V == U->getOperand(1)) {
20622 switch (Intrinsic->getIntrinsicID()) {
20623 default:
20624 Result = false;
20625 break;
20626 case Intrinsic::amdgcn_if_break:
20627 case Intrinsic::amdgcn_if:
20628 case Intrinsic::amdgcn_else:
20629 Result = true;
20630 break;
20631 }
20632 }
20633 if (V == U->getOperand(0)) {
20634 switch (Intrinsic->getIntrinsicID()) {
20635 default:
20636 Result = false;
20637 break;
20638 case Intrinsic::amdgcn_end_cf:
20639 case Intrinsic::amdgcn_loop:
20640 Result = true;
20641 break;
20642 }
20643 }
20644 } else {
20645 Result = hasCFUser(U, Visited, WaveSize);
20646 }
20647 if (Result)
20648 break;
20649 }
20650 return Result;
20651}
20652
20654 const Value *V) const {
20655 if (const CallInst *CI = dyn_cast<CallInst>(V)) {
20656 if (CI->isInlineAsm()) {
20657 // FIXME: This cannot give a correct answer. This should only trigger in
20658 // the case where inline asm returns mixed SGPR and VGPR results, used
20659 // outside the defining block. We don't have a specific result to
20660 // consider, so this assumes if any value is SGPR, the overall register
20661 // also needs to be SGPR.
20662 const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
20664 MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
20665 for (auto &TC : TargetConstraints) {
20666 if (TC.Type == InlineAsm::isOutput) {
20668 const TargetRegisterClass *RC =
20669 getRegForInlineAsmConstraint(SIRI, TC.ConstraintCode,
20670 TC.ConstraintVT)
20671 .second;
20672 if (RC && SIRI->isSGPRClass(RC))
20673 return true;
20674 }
20675 }
20676 }
20677 }
20679 return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
20680}
20681
20683 for (SDUse &Use : N->uses()) {
20685 if (getBasePtrIndex(M) == Use.getOperandNo())
20686 return true;
20687 }
20688 }
20689 return false;
20690}
20691
20693 SDValue N1) const {
20694 if (!N0.hasOneUse())
20695 return false;
20696 // Take care of the opportunity to keep N0 uniform
20697 if (N0->isDivergent() || !N1->isDivergent())
20698 return true;
20699 // Check if we have a good chance to form the memory access pattern with the
20700 // base and offset
20701 return (DAG.isBaseWithConstantOffset(N0) &&
20703}
20704
20706 Register N0, Register N1) const {
20707 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
20708}
20709
20712 // Propagate metadata set by AMDGPUAnnotateUniformValues to the MMO of a load.
20714 if (I.getMetadata("amdgpu.noclobber"))
20715 Flags |= MONoClobber;
20716 if (I.getMetadata("amdgpu.last.use"))
20717 Flags |= MOLastUse;
20718 return Flags;
20719}
20720
20722 Instruction *AI) const {
20723 // Given: atomicrmw fadd ptr %addr, float %val ordering
20724 //
20725 // With this expansion we produce the following code:
20726 // [...]
20727 // %is.shared = call i1 @llvm.amdgcn.is.shared(ptr %addr)
20728 // br i1 %is.shared, label %atomicrmw.shared, label %atomicrmw.check.private
20729 //
20730 // atomicrmw.shared:
20731 // %cast.shared = addrspacecast ptr %addr to ptr addrspace(3)
20732 // %loaded.shared = atomicrmw fadd ptr addrspace(3) %cast.shared,
20733 // float %val ordering
20734 // br label %atomicrmw.phi
20735 //
20736 // atomicrmw.check.private:
20737 // %is.private = call i1 @llvm.amdgcn.is.private(ptr %int8ptr)
20738 // br i1 %is.private, label %atomicrmw.private, label %atomicrmw.global
20739 //
20740 // atomicrmw.private:
20741 // %cast.private = addrspacecast ptr %addr to ptr addrspace(5)
20742 // %loaded.private = load float, ptr addrspace(5) %cast.private
20743 // %val.new = fadd float %loaded.private, %val
20744 // store float %val.new, ptr addrspace(5) %cast.private
20745 // br label %atomicrmw.phi
20746 //
20747 // atomicrmw.global:
20748 // %cast.global = addrspacecast ptr %addr to ptr addrspace(1)
20749 // %loaded.global = atomicrmw fadd ptr addrspace(1) %cast.global,
20750 // float %val ordering
20751 // br label %atomicrmw.phi
20752 //
20753 // atomicrmw.phi:
20754 // %loaded.phi = phi float [ %loaded.shared, %atomicrmw.shared ],
20755 // [ %loaded.private, %atomicrmw.private ],
20756 // [ %loaded.global, %atomicrmw.global ]
20757 // br label %atomicrmw.end
20758 //
20759 // atomicrmw.end:
20760 // [...]
20761 //
20762 //
20763 // For 64-bit atomics which may reside in private memory, we perform a simpler
20764 // version that only inserts the private check, and uses the flat operation.
20765
20766 IRBuilder<> Builder(AI);
20767 LLVMContext &Ctx = Builder.getContext();
20768
20769 auto *RMW = dyn_cast<AtomicRMWInst>(AI);
20770 const unsigned PtrOpIdx = RMW ? AtomicRMWInst::getPointerOperandIndex()
20772 Value *Addr = AI->getOperand(PtrOpIdx);
20773
20774 /// TODO: Only need to check private, then emit flat-known-not private (no
20775 /// need for shared block, or cast to global).
20777
20778 Align Alignment;
20779 if (RMW)
20780 Alignment = RMW->getAlign();
20781 else if (CX)
20782 Alignment = CX->getAlign();
20783 else
20784 llvm_unreachable("unhandled atomic operation");
20785
20786 // FullFlatEmulation is true if we need to issue the private, shared, and
20787 // global cases.
20788 //
20789 // If this is false, we are only dealing with the flat-targeting-private case,
20790 // where we only insert a check for private and still use the flat instruction
20791 // for global and shared.
20792
20793 bool FullFlatEmulation =
20794 RMW && RMW->getOperation() == AtomicRMWInst::FAdd &&
20795 ((Subtarget->hasAtomicFaddInsts() && RMW->getType()->isFloatTy()) ||
20796 (Subtarget->hasFlatBufferGlobalAtomicFaddF64Inst() &&
20797 RMW->getType()->isDoubleTy()));
20798
20799 // If the return value isn't used, do not introduce a false use in the phi.
20800 bool ReturnValueIsUsed = !AI->use_empty();
20801
20802 BasicBlock *BB = Builder.GetInsertBlock();
20803 Function *F = BB->getParent();
20804 BasicBlock *ExitBB =
20805 BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
20806 BasicBlock *SharedBB = nullptr;
20807
20808 BasicBlock *CheckPrivateBB = BB;
20809 if (FullFlatEmulation) {
20810 SharedBB = BasicBlock::Create(Ctx, "atomicrmw.shared", F, ExitBB);
20811 CheckPrivateBB =
20812 BasicBlock::Create(Ctx, "atomicrmw.check.private", F, ExitBB);
20813 }
20814
20815 BasicBlock *PrivateBB =
20816 BasicBlock::Create(Ctx, "atomicrmw.private", F, ExitBB);
20817 BasicBlock *GlobalBB = BasicBlock::Create(Ctx, "atomicrmw.global", F, ExitBB);
20818 BasicBlock *PhiBB = BasicBlock::Create(Ctx, "atomicrmw.phi", F, ExitBB);
20819
20820 std::prev(BB->end())->eraseFromParent();
20821 Builder.SetInsertPoint(BB);
20822
20823 Value *LoadedShared = nullptr;
20824 if (FullFlatEmulation) {
20825 Value *IsShared = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_shared,
20826 {Addr}, nullptr, "is.shared");
20827 Builder.CreateCondBr(IsShared, SharedBB, CheckPrivateBB);
20828 Builder.SetInsertPoint(SharedBB);
20829 Value *CastToLocal = Builder.CreateAddrSpaceCast(
20831
20832 Instruction *Clone = AI->clone();
20833 Clone->insertInto(SharedBB, SharedBB->end());
20834 Clone->getOperandUse(PtrOpIdx).set(CastToLocal);
20835 LoadedShared = Clone;
20836
20837 Builder.CreateBr(PhiBB);
20838 Builder.SetInsertPoint(CheckPrivateBB);
20839 }
20840
20841 Value *IsPrivate = Builder.CreateIntrinsic(Intrinsic::amdgcn_is_private,
20842 {Addr}, nullptr, "is.private");
20843 Builder.CreateCondBr(IsPrivate, PrivateBB, GlobalBB);
20844
20845 Builder.SetInsertPoint(PrivateBB);
20846
20847 Value *CastToPrivate = Builder.CreateAddrSpaceCast(
20849
20850 Value *LoadedPrivate;
20851 if (RMW) {
20852 LoadedPrivate = Builder.CreateAlignedLoad(
20853 RMW->getType(), CastToPrivate, RMW->getAlign(), "loaded.private");
20854
20855 Value *NewVal = buildAtomicRMWValue(RMW->getOperation(), Builder,
20856 LoadedPrivate, RMW->getValOperand());
20857
20858 Builder.CreateAlignedStore(NewVal, CastToPrivate, RMW->getAlign());
20859 } else {
20860 auto [ResultLoad, Equal] =
20861 buildCmpXchgValue(Builder, CastToPrivate, CX->getCompareOperand(),
20862 CX->getNewValOperand(), CX->getAlign());
20863
20864 Value *Insert = Builder.CreateInsertValue(PoisonValue::get(CX->getType()),
20865 ResultLoad, 0);
20866 LoadedPrivate = Builder.CreateInsertValue(Insert, Equal, 1);
20867 }
20868
20869 Builder.CreateBr(PhiBB);
20870
20871 Builder.SetInsertPoint(GlobalBB);
20872
20873 // Continue using a flat instruction if we only emitted the check for private.
20874 Instruction *LoadedGlobal = AI;
20875 if (FullFlatEmulation) {
20876 Value *CastToGlobal = Builder.CreateAddrSpaceCast(
20878 AI->getOperandUse(PtrOpIdx).set(CastToGlobal);
20879 }
20880
20881 AI->removeFromParent();
20882 AI->insertInto(GlobalBB, GlobalBB->end());
20883
20884 // The new atomicrmw may go through another round of legalization later.
20885 if (!FullFlatEmulation) {
20886 // We inserted the runtime check already, make sure we do not try to
20887 // re-expand this.
20888 // TODO: Should union with any existing metadata.
20889 MDBuilder MDB(F->getContext());
20890 MDNode *RangeNotPrivate =
20893 LoadedGlobal->setMetadata(LLVMContext::MD_noalias_addrspace,
20894 RangeNotPrivate);
20895 }
20896
20897 Builder.CreateBr(PhiBB);
20898
20899 Builder.SetInsertPoint(PhiBB);
20900
20901 if (ReturnValueIsUsed) {
20902 PHINode *Loaded = Builder.CreatePHI(AI->getType(), 3);
20903 AI->replaceAllUsesWith(Loaded);
20904 if (FullFlatEmulation)
20905 Loaded->addIncoming(LoadedShared, SharedBB);
20906 Loaded->addIncoming(LoadedPrivate, PrivateBB);
20907 Loaded->addIncoming(LoadedGlobal, GlobalBB);
20908 Loaded->takeName(AI);
20909 }
20910
20911 Builder.CreateBr(ExitBB);
20912}
20913
20915 unsigned PtrOpIdx) {
20916 Value *PtrOp = I->getOperand(PtrOpIdx);
20919
20920 Type *FlatPtr = PointerType::get(I->getContext(), AMDGPUAS::FLAT_ADDRESS);
20921 Value *ASCast = CastInst::CreatePointerCast(PtrOp, FlatPtr, "scratch.ascast",
20922 I->getIterator());
20923 I->setOperand(PtrOpIdx, ASCast);
20924}
20925
20928
20931
20934 if (const auto *ConstVal = dyn_cast<Constant>(AI->getValOperand());
20935 ConstVal && ConstVal->isNullValue()) {
20936 // atomicrmw or %ptr, 0 -> atomicrmw add %ptr, 0
20938
20939 // We may still need the private-alias-flat handling below.
20940
20941 // TODO: Skip this for cases where we cannot access remote memory.
20942 }
20943 }
20944
20945 // The non-flat expansions should only perform the de-canonicalization of
20946 // identity values.
20948 return;
20949
20951}
20952
20959
20963
20965 "Expand Atomic Load only handles SCRATCH -> FLAT conversion");
20966}
20967
20969 if (SI->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
20970 return convertScratchAtomicToFlatAtomic(SI, SI->getPointerOperandIndex());
20971
20973 "Expand Atomic Store only handles SCRATCH -> FLAT conversion");
20974}
20975
20976LoadInst *
20978 IRBuilder<> Builder(AI);
20979 auto Order = AI->getOrdering();
20980
20981 // The optimization removes store aspect of the atomicrmw. Therefore, cache
20982 // must be flushed if the atomic ordering had a release semantics. This is
20983 // not necessary a fence, a release fence just coincides to do that flush.
20984 // Avoid replacing of an atomicrmw with a release semantics.
20985 if (isReleaseOrStronger(Order))
20986 return nullptr;
20987
20988 LoadInst *LI = Builder.CreateAlignedLoad(
20989 AI->getType(), AI->getPointerOperand(), AI->getAlign());
20990 LI->setAtomic(Order, AI->getSyncScopeID());
20991 LI->copyMetadata(*AI);
20992 LI->takeName(AI);
20993 AI->replaceAllUsesWith(LI);
20994 AI->eraseFromParent();
20995 return LI;
20996}
static bool isMul(MachineInstr *MI)
return SDValue()
static unsigned getIntrinsicID(const SDNode *N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static constexpr std::pair< ImplicitArgumentMask, StringLiteral > ImplicitAttrs[]
static bool allUsesHaveSourceMods(MachineInstr &MI, MachineRegisterInfo &MRI, unsigned CostThreshold=4)
static bool isCtlzOpc(unsigned Opc)
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
static bool isNoUnsignedWrap(MachineInstr *Addr)
static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE, bool &IsTexFail)
static bool isAsyncLDSDMA(Intrinsic::ID Intr)
static void packImage16bitOpsToDwords(MachineIRBuilder &B, MachineInstr &MI, SmallVectorImpl< Register > &PackedAddrs, unsigned ArgOffset, const AMDGPU::ImageDimIntrinsicInfo *Intr, bool IsA16, bool IsG16)
Turn a set of s16 typed registers in AddrRegs into a dword sized vector with s16 typed elements.
constexpr LLT S32
static bool isKnownNonNull(Register Val, MachineRegisterInfo &MRI, const AMDGPUTargetMachine &TM, unsigned AddrSpace)
Return true if the value is a known valid address, such that a null check is not necessary.
Provides AMDGPU specific target descriptions.
The AMDGPU TargetMachine interface definition for hw codegen targets.
This file implements a class to represent arbitrary precision integral constant values and operations...
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
static cl::opt< ITMode > IT(cl::desc("IT block support"), cl::Hidden, cl::init(DefaultIT), cl::values(clEnumValN(DefaultIT, "arm-default-it", "Generate any type of IT block"), clEnumValN(RestrictedIT, "arm-restrict-it", "Disallow complex IT blocks")))
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
@ DEFAULT
Default weight is used in cases when there is no dedicated execution weight set.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< SDByteProvider > calculateByteProvider(SDValue Op, unsigned Index, unsigned Depth, std::optional< uint64_t > VectorIndex, unsigned StartingIndex=0)
dxil translate DXIL Translate Metadata
static bool isSigned(unsigned Opcode)
Utilities for dealing with flags related to floating point properties and mode controls.
AMD GCN specific subclass of TargetSubtarget.
Provides analysis for querying information about KnownBits during GISel passes.
#define DEBUG_TYPE
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
iv Induction Variable Users
Definition IVUsers.cpp:48
static constexpr Value * getValue(Ty &ValueOrUse)
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
Contains matchers for matching SSA Machine Instructions.
Machine Check Debug Module
static bool isUndef(const MachineInstr &MI)
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
static unsigned getAddressSpace(const Value *V, unsigned MaxLookup)
uint64_t IntrinsicInst * II
#define P(N)
static constexpr MCPhysReg SPReg
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
Contains matchers for matching SelectionDAG nodes and values.
static void r0(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:39
static void r3(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:57
static void r2(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:51
static void r1(uint32_t &A, uint32_t &B, uint32_t &C, uint32_t &D, uint32_t &E, int I, uint32_t *Buf)
Definition SHA1.cpp:45
#define FP_DENORM_FLUSH_NONE
Definition SIDefines.h:1490
#define FP_DENORM_FLUSH_IN_FLUSH_OUT
Definition SIDefines.h:1487
static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info)
static SDValue adjustLoadValueTypeImpl(SDValue Result, EVT LoadVT, const SDLoc &DL, SelectionDAG &DAG, bool Unpacked)
static MachineBasicBlock * emitIndirectSrc(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static bool denormalModeIsFlushAllF64F16(const MachineFunction &MF)
static bool isAtomicRMWLegalIntTy(Type *Ty)
static void knownBitsForWorkitemID(const GCNSubtarget &ST, GISelValueTracking &VT, KnownBits &Known, unsigned Dim)
static bool flatInstrMayAccessPrivate(const Instruction *I)
Return if a flat address space atomicrmw can access private memory.
static std::pair< unsigned, int > computeIndirectRegAndOffset(const SIRegisterInfo &TRI, const TargetRegisterClass *SuperRC, unsigned VecReg, int Offset)
static bool denormalModeIsFlushAllF32(const MachineFunction &MF)
static bool addresses16Bits(int Mask)
static MachineBasicBlock * expand64BitScalarArithmetic(MachineInstr &MI, MachineBasicBlock *BB)
static bool isClampZeroToOne(SDValue A, SDValue B)
static bool supportsMin3Max3(const GCNSubtarget &Subtarget, unsigned Opc, EVT VT)
static unsigned findFirstFreeSGPR(CCState &CCInfo)
static uint32_t getPermuteMask(SDValue V)
static SDValue lowerLaneOp(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static int getAlignedAGPRClassID(unsigned UnalignedClassID)
static void processPSInputArgs(SmallVectorImpl< ISD::InputArg > &Splits, CallingConv::ID CallConv, ArrayRef< ISD::InputArg > Ins, BitVector &Skipped, FunctionType *FType, SIMachineFunctionInfo *Info)
static SDValue selectSOffset(SDValue SOffset, SelectionDAG &DAG, const GCNSubtarget *Subtarget)
static SDValue getLoadExtOrTrunc(SelectionDAG &DAG, ISD::LoadExtType ExtType, SDValue Op, const SDLoc &SL, EVT VT)
static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget, const AtomicRMWInst *RMW, bool HasSystemScope)
static std::tuple< unsigned, unsigned > getDPPOpcForWaveReduction(unsigned Opc, const GCNSubtarget &ST)
static void fixMasks(SmallVectorImpl< DotSrc > &Srcs, unsigned ChainLength)
static bool is32bitWaveReduceOperation(unsigned Opc)
static TargetLowering::AtomicExpansionKind atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW)
static SDValue strictFPExtFromF16(SelectionDAG &DAG, SDValue Src)
Return the source of an fp_extend from f16 to f32, or a converted FP constant.
static bool isAtomicRMWLegalXChgTy(const AtomicRMWInst *RMW)
static bool bitOpWithConstantIsReducible(unsigned Opc, uint32_t Val)
static void convertScratchAtomicToFlatAtomic(Instruction *I, unsigned PtrOpIdx)
static bool isCopyFromRegOfInlineAsm(const SDNode *N)
static bool elementPairIsOddToEven(ArrayRef< int > Mask, int Elt)
static cl::opt< bool > DisableLoopAlignment("amdgpu-disable-loop-alignment", cl::desc("Do not align and prefetch loops"), cl::init(false))
static SDValue getDWordFromOffset(SelectionDAG &DAG, SDLoc SL, SDValue Src, unsigned DWordOffset)
static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII, MachineBasicBlock &MBB, MachineInstr &MI, unsigned InitResultReg, unsigned PhiReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static bool isFloatingPointWaveReduceOperation(unsigned Opc)
static bool isImmConstraint(StringRef Constraint)
static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT, SDValue Src, int ExtraElts)
static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static bool hasCFUser(const Value *V, SmallPtrSet< const Value *, 16 > &Visited, unsigned WaveSize)
static std::pair< Register, Register > ExtractSubRegs(MachineInstr &MI, MachineOperand &Op, const TargetRegisterClass *SrcRC, const GCNSubtarget &ST, MachineRegisterInfo &MRI)
static OptimizationRemark emitAtomicRMWLegalRemark(const AtomicRMWInst *RMW)
static unsigned SubIdx2Lane(unsigned Idx)
Helper function for adjustWritemask.
static TargetLowering::AtomicExpansionKind getPrivateAtomicExpansionKind(const GCNSubtarget &STI)
static bool addressMayBeAccessedAsPrivate(const MachineMemOperand *MMO, const SIMachineFunctionInfo &Info)
static MachineBasicBlock * lowerWaveReduce(MachineInstr &MI, MachineBasicBlock &BB, const GCNSubtarget &ST, unsigned Opc)
static bool elementPairIsContiguous(ArrayRef< int > Mask, int Elt)
static bool isV2BF16(Type *Ty)
static ArgDescriptor allocateSGPR32InputImpl(CCState &CCInfo, const TargetRegisterClass *RC, unsigned NumArgRegs)
static SDValue getMad64_32(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue N0, SDValue N1, SDValue N2, bool Signed)
static SDValue resolveSources(SelectionDAG &DAG, SDLoc SL, SmallVectorImpl< DotSrc > &Srcs, bool IsSigned, bool IsAny)
static bool hasNon16BitAccesses(uint64_t PermMask, SDValue &Op, SDValue &OtherOp)
static SDValue lowerWaveShuffle(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static void placeSources(ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, SmallVectorImpl< DotSrc > &Src0s, SmallVectorImpl< DotSrc > &Src1s, int Step)
static unsigned parseSyncscopeMDArg(const CallBase &CI, unsigned ArgIdx)
static EVT memVTFromLoadIntrReturn(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static MachineBasicBlock::iterator emitLoadM0FromVGPRLoop(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineBasicBlock &OrigBB, MachineBasicBlock &LoopBB, const DebugLoc &DL, const MachineOperand &Idx, unsigned InitReg, unsigned ResultReg, unsigned PhiReg, unsigned InitSaveExecReg, int Offset, bool UseGPRIdxMode, Register &SGPRIdxReg)
static SDValue matchPERM(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isFrameIndexOp(SDValue Op)
static ConstantFPSDNode * getSplatConstantFP(SDValue Op)
static void allocateSGPR32Input(CCState &CCInfo, ArgDescriptor &Arg)
static void knownBitsForSBFE(const MachineInstr &MI, GISelValueTracking &VT, KnownBits &Known, const APInt &DemandedElts, unsigned BFEWidth, bool SExt, unsigned Depth)
static bool isExtendedFrom16Bits(SDValue &Operand)
static std::optional< bool > checkDot4MulSignedness(const SDValue &N, ByteProvider< SDValue > &Src0, ByteProvider< SDValue > &Src1, const SDValue &S0Op, const SDValue &S1Op, const SelectionDAG &DAG)
static bool vectorEltWillFoldAway(SDValue Op)
static SDValue getSPDenormModeValue(uint32_t SPDenormMode, SelectionDAG &DAG, const SIMachineFunctionInfo *Info, const GCNSubtarget *ST)
static uint32_t getConstantPermuteMask(uint32_t C)
static AtomicOrdering parseAtomicOrderingCABIArg(const CallBase &CI, unsigned ArgIdx)
static MachineBasicBlock * emitIndirectDst(MachineInstr &MI, MachineBasicBlock &MBB, const GCNSubtarget &ST)
static void setM0ToIndexFromSGPR(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask=~0u, ArgDescriptor Arg=ArgDescriptor())
static std::pair< MachineBasicBlock *, MachineBasicBlock * > splitBlockForLoop(MachineInstr &MI, MachineBasicBlock &MBB, bool InstInLoop)
static unsigned getBasePtrIndex(const MemSDNode *N)
MemSDNode::getBasePtr() does not work for intrinsics, which needs to offset by the chain and intrinsi...
static void allocateFixedSGPRInputImpl(CCState &CCInfo, const TargetRegisterClass *RC, MCRegister Reg)
static SDValue constructRetValue(SelectionDAG &DAG, MachineSDNode *Result, ArrayRef< EVT > ResultTypes, bool IsTexFail, bool Unpacked, bool IsD16, int DMaskPop, int NumVDataDwords, bool IsAtomicPacked16Bit, const SDLoc &DL)
static std::optional< ByteProvider< SDValue > > handleMulOperand(const SDValue &MulOperand)
static ISD::CondCode tryReduceF64CompareToHiHalf(const ISD::CondCode CC, const SDValue LHS, const SDValue RHS, const SelectionDAG &DAG)
static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static Register getIndirectSGPRIdx(const SIInstrInfo *TII, MachineRegisterInfo &MRI, MachineInstr &MI, int Offset)
static SDValue emitNonHSAIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static EVT memVTFromLoadIntrData(const SITargetLowering &TLI, const DataLayout &DL, Type *Ty, unsigned MaxNumLanes)
static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc)
static unsigned getExtOpcodeForPromotedOp(SDValue Op)
static void expand64BitV_CNDMASK(MachineInstr &MI, MachineBasicBlock *BB)
static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N, SelectionDAG &DAG)
static SDValue buildSMovImm32(SelectionDAG &DAG, const SDLoc &DL, uint64_t Val)
static SDValue tryFoldMADwithSRL(SelectionDAG &DAG, const SDLoc &SL, SDValue MulLHS, SDValue MulRHS, SDValue AddRHS)
static unsigned getIntrMemWidth(unsigned IntrID)
static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL, ArrayRef< SDValue > Elts)
static SDNode * findUser(SDValue Value, unsigned Opcode)
Helper function for LowerBRCOND.
static unsigned addPermMasks(unsigned First, unsigned Second)
static uint64_t clearUnusedBits(uint64_t Val, unsigned Size)
static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue C, SDValue GlueChain, SDNodeFlags Flags)
static bool isV2F16OrV2BF16(Type *Ty)
static bool atomicIgnoresDenormalModeOrFPModeIsFTZ(const AtomicRMWInst *RMW)
static SDValue emitRemovedIntrinsicError(SelectionDAG &DAG, const SDLoc &DL, EVT VT)
static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL, EVT VT, SDValue A, SDValue B, SDValue GlueChain, SDNodeFlags Flags)
static SDValue buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV, const SDLoc &DL, int64_t Offset, EVT PtrVT, unsigned GAFlags=SIInstrInfo::MO_NONE)
static cl::opt< bool > UseDivergentRegisterIndexing("amdgpu-use-divergent-register-indexing", cl::Hidden, cl::desc("Use indirect register addressing for divergent indexes"), cl::init(false))
static const std::optional< ByteProvider< SDValue > > calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex=0, unsigned Depth=0)
static bool isV2F16(Type *Ty)
static void allocateSGPR64Input(CCState &CCInfo, ArgDescriptor &Arg)
static uint64_t getIdentityValueForWaveReduction(unsigned Opc)
SI DAG Lowering interface definition.
Interface definition for SIRegisterInfo.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
LLVM IR instance of the generic uniformity analysis.
static constexpr int Concat[]
Value * RHS
Value * LHS
The Input class is used to parse a yaml document into in-memory structs and vectors.
static std::optional< uint32_t > getLDSKernelIdMetadata(const Function &F)
void setDynLDSAlign(const Function &F, const GlobalVariable &GV)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
unsigned getWavefrontSize() const
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast between SrcAS and DestAS is a noop.
const std::array< unsigned, 3 > & getDims() const
static const LaneMaskConstants & get(const GCNSubtarget &ST)
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:345
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
static APFloat getQNaN(const fltSemantics &Sem, bool Negative=false, const APInt *payload=nullptr)
Factory for QNaN values.
Definition APFloat.h:1185
LLVM_ABI opStatus convert(const fltSemantics &ToSemantics, roundingMode RM, bool *losesInfo)
Definition APFloat.cpp:5901
LLVM_READONLY int getExactLog2Abs() const
Definition APFloat.h:1600
bool isNegative() const
Definition APFloat.h:1544
bool isNormal() const
Definition APFloat.h:1548
APInt bitcastToAPInt() const
Definition APFloat.h:1436
static APFloat getLargest(const fltSemantics &Sem, bool Negative=false)
Returns the largest finite number in the given semantics.
Definition APFloat.h:1203
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1163
static APFloat getZero(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Zero.
Definition APFloat.h:1144
bool isInfinity() const
Definition APFloat.h:1541
Class for arbitrary precision integers.
Definition APInt.h:78
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1408
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
bool isSignMask() const
Check if the APInt's value is returned by getSignMask.
Definition APInt.h:467
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
bool isOneBitSet(unsigned BitNo) const
Determine if this APInt Value only has the specified bit set.
Definition APInt.h:367
bool getBoolValue() const
Convert APInt to a boolean value.
Definition APInt.h:472
bool isSignBitSet() const
Determine if sign bit of this APInt is set.
Definition APInt.h:342
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool sge(const APInt &RHS) const
Signed greater or equal comparison.
Definition APInt.h:1244
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Check if an argument has a given attribute.
Definition Function.cpp:333
const Function * getParent() const
Definition Argument.h:44
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
An instruction that atomically checks whether a specified value is in a memory location,...
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
an instruction that atomically reads a memory location, combines it with another value,...
Align getAlign() const
Return the alignment of the memory that is being allocated by the instruction.
static unsigned getPointerOperandIndex()
BinOp
This enumeration lists the possible modifications atomicrmw can make.
@ Add
*p = old + v
@ FAdd
*p = old + v
@ USubCond
Subtract only if no unsigned overflow.
@ Min
*p = old <signed v ? old : v
@ Sub
*p = old - v
@ And
*p = old & v
@ Xor
*p = old ^ v
@ USubSat
*p = usub.sat(old, v) usub.sat matches the behavior of llvm.usub.sat.
@ FSub
*p = old - v
@ UIncWrap
Increment one up to a maximum value.
@ Max
*p = old >signed v ? old : v
@ UMin
*p = old <unsigned v ? old : v
@ FMin
*p = minnum(old, v) minnum matches the behavior of llvm.minnum.
@ UMax
*p = old >unsigned v ? old : v
@ FMax
*p = maxnum(old, v) maxnum matches the behavior of llvm.maxnum.
@ UDecWrap
Decrement one until a minimum value or zero.
@ Nand
*p = ~(old & v)
Value * getPointerOperand()
void setOperation(BinOp Operation)
BinOp getOperation() const
SyncScope::ID getSyncScopeID() const
Returns the synchronization scope ID of this rmw instruction.
static LLVM_ABI StringRef getOperationName(BinOp Op)
AtomicOrdering getOrdering() const
Returns the ordering constraint of this rmw instruction.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
bool isCompareAndSwap() const
Returns true if this SDNode represents cmpxchg atomic operation, false otherwise.
This class holds the attributes for a particular argument, parameter, function, or return value.
Definition Attributes.h:407
LLVM_ABI MemoryEffects getMemoryEffects() const
LLVM_ABI bool getValueAsBool() const
Return the attribute's value as a boolean.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator end()
Definition BasicBlock.h:474
LLVM_ABI BasicBlock * splitBasicBlock(iterator I, const Twine &BBName="")
Split the basic block into two basic blocks at the specified instruction.
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
A "pseudo-class" with methods for operating on BUILD_VECTORs.
Represents known origin of an individual byte in combine pattern.
static ByteProvider getConstantZero()
static ByteProvider getSrc(std::optional< ISelOp > Val, int64_t ByteOffset, int64_t VectorOffset)
std::optional< ISelOp > Src
CCState - This class holds information needed while lowering arguments and return values.
MachineFunction & getMachineFunction() const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
LLVM_ABI void AnalyzeCallResult(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeCallResult - Analyze the return values of a call, incorporating info about the passed values i...
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
int64_t AllocateStack(unsigned Size, Align Alignment)
AllocateStack - Allocate a chunk of stack space with the specified size and alignment.
LLVM_ABI void AnalyzeCallOperands(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeCallOperands - Analyze the outgoing arguments to a call, incorporating info about the passed v...
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
bool isAllocated(MCRegister Reg) const
isAllocated - Return true if the specified register (or an alias) is allocated.
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
int64_t getLocMemOffset() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
bool hasFnAttr(Attribute::AttrKind Kind) const
Determine whether this call has the given attribute.
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static LLVM_ABI CastInst * CreatePointerCast(Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Create a BitCast, AddrSpaceCast or a PtrToInt cast instruction.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_NE
not equal
Definition InstrTypes.h:762
bool isSigned() const
Definition InstrTypes.h:993
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:833
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:839
const APFloat & getValueAPF() const
bool isPosZero() const
Return true if the value is positive zero.
bool isOne() const
Returns true if this value is exactly +1.0.
bool isMinusOne() const
Returns true if this value is exactly -1.0.
bool isNegative() const
Return true if the value is negative.
bool isInfinity() const
Return true if the value is an infinity.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
LLVM_ABI Align getABITypeAlign(Type *Ty) const
Returns the minimum ABI-required alignment for the specified type.
bool isBigEndian() const
Definition DataLayout.h:218
A debug info location.
Definition DebugLoc.h:126
Diagnostic information for unsupported feature in backend.
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Register DemoteRegister
DemoteRegister - if CanLowerReturn is false, DemoteRegister is a vreg allocated to hold a pointer to ...
LLVM_ABI const Value * getValueFromVirtualReg(Register Vreg)
This method is called from TargetLowerinInfo::isSDNodeSourceOfDivergence to get the Value correspondi...
Class to represent function types.
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:357
iterator_range< arg_iterator > args()
Definition Function.h:866
Attribute getFnAttribute(Attribute::AttrKind Kind) const
Return the attribute for the given attribute kind.
Definition Function.cpp:758
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
Definition Function.cpp:799
Argument * getArg(unsigned i) const
Definition Function.h:860
const SIInstrInfo * getInstrInfo() const override
bool hasMadF16() const
unsigned getInstCacheLineSize() const
Instruction cache line size in bytes (64 for pre-GFX11, 128 for GFX11+).
const SIRegisterInfo * getRegisterInfo() const override
bool hasMin3Max3_16() const
unsigned getKnownHighZeroBitsForFrameIndex() const
Return the number of high bits known to be zero for a frame index.
bool supportsWaveWideBPermute() const
unsigned getMaxPrivateElementSize(bool ForBufferRSrc=false) const
bool isWave64() const
bool hasPrivateSegmentBuffer() const
const MachineFunction & getMachineFunction() const
void computeKnownBitsImpl(Register R, KnownBits &Known, const APInt &DemandedElts, unsigned Depth=0)
bool isDivergentAtDef(ConstValueRefT V) const
Whether V is divergent at its definition.
LLVM_ABI unsigned getAddressSpace() const
const GlobalValue * getGlobal() const
bool hasExternalLinkage() const
unsigned getAddressSpace() const
Module * getParent()
Get the module that this global value is contained inside of...
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this global belongs to.
Definition Globals.cpp:143
Type * getValueType() const
LLVM_ABI uint64_t getGlobalSize(const DataLayout &DL) const
Get the size of this global variable in bytes.
Definition Globals.cpp:578
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
LLVM_ABI Instruction * clone() const
Create a copy of 'this' instruction that is identical in all ways except the following:
LLVM_ABI void removeFromParent()
This method unlinks 'this' from the containing basic block, but does not delete it.
bool hasMetadata() const
Return true if this instruction has any metadata attached to it.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI const DataLayout & getDataLayout() const
Get the data layout of the module this instruction belongs to.
LLVM_ABI InstListType::iterator insertInto(BasicBlock *ParentBB, InstListType::iterator It)
Inserts an unlinked instruction into ParentBB at position It and returns the iterator of the inserted...
Class to represent integer types.
A wrapper class for inspecting calls to intrinsic functions.
constexpr unsigned getScalarSizeInBits() const
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
LLT changeElementSize(unsigned NewEltSize) const
If this type is a vector, return a vector with the same number of elements but the new element size.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void emitError(const Instruction *I, const Twine &ErrorStr)
emitError - Emit an error message to the currently installed error handler with optional location inf...
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
LLVM_ABI SyncScope::ID getOrInsertSyncScopeID(StringRef SSN)
getOrInsertSyncScopeID - Maps synchronization scope name to synchronization scope ID.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAtomic(AtomicOrdering Ordering, SyncScope::ID SSID=SyncScope::System)
Sets the ordering constraint and the synchronization scope ID of this load instruction.
static unsigned getPointerOperandIndex()
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
const SDValue & getOffset() const
ISD::LoadExtType getExtensionType() const
Return whether this is a plain node, or one of the varieties of value-extending loads.
Describe properties that are true of each instruction in the target description file.
Wrapper class representing physical registers. Should be passed by value.
Definition MCRegister.h:41
LLVM_ABI MDNode * createRange(const APInt &Lo, const APInt &Hi)
Return metadata describing the range [Lo, Hi).
Definition MDBuilder.cpp:96
Metadata node.
Definition Metadata.h:1069
const MDOperand & getOperand(unsigned I) const
Definition Metadata.h:1426
Helper class for constructing bundles of MachineInstrs.
MachineBasicBlock::instr_iterator begin() const
Return an iterator to the first bundled instruction.
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isScalableVector() const
Return true if this is a vector value type where the runtime length is machine dependent.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
static MVT getVectorVT(MVT VT, unsigned NumElements)
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
LLVM_ABI iterator getFirstTerminator()
Returns an iterator to the first terminator instruction of this basic block.
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
bool hasCalls() const
Return true if the current function has any function calls.
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
bool hasStackObjects() const
Return true if there are any stack objects in this function.
PseudoSourceValueManager & getPSVManager() const
const TargetSubtargetInfo & getSubtarget() const
getSubtarget - Return the subtarget for which this machine code is being compiled.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & setOperandDead(unsigned OpIdx) const
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
Representation of each machine instruction.
const MachineOperand & getOperand(unsigned i) const
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
unsigned getSubReg() const
int64_t getImm() const
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
bool isImm() const
isImm - Tests if this is a MO_Immediate operand.
static MachineOperand CreateImm(int64_t Val)
void setIsUndef(bool Val=true)
Register getReg() const
getReg - Returns the register number.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
const TargetRegisterClass * getRegClass(Register Reg) const
Return the register class of the specified virtual register.
LLVM_ABI void clearKillFlags(Register Reg) const
clearKillFlags - Iterate over all the uses of the given register and clear the kill flag from the Mac...
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI void setType(Register VReg, LLT Ty)
Set the low-level type of VReg to Ty.
LLVM_ABI void setRegClass(Register Reg, const TargetRegisterClass *RC)
setRegClass - Set the register class of the specified virtual register.
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
const TargetRegisterClass * getRegClassOrNull(Register Reg) const
Return the register class of Reg, or null if Reg has not been assigned a register class yet.
void setSimpleHint(Register VReg, Register PrefReg)
Specify the preferred (target independent) register allocation hint for the specified virtual registe...
LLVM_ABI Register cloneVirtualRegister(Register VReg, StringRef Name="")
Create and return a new virtual register in the function with the same attributes as the given regist...
unsigned getNumVirtRegs() const
getNumVirtRegs - Return the number of virtual registers created.
LLVM_ABI void replaceRegWith(Register FromReg, Register ToReg)
replaceRegWith - Replace all instances of FromReg with ToReg in the machine function.
An SDNode that represents everything that will be needed to construct a MachineInstr.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
bool onlyWritesMemory() const
Whether this function only (at most) writes memory.
Definition ModRef.h:252
bool doesNotAccessMemory() const
Whether this function accesses no memory.
Definition ModRef.h:246
bool onlyReadsMemory() const
Whether this function only (at most) reads memory.
Definition ModRef.h:249
const DataLayout & getDataLayout() const
Get the data layout for the module's target platform.
Definition Module.h:280
The optimization diagnostic interface.
LLVM_ABI void emit(DiagnosticInfoOptimizationBase &OptDiag)
Output the remark via the diagnostic handler and to the optimization record file.
Diagnostic information for applied optimization remarks.
static LLVM_ABI PointerType * get(Type *ElementType, unsigned AddressSpace)
This constructs a pointer to an object of the specified type in a numbered address space.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
LLVM_ABI const PseudoSourceValue * getConstantPool()
Return a pseudo source value referencing the constant pool.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
static Register index2VirtReg(unsigned Index)
Convert a 0-based index to a virtual register number.
Definition Register.h:72
constexpr bool isPhysical() const
Return true if the specified register number is in the physical register namespace.
Definition Register.h:83
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool isDivergent() const
bool hasOneUse() const
Return true if there is exactly one use of this node.
value_iterator value_end() const
SDNodeFlags getFlags() const
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
op_iterator op_end() const
bool isAnyAdd() const
Returns true if the node type is ADD or PTRADD.
value_iterator value_begin() const
op_iterator op_begin() const
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
bool isMachineOpcode() const
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
MVT getSimpleValueType() const
Return the simple ValueType of the referenced return value.
unsigned getMachineOpcode() const
unsigned getOpcode() const
static unsigned getMaxMUBUFImmOffset(const GCNSubtarget &ST)
static unsigned getDSShaderTypeValue(const MachineFunction &MF)
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
AMDGPU::ClusterDimsAttr getClusterDims() const
SIModeRegisterDefaults getMode() const
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const
const AMDGPUGWSResourcePseudoSourceValue * getGWSPSV(const AMDGPUTargetMachine &TM)
static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs=1)
static LLVM_READONLY const TargetRegisterClass * getSGPRClassForBitWidth(unsigned BitWidth)
static bool isVGPRClass(const TargetRegisterClass *RC)
static bool isSGPRClass(const TargetRegisterClass *RC)
static bool isAGPRClass(const TargetRegisterClass *RC)
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
bool isTypeDesirableForOp(unsigned Op, EVT VT) const override
Return true if the target has native support for the specified value type and it is 'desirable' to us...
SDNode * PostISelFolding(MachineSDNode *N, SelectionDAG &DAG) const override
Fold the instructions after selecting them.
SDValue splitTernaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachineSDNode * wrapAddr64Rsrc(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr) const
bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, EVT VT) const override
Return true if an FMA operation is faster than a pair of fmul and fadd instructions.
SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
bool requiresUniformRegister(MachineFunction &MF, const Value *V) const override
Allows target to decide about the register class of the specific value that is live outside the defin...
bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override
Returns true if be combined with to form an ISD::FMAD.
AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
void bundleInstWithWaitcnt(MachineInstr &MI) const
Insert MI into a BUNDLE with an S_WAITCNT 0 immediately following it.
SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const
MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override
Return the type to use for a scalar shift opcode, given the shifted amount type.
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
MVT getPointerTy(const DataLayout &DL, unsigned AS) const override
Map address space 7 to MVT::amdgpuBufferFatPointer because that's its in-memory representation.
bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const
void insertCopiesSplitCSR(MachineBasicBlock *Entry, const SmallVectorImpl< MachineBasicBlock * > &Exits) const override
Insert explicit copies in entry and exit blocks.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
Return the ValueType of the result of SETCC operations.
SDNode * legalizeTargetIndependentNode(SDNode *Node, SelectionDAG &DAG) const
Legalize target independent instructions (e.g.
bool allowsMisalignedMemoryAccessesImpl(unsigned Size, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const
TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const override
Return the preferred vector type legalization action.
SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const
const GCNSubtarget * getSubtarget() const
bool enableAggressiveFMAFusion(EVT VT) const override
Return true if target always benefits from combining into FMA for a given value type.
bool shouldEmitGOTReloc(const GlobalValue *GV) const
void CollectTargetIntrinsicOperands(const CallInst &I, SmallVectorImpl< SDValue > &Ops, SelectionDAG &DAG) const override
SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const
SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool isCanonicalized(SelectionDAG &DAG, SDValue Op, SDNodeFlags UserFlags={}, unsigned MaxDepth=5) const
void allocateSpecialInputSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocateLDSKernelId(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue LowerSTACKSAVE(SDValue Op, SelectionDAG &DAG) const
bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, SDValue N1) const override
void allocateHSAUserSGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
ArrayRef< MCPhysReg > getRoundingControlRegisters() const override
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
ConstraintType getConstraintType(StringRef Constraint) const override
Given a constraint, return the type of constraint it is for this target.
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool IsVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent) const override
Return the register class that should be used for the specified value type.
void AddMemOpInit(MachineInstr &MI) const
MachineMemOperand::Flags getTargetMMOFlags(const Instruction &I) const override
This callback is used to inspect load/store instructions and add target-specific MachineMemOperand fl...
bool isLegalGlobalAddressingMode(const AddrMode &AM) const
void computeKnownBitsForFrameIndex(int FrameIdx, KnownBits &Known, const MachineFunction &MF) const override
Determine which of the bits of FrameIndex FIOp are known to be 0.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Return true if it is beneficial to convert a load of a constant to just the constant itself.
Align getPrefLoopAlignment(MachineLoop *ML) const override
Return the preferred loop alignment.
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
void emitExpandAtomicStore(StoreInst *SI) const override
Perform a atomic store using a target-specific way.
AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
Align computeKnownAlignForTargetInstr(GISelValueTracking &Analysis, Register R, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine the known alignment for the pointer value R.
bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const
bool isShuffleMaskLegal(ArrayRef< int >, EVT) const override
Targets can use this to indicate that they only support some VECTOR_SHUFFLE operations,...
void emitExpandAtomicLoad(LoadInst *LI) const override
Perform a atomic load using a target-specific way.
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
Register getRegisterByName(const char *RegName, LLT VT, const MachineFunction &MF) const override
Return the register ID of the name passed in.
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
Lower the specified operand into the Ops vector.
LLT getPreferredShiftAmountTy(LLT Ty) const override
Return the preferred type to use for a shift opcode, given the shifted amount type is ShiftValueTy.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
Return true if the addressing mode represented by AM is legal for this target, for a load/store of th...
SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const
bool shouldPreservePtrArith(const Function &F, EVT PtrVT) const override
True if target has some particular form of dealing with pointer arithmetic semantics for pointers wit...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &, const CallBase &, MachineFunction &MF, unsigned IntrinsicID) const override
Given an intrinsic, checks if on the target the intrinsic will need to map to a MemIntrinsicNode (tou...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const
void allocateSpecialInputVGPRsFixed(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments in fixed registers.
LoadInst * lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const override
On some platforms, an AtomicRMW that never actually modifies the value (such as fetch_add of 0) can b...
MachineBasicBlock * emitGWSMemViolTestLoop(MachineInstr &MI, MachineBasicBlock *BB) const
bool getAddrModeArguments(const IntrinsicInst *I, SmallVectorImpl< Value * > &Ops, Type *&AccessTy) const override
CodeGenPrepare sinks address calculations into the same BB as Load/Store instructions reading the add...
bool checkAsmConstraintValA(SDValue Op, uint64_t Val, unsigned MaxSize=64) const
bool shouldEmitFixup(const GlobalValue *GV) const
MachineBasicBlock * splitKillBlock(MachineInstr &MI, MachineBasicBlock *BB) const
void emitExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) const override
Perform a cmpxchg expansion using a target-specific method.
bool canTransformPtrArithOutOfBounds(const Function &F, EVT PtrVT) const override
True if the target allows transformations of in-bounds pointer arithmetic that cause out-of-bounds in...
bool hasMemSDNodeUser(SDNode *N) const
bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, UniformityInfo *UA) const override
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
bool isEligibleForTailCallOptimization(SDValue Callee, CallingConv::ID CalleeCC, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SmallVectorImpl< ISD::InputArg > &Ins, SelectionDAG &DAG) const
bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const
bool isLegalFlatAddressingMode(const AddrMode &AM, unsigned AddrSpace) const
SDValue LowerCallResult(SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals, bool isThisReturn, SDValue ThisVal) const
SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::InputArg > &Ins, const SDLoc &DL, SelectionDAG &DAG, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower the incoming (formal) arguments, described by the Ins array,...
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode, EVT DestVT, EVT SrcVT) const override
Return true if an fpext operation input to an Opcode operation is free (for instance,...
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
Assign the register class depending on the number of bits set in the writemask.
MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void allocateSpecialInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
Allocate implicit function VGPR arguments at the end of allocated user arguments.
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
static bool isNonGlobalAddrSpace(unsigned AS)
void emitExpandAtomicAddrSpacePredicate(Instruction *AI) const
MachineSDNode * buildRSRC(SelectionDAG &DAG, const SDLoc &DL, SDValue Ptr, uint32_t RsrcDword1, uint64_t RsrcDword2And3) const
Return a resource descriptor with the 'Add TID' bit enabled The TID (Thread ID) is multiplied by the ...
unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const override
Certain targets require unusual breakdowns of certain types.
bool mayBeEmittedAsTailCall(const CallInst *) const override
Return true if the target may be able emit the call instruction as a tail call.
void passSpecialInputs(CallLoweringInfo &CLI, CCState &CCInfo, const SIMachineFunctionInfo &Info, SmallVectorImpl< std::pair< unsigned, SDValue > > &RegsToPass, SmallVectorImpl< SDValue > &MemOpChains, SDValue Chain) const
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
bool checkAsmConstraintVal(SDValue Op, StringRef Constraint, uint64_t Val) const
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
void emitExpandAtomicRMW(AtomicRMWInst *AI) const override
Perform a atomicrmw expansion using a target-specific way.
static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem, bool IsDivergentIdx, const GCNSubtarget *Subtarget)
Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be expanded into a set of cmp...
bool shouldUseLDSConstAddress(const GlobalValue *GV) const
bool supportSplitCSR(MachineFunction *MF) const override
Return true if the target supports that a subset of CSRs for the given machine function is handled ex...
bool isExtractVecEltCheap(EVT VT, unsigned Index) const override
Return true if extraction of a scalar element from the given vector type at the given index is cheap.
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *IsFast=nullptr) const override
LLT handling variant.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for extracting this result type from this source type with ...
bool canMergeStoresTo(unsigned AS, EVT MemVT, const MachineFunction &MF) const override
Returns if it's reasonable to merge stores to MemVT size.
SDValue lowerPREFETCH(SDValue Op, SelectionDAG &DAG) const
SITargetLowering(const TargetMachine &tm, const GCNSubtarget &STI)
void computeKnownBitsForTargetInstr(GISelValueTracking &Analysis, Register R, KnownBits &Known, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool isFreeAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override
Returns true if a cast from SrcAS to DestAS is "cheap", such that e.g.
bool shouldEmitPCReloc(const GlobalValue *GV) const
AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
void initializeSplitCSR(MachineBasicBlock *Entry) const override
Perform necessary initialization to handle a subset of CSRs explicitly via copies.
void allocateSpecialEntryInputVGPRs(CCState &CCInfo, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
void allocatePreloadKernArgSGPRs(CCState &CCInfo, SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< ISD::InputArg > &Ins, MachineFunction &MF, const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const
SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const
SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const
MachinePointerInfo getKernargSegmentPtrInfo(MachineFunction &MF) const
unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const override
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
MVT getPointerMemTy(const DataLayout &DL, unsigned AS) const override
Similarly, the in-memory representation of a p7 is {p8, i32}, aka v8i32 when padding is added.
void allocateSystemSGPRs(CCState &CCInfo, MachineFunction &MF, SIMachineFunctionInfo &Info, CallingConv::ID CallConv, bool IsShader) const
bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, LLVMContext &Context, const Type *RetTy) const override
This hook should be implemented to check whether the return values described by the Outs array can fi...
unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const override
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
SDValue getExtractVectorElt(const SDLoc &DL, EVT VT, SDValue Vec, unsigned Idx)
Extract element at Idx from Vec.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr, unsigned SrcAS, unsigned DestAS)
Return an AddrSpaceCastSDNode.
bool isKnownNeverSNaN(SDValue Op, const APInt &DemandedElts, unsigned Depth=0) const
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getAtomicLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT MemVT, EVT VT, SDValue Chain, SDValue Ptr, MachineMemOperand *MMO)
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI bool isConstantIntBuildVectorOrConstantInt(SDValue N, bool AllowOpaques=true) const
Test whether the given value is a constant int or similar node.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI bool SignBitIsZeroFP(SDValue Op, unsigned Depth=0) const
Return true if the sign bit of Op is known to be zero, for a floating-point value.
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT, SDValue Chain, SDValue Ptr, SDValue Val, MachineMemOperand *MMO)
Gets a node for an atomic op, produces result (if relevant) and chain and takes 2 operands.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
LLVM_ABI SDValue getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align DstAlign, Align SrcAlign, bool isVol, bool AlwaysInline, const CallInst *CI, std::optional< bool > OverrideTailCall, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo, const AAMDNodes &AAInfo=AAMDNodes(), BatchAAResults *BatchAA=nullptr)
const TargetLowering & getTargetLoweringInfo() const
LLVM_ABI std::pair< EVT, EVT > GetSplitDestVTs(const EVT &VT) const
Compute the VTs needed for the low/hi parts of a type which is split (or expanded) into two not neces...
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcastedAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by first bitcasting (from potentia...
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getTokenFactor(const SDLoc &DL, SmallVectorImpl< SDValue > &Vals)
Creates a new TokenFactor containing Vals.
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getMemBasePlusOffset(SDValue Base, TypeSize Offset, const SDLoc &DL, const SDNodeFlags Flags=SDNodeFlags())
Returns sum of the base pointer and offset.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI void RemoveDeadNode(SDNode *N)
Remove the specified node from the system.
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
const TargetMachine & getTarget() const
LLVM_ABI SDValue getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either any-extending or truncat...
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI bool isBaseWithConstantOffset(SDValue Op) const
Return true if the specified operand is an ISD::ADD with a ConstantSDNode on the right-hand side,...
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
SDValue getSplatBuildVector(EVT VT, const SDLoc &DL, SDValue Op)
Return a splat ISD::BUILD_VECTOR node, consisting of Op splatted to all elements.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
size_type count(const T &V) const
count - Return 1 if the element is in the set, 0 otherwise.
Definition SmallSet.h:176
bool empty() const
Definition SmallSet.h:169
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
constexpr bool empty() const
Check if the string is empty.
Definition StringRef.h:141
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
A switch()-like statement whose cases are string literals.
StringSwitch & Case(StringLiteral S, T Value)
Information about stack frame layout on the target.
Align getStackAlign() const
getStackAlignment - This method returns the number of bytes to which the stack pointer must be aligne...
StackDirection getStackGrowthDirection() const
getStackGrowthDirection - Return the direction the stack grows
TargetInstrInfo - Interface to description of machine instruction set.
Type * Ty
Same as OrigTy, or partially legalized for soft float libcalls.
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
virtual unsigned getMaxPermittedBytesForAlignment(MachineBasicBlock *MBB) const
Return the maximum amount of bytes allowed to be emitted when padding for alignment.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
LegalizeTypeAction
This enum indicates whether a types are legal for a target, and if not, what action should be used to...
void setHasExtractBitsInsn(bool hasExtractInsn=true)
Tells the code generator that the target has BitExtract instructions.
virtual TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT) const
Return the preferred vector type legalization action.
virtual unsigned getVectorTypeBreakdownForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const
Certain targets such as MIPS require that some types such as vectors are always broken down into scal...
Register getStackPointerRegisterToSaveRestore() const
If a physical register, this specifies the register that llvm.savestack/llvm.restorestack should save...
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
virtual Align getPrefLoopAlignment(MachineLoop *ML=nullptr) const
Return the preferred loop alignment.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
virtual MVT getPointerMemTy(const DataLayout &DL, uint32_t AS=0) const
Return the in-memory pointer type for the given address space, defaults to the pointer type from the ...
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
virtual void computeKnownBitsForFrameIndex(int FIOp, KnownBits &Known, const MachineFunction &MF) const
Determine which of the bits of FrameIndex FIOp are known to be 0.
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
std::vector< AsmOperandInfo > AsmOperandInfoVector
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
SDValue expandFMINIMUMNUM_FMAXIMUMNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminimumnum/fmaximumnum into multiple comparison with selects.
virtual bool isTypeDesirableForOp(unsigned, EVT VT) const
Return true if the target has native support for the specified value type and it is 'desirable' to us...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
virtual AsmOperandInfoVector ParseConstraints(const DataLayout &DL, const TargetRegisterInfo *TRI, const CallBase &Call) const
Split up the constraint string from the inline assembly value into the specific constraints and their...
SDValue expandRoundInexactToOdd(EVT ResultVT, SDValue Op, const SDLoc &DL, SelectionDAG &DAG) const
Truncate Op to ResultVT.
virtual void ComputeConstraintToUse(AsmOperandInfo &OpInfo, SDValue Op, SelectionDAG *DAG=nullptr) const
Determines the constraint code and constraint type to use for the specific AsmOperandInfo,...
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
SDValue expandFMINNUM_FMAXNUM(SDNode *N, SelectionDAG &DAG) const
Expand fminnum/fmaxnum into fminnum_ieee/fmaxnum_ieee with quieted inputs.
Primary interface to the complete machine description for the target machine.
CodeGenOptLevel getOptLevel() const
Returns the optimization level: None, Less, Default, or Aggressive.
const Triple & getTargetTriple() const
bool shouldAssumeDSOLocal(const GlobalValue *GV) const
TargetOptions Options
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
unsigned getNumRegs() const
Return the number of registers in this class.
unsigned getID() const
Return the register class ID number.
MCRegister getRegister(unsigned i) const
Return the specified register in the class.
iterator begin() const
begin/end - Return all of the registers in this class.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
OSType getOS() const
Get the parsed operating system type of this triple.
Definition Triple.h:434
Twine - A lightweight data structure for efficiently representing the concatenation of temporary valu...
Definition Twine.h:82
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
bool isFunctionTy() const
True if this is an instance of FunctionType.
Definition Type.h:273
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
LLVM_ABI const fltSemantics & getFltSemantics() const
Definition Type.cpp:106
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:141
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:36
LLVM_ABI void set(Value *Val)
Definition Value.h:874
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
iterator_range< user_iterator > users()
Definition Value.h:426
bool use_empty() const
Definition Value.h:346
iterator_range< use_iterator > uses()
Definition Value.h:380
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isKnownEven() const
A return value of true indicates we know at compile time that the number of elements (vscale * Min) i...
Definition TypeSize.h:176
self_iterator getIterator()
Definition ilist_node.h:123
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ BUFFER_STRIDED_POINTER
Address space for 192-bit fat buffer pointers with an additional index.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ STREAMOUT_REGISTER
Internal address spaces. Can be freely renumbered.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ FLAT_ADDRESS
Address space for flat memory.
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
@ BUFFER_FAT_POINTER
Address space for 160-bit buffer fat pointers.
@ PRIVATE_ADDRESS
Address space for private memory.
@ BUFFER_RESOURCE
Address space for 128-bit buffer resources.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char NumVGPRs[]
Key for Kernel::CodeProps::Metadata::mNumVGPRs.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char SymbolName[]
Key for Kernel::Metadata::mSymbolName.
bool isInlinableLiteralBF16(int16_t Literal, bool HasInv2Pi)
LLVM_READONLY const MIMGG16MappingInfo * getMIMGG16MappingInfo(unsigned G)
bool isInlinableLiteralFP16(int16_t Literal, bool HasInv2Pi)
int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding, unsigned VDataDwords, unsigned VAddrDwords)
LLVM_READNONE constexpr bool isShader(CallingConv::ID CC)
bool shouldEmitConstantsToTextSection(const Triple &TT)
bool isFlatGlobalAddrSpace(unsigned AS)
const uint64_t FltRoundToHWConversionTable
bool isGFX12Plus(const MCSubtargetInfo &STI)
unsigned getNSAMaxSize(const MCSubtargetInfo &STI, bool HasSampler)
constexpr int64_t getNullPointerValue(unsigned AS)
Get the null pointer value for the given address space.
bool isGFX11(const MCSubtargetInfo &STI)
bool isGFX13(const MCSubtargetInfo &STI)
bool hasValueInRangeLikeMetadata(const MDNode &MD, int64_t Val)
Checks if Val is inside MD, a !range-like metadata.
LLVM_READNONE bool isLegalDPALU_DPPControl(const MCSubtargetInfo &ST, unsigned DC)
LLVM_READNONE constexpr bool mayTailCallThisCC(CallingConv::ID CC)
Return true if we might ever do TCO for calls with this calling convention.
unsigned getAMDHSACodeObjectVersion(const Module &M)
LLVM_READONLY bool hasNamedOperand(uint64_t Opcode, OpName NamedIdx)
LLVM_READNONE constexpr bool isKernel(CallingConv::ID CC)
LLVM_READNONE constexpr bool isEntryFunctionCC(CallingConv::ID CC)
bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool isCompute(CallingConv::ID CC)
bool isIntrinsicSourceOfDivergence(unsigned IntrID)
LLVM_READNONE bool isInlinableIntLiteral(int64_t Literal)
Is this literal inlinable, and not one of the values intended for floating point values.
bool getMUBUFTfe(unsigned Opc)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
LLVM_READONLY int32_t getGlobalSaddrOp(uint32_t Opcode)
LLVM_READONLY int32_t getVOPe64(uint32_t Opcode)
bool isGFX11Plus(const MCSubtargetInfo &STI)
std::optional< unsigned > getInlineEncodingV2F16(uint32_t Literal)
std::tuple< char, unsigned, unsigned > parseAsmConstraintPhysReg(StringRef Constraint)
Returns a valid charcode or 0 in the first entry if this is a valid physical register constraint.
bool isGFX10Plus(const MCSubtargetInfo &STI)
bool isValidWMMAScaleFmtCombination(unsigned AFmt, unsigned AScale, unsigned BFmt, unsigned BScale)
bool isUniformMMO(const MachineMemOperand *MMO)
std::optional< unsigned > getInlineEncodingV2I16(uint32_t Literal)
uint32_t decodeFltRoundToHWConversionTable(uint32_t FltRounds)
Read the hardware rounding mode equivalent of a AMDGPUFltRounds value.
bool isExtendedGlobalAddrSpace(unsigned AS)
LLVM_READONLY const MIMGDimInfo * getMIMGDimInfo(unsigned DimEnum)
std::optional< unsigned > getInlineEncodingV2BF16(uint32_t Literal)
LLVM_READONLY const MIMGBaseOpcodeInfo * getMIMGBaseOpcodeInfo(unsigned BaseOpcode)
LLVM_READNONE constexpr bool isChainCC(CallingConv::ID CC)
int getMaskedMIMGOp(unsigned Opc, unsigned NewChannels)
const ImageDimIntrinsicInfo * getImageDimIntrinsicInfo(unsigned Intr)
bool isInlinableLiteralI16(int32_t Literal, bool HasInv2Pi)
LLVM_READNONE constexpr bool canGuaranteeTCO(CallingConv::ID CC)
LLVM_READNONE constexpr bool isGraphics(CallingConv::ID CC)
bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi)
Is this literal inlinable.
const RsrcIntrinsic * lookupRsrcIntrinsic(unsigned Intr)
const uint64_t FltRoundConversionTable
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ MaxID
The highest possible ID. Must be some 2^k - 1.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ PTRADD
PTRADD represents pointer arithmetic semantics, for targets that opt in using shouldPreservePtrArith(...
@ DELETED_NODE
DELETED_NODE - This is an illegal value that is used to catch errors.
Definition ISDOpcodes.h:45
@ SET_FPENV
Sets the current floating-point environment.
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ ATOMIC_LOAD_USUB_COND
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ GlobalAddress
Definition ISDOpcodes.h:88
@ ATOMIC_CMP_SWAP_WITH_SUCCESS
Val, Success, OUTCHAIN = ATOMIC_CMP_SWAP_WITH_SUCCESS(INCHAIN, ptr, cmp, swap) N.b.
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ ATOMIC_LOAD_USUB_SAT
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:983
@ CONVERGENCECTRL_GLUE
This does not correspond to any convergence control intrinsic.
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ READSTEADYCOUNTER
READSTEADYCOUNTER - This corresponds to the readfixedcounter intrinsic.
@ BR
Control flow instructions. These all have token chains.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:800
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:978
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:819
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:813
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ ATOMIC_LOAD_UDEC_WRAP
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:795
@ ExternalSymbol
Definition ISDOpcodes.h:93
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:953
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:841
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ ATOMIC_LOAD_UINC_WRAP
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getDeclarationIfExists(const Module *M, ID id)
Look up the Function declaration of the intrinsic id in the Module M and return it if it exists.
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
LLVM_ABI AttributeList getAttributes(LLVMContext &C, ID id, FunctionType *FT)
Return the attributes for an intrinsic.
LLVM_ABI FunctionType * getType(LLVMContext &Context, ID id, ArrayRef< Type * > OverloadTys={})
Return the function type for an intrinsic.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
bool mi_match(Reg R, const MachineRegisterInfo &MRI, Pattern &&P)
GFCstOrSplatGFCstMatch m_GFCstOrSplat(std::optional< FPValueAndVReg > &FPValReg)
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
auto m_IntrinsicWOChain(const OpndPreds &...Opnds)
bool sd_match(SDNode *N, const SelectionDAG *DAG, Pattern &&P)
ConstantInt_match m_ConstInt()
Match any integer constants or splat of an integer constant.
Offsets
Offsets in bytes from the start of the input buffer.
@ System
Synchronized with respect to all concurrently executing threads.
Definition LLVMContext.h:58
initializer< Ty > init(const Ty &Val)
constexpr double inv_pi
@ User
could "use" a pointer
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
NodeAddr< NodeBase * > Node
Definition RDFGraph.h:381
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< SSAContext > UniformityInfo
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
@ Offset
Definition DWP.cpp:573
LLVM_ABI ISD::CondCode getICmpCondCode(ICmpInst::Predicate Pred)
getICmpCondCode - Return the ISD condition code corresponding to the given LLVM IR integer condition ...
Definition Analysis.cpp:237
LLVM_ABI void finalizeBundle(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
finalizeBundle - Finalize a machine instruction bundle which includes a sequence of instructions star...
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
detail::zippy< detail::zip_first, T, U, Args... > zip_equal(T &&t, U &&u, Args &&...args)
zip iterator that assumes that all iteratees have the same length.
Definition STLExtras.h:840
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI std::pair< Value *, Value * > buildCmpXchgValue(IRBuilderBase &Builder, Value *Ptr, Value *Cmp, Value *Val, Align Alignment)
Emit IR to implement the given cmpxchg operation on values in registers, returning the new value.
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Undef
Value of the register doesn't matter.
@ Define
Register definition.
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Done
Definition Threading.h:60
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
constexpr int64_t minIntN(int64_t N)
Gets the minimum value for a N-bit signed integer.
Definition MathExtras.h:223
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr T alignDown(U Value, V Align, W Skew=0)
Returns the largest unsigned integer less than or equal to Value and is Skew mod Align.
Definition MathExtras.h:546
MemoryEffectsBase< IRMemLocation > MemoryEffects
Summary of how a function affects memory in the program.
Definition ModRef.h:356
constexpr int popcount(T Value) noexcept
Count the number of set bits in a value.
Definition bit.h:156
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool isReleaseOrStronger(AtomicOrdering AO)
constexpr T MinAlign(U A, V B)
A and B are either alignments or offsets.
Definition MathExtras.h:357
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
AtomicOrderingCABI
Atomic ordering for C11 / C++11's memory models.
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
bool isBoolSGPR(SDValue V)
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
LLVM_ABI ISD::CondCode getFCmpCondCode(FCmpInst::Predicate Pred)
getFCmpCondCode - Return the ISD condition code corresponding to the given LLVM IR floating-point con...
Definition Analysis.cpp:203
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
static const MachineMemOperand::Flags MOCooperative
Mark the MMO of cooperative load/store atomics.
Definition SIInstrInfo.h:52
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
LLVM_ABI Value * buildAtomicRMWValue(AtomicRMWInst::BinOp Op, IRBuilderBase &Builder, Value *Loaded, Value *Val)
Emit IR to implement the given atomicrmw operation on values in registers, returning the new value.
AtomicOrdering
Atomic ordering for LLVM's memory model.
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ AfterLegalizeVectorOps
Definition DAGCombine.h:18
@ AfterLegalizeTypes
Definition DAGCombine.h:17
@ Or
Bitwise or logical OR of integers.
@ Mul
Product of integers.
@ Add
Sum of integers.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr int64_t maxIntN(int64_t N)
Gets the maximum value for a N-bit signed integer.
Definition MathExtras.h:232
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI std::optional< ValueAndVReg > getIConstantVRegValWithLookThrough(Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs=true)
If VReg is defined by a statically evaluable chain of instructions rooted on a G_CONSTANT returns its...
Definition Utils.cpp:436
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
static const MachineMemOperand::Flags MOLastUse
Mark the MMO of a load as the last use.
Definition SIInstrInfo.h:48
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Next
Definition InstrProf.h:147
constexpr T maskTrailingOnes(unsigned N)
Create a bitmask with the N right-most bits set to 1, and all other bits set to 0.
Definition MathExtras.h:77
constexpr RegState getUndefRegState(bool B)
@ Custom
The result value requires a custom uniformity check.
Definition Uniformity.h:31
LLVM_ABI Printable printReg(Register Reg, const TargetRegisterInfo *TRI=nullptr, unsigned SubIdx=0, const MachineRegisterInfo *MRI=nullptr)
Prints virtual and physical registers with or without a TRI instance.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
int64_t DWordOffset
int64_t PermMask
std::tuple< const ArgDescriptor *, const TargetRegisterClass *, LLT > getPreloadedValue(PreloadedValue Value) const
static const AMDGPUFunctionArgInfo FixedABIFunctionInfo
static constexpr uint64_t encode(Fields... Values)
static std::tuple< typename Fields::ValueType... > decode(uint64_t Encoded)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static ArgDescriptor createStack(unsigned Offset, unsigned Mask=~0u)
MCRegister getRegister() const
static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask)
static ArgDescriptor createRegister(Register Reg, unsigned Mask=~0u)
Helper struct shared between Function Specialization and SCCP Solver.
Definition SCCPSolver.h:42
Represent subnormal handling kind for floating point instruction inputs and outputs.
@ Dynamic
Denormals have unknown treatment.
static constexpr DenormalMode getPreserveSign()
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:323
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:373
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:266
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:501
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:435
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
bool bitsEq(EVT VT) const
Return true if this has the same number of bits as VT.
Definition ValueTypes.h:279
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
bool isVectorOf(EVT EltVT) const
Return true if this is a vector with matching element type.
Definition ValueTypes.h:181
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
unsigned getPointerAddrSpace() const
unsigned getByValSize() const
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
unsigned getOrigArgIndex() const
OutputArg - This struct carries flags and a value for a single outgoing (actual) argument or outgoing...
static LLVM_ABI std::optional< bool > eq(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_EQ result.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
bool isNonZero() const
Returns true if this value is known to be non-zero.
Definition KnownBits.h:109
KnownBits extractBits(unsigned NumBits, unsigned BitPosition) const
Return a subset of the known bits from [bitPosition,bitPosition+numBits).
Definition KnownBits.h:239
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
static LLVM_ABI std::optional< bool > ule(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_ULE result.
static LLVM_ABI std::optional< bool > uge(const KnownBits &LHS, const KnownBits &RHS)
Determine if these known bits always give the same ICMP_UGE result.
bool isKnownNeverNaN() const
Return true if it's known this can never be a nan.
static LLVM_ABI KnownFPClass bitcast(const fltSemantics &FltSemantics, const KnownBits &Bits)
Report known values for a bitcast into a float with provided semantics.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoUnsignedWrap() const
bool hasAllowContract() const
bool hasNoSignedWrap() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
unsigned int NumVTs
DenormalMode FP64FP16Denormals
If this is set, neither input or output denormals are flushed for both f64 and f16/v2f16 instructions...
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
std::optional< unsigned > fallbackAddressSpace
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
SmallVector< ISD::OutputArg, 32 > Outs