LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::v2f32, MVT::i64);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
182
184 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
185
187 AddPromotedToType(ISD::ATOMIC_STORE, MVT::v2f32, MVT::i64);
188
189 // There are no 64-bit extloads. These should be done as a 32-bit extload and
190 // an extension to 64-bit.
191 for (MVT VT : MVT::integer_valuetypes())
193 Expand);
194
195 for (MVT VT : MVT::integer_valuetypes()) {
196 if (VT == MVT::i64)
197 continue;
198
199 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
200 setLoadExtAction(Op, VT, MVT::i1, Promote);
201 setLoadExtAction(Op, VT, MVT::i8, Legal);
202 setLoadExtAction(Op, VT, MVT::i16, Legal);
203 setLoadExtAction(Op, VT, MVT::i32, Expand);
204 }
205 }
206
208 for (auto MemVT :
209 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
211 Expand);
212
213 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
227
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
234
235 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
241 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
242 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
243 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
244 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
245 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
246 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
247
249 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
283
285 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
289
291 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
319
321 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
322
324 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
325
327 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
328
329 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
330 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
331 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
332 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
333
334 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
335 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
336 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
337 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
338
339 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
340 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
341 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
342 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
343 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
344 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
345 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
346 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
347 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
348 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
349 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
350 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
351 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
352 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
353 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
354
355 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
356 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
357 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
358
359 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
360 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
361 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
362
363 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
364
365 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
366 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
367 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
368 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
369 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
370 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
371 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
372
373 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
374 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
375 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
376 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
377 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
378
379 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
380 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
381 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
382
383 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
384 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
385 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
386
387 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
388 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
389 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
390
391 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
392 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
393 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
394
395 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
396 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
397 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
398 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
399 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
400 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
401 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
402
403 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
404 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
405
407
408 // For R600, this is totally unsupported, just custom lower to produce an
409 // error.
411
412 // Library functions. These default to Expand, but we have instructions
413 // for them.
416 {MVT::f16, MVT::f32}, Legal);
418
420 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
422 {MVT::f16, MVT::f32, MVT::f64}, Expand);
423
426 Custom);
428
429 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
430
431 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
432
433 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
434 Expand);
435
436 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
437 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
439
441 Custom);
442
443 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
444
445 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
446 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
447 // default unless marked custom/legal.
449 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
450 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
451 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
452 MVT::v16f64},
453 Custom);
454
455 // Expand to fneg + fadd.
457
459 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
460 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
461 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
462 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
463 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
464 Custom);
465
468 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
469 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
470 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
471 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
472 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
473 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
474 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
475 Custom);
476
478 Expand);
479 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
480
481 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
482 for (MVT VT : ScalarIntVTs) {
483 // These should use [SU]DIVREM, so set them to expand
485 Expand);
486
487 // GPU does not have divrem function for signed or unsigned.
489
490 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
492
494
496 Expand);
497 }
498
499 // The hardware supports 32-bit FSHR, but not FSHL.
501
502 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
503
505
510 MVT::i64, Custom);
512
514 Legal);
515
518 MVT::i64, Custom);
519
520 for (auto VT : {MVT::i8, MVT::i16})
522
523 static const MVT::SimpleValueType VectorIntTypes[] = {
524 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
525 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
526
527 for (MVT VT : VectorIntTypes) {
528 // Expand the following operations for the current type by default.
529 // clang-format off
549 VT, Expand);
550 // clang-format on
551 }
552
553 static const MVT::SimpleValueType FloatVectorTypes[] = {
554 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
555 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
556
557 for (MVT VT : FloatVectorTypes) {
570 VT, Expand);
571 }
572
573 // This causes using an unrolled select operation rather than expansion with
574 // bit operations. This is in general better, but the alternative using BFI
575 // instructions may be better if the select sources are SGPRs.
577 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
581
583 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
584
586 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
587
589 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
590
592 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
593
595 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
596
598 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
599
601 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
602
604 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
605
607 setJumpIsExpensive(true);
608
611
613
614 // We want to find all load dependencies for long chains of stores to enable
615 // merging into very wide vectors. The problem is with vectors with > 4
616 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
617 // vectors are a legal type, even though we have to split the loads
618 // usually. When we can more precisely specify load legality per address
619 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
620 // smarter so that they can figure out what to do in 2 iterations without all
621 // N > 4 stores on the same chain.
623
624 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
625 // about these during lowering.
626 MaxStoresPerMemcpy = 0xffffffff;
627 MaxStoresPerMemmove = 0xffffffff;
628 MaxStoresPerMemset = 0xffffffff;
629
630 // The expansion for 64-bit division is enormous.
632 addBypassSlowDiv(64, 32);
633
644
648}
649
651 const auto Flags = Op.getNode()->getFlags();
652 if (Flags.hasNoSignedZeros())
653 return true;
654
655 return false;
656}
657
658//===----------------------------------------------------------------------===//
659// Target Information
660//===----------------------------------------------------------------------===//
661
663static bool fnegFoldsIntoOpcode(unsigned Opc) {
664 switch (Opc) {
665 case ISD::FADD:
666 case ISD::FSUB:
667 case ISD::FMUL:
668 case ISD::FMA:
669 case ISD::FMAD:
670 case ISD::FMINNUM:
671 case ISD::FMAXNUM:
674 case ISD::FMINIMUM:
675 case ISD::FMAXIMUM:
676 case ISD::FMINIMUMNUM:
677 case ISD::FMAXIMUMNUM:
678 case ISD::SELECT:
679 case ISD::FSIN:
680 case ISD::FTRUNC:
681 case ISD::FRINT:
682 case ISD::FNEARBYINT:
683 case ISD::FROUNDEVEN:
685 case AMDGPUISD::RCP:
686 case AMDGPUISD::RCP_LEGACY:
687 case AMDGPUISD::RCP_IFLAG:
688 case AMDGPUISD::SIN_HW:
689 case AMDGPUISD::FMUL_LEGACY:
690 case AMDGPUISD::FMIN_LEGACY:
691 case AMDGPUISD::FMAX_LEGACY:
692 case AMDGPUISD::FMED3:
693 // TODO: handle llvm.amdgcn.fma.legacy
694 return true;
695 case ISD::BITCAST:
696 llvm_unreachable("bitcast is special cased");
697 default:
698 return false;
699 }
700}
701
702static bool fnegFoldsIntoOp(const SDNode *N) {
703 unsigned Opc = N->getOpcode();
704 if (Opc == ISD::BITCAST) {
705 // TODO: Is there a benefit to checking the conditions performFNegCombine
706 // does? We don't for the other cases.
707 SDValue BCSrc = N->getOperand(0);
708 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
709 return BCSrc.getNumOperands() == 2 &&
710 BCSrc.getOperand(1).getValueSizeInBits() == 32;
711 }
712
713 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
714 }
715
716 return fnegFoldsIntoOpcode(Opc);
717}
718
719/// \p returns true if the operation will definitely need to use a 64-bit
720/// encoding, and thus will use a VOP3 encoding regardless of the source
721/// modifiers.
723static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
724 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
725 VT == MVT::f64;
726}
727
728/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
729/// type for ISD::SELECT.
731static bool selectSupportsSourceMods(const SDNode *N) {
732 // TODO: Only applies if select will be vector
733 return N->getValueType(0) == MVT::f32;
734}
735
736// Most FP instructions support source modifiers, but this could be refined
737// slightly.
739static bool hasSourceMods(const SDNode *N) {
740 if (isa<MemSDNode>(N))
741 return false;
742
743 switch (N->getOpcode()) {
744 case ISD::CopyToReg:
745 case ISD::FDIV:
746 case ISD::FREM:
747 case ISD::INLINEASM:
749 case AMDGPUISD::DIV_SCALE:
751
752 // TODO: Should really be looking at the users of the bitcast. These are
753 // problematic because bitcasts are used to legalize all stores to integer
754 // types.
755 case ISD::BITCAST:
756 return false;
758 switch (N->getConstantOperandVal(0)) {
759 case Intrinsic::amdgcn_interp_p1:
760 case Intrinsic::amdgcn_interp_p2:
761 case Intrinsic::amdgcn_interp_mov:
762 case Intrinsic::amdgcn_interp_p1_f16:
763 case Intrinsic::amdgcn_interp_p2_f16:
764 return false;
765 default:
766 return true;
767 }
768 }
769 case ISD::SELECT:
771 default:
772 return true;
773 }
774}
775
777 unsigned CostThreshold) {
778 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
779 // it is truly free to use a source modifier in all cases. If there are
780 // multiple users but for each one will necessitate using VOP3, there will be
781 // a code size increase. Try to avoid increasing code size unless we know it
782 // will save on the instruction count.
783 unsigned NumMayIncreaseSize = 0;
784 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
785
786 assert(!N->use_empty());
787
788 // XXX - Should this limit number of uses to check?
789 for (const SDNode *U : N->users()) {
790 if (!hasSourceMods(U))
791 return false;
792
793 if (!opMustUseVOP3Encoding(U, VT)) {
794 if (++NumMayIncreaseSize > CostThreshold)
795 return false;
796 }
797 }
798
799 return true;
800}
801
803 ISD::NodeType ExtendKind) const {
804 assert(!VT.isVector() && "only scalar expected");
805
806 // Round to the next multiple of 32-bits.
807 unsigned Size = VT.getSizeInBits();
808 if (Size <= 32)
809 return MVT::i32;
810 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
811}
812
814 return 32;
815}
816
818 return true;
819}
820
821// The backend supports 32 and 64 bit floating point immediates.
822// FIXME: Why are we reporting vectors of FP immediates as legal?
824 bool ForCodeSize) const {
825 return isTypeLegal(VT.getScalarType());
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 // Report this based on the end legalized type.
970 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ABS:
1045 case ISD::ADD:
1046 case ISD::SUB:
1047 case ISD::SHL:
1048 case ISD::SRL:
1049 case ISD::SRA:
1050 case ISD::AND:
1051 case ISD::OR:
1052 case ISD::XOR:
1053 case ISD::MUL:
1054 case ISD::SETCC:
1055 case ISD::SELECT:
1056 case ISD::SMIN:
1057 case ISD::SMAX:
1058 case ISD::UMIN:
1059 case ISD::UMAX:
1060 case ISD::USUBSAT:
1061 if (isTypeLegal(MVT::i16) &&
1062 (!DestVT.isVector() ||
1063 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1064 // Don't narrow back down to i16 if promoted to i32 already.
1065 if (!N->isDivergent() && DestVT.isInteger() &&
1066 DestVT.getScalarSizeInBits() > 1 &&
1067 DestVT.getScalarSizeInBits() <= 16 &&
1068 SrcVT.getScalarSizeInBits() > 16) {
1069 return false;
1070 }
1071 }
1072 return true;
1073 default:
1074 break;
1075 }
1076
1077 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1078 // limited number of native 64-bit operations. Shrinking an operation to fit
1079 // in a single 32-bit register should always be helpful. As currently used,
1080 // this is much less general than the name suggests, and is only used in
1081 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1082 // not profitable, and may actually be harmful.
1083 if (isa<LoadSDNode>(N))
1084 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1085
1086 return true;
1087}
1088
1090 const SDNode* N, CombineLevel Level) const {
1091 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1092 N->getOpcode() == ISD::SRL) &&
1093 "Expected shift op");
1094
1095 SDValue ShiftLHS = N->getOperand(0);
1096 if (!ShiftLHS->hasOneUse())
1097 return false;
1098
1099 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1100 !ShiftLHS.getOperand(0)->hasOneUse())
1101 return false;
1102
1103 // Always commute pre-type legalization and right shifts.
1104 // We're looking for shl(or(x,y),z) patterns.
1106 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1107 return true;
1108
1109 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1110 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1111 (N->user_begin()->getOpcode() == ISD::SRA ||
1112 N->user_begin()->getOpcode() == ISD::SRL))
1113 return false;
1114
1115 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1116 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1117 if (LHS.getOpcode() != ISD::SHL)
1118 return false;
1119 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1120 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1121 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1122 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1123 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1124 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1125 };
1126 SDValue LHS = N->getOperand(0).getOperand(0);
1127 SDValue RHS = N->getOperand(0).getOperand(1);
1128 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1129}
1130
1131//===---------------------------------------------------------------------===//
1132// TargetLowering Callbacks
1133//===---------------------------------------------------------------------===//
1134
1136 bool IsVarArg) {
1137 switch (CC) {
1145 return CC_AMDGPU;
1148 return CC_AMDGPU_CS_CHAIN;
1149 case CallingConv::C:
1150 case CallingConv::Fast:
1151 case CallingConv::Cold:
1152 return CC_AMDGPU_Func;
1155 return CC_SI_Gfx;
1158 default:
1159 reportFatalUsageError("unsupported calling convention for call");
1160 }
1161}
1162
1164 bool IsVarArg) {
1165 switch (CC) {
1168 llvm_unreachable("kernels should not be handled here");
1178 return RetCC_SI_Shader;
1181 return RetCC_SI_Gfx;
1182 case CallingConv::C:
1183 case CallingConv::Fast:
1184 case CallingConv::Cold:
1185 return RetCC_AMDGPU_Func;
1186 default:
1187 reportFatalUsageError("unsupported calling convention");
1188 }
1189}
1190
1191/// The SelectionDAGBuilder will automatically promote function arguments
1192/// with illegal types. However, this does not work for the AMDGPU targets
1193/// since the function arguments are stored in memory as these illegal types.
1194/// In order to handle this properly we need to get the original types sizes
1195/// from the LLVM IR Function and fixup the ISD:InputArg values before
1196/// passing them to AnalyzeFormalArguments()
1197
1198/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1199/// input values across multiple registers. Each item in the Ins array
1200/// represents a single value that will be stored in registers. Ins[x].VT is
1201/// the value type of the value that will be stored in the register, so
1202/// whatever SDNode we lower the argument to needs to be this type.
1203///
1204/// In order to correctly lower the arguments we need to know the size of each
1205/// argument. Since Ins[x].VT gives us the size of the register that will
1206/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1207/// for the original function argument so that we can deduce the correct memory
1208/// type to use for Ins[x]. In most cases the correct memory type will be
1209/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1210/// we have a kernel argument of type v8i8, this argument will be split into
1211/// 8 parts and each part will be represented by its own item in the Ins array.
1212/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1213/// the argument before it was split. From this, we deduce that the memory type
1214/// for each individual part is i8. We pass the memory type as LocVT to the
1215/// calling convention analysis function and the register type (Ins[x].VT) as
1216/// the ValVT.
1218 CCState &State,
1219 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1220 const MachineFunction &MF = State.getMachineFunction();
1221 const Function &Fn = MF.getFunction();
1222 LLVMContext &Ctx = Fn.getContext();
1223 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1225
1226 Align MaxAlign = Align(1);
1227 uint64_t ExplicitArgOffset = 0;
1228 const DataLayout &DL = Fn.getDataLayout();
1229
1230 unsigned InIndex = 0;
1231
1232 for (const Argument &Arg : Fn.args()) {
1233 const bool IsByRef = Arg.hasByRefAttr();
1234 Type *BaseArgTy = Arg.getType();
1235 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1236 Align Alignment = DL.getValueOrABITypeAlignment(
1237 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1238 MaxAlign = std::max(Alignment, MaxAlign);
1239 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1240
1241 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1242 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1243
1244 // We're basically throwing away everything passed into us and starting over
1245 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1246 // to us as computed in Ins.
1247 //
1248 // We also need to figure out what type legalization is trying to do to get
1249 // the correct memory offsets.
1250
1251 SmallVector<EVT, 16> ValueVTs;
1253 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1254 &Offsets, ArgOffset);
1255
1256 for (unsigned Value = 0, NumValues = ValueVTs.size();
1257 Value != NumValues; ++Value) {
1258 uint64_t BasePartOffset = Offsets[Value];
1259
1260 EVT ArgVT = ValueVTs[Value];
1261 EVT MemVT = ArgVT;
1262 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1263 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1264
1265 if (NumRegs == 1) {
1266 // This argument is not split, so the IR type is the memory type.
1267 if (ArgVT.isExtended()) {
1268 // We have an extended type, like i24, so we should just use the
1269 // register type.
1270 MemVT = RegisterVT;
1271 } else {
1272 MemVT = ArgVT;
1273 }
1274 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1275 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1276 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1277 // We have a vector value which has been split into a vector with
1278 // the same scalar type, but fewer elements. This should handle
1279 // all the floating-point vector types.
1280 MemVT = RegisterVT;
1281 } else if (ArgVT.isVector() &&
1282 ArgVT.getVectorNumElements() == NumRegs) {
1283 // This arg has been split so that each element is stored in a separate
1284 // register.
1285 MemVT = ArgVT.getScalarType();
1286 } else if (ArgVT.isExtended()) {
1287 // We have an extended type, like i65.
1288 MemVT = RegisterVT;
1289 } else {
1290 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1291 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1292 if (RegisterVT.isInteger()) {
1293 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1294 } else if (RegisterVT.isVector()) {
1295 assert(!RegisterVT.getScalarType().isFloatingPoint());
1296 unsigned NumElements = RegisterVT.getVectorNumElements();
1297 assert(MemoryBits % NumElements == 0);
1298 // This vector type has been split into another vector type with
1299 // a different elements size.
1300 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1301 MemoryBits / NumElements);
1302 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1303 } else {
1304 llvm_unreachable("cannot deduce memory type.");
1305 }
1306 }
1307
1308 // Convert one element vectors to scalar.
1309 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1310 MemVT = MemVT.getScalarType();
1311
1312 // Round up vec3/vec5 argument.
1313 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1314 MemVT = MemVT.getPow2VectorType(State.getContext());
1315 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1316 MemVT = MemVT.getRoundIntegerType(State.getContext());
1317 }
1318
1319 unsigned PartOffset = 0;
1320 for (unsigned i = 0; i != NumRegs; ++i) {
1321 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1322 BasePartOffset + PartOffset,
1323 MemVT.getSimpleVT(),
1325 PartOffset += MemVT.getStoreSize();
1326 }
1327 }
1328 }
1329}
1330
1332 SDValue Chain, CallingConv::ID CallConv,
1333 bool isVarArg,
1335 const SmallVectorImpl<SDValue> &OutVals,
1336 const SDLoc &DL, SelectionDAG &DAG) const {
1337 // FIXME: Fails for r600 tests
1338 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1339 // "wave terminate should not have return values");
1340 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1341}
1342
1343//===---------------------------------------------------------------------===//
1344// Target specific lowering
1345//===---------------------------------------------------------------------===//
1346
1347/// Selects the correct CCAssignFn for a given CallingConvention value.
1352
1357
1359 SelectionDAG &DAG,
1360 MachineFrameInfo &MFI,
1361 int ClobberedFI) const {
1362 SmallVector<SDValue, 8> ArgChains;
1363 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1364 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1365
1366 // Include the original chain at the beginning of the list. When this is
1367 // used by target LowerCall hooks, this helps legalize find the
1368 // CALLSEQ_BEGIN node.
1369 ArgChains.push_back(Chain);
1370
1371 // Add a chain value for each stack argument corresponding
1372 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1373 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1374 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1375 if (FI->getIndex() < 0) {
1376 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1377 int64_t InLastByte = InFirstByte;
1378 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1379
1380 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1381 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1382 ArgChains.push_back(SDValue(L, 1));
1383 }
1384 }
1385 }
1386 }
1387
1388 // Build a tokenfactor for all the chains.
1389 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1390}
1391
1394 StringRef Reason) const {
1395 SDValue Callee = CLI.Callee;
1396 SelectionDAG &DAG = CLI.DAG;
1397
1398 const Function &Fn = DAG.getMachineFunction().getFunction();
1399
1400 StringRef FuncName("<unknown>");
1401
1403 FuncName = G->getSymbol();
1404 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1405 FuncName = G->getGlobal()->getName();
1406
1407 DAG.getContext()->diagnose(
1408 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1409
1410 if (!CLI.IsTailCall) {
1411 for (ISD::InputArg &Arg : CLI.Ins)
1412 InVals.push_back(DAG.getPOISON(Arg.VT));
1413 }
1414
1415 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1416 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1417 return CLI.Chain;
1418
1419 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1420 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1421}
1422
1424 SmallVectorImpl<SDValue> &InVals) const {
1425 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1426}
1427
1429 SelectionDAG &DAG) const {
1430 const Function &Fn = DAG.getMachineFunction().getFunction();
1431
1433 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1434 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1435 return DAG.getMergeValues(Ops, SDLoc());
1436}
1437
1439 SelectionDAG &DAG) const {
1440 switch (Op.getOpcode()) {
1441 default:
1442 Op->print(errs(), &DAG);
1443 llvm_unreachable("Custom lowering code for this "
1444 "instruction is not implemented yet!");
1445 break;
1447 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1449 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1450 case ISD::SDIVREM:
1451 return LowerSDIVREM(Op, DAG);
1452 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1453 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1454 case ISD::FRINT: return LowerFRINT(Op, DAG);
1455 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1456 case ISD::FROUNDEVEN:
1457 return LowerFROUNDEVEN(Op, DAG);
1458 case ISD::FROUND: return LowerFROUND(Op, DAG);
1459 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1460 case ISD::FLOG2:
1461 return LowerFLOG2(Op, DAG);
1462 case ISD::FLOG:
1463 case ISD::FLOG10:
1464 return LowerFLOGCommon(Op, DAG);
1465 case ISD::FEXP:
1466 case ISD::FEXP10:
1467 return lowerFEXP(Op, DAG);
1468 case ISD::FEXP2:
1469 return lowerFEXP2(Op, DAG);
1470 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1471 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1472 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1473 case ISD::FP_TO_SINT:
1474 case ISD::FP_TO_UINT:
1475 return LowerFP_TO_INT(Op, DAG);
1478 return LowerFP_TO_INT_SAT(Op, DAG);
1479 case ISD::CTTZ:
1481 case ISD::CTLZ:
1483 return LowerCTLZ_CTTZ(Op, DAG);
1484 case ISD::CTLS:
1485 return LowerCTLS(Op, DAG);
1487 }
1488 return Op;
1489}
1490
1493 SelectionDAG &DAG) const {
1494 switch (N->getOpcode()) {
1496 // Different parts of legalization seem to interpret which type of
1497 // sign_extend_inreg is the one to check for custom lowering. The extended
1498 // from type is what really matters, but some places check for custom
1499 // lowering of the result type. This results in trying to use
1500 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1501 // nothing here and let the illegal result integer be handled normally.
1502 return;
1503 case ISD::FLOG2:
1504 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1505 Results.push_back(Lowered);
1506 return;
1507 case ISD::FLOG:
1508 case ISD::FLOG10:
1509 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1510 Results.push_back(Lowered);
1511 return;
1512 case ISD::FEXP2:
1513 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1514 Results.push_back(Lowered);
1515 return;
1516 case ISD::FEXP:
1517 case ISD::FEXP10:
1518 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1519 Results.push_back(Lowered);
1520 return;
1521 case ISD::CTLZ:
1523 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1524 Results.push_back(Lowered);
1525 return;
1526 default:
1527 return;
1528 }
1529}
1530
1532 SelectionDAG &DAG) const {
1534 SDLoc SL(Op);
1535 EVT VT = Op.getValueType();
1536 return DAG.getTargetBlockAddress(BA->getBlockAddress(), VT, BA->getOffset(),
1537 BA->getTargetFlags());
1538}
1539
1541 SDValue Op,
1542 SelectionDAG &DAG) const {
1543
1544 const DataLayout &DL = DAG.getDataLayout();
1546 const GlobalValue *GV = G->getGlobal();
1547
1548 if (!MFI->isModuleEntryFunction()) {
1549 bool IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1550 std::optional<uint32_t> Address =
1552 if (!Address && IsNamedBarrier)
1553 llvm_unreachable("named barrier should have an assigned address");
1554 if (Address) {
1555 if (IsNamedBarrier) {
1556 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1557 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1558 }
1559 // A constant byte offset (e.g. from a GEP into an array of named
1560 // barriers) folds directly into the fixed LDS address.
1561 return DAG.getConstant(*Address + G->getOffset(), SDLoc(Op),
1562 Op.getValueType());
1563 }
1564 }
1565
1566 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1567 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1568 if (!MFI->isModuleEntryFunction() &&
1569 GV->getName() != "llvm.amdgcn.module.lds" &&
1571 SDLoc DL(Op);
1572 const Function &Fn = DAG.getMachineFunction().getFunction();
1574 Fn, "local memory global used by non-kernel function",
1575 DL.getDebugLoc(), DS_Warning));
1576
1577 // We currently don't have a way to correctly allocate LDS objects that
1578 // aren't directly associated with a kernel. We do force inlining of
1579 // functions that use local objects. However, if these dead functions are
1580 // not eliminated, we don't want a compile time error. Just emit a warning
1581 // and a trap, since there should be no callable path here.
1582 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1583 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1584 Trap, DAG.getRoot());
1585 DAG.setRoot(OutputChain);
1586 return DAG.getPOISON(Op.getValueType());
1587 }
1588
1589 // TODO: We could emit code to handle the initialization somewhere.
1590 // We ignore the initializer for now and legalize it to allow selection.
1591 // The initializer will anyway get errored out during assembly emission.
1592 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1593 // A constant byte offset (e.g. from a GEP into an array of named barriers)
1594 // folds directly into the allocated LDS address.
1595 return DAG.getConstant(Offset + G->getOffset(), SDLoc(Op),
1596 Op.getValueType());
1597 }
1598 return SDValue();
1599}
1600
1602 SelectionDAG &DAG) const {
1604 SDLoc SL(Op);
1605
1606 EVT VT = Op.getValueType();
1607 if (VT.getVectorElementType().getSizeInBits() < 32) {
1608 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1609 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1610 unsigned NewNumElt = OpBitSize / 32;
1611 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1613 MVT::i32, NewNumElt);
1614 for (const SDUse &U : Op->ops()) {
1615 SDValue In = U.get();
1616 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1617 if (NewNumElt > 1)
1618 DAG.ExtractVectorElements(NewIn, Args);
1619 else
1620 Args.push_back(NewIn);
1621 }
1622
1623 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1624 NewNumElt * Op.getNumOperands());
1625 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1626 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1627 }
1628 }
1629
1630 for (const SDUse &U : Op->ops())
1631 DAG.ExtractVectorElements(U.get(), Args);
1632
1633 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1634}
1635
1637 SelectionDAG &DAG) const {
1638 SDLoc SL(Op);
1640 unsigned Start = Op.getConstantOperandVal(1);
1641 EVT VT = Op.getValueType();
1642 EVT SrcVT = Op.getOperand(0).getValueType();
1643
1644 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1645 unsigned NumElt = VT.getVectorNumElements();
1646 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1647 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1648
1649 // Extract 32-bit registers at a time.
1650 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1651 EVT NewVT = NumElt == 2
1652 ? MVT::i32
1653 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1654 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1655
1656 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1657 if (NumElt == 2)
1658 Tmp = Args[0];
1659 else
1660 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1661
1662 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1663 }
1664
1665 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1667
1668 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1669}
1670
1671// TODO: Handle fabs too
1673 if (Val.getOpcode() == ISD::FNEG)
1674 return Val.getOperand(0);
1675
1676 return Val;
1677}
1678
1680 if (Val.getOpcode() == ISD::FNEG)
1681 Val = Val.getOperand(0);
1682 if (Val.getOpcode() == ISD::FABS)
1683 Val = Val.getOperand(0);
1684 if (Val.getOpcode() == ISD::FCOPYSIGN)
1685 Val = Val.getOperand(0);
1686 return Val;
1687}
1688
1690 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1691 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1692 SelectionDAG &DAG = DCI.DAG;
1693 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1694 switch (CCOpcode) {
1695 case ISD::SETOEQ:
1696 case ISD::SETONE:
1697 case ISD::SETUNE:
1698 case ISD::SETNE:
1699 case ISD::SETUEQ:
1700 case ISD::SETEQ:
1701 case ISD::SETFALSE:
1702 case ISD::SETFALSE2:
1703 case ISD::SETTRUE:
1704 case ISD::SETTRUE2:
1705 case ISD::SETUO:
1706 case ISD::SETO:
1707 break;
1708 case ISD::SETULE:
1709 case ISD::SETULT: {
1710 if (LHS == True)
1711 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1712 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1713 }
1714 case ISD::SETOLE:
1715 case ISD::SETOLT:
1716 case ISD::SETLE:
1717 case ISD::SETLT: {
1718 // Ordered. Assume ordered for undefined.
1719
1720 // Only do this after legalization to avoid interfering with other combines
1721 // which might occur.
1723 !DCI.isCalledByLegalizer())
1724 return SDValue();
1725
1726 // We need to permute the operands to get the correct NaN behavior. The
1727 // selected operand is the second one based on the failing compare with NaN,
1728 // so permute it based on the compare type the hardware uses.
1729 if (LHS == True)
1730 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1731 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1732 }
1733 case ISD::SETUGE:
1734 case ISD::SETUGT: {
1735 if (LHS == True)
1736 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1737 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1738 }
1739 case ISD::SETGT:
1740 case ISD::SETGE:
1741 case ISD::SETOGE:
1742 case ISD::SETOGT: {
1744 !DCI.isCalledByLegalizer())
1745 return SDValue();
1746
1747 if (LHS == True)
1748 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1749 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1750 }
1751 case ISD::SETCC_INVALID:
1752 llvm_unreachable("Invalid setcc condcode!");
1753 }
1754 return SDValue();
1755}
1756
1757/// Generate Min/Max node
1759 SDValue LHS, SDValue RHS,
1760 SDValue True, SDValue False,
1761 SDValue CC,
1762 DAGCombinerInfo &DCI) const {
1763 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1764 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1765
1766 SelectionDAG &DAG = DCI.DAG;
1767
1768 // If we can't directly match this, try to see if we can fold an fneg to
1769 // match.
1770
1773 SDValue NegTrue = peekFNeg(True);
1774
1775 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1776 // fmin/fmax.
1777 //
1778 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1779 // -> fneg (fmin_legacy lhs, K)
1780 //
1781 // TODO: Use getNegatedExpression
1782 if (LHS == NegTrue && CFalse && CRHS) {
1783 APFloat NegRHS = neg(CRHS->getValueAPF());
1784 if (NegRHS == CFalse->getValueAPF()) {
1785 SDValue Combined =
1786 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1787 if (Combined)
1788 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1789 return SDValue();
1790 }
1791 }
1792
1793 return SDValue();
1794}
1795
1796std::pair<SDValue, SDValue>
1798 SDLoc SL(Op);
1799
1800 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1801
1802 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1803 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1804
1805 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1806 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1807
1808 return std::pair(Lo, Hi);
1809}
1810
1812 SDLoc SL(Op);
1813
1814 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1815 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1816 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1817}
1818
1820 SDLoc SL(Op);
1821
1822 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1823 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1824 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1825}
1826
1827// Split a vector type into two parts. The first part is a power of two vector.
1828// The second part is whatever is left over, and is a scalar if it would
1829// otherwise be a 1-vector.
1830std::pair<EVT, EVT>
1832 EVT LoVT, HiVT;
1833 EVT EltVT = VT.getVectorElementType();
1834 unsigned NumElts = VT.getVectorNumElements();
1835 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1836 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1837 HiVT = NumElts - LoNumElts == 1
1838 ? EltVT
1839 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1840 return std::pair(LoVT, HiVT);
1841}
1842
1843// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1844// scalar.
1845std::pair<SDValue, SDValue>
1847 const EVT &LoVT, const EVT &HiVT,
1848 SelectionDAG &DAG) const {
1849 EVT VT = N.getValueType();
1851 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1852 VT.getVectorNumElements() &&
1853 "More vector elements requested than available!");
1855 DAG.getVectorIdxConstant(0, DL));
1856
1857 unsigned LoNumElts = LoVT.getVectorNumElements();
1858
1859 if (HiVT.isVector()) {
1860 unsigned HiNumElts = HiVT.getVectorNumElements();
1861 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1862 // Avoid creating an extract_subvector with an index that isn't a multiple
1863 // of the result type.
1865 DAG.getConstant(LoNumElts, DL, MVT::i32));
1866 return {Lo, Hi};
1867 }
1868
1870 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1871 /*Count=*/HiNumElts);
1872 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1873 return {Lo, Hi};
1874 }
1875
1877 DAG.getVectorIdxConstant(LoNumElts, DL));
1878 return {Lo, Hi};
1879}
1880
1882 SelectionDAG &DAG) const {
1884 EVT VT = Op.getValueType();
1885 SDLoc SL(Op);
1886
1887
1888 // If this is a 2 element vector, we really want to scalarize and not create
1889 // weird 1 element vectors.
1890 if (VT.getVectorNumElements() == 2) {
1891 SDValue Ops[2];
1892 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1893 return DAG.getMergeValues(Ops, SL);
1894 }
1895
1896 SDValue BasePtr = Load->getBasePtr();
1897 EVT MemVT = Load->getMemoryVT();
1898
1899 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1900
1901 EVT LoVT, HiVT;
1902 EVT LoMemVT, HiMemVT;
1903 SDValue Lo, Hi;
1904
1905 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1906 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1907 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1908
1909 unsigned Size = LoMemVT.getStoreSize();
1910 Align BaseAlign = Load->getAlign();
1911 Align HiAlign = commonAlignment(BaseAlign, Size);
1912
1913 SDValue LoLoad = DAG.getExtLoad(
1914 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1915 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1916 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1917 SDValue HiLoad = DAG.getExtLoad(
1918 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1919 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1920 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1921
1922 SDValue Join;
1923 if (LoVT == HiVT) {
1924 // This is the case that the vector is power of two so was evenly split.
1925 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1926 } else {
1927 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1928 DAG.getVectorIdxConstant(0, SL));
1929 Join = DAG.getNode(
1931 VT, Join, HiLoad,
1933 }
1934
1935 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1936 LoLoad.getValue(1), HiLoad.getValue(1))};
1937
1938 return DAG.getMergeValues(Ops, SL);
1939}
1940
1942 SelectionDAG &DAG) const {
1944 EVT VT = Op.getValueType();
1945 SDValue BasePtr = Load->getBasePtr();
1946 EVT MemVT = Load->getMemoryVT();
1947 SDLoc SL(Op);
1948 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1949 Align BaseAlign = Load->getAlign();
1950 unsigned NumElements = MemVT.getVectorNumElements();
1951
1952 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1953 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1954 if (NumElements != 3 ||
1955 (BaseAlign < Align(8) &&
1956 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1957 return SplitVectorLoad(Op, DAG);
1958
1959 assert(NumElements == 3);
1960
1961 EVT WideVT =
1963 EVT WideMemVT =
1965 SDValue WideLoad = DAG.getExtLoad(
1966 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1967 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1968 return DAG.getMergeValues(
1969 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1970 DAG.getVectorIdxConstant(0, SL)),
1971 WideLoad.getValue(1)},
1972 SL);
1973}
1974
1976 SelectionDAG &DAG) const {
1978 SDValue Val = Store->getValue();
1979 EVT VT = Val.getValueType();
1980
1981 // If this is a 2 element vector, we really want to scalarize and not create
1982 // weird 1 element vectors.
1983 if (VT.getVectorNumElements() == 2)
1984 return scalarizeVectorStore(Store, DAG);
1985
1986 EVT MemVT = Store->getMemoryVT();
1987 SDValue Chain = Store->getChain();
1988 SDValue BasePtr = Store->getBasePtr();
1989 SDLoc SL(Op);
1990
1991 EVT LoVT, HiVT;
1992 EVT LoMemVT, HiMemVT;
1993 SDValue Lo, Hi;
1994
1995 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1996 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1997 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1998
1999 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
2000
2001 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
2002 Align BaseAlign = Store->getAlign();
2003 unsigned Size = LoMemVT.getStoreSize();
2004 Align HiAlign = commonAlignment(BaseAlign, Size);
2005
2006 SDValue LoStore =
2007 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
2008 Store->getMemOperand()->getFlags(), Store->getAAInfo());
2009 SDValue HiStore = DAG.getTruncStore(
2010 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
2011 Store->getMemOperand()->getFlags(), Store->getAAInfo());
2012
2013 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
2014}
2015
2016// This is a shortcut for integer division because we have fast i32<->f32
2017// conversions, and fast f32 reciprocal instructions. The fractional part of a
2018// float is enough to accurately represent up to a 24-bit integer.
2020 bool Sign) const {
2021 SDLoc DL(Op);
2022 EVT VT = Op.getValueType();
2023 assert(VT == MVT::i32 && "LowerDIVREM24 expects an i32");
2024
2025 SDValue LHS = Op.getOperand(0);
2026 SDValue RHS = Op.getOperand(1);
2027 MVT IntVT = MVT::i32;
2028 MVT FltVT = MVT::f32;
2029
2030 unsigned LHSSignBits;
2031 unsigned RHSSignBits;
2032 if (Sign) {
2033 LHSSignBits = DAG.ComputeNumSignBits(LHS);
2034 RHSSignBits = DAG.ComputeNumSignBits(RHS);
2035 if (LHSSignBits < 9 || RHSSignBits < 9)
2036 return SDValue();
2037 } else {
2038 KnownBits LHSKnown = DAG.computeKnownBits(LHS);
2039 KnownBits RHSKnown = DAG.computeKnownBits(RHS);
2040 APInt U24Max = APInt::getLowBitsSet(32, 24);
2041 if (LHSKnown.getMaxValue().ugt(U24Max) ||
2042 RHSKnown.getMaxValue().ugt(U24Max))
2043 return SDValue();
2044 LHSSignBits = LHSKnown.countMinLeadingZeros();
2045 RHSSignBits = RHSKnown.countMinLeadingZeros();
2046 }
2047
2048 unsigned BitSize = VT.getSizeInBits();
2049 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2050 unsigned DivBits = BitSize - SignBits;
2051 if (Sign)
2052 ++DivBits;
2053
2056
2057 SDValue jq = DAG.getConstant(1, DL, IntVT);
2058
2059 if (Sign) {
2060 // char|short jq = ia ^ ib;
2061 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2062
2063 // jq = jq >> (bitsize - 2)
2064 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2065 DAG.getConstant(BitSize - 2, DL, VT));
2066
2067 // jq = jq | 0x1
2068 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2069 }
2070
2071 // int ia = (int)LHS;
2072 SDValue ia = LHS;
2073
2074 // int ib, (int)RHS;
2075 SDValue ib = RHS;
2076
2077 // float fa = (float)ia;
2078 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2079
2080 // float fb = (float)ib;
2081 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2082
2083 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2084 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2085
2086 // fq = trunc(fq);
2087 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2088
2089 // float fqneg = -fq;
2090 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2091
2093
2094 bool UseFmadFtz = false;
2095 if (Subtarget->isGCN()) {
2097 UseFmadFtz =
2099 }
2100
2101 // float fr = mad(fqneg, fb, fa);
2102 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2103 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2105 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2106
2107 // int iq = (int)fq;
2108 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2109
2110 // fr = fabs(fr);
2111 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2112
2113 // fb = fabs(fb);
2114 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2115
2116 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2117
2118 // int cv = fr >= fb;
2119 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2120
2121 // jq = (cv ? jq : 0);
2122 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2123
2124 // dst = iq + jq;
2125 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2126
2127 // Rem needs compensation, it's easier to recompute it
2128 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2129 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2130
2131 // Truncate to number of bits this divide really is.
2132 if (Sign) {
2133 SDValue InRegSize
2134 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2135 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2136 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2137 } else {
2138 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2139 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2140 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2141 }
2142
2143 return DAG.getMergeValues({ Div, Rem }, DL);
2144}
2145
2147 SelectionDAG &DAG,
2149 SDLoc DL(Op);
2150 EVT VT = Op.getValueType();
2151
2152 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2153
2154 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2155
2156 SDValue One = DAG.getConstant(1, DL, HalfVT);
2157 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2158
2159 //HiLo split
2160 SDValue LHS_Lo, LHS_Hi;
2161 SDValue LHS = Op.getOperand(0);
2162 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2163
2164 SDValue RHS_Lo, RHS_Hi;
2165 SDValue RHS = Op.getOperand(1);
2166 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2167
2168 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2169 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2170
2171 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2172 LHS_Lo, RHS_Lo);
2173
2174 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2175 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2176
2177 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2178 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2179 return;
2180 }
2181
2182 if (isTypeLegal(MVT::i64)) {
2183 // The algorithm here is based on ideas from "Software Integer Division",
2184 // Tom Rodeheffer, August 2008.
2185
2188
2189 // Compute denominator reciprocal.
2190 unsigned FMAD =
2191 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2194 : (unsigned)AMDGPUISD::FMAD_FTZ;
2195
2196 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2197 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2198 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2199 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2200 Cvt_Lo);
2201 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2202 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2203 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2204 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2205 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2206 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2207 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2208 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2209 Mul1);
2210 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2211 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2212 SDValue Rcp64 = DAG.getBitcast(VT,
2213 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2214
2215 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2216 SDValue One64 = DAG.getConstant(1, DL, VT);
2217 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2218 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2219
2220 // First round of UNR (Unsigned integer Newton-Raphson).
2221 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2222 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2223 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2224 SDValue Mulhi1_Lo, Mulhi1_Hi;
2225 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2226 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2227 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2228 Mulhi1_Lo, Zero1);
2229 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2230 Mulhi1_Hi, Add1_Lo.getValue(1));
2231 SDValue Add1 = DAG.getBitcast(VT,
2232 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2233
2234 // Second round of UNR.
2235 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2236 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2237 SDValue Mulhi2_Lo, Mulhi2_Hi;
2238 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2239 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2240 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2241 Mulhi2_Lo, Zero1);
2242 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2243 Mulhi2_Hi, Add2_Lo.getValue(1));
2244 SDValue Add2 = DAG.getBitcast(VT,
2245 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2246
2247 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2248
2249 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2250
2251 SDValue Mul3_Lo, Mul3_Hi;
2252 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2253 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2254 Mul3_Lo, Zero1);
2255 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2256 Mul3_Hi, Sub1_Lo.getValue(1));
2257 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2258 SDValue Sub1 = DAG.getBitcast(VT,
2259 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2260
2261 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2262 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2263 ISD::SETUGE);
2264 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2265 ISD::SETUGE);
2266 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2267
2268 // TODO: Here and below portions of the code can be enclosed into if/endif.
2269 // Currently control flow is unconditional and we have 4 selects after
2270 // potential endif to substitute PHIs.
2271
2272 // if C3 != 0 ...
2273 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2274 RHS_Lo, Zero1);
2275 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2276 RHS_Hi, Sub1_Lo.getValue(1));
2277 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2278 Zero, Sub2_Lo.getValue(1));
2279 SDValue Sub2 = DAG.getBitcast(VT,
2280 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2281
2282 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2283
2284 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2285 ISD::SETUGE);
2286 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2287 ISD::SETUGE);
2288 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2289
2290 // if (C6 != 0)
2291 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2292
2293 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2294 RHS_Lo, Zero1);
2295 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2296 RHS_Hi, Sub2_Lo.getValue(1));
2297 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2298 Zero, Sub3_Lo.getValue(1));
2299 SDValue Sub3 = DAG.getBitcast(VT,
2300 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2301
2302 // endif C6
2303 // endif C3
2304
2305 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2306 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2307
2308 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2309 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2310
2311 Results.push_back(Div);
2312 Results.push_back(Rem);
2313
2314 return;
2315 }
2316
2317 // r600 expandion.
2318 // Get Speculative values
2319 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2320 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2321
2322 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2323 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2324 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2325
2326 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2327 SDValue DIV_Lo = Zero;
2328
2329 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2330
2331 for (unsigned i = 0; i < halfBitWidth; ++i) {
2332 const unsigned bitPos = halfBitWidth - i - 1;
2333 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2334 // Get value of high bit
2335 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2336 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2337 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2338
2339 // Shift
2340 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2341 // Add LHS high bit
2342 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2343
2344 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2345 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2346
2347 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2348
2349 // Update REM
2350 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2351 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2352 }
2353
2354 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2355 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2356 Results.push_back(DIV);
2357 Results.push_back(REM);
2358}
2359
2361 SelectionDAG &DAG) const {
2362 SDLoc DL(Op);
2363 EVT VT = Op.getValueType();
2364
2365 if (VT == MVT::i64) {
2367 LowerUDIVREM64(Op, DAG, Results);
2368 return DAG.getMergeValues(Results, DL);
2369 }
2370
2371 if (VT == MVT::i32) {
2372 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2373 return Res;
2374 }
2375
2376 SDValue X = Op.getOperand(0);
2377 SDValue Y = Op.getOperand(1);
2378
2379 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2380 // algorithm used here.
2381
2382 // Initial estimate of inv(y).
2383 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2384
2385 // One round of UNR.
2386 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2387 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2388 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2389 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2390
2391 // Quotient/remainder estimate.
2392 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2393 SDValue R =
2394 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2395
2396 // First quotient/remainder refinement.
2397 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2398 SDValue One = DAG.getConstant(1, DL, VT);
2399 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2400 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2401 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2402 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2403 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2404
2405 // Second quotient/remainder refinement.
2406 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2407 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2408 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2409 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2410 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2411
2412 return DAG.getMergeValues({Q, R}, DL);
2413}
2414
2416 SelectionDAG &DAG) const {
2417 SDLoc DL(Op);
2418 EVT VT = Op.getValueType();
2419
2420 SDValue LHS = Op.getOperand(0);
2421 SDValue RHS = Op.getOperand(1);
2422
2423 SDValue Zero = DAG.getConstant(0, DL, VT);
2424 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2425
2426 if (VT == MVT::i32) {
2427 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2428 return Res;
2429 }
2430
2431 // LHS must have > 33 sign-bits to ensure that LHS != -2147483648
2432 // Otherwise 32-bit division cannot be used safely.
2433 // -2147483648/1 and -2147483648/-1 are not equal,
2434 // but they produce the same lower 32-bit result.
2435 if (VT == MVT::i64 && DAG.ComputeNumSignBits(LHS) > 33 &&
2436 DAG.ComputeNumSignBits(RHS) > 32) {
2437 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2438
2439 //HiLo split
2440 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2441 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2442 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2443 LHS_Lo, RHS_Lo);
2444 SDValue Res[2] = {
2445 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2446 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2447 };
2448 return DAG.getMergeValues(Res, DL);
2449 }
2450
2451 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2452 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2453 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2454 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2455
2456 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2457 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2458
2459 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2460 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2461
2462 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2463 SDValue Rem = Div.getValue(1);
2464
2465 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2466 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2467
2468 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2469 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2470
2471 SDValue Res[2] = {
2472 Div,
2473 Rem
2474 };
2475 return DAG.getMergeValues(Res, DL);
2476}
2477
2479 SDLoc SL(Op);
2480 SDValue Src = Op.getOperand(0);
2481
2482 // result = trunc(src)
2483 // if (src > 0.0 && src != result)
2484 // result += 1.0
2485
2486 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2487
2488 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2489 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2490
2491 EVT SetCCVT =
2492 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2493
2494 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2495 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2496 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2497
2498 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2499 // TODO: Should this propagate fast-math-flags?
2500 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2501}
2502
2504 SelectionDAG &DAG) {
2505 const unsigned FractBits = 52;
2506 const unsigned ExpBits = 11;
2507
2508 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2509 Hi,
2510 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2511 DAG.getConstant(ExpBits, SL, MVT::i32));
2512 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2513 DAG.getConstant(1023, SL, MVT::i32));
2514
2515 return Exp;
2516}
2517
2519 SDLoc SL(Op);
2520 SDValue Src = Op.getOperand(0);
2521
2522 assert(Op.getValueType() == MVT::f64);
2523
2524 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2525
2526 // Extract the upper half, since this is where we will find the sign and
2527 // exponent.
2528 SDValue Hi = getHiHalf64(Src, DAG);
2529
2530 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2531
2532 const unsigned FractBits = 52;
2533
2534 // Extract the sign bit.
2535 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2536 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2537
2538 // Extend back to 64-bits.
2539 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2540 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2541
2542 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2543 const SDValue FractMask
2544 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2545
2546 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2547 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2548 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2549
2550 EVT SetCCVT =
2551 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2552
2553 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2554
2555 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2556 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2557
2558 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2559 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2560
2561 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2562}
2563
2565 SelectionDAG &DAG) const {
2566 SDLoc SL(Op);
2567 SDValue Src = Op.getOperand(0);
2568
2569 assert(Op.getValueType() == MVT::f64);
2570
2571 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2572 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2573 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2574
2575 // TODO: Should this propagate fast-math-flags?
2576
2577 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2578 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2579
2580 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2581
2582 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2583 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2584
2585 EVT SetCCVT =
2586 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2587 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2588
2589 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2590}
2591
2593 SelectionDAG &DAG) const {
2594 // FNEARBYINT and FRINT are the same, except in their handling of FP
2595 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2596 // rint, so just treat them as equivalent.
2597 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2598 Op.getOperand(0));
2599}
2600
2602 auto VT = Op.getValueType();
2603 auto Arg = Op.getOperand(0u);
2604 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2605}
2606
2607// XXX - May require not supporting f32 denormals?
2608
2609// Don't handle v2f16. The extra instructions to scalarize and repack around the
2610// compare and vselect end up producing worse code than scalarizing the whole
2611// operation.
2613 SDLoc SL(Op);
2614 SDValue X = Op.getOperand(0);
2615 EVT VT = Op.getValueType();
2616
2617 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2618
2619 // TODO: Should this propagate fast-math-flags?
2620
2621 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2622
2623 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2624
2625 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2626 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2627
2628 EVT SetCCVT =
2629 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2630
2631 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2632 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2633 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2634
2635 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2636 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2637}
2638
2640 SDLoc SL(Op);
2641 SDValue Src = Op.getOperand(0);
2642
2643 // result = trunc(src);
2644 // if (src < 0.0 && src != result)
2645 // result += -1.0.
2646
2647 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2648
2649 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2650 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2651
2652 EVT SetCCVT =
2653 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2654
2655 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2656 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2657 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2658
2659 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2660 // TODO: Should this propagate fast-math-flags?
2661 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2662}
2663
2664/// Return true if it's known that \p Src can never be an f32 denormal value.
2666 switch (Src.getOpcode()) {
2667 case ISD::FP_EXTEND:
2668 return Src.getOperand(0).getValueType() == MVT::f16;
2669 case ISD::FP16_TO_FP:
2670 case ISD::FFREXP:
2671 case ISD::FSQRT:
2672 case AMDGPUISD::LOG:
2673 case AMDGPUISD::EXP:
2674 return true;
2676 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2677 switch (IntrinsicID) {
2678 case Intrinsic::amdgcn_frexp_mant:
2679 case Intrinsic::amdgcn_log:
2680 case Intrinsic::amdgcn_log_clamp:
2681 case Intrinsic::amdgcn_exp2:
2682 case Intrinsic::amdgcn_sqrt:
2683 return true;
2684 default:
2685 return false;
2686 }
2687 }
2688 default:
2689 return false;
2690 }
2691
2692 llvm_unreachable("covered opcode switch");
2693}
2694
2696 SDNodeFlags Flags) {
2697 return Flags.hasApproximateFuncs();
2698}
2699
2708
2710 SDValue Src,
2711 SDNodeFlags Flags) const {
2712 SDLoc SL(Src);
2713 EVT VT = Src.getValueType();
2714 const fltSemantics &Semantics = VT.getFltSemantics();
2715 SDValue SmallestNormal =
2716 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2717
2718 // Want to scale denormals up, but negatives and 0 work just as well on the
2719 // scaled path.
2720 SDValue IsLtSmallestNormal = DAG.getSetCC(
2721 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2722 SmallestNormal, ISD::SETOLT);
2723
2724 return IsLtSmallestNormal;
2725}
2726
2728 SDNodeFlags Flags) const {
2729 SDLoc SL(Src);
2730 EVT VT = Src.getValueType();
2731 const fltSemantics &Semantics = VT.getFltSemantics();
2732 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2733
2734 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2735 SDValue IsFinite = DAG.getSetCC(
2736 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2737 Inf, ISD::SETOLT);
2738 return IsFinite;
2739}
2740
2741/// If denormal handling is required return the scaled input to FLOG2, and the
2742/// check for denormal range. Otherwise, return null values.
2743std::pair<SDValue, SDValue>
2745 SDValue Src, SDNodeFlags Flags) const {
2746 if (!needsDenormHandlingF32(DAG, Src, Flags))
2747 return {};
2748
2749 MVT VT = MVT::f32;
2750 const fltSemantics &Semantics = APFloat::IEEEsingle();
2751 SDValue SmallestNormal =
2752 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2753
2754 SDValue IsLtSmallestNormal = DAG.getSetCC(
2755 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2756 SmallestNormal, ISD::SETOLT);
2757
2758 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2759 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2760 SDValue ScaleFactor =
2761 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2762
2763 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2764 return {ScaledInput, IsLtSmallestNormal};
2765}
2766
2768 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2769 // If we have to handle denormals, scale up the input and adjust the result.
2770
2771 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2772 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2773
2774 SDLoc SL(Op);
2775 EVT VT = Op.getValueType();
2776 SDValue Src = Op.getOperand(0);
2777 SDNodeFlags Flags = Op->getFlags();
2778
2779 if (VT == MVT::f16) {
2780 // Nothing in half is a denormal when promoted to f32.
2781 assert(!isTypeLegal(VT));
2782 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2783 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2784 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2785 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2786 }
2787
2788 auto [ScaledInput, IsLtSmallestNormal] =
2789 getScaledLogInput(DAG, SL, Src, Flags);
2790 if (!ScaledInput)
2791 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2792
2793 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2794
2795 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2796 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2797 SDValue ResultOffset =
2798 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2799 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2800}
2801
2802static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2803 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2804 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2805 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2806}
2807
2809 SelectionDAG &DAG) const {
2810 SDValue X = Op.getOperand(0);
2811 EVT VT = Op.getValueType();
2812 SDNodeFlags Flags = Op->getFlags();
2813 SDLoc DL(Op);
2814 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2815 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2816
2817 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2818 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2819 // depending on !fpmath metadata.
2820
2821 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2822 !isTypeLegal(MVT::f16));
2823
2824 if (PromoteToF32) {
2825 // Log and multiply in f32 is always good enough for f16.
2826 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2827 }
2828
2829 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2830 if (PromoteToF32) {
2831 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2832 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2833 }
2834
2835 return Lowered;
2836 }
2837
2838 SDValue ScaledInput, IsScaled;
2839 if (VT == MVT::f16)
2840 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2841 else {
2842 std::tie(ScaledInput, IsScaled) = getScaledLogInput(DAG, DL, X, Flags);
2843 if (ScaledInput)
2844 X = ScaledInput;
2845 }
2846
2847 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2848
2849 SDValue R;
2850 if (Subtarget->hasFastFMAF32()) {
2851 // c+cc are ln(2)/ln(10) to more than 49 bits
2852 const float c_log10 = 0x1.344134p-2f;
2853 const float cc_log10 = 0x1.09f79ep-26f;
2854
2855 // c + cc is ln(2) to more than 49 bits
2856 const float c_log = 0x1.62e42ep-1f;
2857 const float cc_log = 0x1.efa39ep-25f;
2858
2859 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2860 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2861 // This adds correction terms for which contraction may lead to an increase
2862 // in the error of the approximation, so disable it.
2863 Flags.setAllowContract(false);
2864 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2865 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2866 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2867 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2868 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2869 } else {
2870 // ch+ct is ln(2)/ln(10) to more than 36 bits
2871 const float ch_log10 = 0x1.344000p-2f;
2872 const float ct_log10 = 0x1.3509f6p-18f;
2873
2874 // ch + ct is ln(2) to more than 36 bits
2875 const float ch_log = 0x1.62e000p-1f;
2876 const float ct_log = 0x1.0bfbe8p-15f;
2877
2878 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2879 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2880
2881 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2882 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2883 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2884 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2885 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2886 // This adds correction terms for which contraction may lead to an increase
2887 // in the error of the approximation, so disable it.
2888 Flags.setAllowContract(false);
2889 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2890 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2891 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2892 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2893 }
2894
2895 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2896
2897 // TODO: Check if known finite from source value.
2898 if (!IsFiniteOnly) {
2899 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2900 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2901 }
2902
2903 if (IsScaled) {
2904 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2905 SDValue ShiftK =
2906 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2907 SDValue Shift =
2908 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2909 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2910 }
2911
2912 return R;
2913}
2914
2918
2919// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2920// promote f16 operation.
2922 SelectionDAG &DAG, bool IsLog10,
2923 SDNodeFlags Flags) const {
2924 EVT VT = Src.getValueType();
2925 unsigned LogOp =
2926 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2927
2928 double Log2BaseInverted =
2930
2931 if (VT == MVT::f32) {
2932 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2933 if (ScaledInput) {
2934 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2935 SDValue ScaledResultOffset =
2936 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2937
2938 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2939
2940 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2941 ScaledResultOffset, Zero, Flags);
2942
2943 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2944
2945 if (Subtarget->hasFastFMAF32())
2946 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2947 Flags);
2948 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2949 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2950 }
2951 }
2952
2953 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2954 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2955
2956 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2957 Flags);
2958}
2959
2960// This expansion gives a result slightly better than 1ulp.
2962 SelectionDAG &DAG) const {
2963 SDLoc DL(Op);
2964 SDValue X = Op.getOperand(0);
2965
2966 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2967 // exp10, which slightly increases ulp.
2968 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2969
2970 SDValue DN, F, T;
2971
2972 if (Op.getOpcode() == ISD::FEXP2) {
2973 // dn = rint(x)
2974 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, X, Flags);
2975 // f = x - dn
2976 F = DAG.getNode(ISD::FSUB, DL, MVT::f64, X, DN, Flags);
2977 // t = f*C1 + f*C2
2978 SDValue C1 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2979 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2980 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C2, Flags);
2981 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C1, Mul2, Flags);
2982 } else if (Op.getOpcode() == ISD::FEXP10) {
2983 // dn = rint(x * C1)
2984 SDValue C1 = DAG.getConstantFP(0x1.a934f0979a371p+1, DL, MVT::f64);
2985 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2986 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2987
2988 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2989 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2990 SDValue C2 = DAG.getConstantFP(-0x1.9dc1da994fd21p-59, DL, MVT::f64);
2991 SDValue C3 = DAG.getConstantFP(0x1.34413509f79ffp-2, DL, MVT::f64);
2992 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2993 F = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2994
2995 // t = FMA(f, C4, f*C5)
2996 SDValue C4 = DAG.getConstantFP(0x1.26bb1bbb55516p+1, DL, MVT::f64);
2997 SDValue C5 = DAG.getConstantFP(-0x1.f48ad494ea3e9p-53, DL, MVT::f64);
2998 SDValue MulF = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C5, Flags);
2999 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C4, MulF, Flags);
3000 } else { // ISD::FEXP
3001 // dn = rint(x * C1)
3002 SDValue C1 = DAG.getConstantFP(0x1.71547652b82fep+0, DL, MVT::f64);
3003 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
3004 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
3005
3006 // t = FMA(-dn, C2, FMA(-dn, C3, x))
3007 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
3008 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
3009 SDValue C3 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
3010 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
3011 T = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
3012 }
3013
3014 // Polynomial expansion for p
3015 SDValue P = DAG.getConstantFP(0x1.ade156a5dcb37p-26, DL, MVT::f64);
3016 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3017 DAG.getConstantFP(0x1.28af3fca7ab0cp-22, DL, MVT::f64),
3018 Flags);
3019 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3020 DAG.getConstantFP(0x1.71dee623fde64p-19, DL, MVT::f64),
3021 Flags);
3022 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3023 DAG.getConstantFP(0x1.a01997c89e6b0p-16, DL, MVT::f64),
3024 Flags);
3025 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3026 DAG.getConstantFP(0x1.a01a014761f6ep-13, DL, MVT::f64),
3027 Flags);
3028 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3029 DAG.getConstantFP(0x1.6c16c1852b7b0p-10, DL, MVT::f64),
3030 Flags);
3031 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3032 DAG.getConstantFP(0x1.1111111122322p-7, DL, MVT::f64), Flags);
3033 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3034 DAG.getConstantFP(0x1.55555555502a1p-5, DL, MVT::f64), Flags);
3035 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3036 DAG.getConstantFP(0x1.5555555555511p-3, DL, MVT::f64), Flags);
3037 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3038 DAG.getConstantFP(0x1.000000000000bp-1, DL, MVT::f64), Flags);
3039
3040 SDValue One = DAG.getConstantFP(1.0, DL, MVT::f64);
3041
3042 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3043 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3044
3045 // z = ldexp(p, (int)dn)
3046 SDValue DNInt = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32, DN);
3047 SDValue Z = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, P, DNInt, Flags);
3048
3049 // Overflow/underflow guards
3050 SDValue CondHi = DAG.getSetCC(
3051 DL, MVT::i1, X, DAG.getConstantFP(1024.0, DL, MVT::f64), ISD::SETULE);
3052
3053 if (!Flags.hasNoInfs()) {
3054 SDValue PInf = DAG.getConstantFP(std::numeric_limits<double>::infinity(),
3055 DL, MVT::f64);
3056 Z = DAG.getSelect(DL, MVT::f64, CondHi, Z, PInf, Flags);
3057 }
3058
3059 SDValue CondLo = DAG.getSetCC(
3060 DL, MVT::i1, X, DAG.getConstantFP(-1075.0, DL, MVT::f64), ISD::SETUGE);
3061 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
3062 Z = DAG.getSelect(DL, MVT::f64, CondLo, Z, Zero, Flags);
3063
3064 return Z;
3065}
3066
3068 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3069 // If we have to handle denormals, scale up the input and adjust the result.
3070
3071 EVT VT = Op.getValueType();
3072 if (VT == MVT::f64)
3073 return lowerFEXPF64(Op, DAG);
3074
3075 SDLoc SL(Op);
3076 SDValue Src = Op.getOperand(0);
3077 SDNodeFlags Flags = Op->getFlags();
3078
3079 if (VT == MVT::f16) {
3080 // Nothing in half is a denormal when promoted to f32.
3081 assert(!isTypeLegal(MVT::f16));
3082 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
3083 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
3084 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
3085 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3086 }
3087
3088 assert(VT == MVT::f32);
3089
3090 if (!needsDenormHandlingF32(DAG, Src, Flags))
3091 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
3092
3093 // bool needs_scaling = x < -0x1.f80000p+6f;
3094 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3095
3096 // -nextafter(128.0, -1)
3097 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
3098
3099 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3100
3101 SDValue NeedsScaling =
3102 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
3103
3104 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3105 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3106
3107 SDValue AddOffset =
3108 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
3109
3110 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
3111 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
3112
3113 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
3114 SDValue One = DAG.getConstantFP(1.0, SL, VT);
3115 SDValue ResultScale =
3116 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
3117
3118 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
3119}
3120
3122 SelectionDAG &DAG,
3123 SDNodeFlags Flags,
3124 bool IsExp10) const {
3125 // exp(x) -> exp2(M_LOG2E_F * x);
3126 // exp10(x) -> exp2(log2(10) * x);
3127 EVT VT = X.getValueType();
3128 SDValue Const =
3129 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
3130
3131 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
3132 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3133 : (unsigned)ISD::FEXP2,
3134 SL, VT, Mul, Flags);
3135}
3136
3138 SelectionDAG &DAG,
3139 SDNodeFlags Flags) const {
3140 EVT VT = X.getValueType();
3141 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
3142 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3143
3144 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3145
3146 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
3147 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3148
3149 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3150
3151 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3152
3153 SDValue AdjustedX =
3154 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3155
3156 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3157 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3158
3159 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3160
3161 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3162 SDValue AdjustedResult =
3163 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3164
3165 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3166 Flags);
3167}
3168
3169/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3170/// handled correctly.
3172 SelectionDAG &DAG,
3173 SDNodeFlags Flags) const {
3174 const EVT VT = X.getValueType();
3175
3176 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3177 : static_cast<unsigned>(ISD::FEXP2);
3178
3179 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3180 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3181 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3182 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3183
3184 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3185 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3186 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3187 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3188 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3189 }
3190
3191 // bool s = x < -0x1.2f7030p+5f;
3192 // x += s ? 0x1.0p+5f : 0.0f;
3193 // exp10 = exp2(x * 0x1.a92000p+1f) *
3194 // exp2(x * 0x1.4f0978p-11f) *
3195 // (s ? 0x1.9f623ep-107f : 1.0f);
3196
3197 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3198
3199 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3200 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3201
3202 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3203 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3204 SDValue AdjustedX =
3205 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3206
3207 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3208 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3209
3210 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3211 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3212 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3213 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3214
3215 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3216
3217 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3218 SDValue AdjustedResult =
3219 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3220
3221 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3222 Flags);
3223}
3224
3226 EVT VT = Op.getValueType();
3227
3228 if (VT == MVT::f64)
3229 return lowerFEXPF64(Op, DAG);
3230
3231 SDLoc SL(Op);
3232 SDValue X = Op.getOperand(0);
3233 SDNodeFlags Flags = Op->getFlags();
3234 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3235
3236 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3237 // library behavior. Also, is known-not-daz source sufficient?
3238 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3239 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3240 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3241 }
3242
3243 if (VT.getScalarType() == MVT::f16) {
3244 if (VT.isVector())
3245 return SDValue();
3246
3247 // Nothing in half is a denormal when promoted to f32.
3248 //
3249 // exp(f16 x) ->
3250 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3251 //
3252 // exp10(f16 x) ->
3253 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3254 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3255 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3256 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3257 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3258 }
3259
3260 assert(VT == MVT::f32);
3261
3262 // Algorithm:
3263 //
3264 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3265 //
3266 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3267 // n = 64*m + j, 0 <= j < 64
3268 //
3269 // e^x = 2^((64*m + j + f)/64)
3270 // = (2^m) * (2^(j/64)) * 2^(f/64)
3271 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3272 //
3273 // f = x*(64/ln(2)) - n
3274 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3275 //
3276 // e^x = (2^m) * (2^(j/64)) * e^r
3277 //
3278 // (2^(j/64)) is precomputed
3279 //
3280 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3281 // e^r = 1 + q
3282 //
3283 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3284 //
3285 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3286 SDNodeFlags FlagsNoContract = Flags;
3287 FlagsNoContract.setAllowContract(false);
3288
3289 SDValue PH, PL;
3290 if (Subtarget->hasFastFMAF32()) {
3291 const float c_exp = numbers::log2ef;
3292 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3293 const float c_exp10 = 0x1.a934f0p+1f;
3294 const float cc_exp10 = 0x1.2f346ep-24f;
3295
3296 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3297 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3298
3299 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3300 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3301 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3302 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3303 } else {
3304 const float ch_exp = 0x1.714000p+0f;
3305 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3306
3307 const float ch_exp10 = 0x1.a92000p+1f;
3308 const float cl_exp10 = 0x1.4f0978p-11f;
3309
3310 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3311 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3312
3313 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3314 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3315 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3316 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3317 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3318
3319 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3320
3321 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3322 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3323 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3324 }
3325
3326 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3327
3328 // It is unsafe to contract this fsub into the PH multiply.
3329 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3330
3331 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3332 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3333 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3334
3335 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3336
3337 SDValue UnderflowCheckConst =
3338 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3339
3340 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3341 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3342 SDValue Underflow =
3343 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3344
3345 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3346
3347 if (!Flags.hasNoInfs()) {
3348 SDValue OverflowCheckConst =
3349 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3350 SDValue Overflow =
3351 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3352 SDValue Inf =
3354 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3355 }
3356
3357 return R;
3358}
3359
3360static bool isCtlzOpc(unsigned Opc) {
3361 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_POISON;
3362}
3363
3364static bool isCttzOpc(unsigned Opc) {
3365 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_POISON;
3366}
3367
3369 SelectionDAG &DAG) const {
3370 auto SL = SDLoc(Op);
3371 auto Opc = Op.getOpcode();
3372 auto Arg = Op.getOperand(0u);
3373 auto ResultVT = Op.getValueType();
3374
3375 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3376 return {};
3377
3379 assert(ResultVT == Arg.getValueType());
3380
3381 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3382 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3383 SDValue NewOp;
3384
3385 if (Opc == ISD::CTLZ_ZERO_POISON) {
3386 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3387 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3388 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3389 } else {
3390 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3391 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3392 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3393 }
3394
3395 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3396}
3397
3399 SDLoc SL(Op);
3400 SDValue Src = Op.getOperand(0);
3401
3402 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3403 bool Ctlz = isCtlzOpc(Op.getOpcode());
3404 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3405
3406 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_POISON ||
3407 Op.getOpcode() == ISD::CTTZ_ZERO_POISON;
3408 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3409
3410 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3411 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3412 // (cttz hi:lo) -> (umin (ffbl src), 32)
3413 // (ctlz_zero_poison src) -> (ffbh src)
3414 // (cttz_zero_poison src) -> (ffbl src)
3415
3416 // 64-bit scalar version produce 32-bit result
3417 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3418 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3419 // (ctlz_zero_poison src) -> (S_FLBIT_I32_B64 src)
3420 // (cttz_zero_poison src) -> (S_FF1_I32_B64 src)
3421 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3422 if (!ZeroUndef) {
3423 const SDValue ConstVal = DAG.getConstant(
3424 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3425 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3426 }
3427 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3428 }
3429
3430 SDValue Lo, Hi;
3431 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3432
3433 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3434 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3435
3436 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3437 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3438 // (ctlz_zero_poison hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3439 // (cttz_zero_poison hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3440
3441 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3442 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3443 if (Ctlz)
3444 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3445 else
3446 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3447
3448 SDValue NewOpr;
3449 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3450 if (!ZeroUndef) {
3451 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3452 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3453 }
3454
3455 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3456}
3457
3459 SDLoc SL(Op);
3460 SDValue Src = Op.getOperand(0);
3461 assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
3462 SDValue Ffbh = DAG.getNode(
3463 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3464 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src);
3465 SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh,
3466 DAG.getConstant(32, SL, MVT::i32));
3467 return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped,
3468 DAG.getAllOnesConstant(SL, MVT::i32));
3469}
3470
3472 EVT FP16Ty) const {
3473 assert(FP16Ty == MVT::f16 || FP16Ty == MVT::bf16);
3474 SDLoc SL(Op);
3475 SDValue Src = Op.getOperand(0);
3476 SDValue ToF32 = DAG.getNode(Op.getOpcode(), SL, MVT::f32, Src);
3477 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3478 return DAG.getNode(ISD::FP_ROUND, SL, FP16Ty, ToF32, FPRoundFlag);
3479}
3480
3482 bool Signed) const {
3483 // The regular method converting a 64-bit integer to float roughly consists of
3484 // 2 steps: normalization and rounding. In fact, after normalization, the
3485 // conversion from a 64-bit integer to a float is essentially the same as the
3486 // one from a 32-bit integer. The only difference is that it has more
3487 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3488 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3489 // converted into the correct float number. The basic steps for the unsigned
3490 // conversion are illustrated in the following pseudo code:
3491 //
3492 // f32 uitofp(i64 u) {
3493 // i32 hi, lo = split(u);
3494 // // Only count the leading zeros in hi as we have native support of the
3495 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3496 // // reduced to a 32-bit one automatically.
3497 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3498 // u <<= shamt;
3499 // hi, lo = split(u);
3500 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3501 // // convert it as a 32-bit integer and scale the result back.
3502 // return uitofp(hi) * 2^(32 - shamt);
3503 // }
3504 //
3505 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3506 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3507 // converted instead followed by negation based its sign bit.
3508
3509 SDLoc SL(Op);
3510 SDValue Src = Op.getOperand(0);
3511
3512 SDValue Lo, Hi;
3513 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3514 SDValue Sign;
3515 SDValue ShAmt;
3516 if (Signed && Subtarget->isGCN()) {
3517 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3518 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3519 // account. That is, the maximal shift is
3520 // - 32 if Lo and Hi have opposite signs;
3521 // - 33 if Lo and Hi have the same sign.
3522 //
3523 // Or, MaxShAmt = 33 + OppositeSign, where
3524 //
3525 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3526 // - -1 if Lo and Hi have opposite signs; and
3527 // - 0 otherwise.
3528 //
3529 // All in all, ShAmt is calculated as
3530 //
3531 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3532 //
3533 // or
3534 //
3535 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3536 //
3537 // to reduce the critical path.
3538 SDValue OppositeSign = DAG.getNode(
3539 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3540 DAG.getConstant(31, SL, MVT::i32));
3541 SDValue MaxShAmt =
3542 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3543 OppositeSign);
3544 // Count the leading sign bits.
3545 ShAmt = DAG.getNode(
3546 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3547 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi);
3548 // Different from unsigned conversion, the shift should be one bit less to
3549 // preserve the sign bit.
3550 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3551 DAG.getConstant(1, SL, MVT::i32));
3552 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3553 } else {
3554 if (Signed) {
3555 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3556 // absolute value first.
3557 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3558 DAG.getConstant(63, SL, MVT::i64));
3559 SDValue Abs =
3560 DAG.getNode(ISD::XOR, SL, MVT::i64,
3561 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3562 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3563 }
3564 // Count the leading zeros.
3565 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3566 // The shift amount for signed integers is [0, 32].
3567 }
3568 // Normalize the given 64-bit integer.
3569 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3570 // Split it again.
3571 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3572 // Calculate the adjust bit for rounding.
3573 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3574 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3575 DAG.getConstant(1, SL, MVT::i32), Lo);
3576 // Get the 32-bit normalized integer.
3577 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3578 // Convert the normalized 32-bit integer into f32.
3579
3580 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3581 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3582 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3583
3584 // Finally, need to scale back the converted floating number as the original
3585 // 64-bit integer is converted as a 32-bit one.
3586 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3587 ShAmt);
3588 // On GCN, use LDEXP directly.
3589 if (UseLDEXP)
3590 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3591
3592 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3593 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3594 // exponent is enough to avoid overflowing into the sign bit.
3595 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3596 DAG.getConstant(23, SL, MVT::i32));
3597 SDValue IVal =
3598 DAG.getNode(ISD::ADD, SL, MVT::i32,
3599 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3600 if (Signed) {
3601 // Set the sign bit.
3602 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3603 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3604 DAG.getConstant(31, SL, MVT::i32));
3605 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3606 }
3607 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3608}
3609
3611 bool Signed) const {
3612 SDLoc SL(Op);
3613 SDValue Src = Op.getOperand(0);
3614
3615 SDValue Lo, Hi;
3616 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3617
3619 SL, MVT::f64, Hi);
3620
3621 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3622
3623 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3624 DAG.getConstant(32, SL, MVT::i32));
3625 // TODO: Should this propagate fast-math-flags?
3626 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3627}
3628
3630 SelectionDAG &DAG) const {
3631 // TODO: Factor out code common with LowerSINT_TO_FP.
3632 EVT DestVT = Op.getValueType();
3633 SDValue Src = Op.getOperand(0);
3634 EVT SrcVT = Src.getValueType();
3635
3636 if (SrcVT == MVT::i16) {
3637 if (DestVT == MVT::f16)
3638 return Op;
3639 SDLoc DL(Op);
3640
3641 // Promote src to i32
3642 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3643 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3644 }
3645
3646 if (DestVT == MVT::bf16 || DestVT == MVT::f16)
3647 return LowerINT_TO_FP16(Op, DAG, DestVT);
3648
3649 if (SrcVT != MVT::i64)
3650 return Op;
3651
3652 if (DestVT == MVT::f32)
3653 return LowerINT_TO_FP32(Op, DAG, false);
3654
3655 assert(DestVT == MVT::f64);
3656 return LowerINT_TO_FP64(Op, DAG, false);
3657}
3658
3660 SelectionDAG &DAG) const {
3661 EVT DestVT = Op.getValueType();
3662
3663 SDValue Src = Op.getOperand(0);
3664 EVT SrcVT = Src.getValueType();
3665
3666 if (SrcVT == MVT::i16) {
3667 if (DestVT == MVT::f16)
3668 return Op;
3669
3670 SDLoc DL(Op);
3671 // Promote src to i32
3672 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3673 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3674 }
3675
3676 if (DestVT == MVT::bf16 || DestVT == MVT::f16)
3677 return LowerINT_TO_FP16(Op, DAG, DestVT);
3678
3679 if (SrcVT != MVT::i64)
3680 return Op;
3681
3682 // TODO: Factor out code common with LowerUINT_TO_FP.
3683
3684 if (DestVT == MVT::f32)
3685 return LowerINT_TO_FP32(Op, DAG, true);
3686
3687 assert(DestVT == MVT::f64);
3688 return LowerINT_TO_FP64(Op, DAG, true);
3689}
3690
3692 bool Signed) const {
3693 SDLoc SL(Op);
3694
3695 SDValue Src = Op.getOperand(0);
3696 EVT SrcVT = Src.getValueType();
3697
3698 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3699
3700 // The basic idea of converting a floating point number into a pair of 32-bit
3701 // integers is illustrated as follows:
3702 //
3703 // tf := trunc(val);
3704 // hif := floor(tf * 2^-32);
3705 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3706 // hi := fptoi(hif);
3707 // lo := fptoi(lof);
3708 //
3709 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3710 SDValue Sign;
3711 if (Signed && SrcVT == MVT::f32) {
3712 // However, a 32-bit floating point number has only 23 bits mantissa and
3713 // it's not enough to hold all the significant bits of `lof` if val is
3714 // negative. To avoid the loss of precision, We need to take the absolute
3715 // value after truncating and flip the result back based on the original
3716 // signedness.
3717 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3718 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3719 DAG.getConstant(31, SL, MVT::i32));
3720 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3721 }
3722
3723 SDValue K0, K1;
3724 if (SrcVT == MVT::f64) {
3725 K0 = DAG.getConstantFP(
3726 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3727 SrcVT);
3728 K1 = DAG.getConstantFP(
3729 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3730 SrcVT);
3731 } else {
3732 K0 = DAG.getConstantFP(
3733 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3734 K1 = DAG.getConstantFP(
3735 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3736 }
3737 // TODO: Should this propagate fast-math-flags?
3738 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3739
3740 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3741
3742 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3743
3744 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3746 SL, MVT::i32, FloorMul);
3747 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3748
3749 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3750 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3751
3752 if (Signed && SrcVT == MVT::f32) {
3753 assert(Sign);
3754 // Flip the result based on the signedness, which is either all 0s or 1s.
3755 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3756 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3757 // r := xor(r, sign) - sign;
3758 Result =
3759 DAG.getNode(ISD::SUB, SL, MVT::i64,
3760 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3761 }
3762
3763 return Result;
3764}
3765
3767 SDLoc DL(Op);
3768 SDValue N0 = Op.getOperand(0);
3769
3770 // Convert to target node to get known bits
3771 if (N0.getValueType() == MVT::f32)
3772 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3773
3774 if (Op->getFlags().hasApproximateFuncs()) {
3775 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3776 return SDValue();
3777 }
3778
3779 return LowerF64ToF16Safe(N0, DL, DAG);
3780}
3781
3782// return node in i32
3784 SelectionDAG &DAG) const {
3785 assert(Src.getSimpleValueType() == MVT::f64);
3786
3787 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3788 // TODO: We can generate better code for True16.
3789 const unsigned ExpMask = 0x7ff;
3790 const unsigned ExpBiasf64 = 1023;
3791 const unsigned ExpBiasf16 = 15;
3792 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3793 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3794 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3795 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3796 DAG.getConstant(32, DL, MVT::i64));
3797 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3798 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3799 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3800 DAG.getConstant(20, DL, MVT::i64));
3801 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3802 DAG.getConstant(ExpMask, DL, MVT::i32));
3803 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3804 // add the f16 bias (15) to get the biased exponent for the f16 format.
3805 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3806 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3807
3808 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3809 DAG.getConstant(8, DL, MVT::i32));
3810 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3811 DAG.getConstant(0xffe, DL, MVT::i32));
3812
3813 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3814 DAG.getConstant(0x1ff, DL, MVT::i32));
3815 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3816
3817 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3818 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3819
3820 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3821 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3822 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3823 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3824
3825 // N = M | (E << 12);
3826 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3827 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3828 DAG.getConstant(12, DL, MVT::i32)));
3829
3830 // B = clamp(1-E, 0, 13);
3831 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3832 One, E);
3833 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3834 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3835 DAG.getConstant(13, DL, MVT::i32));
3836
3837 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3838 DAG.getConstant(0x1000, DL, MVT::i32));
3839
3840 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3841 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3842 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3843 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3844
3845 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3846 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3847 DAG.getConstant(0x7, DL, MVT::i32));
3848 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3849 DAG.getConstant(2, DL, MVT::i32));
3850 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3851 One, Zero, ISD::SETEQ);
3852 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3853 One, Zero, ISD::SETGT);
3854 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3855 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3856
3857 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3858 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3859 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3860 I, V, ISD::SETEQ);
3861
3862 // Extract the sign bit.
3863 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3864 DAG.getConstant(16, DL, MVT::i32));
3865 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3866 DAG.getConstant(0x8000, DL, MVT::i32));
3867
3868 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3869}
3870
3872 SelectionDAG &DAG) const {
3873 SDValue Src = Op.getOperand(0);
3874 unsigned OpOpcode = Op.getOpcode();
3875 EVT SrcVT = Src.getValueType();
3876 EVT DestVT = Op.getValueType();
3877
3878 // Will be selected natively
3879 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3880 return Op;
3881
3882 if (SrcVT == MVT::bf16 || (SrcVT == MVT::f16 && DestVT == MVT::i32)) {
3883 SDLoc DL(Op);
3884 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3885 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3886 }
3887
3888 // Promote i16 to i32
3889 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3890 SDLoc DL(Op);
3891
3892 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3893 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3894 }
3895
3896 if (DestVT != MVT::i64)
3897 return Op;
3898
3899 if (SrcVT == MVT::f16 ||
3900 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3901 SDLoc DL(Op);
3902
3903 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3904 unsigned Ext =
3906 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3907 }
3908
3909 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3910 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3911
3912 return SDValue();
3913}
3914
3916 SelectionDAG &DAG) const {
3917 SDValue Src = Op.getOperand(0);
3918 unsigned OpOpcode = Op.getOpcode();
3919 EVT SrcVT = Src.getValueType();
3920 EVT DstVT = Op.getValueType();
3921 SDValue SatVTOp = Op.getNode()->getOperand(1);
3922 EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
3923 SDLoc DL(Op);
3924
3925 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3926 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3927 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3928
3929 // Scalar cases will be selected natively to v_cvt_/s_cvt_ instructions.
3930 // v2f32 -> v2i16 will be selected natively to v_cvt_pk_[iu]16_f32.
3931 if (SatWidth == DstWidth) {
3932 if ((DstVT == MVT::i32 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) ||
3933 (DstVT == MVT::i16 && (SrcVT == MVT::f16 || SrcVT == MVT::f32)) ||
3934 (DstVT == MVT::v2i16 && SrcVT == MVT::v2f32))
3935 return Op;
3936 }
3937
3938 // Vectors can only be selected natively.
3939 if (DstVT.isVector())
3940 return SDValue();
3941
3942 // Perform all saturation at selected width (i16 or i32) and truncate
3943 if (SatWidth < DstWidth && SatWidth <= 32) {
3944 // For f16 conversion with sub-i16 saturation perform saturation
3945 // at i16, if available in the target. This removes the need for extra f16
3946 // to f32 conversion. For all the others use i32.
3947 MVT ResultVT =
3948 Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16
3949 ? MVT::i16
3950 : MVT::i32;
3951
3952 const SDValue ResultVTOp = DAG.getValueType(ResultVT);
3953 const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();
3954
3955 // First, convert input float into selected integer (i16 or i32)
3956 SDValue FpToInt = DAG.getNode(OpOpcode, DL, ResultVT, Src, ResultVTOp);
3957 SDValue IntSatVal;
3958
3959 // Then, clamp at the saturation width using either i16 or i32 instructions
3960 if (OpOpcode == ISD::FP_TO_SINT_SAT) {
3961 SDValue MinConst = DAG.getConstant(
3962 APInt::getSignedMaxValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3963 SDValue MaxConst = DAG.getConstant(
3964 APInt::getSignedMinValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3965 SDValue MinVal = DAG.getNode(ISD::SMIN, DL, ResultVT, FpToInt, MinConst);
3966 IntSatVal = DAG.getNode(ISD::SMAX, DL, ResultVT, MinVal, MaxConst);
3967 } else {
3968 SDValue MinConst = DAG.getConstant(
3969 APInt::getMaxValue(SatWidth).zext(ResultWidth), DL, ResultVT);
3970 IntSatVal = DAG.getNode(ISD::UMIN, DL, ResultVT, FpToInt, MinConst);
3971 }
3972
3973 // Finally, after saturating at i16 or i32 fit into the destination type
3974 return DAG.getExtOrTrunc(OpOpcode == ISD::FP_TO_SINT_SAT, IntSatVal, DL,
3975 DstVT);
3976 }
3977
3978 // SatWidth == DstWidth or SatWidth > 32
3979
3980 // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion
3981 // below)
3982 if (DstVT == MVT::i64 &&
3983 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3984 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3985 const SDValue Int32VTOp = DAG.getValueType(MVT::i32);
3986 return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VTOp);
3987 }
3988
3989 // Promote f16/bf16 src to f32 for i32 conversion
3990 if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
3991 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3992 return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
3993 }
3994
3995 // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16
3996 // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32
3997 // saturation; this covers i16.f32 and i16.f64
3998 if (DstWidth < 32) {
3999 // Note: this triggers SatWidth < DstWidth above to generate saturated
4000 // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.
4001 MVT PromoteVT =
4002 (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;
4003 SDValue FpToInt = DAG.getNode(OpOpcode, DL, PromoteVT, Src, SatVTOp);
4004 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt);
4005 }
4006
4007 // TODO: can we implement i64 dst for f32/f64?
4008
4009 return SDValue();
4010}
4011
4013 SelectionDAG &DAG) const {
4014 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
4015 MVT VT = Op.getSimpleValueType();
4016 MVT ScalarVT = VT.getScalarType();
4017
4018 assert(VT.isVector());
4019
4020 SDValue Src = Op.getOperand(0);
4021 SDLoc DL(Op);
4022
4023 // TODO: Don't scalarize on Evergreen?
4024 unsigned NElts = VT.getVectorNumElements();
4026 DAG.ExtractVectorElements(Src, Args, 0, NElts);
4027
4028 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
4029 for (unsigned I = 0; I < NElts; ++I)
4030 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
4031
4032 return DAG.getBuildVector(VT, DL, Args);
4033}
4034
4035//===----------------------------------------------------------------------===//
4036// Custom DAG optimizations
4037//===----------------------------------------------------------------------===//
4038
4039static bool isU24(SDValue Op, SelectionDAG &DAG) {
4040 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
4041}
4042
4043static bool isI24(SDValue Op, SelectionDAG &DAG) {
4044 EVT VT = Op.getValueType();
4045 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4046 // as unsigned 24-bit values.
4048}
4049
4052 SelectionDAG &DAG = DCI.DAG;
4053 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4054 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4055
4056 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
4057 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
4058 unsigned NewOpcode = Node24->getOpcode();
4059 if (IsIntrin) {
4060 unsigned IID = Node24->getConstantOperandVal(0);
4061 switch (IID) {
4062 case Intrinsic::amdgcn_mul_i24:
4063 NewOpcode = AMDGPUISD::MUL_I24;
4064 break;
4065 case Intrinsic::amdgcn_mul_u24:
4066 NewOpcode = AMDGPUISD::MUL_U24;
4067 break;
4068 case Intrinsic::amdgcn_mulhi_i24:
4069 NewOpcode = AMDGPUISD::MULHI_I24;
4070 break;
4071 case Intrinsic::amdgcn_mulhi_u24:
4072 NewOpcode = AMDGPUISD::MULHI_U24;
4073 break;
4074 default:
4075 llvm_unreachable("Expected 24-bit mul intrinsic");
4076 }
4077 }
4078
4079 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
4080
4081 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4082 // the operands to have other uses, but will only perform simplifications that
4083 // involve bypassing some nodes for this user.
4084 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
4085 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
4086 if (DemandedLHS || DemandedRHS)
4087 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
4088 DemandedLHS ? DemandedLHS : LHS,
4089 DemandedRHS ? DemandedRHS : RHS);
4090
4091 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4092 // operands if this node is the only user.
4093 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
4094 return SDValue(Node24, 0);
4095 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
4096 return SDValue(Node24, 0);
4097
4098 return SDValue();
4099}
4100
4101template <typename IntTy>
4103 uint32_t Width, const SDLoc &DL) {
4104 if (Width + Offset < 32) {
4105 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4106 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4107 if constexpr (std::is_signed_v<IntTy>) {
4108 return DAG.getSignedConstant(Result, DL, MVT::i32);
4109 } else {
4110 return DAG.getConstant(Result, DL, MVT::i32);
4111 }
4112 }
4113
4114 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4115}
4116
4117static bool hasVolatileUser(SDNode *Val) {
4118 for (SDNode *U : Val->users()) {
4119 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
4120 if (M->isVolatile())
4121 return true;
4122 }
4123 }
4124
4125 return false;
4126}
4127
4129 // i32 vectors are the canonical memory type.
4130 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4131 return false;
4132
4133 if (!VT.isByteSized())
4134 return false;
4135
4136 unsigned Size = VT.getStoreSize();
4137
4138 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4139 return false;
4140
4141 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4142 return false;
4143
4144 return true;
4145}
4146
4147// Replace load of an illegal type with a bitcast from a load of a friendlier
4148// type.
4150 DAGCombinerInfo &DCI) const {
4151 if (!DCI.isBeforeLegalize())
4152 return SDValue();
4153
4155 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
4156 return SDValue();
4157
4158 SDLoc SL(N);
4159 SelectionDAG &DAG = DCI.DAG;
4160 EVT VT = LN->getMemoryVT();
4161
4162 unsigned Size = VT.getStoreSize();
4163 Align Alignment = LN->getAlign();
4164 if (Alignment < Size && isTypeLegal(VT)) {
4165 unsigned IsFast;
4166 unsigned AS = LN->getAddressSpace();
4167
4168 // Expand unaligned loads earlier than legalization. Due to visitation order
4169 // problems during legalization, the emitted instructions to pack and unpack
4170 // the bytes again are not eliminated in the case of an unaligned copy.
4172 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
4173 if (VT.isVector())
4174 return SplitVectorLoad(SDValue(LN, 0), DAG);
4175
4176 SDValue Ops[2];
4177 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
4178
4179 return DAG.getMergeValues(Ops, SDLoc(N));
4180 }
4181
4182 if (!IsFast)
4183 return SDValue();
4184 }
4185
4186 if (!shouldCombineMemoryType(VT))
4187 return SDValue();
4188
4189 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4190
4191 SDValue NewLoad
4192 = DAG.getLoad(NewVT, SL, LN->getChain(),
4193 LN->getBasePtr(), LN->getMemOperand());
4194
4195 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
4196 DCI.CombineTo(N, BC, NewLoad.getValue(1));
4197 return SDValue(N, 0);
4198}
4199
4200// Replace store of an illegal type with a store of a bitcast to a friendlier
4201// type.
4203 DAGCombinerInfo &DCI) const {
4204 if (!DCI.isBeforeLegalize())
4205 return SDValue();
4206
4208 if (!SN->isSimple() || !ISD::isNormalStore(SN))
4209 return SDValue();
4210
4211 EVT VT = SN->getMemoryVT();
4212 unsigned Size = VT.getStoreSize();
4213
4214 SDLoc SL(N);
4215 SelectionDAG &DAG = DCI.DAG;
4216 Align Alignment = SN->getAlign();
4217 if (Alignment < Size && isTypeLegal(VT)) {
4218 unsigned IsFast;
4219 unsigned AS = SN->getAddressSpace();
4220
4221 // Expand unaligned stores earlier than legalization. Due to visitation
4222 // order problems during legalization, the emitted instructions to pack and
4223 // unpack the bytes again are not eliminated in the case of an unaligned
4224 // copy.
4226 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
4227 if (VT.isVector())
4228 return SplitVectorStore(SDValue(SN, 0), DAG);
4229
4230 return expandUnalignedStore(SN, DAG);
4231 }
4232
4233 if (!IsFast)
4234 return SDValue();
4235 }
4236
4237 if (!shouldCombineMemoryType(VT))
4238 return SDValue();
4239
4240 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4241 SDValue Val = SN->getValue();
4242
4243 // DCI.AddToWorklist(Val.getNode());
4244
4245 bool OtherUses = !Val.hasOneUse();
4246 SDValue CastVal = DAG.getBitcast(NewVT, Val);
4247 if (OtherUses) {
4248 SDValue CastBack = DAG.getBitcast(VT, CastVal);
4249 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4250 }
4251
4252 return DAG.getStore(SN->getChain(), SL, CastVal,
4253 SN->getBasePtr(), SN->getMemOperand());
4254}
4255
4256// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4257// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4258// issues.
4260 DAGCombinerInfo &DCI) const {
4261 SelectionDAG &DAG = DCI.DAG;
4262 SDValue N0 = N->getOperand(0);
4263
4264 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4265 // (vt2 (truncate (assertzext vt0:x, vt1)))
4266 if (N0.getOpcode() == ISD::TRUNCATE) {
4267 SDValue N1 = N->getOperand(1);
4268 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4269 SDLoc SL(N);
4270
4271 SDValue Src = N0.getOperand(0);
4272 EVT SrcVT = Src.getValueType();
4273 if (SrcVT.bitsGE(ExtVT)) {
4274 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4275 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4276 }
4277 }
4278
4279 return SDValue();
4280}
4281
4283 SDNode *N, DAGCombinerInfo &DCI) const {
4284 unsigned IID = N->getConstantOperandVal(0);
4285 switch (IID) {
4286 case Intrinsic::amdgcn_mul_i24:
4287 case Intrinsic::amdgcn_mul_u24:
4288 case Intrinsic::amdgcn_mulhi_i24:
4289 case Intrinsic::amdgcn_mulhi_u24:
4290 return simplifyMul24(N, DCI);
4291 case Intrinsic::amdgcn_fract:
4292 case Intrinsic::amdgcn_rsq:
4293 case Intrinsic::amdgcn_rcp_legacy:
4294 case Intrinsic::amdgcn_rsq_legacy:
4295 case Intrinsic::amdgcn_rsq_clamp:
4296 case Intrinsic::amdgcn_tanh:
4297 case Intrinsic::amdgcn_prng_b32: {
4298 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4299 SDValue Src = N->getOperand(1);
4300 return Src.isUndef() ? Src : SDValue();
4301 }
4302 case Intrinsic::amdgcn_frexp_exp: {
4303 // frexp_exp (fneg x) -> frexp_exp x
4304 // frexp_exp (fabs x) -> frexp_exp x
4305 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4306 SDValue Src = N->getOperand(1);
4307 SDValue PeekSign = peekFPSignOps(Src);
4308 if (PeekSign == Src)
4309 return SDValue();
4310 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4311 0);
4312 }
4313 default:
4314 return SDValue();
4315 }
4316}
4317
4318/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4319/// binary operation \p Opc to it with the corresponding constant operands.
4321 DAGCombinerInfo &DCI, const SDLoc &SL,
4322 unsigned Opc, SDValue LHS,
4323 uint32_t ValLo, uint32_t ValHi) const {
4324 SelectionDAG &DAG = DCI.DAG;
4325 SDValue Lo, Hi;
4326 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4327
4328 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4329 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4330
4331 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4332 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4333
4334 // Re-visit the ands. It's possible we eliminated one of them and it could
4335 // simplify the vector.
4336 DCI.AddToWorklist(Lo.getNode());
4337 DCI.AddToWorklist(Hi.getNode());
4338
4339 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4340 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4341}
4342
4344 DAGCombinerInfo &DCI) const {
4345 EVT VT = N->getValueType(0);
4346 SDValue LHS = N->getOperand(0);
4347 SDValue RHS = N->getOperand(1);
4349 SDLoc SL(N);
4350 SelectionDAG &DAG = DCI.DAG;
4351
4352 unsigned RHSVal;
4353 if (CRHS) {
4354 RHSVal = CRHS->getZExtValue();
4355 if (!RHSVal)
4356 return LHS;
4357
4358 switch (LHS->getOpcode()) {
4359 default:
4360 break;
4361 case ISD::ZERO_EXTEND:
4362 case ISD::SIGN_EXTEND:
4363 case ISD::ANY_EXTEND: {
4364 SDValue X = LHS->getOperand(0);
4365
4366 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4367 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4368 // Prefer build_vector as the canonical form if packed types are legal.
4369 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4370 SDValue Vec = DAG.getBuildVector(
4371 MVT::v2i16, SL,
4372 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4373 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4374 }
4375
4376 // shl (ext x) => zext (shl x), if shift does not overflow int
4377 if (VT != MVT::i64)
4378 break;
4379 KnownBits Known = DAG.computeKnownBits(X);
4380 unsigned LZ = Known.countMinLeadingZeros();
4381 if (LZ < RHSVal)
4382 break;
4383 EVT XVT = X.getValueType();
4384 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4385 return DAG.getZExtOrTrunc(Shl, SL, VT);
4386 }
4387 }
4388 }
4389
4390 if (VT.getScalarType() != MVT::i64)
4391 return SDValue();
4392
4393 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4394 // common case, splitting this into a move and a 32-bit shift is faster and
4395 // the same code size.
4396 KnownBits Known = DAG.computeKnownBits(RHS);
4397
4398 EVT ElementType = VT.getScalarType();
4399 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4400 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4401
4402 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4403 return SDValue();
4404 SDValue ShiftAmt;
4405
4406 if (CRHS) {
4407 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4408 TargetType);
4409 } else {
4410 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4411 const SDValue ShiftMask =
4412 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4413 // This AND instruction will clamp out of bounds shift values.
4414 // It will also be removed during later instruction selection.
4415 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4416 }
4417
4418 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4419 SDValue NewShift =
4420 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4421
4422 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4423 SDValue Vec;
4424
4425 if (VT.isVector()) {
4426 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4427 unsigned NElts = TargetType.getVectorNumElements();
4429 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4430
4431 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4432 for (unsigned I = 0; I != NElts; ++I)
4433 HiAndLoOps[2 * I + 1] = HiOps[I];
4434 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4435 } else {
4436 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4437 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4438 }
4439 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4440}
4441
4443 DAGCombinerInfo &DCI) const {
4444 SDValue RHS = N->getOperand(1);
4446 EVT VT = N->getValueType(0);
4447 SDValue LHS = N->getOperand(0);
4448 SelectionDAG &DAG = DCI.DAG;
4449 SDLoc SL(N);
4450
4451 if (VT.getScalarType() != MVT::i64)
4452 return SDValue();
4453
4454 // For C >= 32
4455 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4456
4457 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4458 // common case, splitting this into a move and a 32-bit shift is faster and
4459 // the same code size.
4460 KnownBits Known = DAG.computeKnownBits(RHS);
4461
4462 EVT ElementType = VT.getScalarType();
4463 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4464 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4465
4466 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4467 return SDValue();
4468
4469 SDValue ShiftFullAmt =
4470 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4471 SDValue ShiftAmt;
4472 if (CRHS) {
4473 unsigned RHSVal = CRHS->getZExtValue();
4474 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4475 TargetType);
4476 } else if (Known.getMinValue().getZExtValue() ==
4477 (ElementType.getSizeInBits() - 1)) {
4478 ShiftAmt = ShiftFullAmt;
4479 } else {
4480 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4481 const SDValue ShiftMask =
4482 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4483 // This AND instruction will clamp out of bounds shift values.
4484 // It will also be removed during later instruction selection.
4485 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4486 }
4487
4488 EVT ConcatType;
4489 SDValue Hi;
4490 SDLoc LHSSL(LHS);
4491 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4492 if (VT.isVector()) {
4493 unsigned NElts = TargetType.getVectorNumElements();
4494 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4495 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4496 SmallVector<SDValue, 8> HiOps(NElts);
4497 SmallVector<SDValue, 16> HiAndLoOps;
4498
4499 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4500 for (unsigned I = 0; I != NElts; ++I) {
4501 HiOps[I] = HiAndLoOps[2 * I + 1];
4502 }
4503 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4504 } else {
4505 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4506 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4507 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4508 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4509 }
4510
4511 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4512 SDValue NewShift, HiShift;
4513 if (KnownLHS.isNegative()) {
4514 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4515 NewShift =
4516 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4517 } else if (CRHS &&
4518 CRHS->getZExtValue() == (ElementType.getSizeInBits() - 1)) {
4519 NewShift = HiShift =
4520 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4521 } else {
4522 Hi = DAG.getFreeze(Hi);
4523 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4524 NewShift =
4525 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4526 }
4527
4528 SDValue Vec;
4529 if (VT.isVector()) {
4530 unsigned NElts = TargetType.getVectorNumElements();
4533 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4534
4535 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4536 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4537 for (unsigned I = 0; I != NElts; ++I) {
4538 HiAndLoOps[2 * I + 1] = HiOps[I];
4539 HiAndLoOps[2 * I] = LoOps[I];
4540 }
4541 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4542 } else {
4543 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4544 }
4545 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4546}
4547
4549 DAGCombinerInfo &DCI) const {
4550 SDValue RHS = N->getOperand(1);
4552 EVT VT = N->getValueType(0);
4553 SDValue LHS = N->getOperand(0);
4554 SelectionDAG &DAG = DCI.DAG;
4555 SDLoc SL(N);
4556 unsigned RHSVal;
4557
4558 if (CRHS) {
4559 RHSVal = CRHS->getZExtValue();
4560
4561 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4562 // this improves the ability to match BFE patterns in isel.
4563 if (LHS.getOpcode() == ISD::AND) {
4564 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4565 unsigned MaskIdx, MaskLen;
4566 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4567 MaskIdx == RHSVal) {
4568 return DAG.getNode(ISD::AND, SL, VT,
4569 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4570 N->getOperand(1)),
4571 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4572 N->getOperand(1)));
4573 }
4574 }
4575 }
4576 }
4577
4578 if (VT.getScalarType() != MVT::i64)
4579 return SDValue();
4580
4581 // for C >= 32
4582 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4583
4584 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4585 // common case, splitting this into a move and a 32-bit shift is faster and
4586 // the same code size.
4587 KnownBits Known = DAG.computeKnownBits(RHS);
4588
4589 EVT ElementType = VT.getScalarType();
4590 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4591 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4592
4593 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4594 return SDValue();
4595
4596 SDValue ShiftAmt;
4597 if (CRHS) {
4598 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4599 TargetType);
4600 } else {
4601 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4602 const SDValue ShiftMask =
4603 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4604 // This AND instruction will clamp out of bounds shift values.
4605 // It will also be removed during later instruction selection.
4606 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4607 }
4608
4609 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4610 EVT ConcatType;
4611 SDValue Hi;
4612 SDLoc LHSSL(LHS);
4613 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4614 if (VT.isVector()) {
4615 unsigned NElts = TargetType.getVectorNumElements();
4616 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4617 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4618 SmallVector<SDValue, 8> HiOps(NElts);
4619 SmallVector<SDValue, 16> HiAndLoOps;
4620
4621 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4622 for (unsigned I = 0; I != NElts; ++I)
4623 HiOps[I] = HiAndLoOps[2 * I + 1];
4624 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4625 } else {
4626 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4627 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4628 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4629 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4630 }
4631
4632 SDValue NewShift =
4633 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4634
4635 SDValue Vec;
4636 if (VT.isVector()) {
4637 unsigned NElts = TargetType.getVectorNumElements();
4639 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4640
4641 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4642 for (unsigned I = 0; I != NElts; ++I)
4643 HiAndLoOps[2 * I] = LoOps[I];
4644 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4645 } else {
4646 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4647 }
4648 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4649}
4650
4652 SDNode *N, DAGCombinerInfo &DCI) const {
4653 SDLoc SL(N);
4654 SelectionDAG &DAG = DCI.DAG;
4655 EVT VT = N->getValueType(0);
4656 SDValue Src = N->getOperand(0);
4657
4658 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4659 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4660 SDValue Vec = Src.getOperand(0);
4661 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4662 SDValue Elt0 = Vec.getOperand(0);
4663 EVT EltVT = Elt0.getValueType();
4664 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4665 if (EltVT.isFloatingPoint()) {
4666 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4667 EltVT.changeTypeToInteger(), Elt0);
4668 }
4669
4670 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4671 }
4672 }
4673 }
4674
4675 // Equivalent of above for accessing the high element of a vector as an
4676 // integer operation.
4677 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4678 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4679 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4680 SDValue BV = stripBitcast(Src.getOperand(0));
4681 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4682 EVT SrcEltVT = BV.getOperand(0).getValueType();
4683 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4684 unsigned BitIndex = K->getZExtValue();
4685 unsigned PartIndex = BitIndex / SrcEltSize;
4686
4687 if (PartIndex * SrcEltSize == BitIndex &&
4688 PartIndex < BV.getNumOperands()) {
4689 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4690 SDValue SrcElt =
4691 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4692 BV.getOperand(PartIndex));
4693 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4694 }
4695 }
4696 }
4697 }
4698 }
4699
4700 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4701 //
4702 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4703 // i16 (trunc (srl (i32 (trunc x), K)))
4704 if (VT.getScalarSizeInBits() < 32) {
4705 EVT SrcVT = Src.getValueType();
4706 if (SrcVT.getScalarSizeInBits() > 32 &&
4707 (Src.getOpcode() == ISD::SRL ||
4708 Src.getOpcode() == ISD::SRA ||
4709 Src.getOpcode() == ISD::SHL)) {
4710 SDValue Amt = Src.getOperand(1);
4711 KnownBits Known = DAG.computeKnownBits(Amt);
4712
4713 // - For left shifts, do the transform as long as the shift
4714 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4715 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4716 // losing information stored in the high bits when truncating.
4717 const unsigned MaxCstSize =
4718 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4719 if (Known.getMaxValue().ule(MaxCstSize)) {
4720 EVT MidVT = VT.isVector() ?
4721 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4722 VT.getVectorNumElements()) : MVT::i32;
4723
4724 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4725 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4726 Src.getOperand(0));
4727 DCI.AddToWorklist(Trunc.getNode());
4728
4729 if (Amt.getValueType() != NewShiftVT) {
4730 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4731 DCI.AddToWorklist(Amt.getNode());
4732 }
4733
4734 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4735 Trunc, Amt);
4736 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4737 }
4738 }
4739 }
4740
4741 return SDValue();
4742}
4743
4744// We need to specifically handle i64 mul here to avoid unnecessary conversion
4745// instructions. If we only match on the legalized i64 mul expansion,
4746// SimplifyDemandedBits will be unable to remove them because there will be
4747// multiple uses due to the separate mul + mulh[su].
4748static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4749 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4750 if (Size <= 32) {
4751 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4752 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4753 }
4754
4755 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4756 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4757
4758 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4759 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4760
4761 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4762}
4763
4764/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4765/// return SDValue().
4766static SDValue getAddOneOp(const SDNode *V) {
4767 if (V->getOpcode() != ISD::ADD)
4768 return SDValue();
4769
4770 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4771}
4772
4774 DAGCombinerInfo &DCI) const {
4775 assert(N->getOpcode() == ISD::MUL);
4776 EVT VT = N->getValueType(0);
4777
4778 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4779 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4780 // unnecessarily). isDivergent() is used as an approximation of whether the
4781 // value is in an SGPR.
4782 if (!N->isDivergent())
4783 return SDValue();
4784
4785 unsigned Size = VT.getSizeInBits();
4786 if (VT.isVector() || Size > 64)
4787 return SDValue();
4788
4789 SelectionDAG &DAG = DCI.DAG;
4790 SDLoc DL(N);
4791
4792 SDValue N0 = N->getOperand(0);
4793 SDValue N1 = N->getOperand(1);
4794
4795 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4796 // matching.
4797
4798 // mul x, (add y, 1) -> add (mul x, y), x
4799 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4800 SDValue AddOp = getAddOneOp(V.getNode());
4801 if (!AddOp)
4802 return SDValue();
4803
4804 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4805 return U->getOpcode() == ISD::MUL;
4806 }))
4807 return AddOp;
4808
4809 return SDValue();
4810 };
4811
4812 // FIXME: The selection pattern is not properly checking for commuted
4813 // operands, so we have to place the mul in the LHS
4814 if (SDValue MulOper = IsFoldableAdd(N0)) {
4815 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4816 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4817 }
4818
4819 if (SDValue MulOper = IsFoldableAdd(N1)) {
4820 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4821 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4822 }
4823
4824 // There are i16 integer mul/mad.
4825 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4826 return SDValue();
4827
4828 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4829 // in the source into any_extends if the result of the mul is truncated. Since
4830 // we can assume the high bits are whatever we want, use the underlying value
4831 // to avoid the unknown high bits from interfering.
4832 if (N0.getOpcode() == ISD::ANY_EXTEND)
4833 N0 = N0.getOperand(0);
4834
4835 if (N1.getOpcode() == ISD::ANY_EXTEND)
4836 N1 = N1.getOperand(0);
4837
4838 SDValue Mul;
4839
4840 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4841 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4842 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4843 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4844 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4845 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4846 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4847 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4848 } else {
4849 return SDValue();
4850 }
4851
4852 // We need to use sext even for MUL_U24, because MUL_U24 is used
4853 // for signed multiply of 8 and 16-bit types.
4854 return DAG.getSExtOrTrunc(Mul, DL, VT);
4855}
4856
4857SDValue
4859 DAGCombinerInfo &DCI) const {
4860 if (N->getValueType(0) != MVT::i32)
4861 return SDValue();
4862
4863 SelectionDAG &DAG = DCI.DAG;
4864 SDLoc DL(N);
4865
4866 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4867 SDValue N0 = N->getOperand(0);
4868 SDValue N1 = N->getOperand(1);
4869
4870 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4871 // in the source into any_extends if the result of the mul is truncated. Since
4872 // we can assume the high bits are whatever we want, use the underlying value
4873 // to avoid the unknown high bits from interfering.
4874 if (N0.getOpcode() == ISD::ANY_EXTEND)
4875 N0 = N0.getOperand(0);
4876 if (N1.getOpcode() == ISD::ANY_EXTEND)
4877 N1 = N1.getOperand(0);
4878
4879 // Try to use two fast 24-bit multiplies (one for each half of the result)
4880 // instead of one slow extending multiply.
4881 unsigned LoOpcode = 0;
4882 unsigned HiOpcode = 0;
4883 if (Signed) {
4884 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4885 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4886 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4887 LoOpcode = AMDGPUISD::MUL_I24;
4888 HiOpcode = AMDGPUISD::MULHI_I24;
4889 }
4890 } else {
4891 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4892 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4893 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4894 LoOpcode = AMDGPUISD::MUL_U24;
4895 HiOpcode = AMDGPUISD::MULHI_U24;
4896 }
4897 }
4898 if (!LoOpcode)
4899 return SDValue();
4900
4901 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4902 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4903 DCI.CombineTo(N, Lo, Hi);
4904 return SDValue(N, 0);
4905}
4906
4908 DAGCombinerInfo &DCI) const {
4909 EVT VT = N->getValueType(0);
4910
4911 if (!Subtarget->hasMulI24() || VT.isVector())
4912 return SDValue();
4913
4914 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4915 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4916 // unnecessarily). isDivergent() is used as an approximation of whether the
4917 // value is in an SGPR.
4918 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4919 // valu op anyway)
4920 if (Subtarget->hasSMulHi() && !N->isDivergent())
4921 return SDValue();
4922
4923 SelectionDAG &DAG = DCI.DAG;
4924 SDLoc DL(N);
4925
4926 SDValue N0 = N->getOperand(0);
4927 SDValue N1 = N->getOperand(1);
4928
4929 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4930 return SDValue();
4931
4932 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4933 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4934
4935 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4936 DCI.AddToWorklist(Mulhi.getNode());
4937 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4938}
4939
4941 DAGCombinerInfo &DCI) const {
4942 EVT VT = N->getValueType(0);
4943
4944 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4945 return SDValue();
4946
4947 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4948 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4949 // unnecessarily). isDivergent() is used as an approximation of whether the
4950 // value is in an SGPR.
4951 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4952 // valu op anyway)
4953 if (!N->isDivergent() && Subtarget->hasSMulHi())
4954 return SDValue();
4955
4956 SelectionDAG &DAG = DCI.DAG;
4957 SDLoc DL(N);
4958
4959 SDValue N0 = N->getOperand(0);
4960 SDValue N1 = N->getOperand(1);
4961
4962 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4963 return SDValue();
4964
4965 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4966 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4967
4968 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4969 DCI.AddToWorklist(Mulhi.getNode());
4970 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4971}
4972
4973SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4974 SDValue Op,
4975 const SDLoc &DL,
4976 unsigned Opc) const {
4977 EVT VT = Op.getValueType();
4978 if (VT.bitsGT(MVT::i32))
4979 return SDValue();
4980
4981 if (VT != MVT::i32)
4982 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4983
4984 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4985 if (VT != MVT::i32)
4986 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4987
4988 return FFBX;
4989}
4990
4991// The native instructions return -1 on 0 input. Optimize out a select that
4992// produces -1 on 0.
4993//
4994// TODO: If zero is not undef, we could also do this if the output is compared
4995// against the bitwidth.
4996//
4997// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4999 SDValue LHS, SDValue RHS,
5000 DAGCombinerInfo &DCI) const {
5001 if (!isNullConstant(Cond.getOperand(1)))
5002 return SDValue();
5003
5004 SelectionDAG &DAG = DCI.DAG;
5005 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
5006 SDValue CmpLHS = Cond.getOperand(0);
5007
5008 // select (setcc x, 0, eq), -1, (ctlz_zero_poison x) -> ffbh_u32 x
5009 // select (setcc x, 0, eq), -1, (cttz_zero_poison x) -> ffbl_u32 x
5010 if (CCOpcode == ISD::SETEQ &&
5011 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
5012 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
5013 unsigned Opc =
5014 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5015 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5016 }
5017
5018 // select (setcc x, 0, ne), (ctlz_zero_poison x), -1 -> ffbh_u32 x
5019 // select (setcc x, 0, ne), (cttz_zero_poison x), -1 -> ffbl_u32 x
5020 if (CCOpcode == ISD::SETNE &&
5021 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
5022 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
5023 unsigned Opc =
5024 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5025
5026 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5027 }
5028
5029 return SDValue();
5030}
5031
5033 unsigned Op,
5034 const SDLoc &SL,
5035 SDValue Cond,
5036 SDValue N1,
5037 SDValue N2) {
5038 SelectionDAG &DAG = DCI.DAG;
5039 EVT VT = N1.getValueType();
5040
5041 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
5042 N1.getOperand(0), N2.getOperand(0));
5043 DCI.AddToWorklist(NewSelect.getNode());
5044 return DAG.getNode(Op, SL, VT, NewSelect);
5045}
5046
5047// Pull a free FP operation out of a select so it may fold into uses.
5048//
5049// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
5050// select c, (fneg x), k -> fneg (select c, x, (fneg k))
5051//
5052// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5053// select c, (fabs x), +k -> fabs (select c, x, k)
5054SDValue
5056 SDValue N) const {
5057 SelectionDAG &DAG = DCI.DAG;
5058 SDValue Cond = N.getOperand(0);
5059 SDValue LHS = N.getOperand(1);
5060 SDValue RHS = N.getOperand(2);
5061
5062 EVT VT = N.getValueType();
5063 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5064 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5066 return SDValue();
5067
5068 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
5069 SDLoc(N), Cond, LHS, RHS);
5070 }
5071
5072 bool Inv = false;
5073 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5074 std::swap(LHS, RHS);
5075 Inv = true;
5076 }
5077
5078 // TODO: Support vector constants.
5080 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5081 !selectSupportsSourceMods(N.getNode())) {
5082 SDLoc SL(N);
5083 // If one side is an fneg/fabs and the other is a constant, we can push the
5084 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5085 SDValue NewLHS = LHS.getOperand(0);
5086 SDValue NewRHS = RHS;
5087
5088 // Careful: if the neg can be folded up, don't try to pull it back down.
5089 bool ShouldFoldNeg = true;
5090
5091 if (NewLHS.hasOneUse()) {
5092 unsigned Opc = NewLHS.getOpcode();
5093 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
5094 ShouldFoldNeg = false;
5095 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5096 ShouldFoldNeg = false;
5097 }
5098
5099 if (ShouldFoldNeg) {
5100 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5101 return SDValue();
5102
5103 // We're going to be forced to use a source modifier anyway, there's no
5104 // point to pulling the negate out unless we can get a size reduction by
5105 // negating the constant.
5106 //
5107 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5108 // about cheaper constants.
5109 if (NewLHS.getOpcode() == ISD::FABS &&
5111 return SDValue();
5112
5114 return SDValue();
5115
5116 if (LHS.getOpcode() == ISD::FNEG)
5117 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5118
5119 if (Inv)
5120 std::swap(NewLHS, NewRHS);
5121
5122 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
5123 Cond, NewLHS, NewRHS);
5124 DCI.AddToWorklist(NewSelect.getNode());
5125 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
5126 }
5127 }
5128
5129 return SDValue();
5130}
5131
5133 DAGCombinerInfo &DCI) const {
5134 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
5135 return Folded;
5136
5137 SDValue Cond = N->getOperand(0);
5138 if (Cond.getOpcode() != ISD::SETCC)
5139 return SDValue();
5140
5141 EVT VT = N->getValueType(0);
5142 SDValue LHS = Cond.getOperand(0);
5143 SDValue RHS = Cond.getOperand(1);
5144 SDValue CC = Cond.getOperand(2);
5145
5146 SDValue True = N->getOperand(1);
5147 SDValue False = N->getOperand(2);
5148
5149 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5150 SelectionDAG &DAG = DCI.DAG;
5151 if (DAG.isConstantValueOfAnyType(True) &&
5152 !DAG.isConstantValueOfAnyType(False)) {
5153 // Swap cmp + select pair to move constant to false input.
5154 // This will allow using VOPC cndmasks more often.
5155 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5156
5157 SDLoc SL(N);
5158 ISD::CondCode NewCC =
5159 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
5160
5161 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
5162 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
5163 }
5164
5165 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5167 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5168 // Revisit this node so we can catch min3/max3/med3 patterns.
5169 //DCI.AddToWorklist(MinMax.getNode());
5170 return MinMax;
5171 }
5172 }
5173
5174 // There's no reason to not do this if the condition has other uses.
5175 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
5176}
5177
5178static bool isInv2Pi(const APFloat &APF) {
5179 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5180 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5181 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5182
5183 return APF.bitwiseIsEqual(KF16) ||
5184 APF.bitwiseIsEqual(KF32) ||
5185 APF.bitwiseIsEqual(KF64);
5186}
5187
5188// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5189// additional cost to negate them.
5192 if (C->isZero())
5193 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5194
5195 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
5196 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5197
5199}
5200
5206
5212
5213static unsigned inverseMinMax(unsigned Opc) {
5214 switch (Opc) {
5215 case ISD::FMAXNUM:
5216 return ISD::FMINNUM;
5217 case ISD::FMINNUM:
5218 return ISD::FMAXNUM;
5219 case ISD::FMAXNUM_IEEE:
5220 return ISD::FMINNUM_IEEE;
5221 case ISD::FMINNUM_IEEE:
5222 return ISD::FMAXNUM_IEEE;
5223 case ISD::FMAXIMUM:
5224 return ISD::FMINIMUM;
5225 case ISD::FMINIMUM:
5226 return ISD::FMAXIMUM;
5227 case ISD::FMAXIMUMNUM:
5228 return ISD::FMINIMUMNUM;
5229 case ISD::FMINIMUMNUM:
5230 return ISD::FMAXIMUMNUM;
5231 case AMDGPUISD::FMAX_LEGACY:
5232 return AMDGPUISD::FMIN_LEGACY;
5233 case AMDGPUISD::FMIN_LEGACY:
5234 return AMDGPUISD::FMAX_LEGACY;
5235 default:
5236 llvm_unreachable("invalid min/max opcode");
5237 }
5238}
5239
5240/// \return true if it's profitable to try to push an fneg into its source
5241/// instruction.
5243 // If the input has multiple uses and we can either fold the negate down, or
5244 // the other uses cannot, give up. This both prevents unprofitable
5245 // transformations and infinite loops: we won't repeatedly try to fold around
5246 // a negate that has no 'good' form.
5247 if (N0.hasOneUse()) {
5248 // This may be able to fold into the source, but at a code size cost. Don't
5249 // fold if the fold into the user is free.
5250 if (allUsesHaveSourceMods(N, 0))
5251 return false;
5252 } else {
5253 if (fnegFoldsIntoOp(N0.getNode()) &&
5255 return false;
5256 }
5257
5258 return true;
5259}
5260
5262 DAGCombinerInfo &DCI) const {
5263 SelectionDAG &DAG = DCI.DAG;
5264 SDValue N0 = N->getOperand(0);
5265 EVT VT = N->getValueType(0);
5266
5267 unsigned Opc = N0.getOpcode();
5268
5269 if (!shouldFoldFNegIntoSrc(N, N0))
5270 return SDValue();
5271
5272 SDLoc SL(N);
5273 switch (Opc) {
5274 case ISD::FADD: {
5275 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5276 return SDValue();
5277
5278 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5279 SDValue LHS = N0.getOperand(0);
5280 SDValue RHS = N0.getOperand(1);
5281
5282 if (LHS.getOpcode() != ISD::FNEG)
5283 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5284 else
5285 LHS = LHS.getOperand(0);
5286
5287 if (RHS.getOpcode() != ISD::FNEG)
5288 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5289 else
5290 RHS = RHS.getOperand(0);
5291
5292 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5293 if (Res.getOpcode() != ISD::FADD)
5294 return SDValue(); // Op got folded away.
5295 if (!N0.hasOneUse())
5296 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5297 return Res;
5298 }
5299 case ISD::FMUL:
5300 case AMDGPUISD::FMUL_LEGACY: {
5301 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5302 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5303 SDValue LHS = N0.getOperand(0);
5304 SDValue RHS = N0.getOperand(1);
5305
5306 if (LHS.getOpcode() == ISD::FNEG)
5307 LHS = LHS.getOperand(0);
5308 else if (RHS.getOpcode() == ISD::FNEG)
5309 RHS = RHS.getOperand(0);
5310 else
5311 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5312
5313 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5314 if (Res.getOpcode() != Opc)
5315 return SDValue(); // Op got folded away.
5316 if (!N0.hasOneUse())
5317 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5318 return Res;
5319 }
5320 case ISD::FMA:
5321 case ISD::FMAD: {
5322 // TODO: handle llvm.amdgcn.fma.legacy
5323 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5324 return SDValue();
5325
5326 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5327 SDValue LHS = N0.getOperand(0);
5328 SDValue MHS = N0.getOperand(1);
5329 SDValue RHS = N0.getOperand(2);
5330
5331 if (LHS.getOpcode() == ISD::FNEG)
5332 LHS = LHS.getOperand(0);
5333 else if (MHS.getOpcode() == ISD::FNEG)
5334 MHS = MHS.getOperand(0);
5335 else
5336 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5337
5338 if (RHS.getOpcode() != ISD::FNEG)
5339 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5340 else
5341 RHS = RHS.getOperand(0);
5342
5343 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5344 if (Res.getOpcode() != Opc)
5345 return SDValue(); // Op got folded away.
5346 if (!N0.hasOneUse())
5347 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5348 return Res;
5349 }
5350 case ISD::FMAXNUM:
5351 case ISD::FMINNUM:
5352 case ISD::FMAXNUM_IEEE:
5353 case ISD::FMINNUM_IEEE:
5354 case ISD::FMINIMUM:
5355 case ISD::FMAXIMUM:
5356 case ISD::FMINIMUMNUM:
5357 case ISD::FMAXIMUMNUM:
5358 case AMDGPUISD::FMAX_LEGACY:
5359 case AMDGPUISD::FMIN_LEGACY: {
5360 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5361 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5362 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5363 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5364
5365 SDValue LHS = N0.getOperand(0);
5366 SDValue RHS = N0.getOperand(1);
5367
5368 // 0 doesn't have a negated inline immediate.
5369 // TODO: This constant check should be generalized to other operations.
5371 return SDValue();
5372
5373 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5374 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5375 unsigned Opposite = inverseMinMax(Opc);
5376
5377 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5378 if (Res.getOpcode() != Opposite)
5379 return SDValue(); // Op got folded away.
5380 if (!N0.hasOneUse())
5381 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5382 return Res;
5383 }
5384 case AMDGPUISD::FMED3: {
5385 // med3 sorts a NaN input as smaller than everything regardless of its sign,
5386 // so negating all operands does not sign-flip the median when an input may
5387 // be NaN.
5388 if (!N0->getFlags().hasNoNaNs())
5389 return SDValue();
5390
5391 SDValue Ops[3];
5392 for (unsigned I = 0; I < 3; ++I)
5393 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5394
5395 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5396 if (Res.getOpcode() != AMDGPUISD::FMED3)
5397 return SDValue(); // Op got folded away.
5398
5399 if (!N0.hasOneUse()) {
5400 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5401 DAG.ReplaceAllUsesWith(N0, Neg);
5402
5403 for (SDNode *U : Neg->users())
5404 DCI.AddToWorklist(U);
5405 }
5406
5407 return Res;
5408 }
5409 case ISD::FP_EXTEND:
5410 case ISD::FTRUNC:
5411 case ISD::FRINT:
5412 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5413 case ISD::FROUNDEVEN:
5414 case ISD::FSIN:
5415 case ISD::FCANONICALIZE:
5416 case AMDGPUISD::RCP:
5417 case AMDGPUISD::RCP_LEGACY:
5418 case AMDGPUISD::RCP_IFLAG:
5419 case AMDGPUISD::SIN_HW: {
5420 SDValue CvtSrc = N0.getOperand(0);
5421 if (CvtSrc.getOpcode() == ISD::FNEG) {
5422 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5423 // (fneg (rcp (fneg x))) -> (rcp x)
5424 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5425 }
5426
5427 if (!N0.hasOneUse())
5428 return SDValue();
5429
5430 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5431 // (fneg (rcp x)) -> (rcp (fneg x))
5432 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5433 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5434 }
5435 case ISD::FP_ROUND: {
5436 SDValue CvtSrc = N0.getOperand(0);
5437
5438 if (CvtSrc.getOpcode() == ISD::FNEG) {
5439 // (fneg (fp_round (fneg x))) -> (fp_round x)
5440 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5441 CvtSrc.getOperand(0), N0.getOperand(1));
5442 }
5443
5444 if (!N0.hasOneUse())
5445 return SDValue();
5446
5447 // (fneg (fp_round x)) -> (fp_round (fneg x))
5448 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5449 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5450 }
5451 case ISD::FP16_TO_FP: {
5452 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5453 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5454 // Put the fneg back as a legal source operation that can be matched later.
5455 SDLoc SL(N);
5456
5457 SDValue Src = N0.getOperand(0);
5458 EVT SrcVT = Src.getValueType();
5459
5460 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5461 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5462 DAG.getConstant(0x8000, SL, SrcVT));
5463 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5464 }
5465 case ISD::SELECT: {
5466 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5467 // TODO: Invert conditions of foldFreeOpFromSelect
5468 return SDValue();
5469 }
5470 case ISD::BITCAST: {
5471 SDLoc SL(N);
5472 SDValue BCSrc = N0.getOperand(0);
5473 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5474 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5475 if (VT != MVT::f64 || HighBits.getValueType().getSizeInBits() != 32 ||
5476 !fnegFoldsIntoOp(HighBits.getNode()))
5477 return SDValue();
5478
5479 // f64 fneg only really needs to operate on the high half of of the
5480 // register, so try to force it to an f32 operation to help make use of
5481 // source modifiers.
5482 //
5483 //
5484 // fneg (f64 (bitcast (build_vector x, y))) ->
5485 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5486 // (fneg (bitcast i32:y to f32)))
5487
5488 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5489 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5490 SDValue CastBack =
5491 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5492
5494 Ops.back() = CastBack;
5495 DCI.AddToWorklist(NegHi.getNode());
5496 SDValue Build =
5497 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5498 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5499
5500 if (!N0.hasOneUse())
5501 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5502 return Result;
5503 }
5504
5505 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5506 BCSrc.hasOneUse()) {
5507 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5508 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5509
5510 // TODO: Cast back result for multiple uses is beneficial in some cases.
5511
5512 SDValue LHS =
5513 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5514 SDValue RHS =
5515 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5516
5517 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5518 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5519
5520 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5521 NegRHS);
5522 }
5523
5524 return SDValue();
5525 }
5526 default:
5527 return SDValue();
5528 }
5529}
5530
5532 DAGCombinerInfo &DCI) const {
5533 SelectionDAG &DAG = DCI.DAG;
5534 SDValue N0 = N->getOperand(0);
5535
5536 if (!N0.hasOneUse())
5537 return SDValue();
5538
5539 switch (N0.getOpcode()) {
5540 case ISD::FP16_TO_FP: {
5541 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5542 SDLoc SL(N);
5543 SDValue Src = N0.getOperand(0);
5544 EVT SrcVT = Src.getValueType();
5545
5546 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5547 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5548 DAG.getConstant(0x7fff, SL, SrcVT));
5549 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5550 }
5551 default:
5552 return SDValue();
5553 }
5554}
5555
5557 DAGCombinerInfo &DCI) const {
5558 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5559 if (!CFP)
5560 return SDValue();
5561
5562 // XXX - Should this flush denormals?
5563 const APFloat &Val = CFP->getValueAPF();
5564 APFloat One(Val.getSemantics(), "1.0");
5565 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5566}
5567
5569 if (!Subtarget->isGCN())
5570 return false;
5571
5574 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5575 const auto *TII = ST.getInstrInfo();
5576
5577 if (!ST.hasVMovB64Inst() || (!SDConstant && !SDFPConstant))
5578 return false;
5579
5580 if (ST.has64BitLiterals())
5581 return true;
5582
5583 if (SDConstant) {
5584 const APInt &APVal = SDConstant->getAPIntValue();
5585 return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal);
5586 }
5587
5588 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5589 return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val);
5590}
5591
5593 DAGCombinerInfo &DCI) const {
5594 SelectionDAG &DAG = DCI.DAG;
5595 SDLoc DL(N);
5596
5597 switch(N->getOpcode()) {
5598 default:
5599 break;
5600 case ISD::BITCAST: {
5601 EVT DestVT = N->getValueType(0);
5602
5603 // Push casts through vector builds. This helps avoid emitting a large
5604 // number of copies when materializing floating point vector constants.
5605 //
5606 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5607 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5608 if (DestVT.isVector()) {
5609 SDValue Src = N->getOperand(0);
5610 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5613 EVT SrcVT = Src.getValueType();
5614 unsigned NElts = DestVT.getVectorNumElements();
5615
5616 if (SrcVT.getVectorNumElements() == NElts) {
5617 EVT DestEltVT = DestVT.getVectorElementType();
5618
5619 SmallVector<SDValue, 8> CastedElts;
5620 SDLoc SL(N);
5621 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5622 SDValue Elt = Src.getOperand(I);
5623 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5624 }
5625
5626 return DAG.getBuildVector(DestVT, SL, CastedElts);
5627 }
5628 }
5629 }
5630
5631 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5632 break;
5633
5634 // Fold bitcasts of constants.
5635 //
5636 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5637 // TODO: Generalize and move to DAGCombiner
5638 SDValue Src = N->getOperand(0);
5640 SDLoc SL(N);
5641 if (isInt64ImmLegal(C, DAG))
5642 break;
5643 uint64_t CVal = C->getZExtValue();
5644 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5645 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5646 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5647 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5648 }
5649
5651 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5652 SDLoc SL(N);
5653 if (isInt64ImmLegal(C, DAG))
5654 break;
5655 uint64_t CVal = Val.getZExtValue();
5656 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5657 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5658 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5659
5660 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5661 }
5662
5663 break;
5664 }
5665 case ISD::SHL:
5666 case ISD::SRA:
5667 case ISD::SRL: {
5668 // Range metadata can be invalidated when loads are converted to legal types
5669 // (e.g. v2i64 -> v4i32).
5670 // Try to convert vector shl/sra/srl before type legalization so that range
5671 // metadata can be utilized.
5672 if (!(N->getValueType(0).isVector() &&
5675 break;
5676 if (N->getOpcode() == ISD::SHL)
5677 return performShlCombine(N, DCI);
5678 if (N->getOpcode() == ISD::SRA)
5679 return performSraCombine(N, DCI);
5680 return performSrlCombine(N, DCI);
5681 }
5682 case ISD::TRUNCATE:
5683 return performTruncateCombine(N, DCI);
5684 case ISD::MUL:
5685 return performMulCombine(N, DCI);
5686 case AMDGPUISD::MUL_U24:
5687 case AMDGPUISD::MUL_I24: {
5688 if (SDValue Simplified = simplifyMul24(N, DCI))
5689 return Simplified;
5690 break;
5691 }
5692 case AMDGPUISD::MULHI_I24:
5693 case AMDGPUISD::MULHI_U24:
5694 return simplifyMul24(N, DCI);
5695 case ISD::SMUL_LOHI:
5696 case ISD::UMUL_LOHI:
5697 return performMulLoHiCombine(N, DCI);
5698 case ISD::MULHS:
5699 return performMulhsCombine(N, DCI);
5700 case ISD::MULHU:
5701 return performMulhuCombine(N, DCI);
5702 case ISD::SELECT:
5703 return performSelectCombine(N, DCI);
5704 case ISD::FNEG:
5705 return performFNegCombine(N, DCI);
5706 case ISD::FABS:
5707 return performFAbsCombine(N, DCI);
5708 case AMDGPUISD::BFE_I32:
5709 case AMDGPUISD::BFE_U32: {
5710 assert(!N->getValueType(0).isVector() &&
5711 "Vector handling of BFE not implemented");
5712 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5713 if (!Width)
5714 break;
5715
5716 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5717 if (WidthVal == 0)
5718 return DAG.getConstant(0, DL, MVT::i32);
5719
5721 if (!Offset)
5722 break;
5723
5724 SDValue BitsFrom = N->getOperand(0);
5725 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5726
5727 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5728
5729 if (OffsetVal == 0) {
5730 // This is already sign / zero extended, so try to fold away extra BFEs.
5731 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5732
5733 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5734 if (OpSignBits >= SignBits)
5735 return BitsFrom;
5736
5737 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5738 if (Signed) {
5739 // This is a sign_extend_inreg. Replace it to take advantage of existing
5740 // DAG Combines. If not eliminated, we will match back to BFE during
5741 // selection.
5742
5743 // TODO: The sext_inreg of extended types ends, although we can could
5744 // handle them in a single BFE.
5745 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5746 DAG.getValueType(SmallVT));
5747 }
5748
5749 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5750 }
5751
5752 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5753 if (Signed) {
5754 return constantFoldBFE<int32_t>(DAG,
5755 CVal->getSExtValue(),
5756 OffsetVal,
5757 WidthVal,
5758 DL);
5759 }
5760
5761 return constantFoldBFE<uint32_t>(DAG,
5762 CVal->getZExtValue(),
5763 OffsetVal,
5764 WidthVal,
5765 DL);
5766 }
5767
5768 if ((OffsetVal + WidthVal) >= 32 &&
5769 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5770 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5771 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5772 BitsFrom, ShiftVal);
5773 }
5774
5775 if (BitsFrom.hasOneUse()) {
5776 APInt Demanded = APInt::getBitsSet(32,
5777 OffsetVal,
5778 OffsetVal + WidthVal);
5779
5780 KnownBits Known;
5782 !DCI.isBeforeLegalizeOps());
5783 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5784 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5785 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5786 DCI.CommitTargetLoweringOpt(TLO);
5787 }
5788 }
5789
5790 break;
5791 }
5792 case ISD::LOAD:
5793 return performLoadCombine(N, DCI);
5794 case ISD::STORE:
5795 return performStoreCombine(N, DCI);
5796 case AMDGPUISD::RCP:
5797 case AMDGPUISD::RCP_IFLAG:
5798 return performRcpCombine(N, DCI);
5799 case ISD::AssertZext:
5800 case ISD::AssertSext:
5801 return performAssertSZExtCombine(N, DCI);
5803 return performIntrinsicWOChainCombine(N, DCI);
5804 case AMDGPUISD::FMAD_FTZ: {
5805 SDValue N0 = N->getOperand(0);
5806 SDValue N1 = N->getOperand(1);
5807 SDValue N2 = N->getOperand(2);
5808 EVT VT = N->getValueType(0);
5809
5810 // FMAD_FTZ is a FMAD + flush denormals to zero.
5811 // We flush the inputs, the intermediate step, and the output.
5815 if (N0CFP && N1CFP && N2CFP) {
5816 const auto FTZ = [](const APFloat &V) {
5817 if (V.isDenormal()) {
5818 APFloat Zero(V.getSemantics(), 0);
5819 return V.isNegative() ? -Zero : Zero;
5820 }
5821 return V;
5822 };
5823
5824 APFloat V0 = FTZ(N0CFP->getValueAPF());
5825 APFloat V1 = FTZ(N1CFP->getValueAPF());
5826 APFloat V2 = FTZ(N2CFP->getValueAPF());
5828 V0 = FTZ(V0);
5830 return DAG.getConstantFP(FTZ(V0), DL, VT);
5831 }
5832 break;
5833 }
5834 }
5835 return SDValue();
5836}
5837
5839 SDValue Op, const APInt &OriginalDemandedBits,
5840 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
5841 unsigned Depth) const {
5842 switch (Op.getOpcode()) {
5844 switch (Op.getConstantOperandVal(0)) {
5845 case Intrinsic::amdgcn_readfirstlane:
5846 case Intrinsic::amdgcn_readlane:
5847 case Intrinsic::amdgcn_set_inactive:
5848 case Intrinsic::amdgcn_wwm: {
5849 if (SimplifyDemandedBits(Op.getOperand(1), OriginalDemandedBits,
5850 OriginalDemandedElts, Known, TLO, Depth + 1))
5851 return true;
5852 break;
5853 }
5854 default:
5855 break;
5856 }
5857 break;
5858 }
5859 default:
5860 break;
5861 }
5862
5863 return false;
5864}
5865
5866//===----------------------------------------------------------------------===//
5867// Helper functions
5868//===----------------------------------------------------------------------===//
5869
5871 const TargetRegisterClass *RC,
5872 Register Reg, EVT VT,
5873 const SDLoc &SL,
5874 bool RawReg) const {
5876 MachineRegisterInfo &MRI = MF.getRegInfo();
5877 Register VReg;
5878
5879 if (!MRI.isLiveIn(Reg)) {
5880 VReg = MRI.createVirtualRegister(RC);
5881 MRI.addLiveIn(Reg, VReg);
5882 } else {
5883 VReg = MRI.getLiveInVirtReg(Reg);
5884 }
5885
5886 if (RawReg)
5887 return DAG.getRegister(VReg, VT);
5888
5889 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5890}
5891
5892// This may be called multiple times, and nothing prevents creating multiple
5893// objects at the same offset. See if we already defined this object.
5895 int64_t Offset) {
5896 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5897 if (MFI.getObjectOffset(I) == Offset) {
5898 assert(MFI.getObjectSize(I) == Size);
5899 return I;
5900 }
5901 }
5902
5903 return MFI.CreateFixedObject(Size, Offset, true);
5904}
5905
5907 EVT VT,
5908 const SDLoc &SL,
5909 int64_t Offset) const {
5911 MachineFrameInfo &MFI = MF.getFrameInfo();
5912 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5913
5914 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5915 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5916
5917 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5920}
5921
5923 const SDLoc &SL,
5924 SDValue Chain,
5925 SDValue ArgVal,
5926 int64_t Offset) const {
5930
5931 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5932 // Stores to the argument stack area are relative to the stack pointer.
5933 SDValue SP =
5934 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5935 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5936 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5938 return Store;
5939}
5940
5942 const TargetRegisterClass *RC,
5943 EVT VT, const SDLoc &SL,
5944 const ArgDescriptor &Arg) const {
5945 assert(Arg && "Attempting to load missing argument");
5946
5947 SDValue V = Arg.isRegister() ?
5948 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5949 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5950
5951 if (!Arg.isMasked())
5952 return V;
5953
5954 unsigned Mask = Arg.getMask();
5955 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5956 V = DAG.getNode(ISD::SRL, SL, VT, V,
5957 DAG.getShiftAmountConstant(Shift, VT, SL));
5958 return DAG.getNode(ISD::AND, SL, VT, V,
5959 DAG.getConstant(Mask >> Shift, SL, VT));
5960}
5961
5963 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5964 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5965 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5966 uint64_t ArgOffset =
5967 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5968 switch (Param) {
5969 case FIRST_IMPLICIT:
5970 return ArgOffset;
5971 case PRIVATE_BASE:
5973 case SHARED_BASE:
5974 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5975 case QUEUE_PTR:
5976 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5977 }
5978 llvm_unreachable("unexpected implicit parameter type");
5979}
5980
5987
5989 SelectionDAG &DAG, int Enabled,
5990 int &RefinementSteps,
5991 bool &UseOneConstNR,
5992 bool Reciprocal) const {
5993 EVT VT = Operand.getValueType();
5994
5995 if (VT == MVT::f32) {
5996 RefinementSteps = 0;
5997 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5998 }
5999
6000 // TODO: There is also f64 rsq instruction, but the documentation is less
6001 // clear on its precision.
6002
6003 return SDValue();
6004}
6005
6007 SelectionDAG &DAG, int Enabled,
6008 int &RefinementSteps) const {
6009 EVT VT = Operand.getValueType();
6010
6011 if (VT == MVT::f32) {
6012 // Reciprocal, < 1 ulp error.
6013 //
6014 // This reciprocal approximation converges to < 0.5 ulp error with one
6015 // newton rhapson performed with two fused multiple adds (FMAs).
6016
6017 RefinementSteps = 0;
6018 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
6019 }
6020
6021 // TODO: There is also f64 rcp instruction, but the documentation is less
6022 // clear on its precision.
6023
6024 return SDValue();
6025}
6026
6027static unsigned workitemIntrinsicDim(unsigned ID) {
6028 switch (ID) {
6029 case Intrinsic::amdgcn_workitem_id_x:
6030 return 0;
6031 case Intrinsic::amdgcn_workitem_id_y:
6032 return 1;
6033 case Intrinsic::amdgcn_workitem_id_z:
6034 return 2;
6035 default:
6036 llvm_unreachable("not a workitem intrinsic");
6037 }
6038}
6039
6041 const SDValue Op, KnownBits &Known,
6042 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
6043
6044 Known.resetAll(); // Don't know anything.
6045
6046 unsigned Opc = Op.getOpcode();
6047
6048 switch (Opc) {
6049 default:
6050 break;
6051 case AMDGPUISD::CARRY:
6052 case AMDGPUISD::BORROW: {
6053 Known.Zero = APInt::getHighBitsSet(32, 31);
6054 break;
6055 }
6056
6057 case AMDGPUISD::BFE_I32:
6058 case AMDGPUISD::BFE_U32: {
6059 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6060 if (!CWidth)
6061 return;
6062
6063 uint32_t Width = CWidth->getZExtValue() & 0x1f;
6064
6065 if (Opc == AMDGPUISD::BFE_U32)
6066 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
6067
6068 break;
6069 }
6070 case AMDGPUISD::FP_TO_FP16: {
6071 unsigned BitWidth = Known.getBitWidth();
6072
6073 // High bits are zero.
6075 break;
6076 }
6077 case AMDGPUISD::MUL_U24:
6078 case AMDGPUISD::MUL_I24: {
6079 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6080 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6081 unsigned BitWidth = Op.getScalarValueSizeInBits();
6082
6083 // Sign/Zero extend from 24 bits.
6084 if (Opc == AMDGPUISD::MUL_I24) {
6085 LHSKnown = LHSKnown.trunc(24).sext(BitWidth);
6086 RHSKnown = RHSKnown.trunc(24).sext(BitWidth);
6087 } else {
6088 LHSKnown = LHSKnown.trunc(24).zext(BitWidth);
6089 RHSKnown = RHSKnown.trunc(24).zext(BitWidth);
6090 }
6091
6092 // TODO: SelfMultiply can be poison, but not undef.
6093 bool SelfMultiply = Op.getOperand(0) == Op.getOperand(1);
6094 if (SelfMultiply)
6095 SelfMultiply &= DAG.isGuaranteedNotToBeUndefOrPoison(
6096 Op.getOperand(0), DemandedElts, UndefPoisonKind::UndefOrPoison,
6097 Depth + 1);
6098
6099 Known = KnownBits::mul(LHSKnown, RHSKnown, SelfMultiply);
6100 break;
6101 }
6102 case AMDGPUISD::PERM: {
6103 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6104 if (!CMask)
6105 return;
6106
6107 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6108 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6109 unsigned Sel = CMask->getZExtValue();
6110
6111 for (unsigned I = 0; I < 32; I += 8) {
6112 unsigned SelBits = Sel & 0xff;
6113 if (SelBits < 4) {
6114 SelBits *= 8;
6115 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6116 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6117 } else if (SelBits < 7) {
6118 SelBits = (SelBits & 3) * 8;
6119 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6120 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6121 } else if (SelBits == 0x0c) {
6122 Known.Zero |= 0xFFull << I;
6123 } else if (SelBits > 0x0c) {
6124 Known.One |= 0xFFull << I;
6125 }
6126 Sel >>= 8;
6127 }
6128 break;
6129 }
6130 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6131 Known.Zero.setHighBits(24);
6132 break;
6133 }
6134 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6135 Known.Zero.setHighBits(16);
6136 break;
6137 }
6138 case AMDGPUISD::LDS: {
6139 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
6140 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
6141
6142 Known.Zero.setHighBits(16);
6143 Known.Zero.setLowBits(Log2(Alignment));
6144 break;
6145 }
6146 case AMDGPUISD::SMIN3:
6147 case AMDGPUISD::SMAX3:
6148 case AMDGPUISD::SMED3:
6149 case AMDGPUISD::UMIN3:
6150 case AMDGPUISD::UMAX3:
6151 case AMDGPUISD::UMED3: {
6152 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6153 if (Known2.isUnknown())
6154 break;
6155
6156 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6157 if (Known1.isUnknown())
6158 break;
6159
6160 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6161 if (Known0.isUnknown())
6162 break;
6163
6164 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6165 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6166 Known.One = Known0.One & Known1.One & Known2.One;
6167 break;
6168 }
6170 unsigned IID = Op.getConstantOperandVal(0);
6171 switch (IID) {
6172 case Intrinsic::amdgcn_workitem_id_x:
6173 case Intrinsic::amdgcn_workitem_id_y:
6174 case Intrinsic::amdgcn_workitem_id_z: {
6175 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6177 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6178 break;
6179 }
6180 default:
6181 break;
6182 }
6183 }
6184 }
6185}
6186
6188 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6189 unsigned Depth) const {
6190 switch (Op.getOpcode()) {
6191 case AMDGPUISD::BFE_I32: {
6192 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6193 if (!Width)
6194 return 1;
6195
6196 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6197 if (!isNullConstant(Op.getOperand(1)))
6198 return SignBits;
6199
6200 // TODO: Could probably figure something out with non-0 offsets.
6201 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6202 return std::max(SignBits, Op0SignBits);
6203 }
6204
6205 case AMDGPUISD::BFE_U32: {
6206 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6207 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6208 }
6209
6210 case AMDGPUISD::CARRY:
6211 case AMDGPUISD::BORROW:
6212 return 31;
6213 case AMDGPUISD::BUFFER_LOAD_BYTE:
6214 return 25;
6215 case AMDGPUISD::BUFFER_LOAD_SHORT:
6216 return 17;
6217 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6218 return 24;
6219 case AMDGPUISD::BUFFER_LOAD_USHORT:
6220 return 16;
6221 case AMDGPUISD::FP_TO_FP16:
6222 return 16;
6223 case AMDGPUISD::SMIN3:
6224 case AMDGPUISD::SMAX3:
6225 case AMDGPUISD::SMED3:
6226 case AMDGPUISD::UMIN3:
6227 case AMDGPUISD::UMAX3:
6228 case AMDGPUISD::UMED3: {
6229 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6230 if (Tmp2 == 1)
6231 return 1; // Early out.
6232
6233 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6234 if (Tmp1 == 1)
6235 return 1; // Early out.
6236
6237 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6238 if (Tmp0 == 1)
6239 return 1; // Early out.
6240
6241 return std::min({Tmp0, Tmp1, Tmp2});
6242 }
6243 default:
6244 return 1;
6245 }
6246}
6247
6249 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6250 const MachineRegisterInfo &MRI, unsigned Depth) const {
6251 const MachineInstr *MI = MRI.getVRegDef(R);
6252 if (!MI)
6253 return 1;
6254
6255 // TODO: Check range metadata on MMO.
6256 switch (MI->getOpcode()) {
6257 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6258 return 25;
6259 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6260 return 17;
6261 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6262 return 24;
6263 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6264 return 16;
6265 case AMDGPU::G_AMDGPU_SMED3:
6266 case AMDGPU::G_AMDGPU_UMED3: {
6267 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6268 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6269 if (Tmp2 == 1)
6270 return 1;
6271 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6272 if (Tmp1 == 1)
6273 return 1;
6274 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6275 if (Tmp0 == 1)
6276 return 1;
6277 return std::min({Tmp0, Tmp1, Tmp2});
6278 }
6279 default:
6280 return 1;
6281 }
6282}
6283
6285 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6286 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
6287 unsigned Opcode = Op.getOpcode();
6288 switch (Opcode) {
6289 case AMDGPUISD::BFE_I32:
6290 case AMDGPUISD::BFE_U32:
6291 return false;
6292 }
6294 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
6295}
6296
6298 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6299 unsigned Depth) const {
6300 unsigned Opcode = Op.getOpcode();
6301 switch (Opcode) {
6302 case AMDGPUISD::FMIN_LEGACY:
6303 case AMDGPUISD::FMAX_LEGACY: {
6304 if (SNaN)
6305 return true;
6306
6307 // TODO: Can check no nans on one of the operands for each one, but which
6308 // one?
6309 return false;
6310 }
6311 case AMDGPUISD::FMUL_LEGACY:
6312 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6313 if (SNaN)
6314 return true;
6315 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6316 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6317 }
6318 case AMDGPUISD::FMED3:
6319 case AMDGPUISD::FMIN3:
6320 case AMDGPUISD::FMAX3:
6321 case AMDGPUISD::FMINIMUM3:
6322 case AMDGPUISD::FMAXIMUM3:
6323 case AMDGPUISD::FMAD_FTZ: {
6324 if (SNaN)
6325 return true;
6326 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6327 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6328 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6329 }
6330 case AMDGPUISD::CVT_F32_UBYTE0:
6331 case AMDGPUISD::CVT_F32_UBYTE1:
6332 case AMDGPUISD::CVT_F32_UBYTE2:
6333 case AMDGPUISD::CVT_F32_UBYTE3:
6334 return true;
6335
6336 case AMDGPUISD::RCP:
6337 case AMDGPUISD::RSQ:
6338 case AMDGPUISD::RCP_LEGACY:
6339 case AMDGPUISD::RSQ_CLAMP: {
6340 if (SNaN)
6341 return true;
6342
6343 // TODO: Need is known positive check.
6344 return false;
6345 }
6346 case ISD::FLDEXP:
6347 case AMDGPUISD::FRACT: {
6348 if (SNaN)
6349 return true;
6350 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6351 }
6352 case AMDGPUISD::DIV_SCALE:
6353 case AMDGPUISD::DIV_FMAS:
6354 case AMDGPUISD::DIV_FIXUP:
6355 // TODO: Refine on operands.
6356 return SNaN;
6357 case AMDGPUISD::SIN_HW:
6358 case AMDGPUISD::COS_HW: {
6359 // TODO: Need check for infinity
6360 return SNaN;
6361 }
6363 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6364 // TODO: Handle more intrinsics
6365 switch (IntrinsicID) {
6366 case Intrinsic::amdgcn_cubeid:
6367 case Intrinsic::amdgcn_cvt_off_f32_i4:
6368 return true;
6369
6370 case Intrinsic::amdgcn_frexp_mant: {
6371 if (SNaN)
6372 return true;
6373 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6374 }
6375 case Intrinsic::amdgcn_cvt_pkrtz: {
6376 if (SNaN)
6377 return true;
6378 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6379 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6380 }
6381 case Intrinsic::amdgcn_rcp:
6382 case Intrinsic::amdgcn_rsq:
6383 case Intrinsic::amdgcn_rcp_legacy:
6384 case Intrinsic::amdgcn_rsq_legacy:
6385 case Intrinsic::amdgcn_rsq_clamp:
6386 case Intrinsic::amdgcn_tanh: {
6387 if (SNaN)
6388 return true;
6389
6390 // TODO: Need is known positive check.
6391 return false;
6392 }
6393 case Intrinsic::amdgcn_trig_preop:
6394 case Intrinsic::amdgcn_fdot2:
6395 // TODO: Refine on operand
6396 return SNaN;
6397 case Intrinsic::amdgcn_fma_legacy:
6398 if (SNaN)
6399 return true;
6400 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6401 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6402 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6403 default:
6404 return false;
6405 }
6406 }
6407 default:
6408 return false;
6409 }
6410}
6411
6413 Register N0, Register N1) const {
6414 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6415}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:317
#define LLVM_READONLY
Definition Compiler.h:324
Provides analysis for querying information about KnownBits during GISel passes.
const HexagonInstrInfo * TII
static MaybeAlign getAlign(Value *Ptr)
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue lowerFEXPF64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const
Check whether value Val can be supported by v_mov_b64, for the current target.
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP16(SDValue Op, SelectionDAG &DAG, EVT FP16Ty) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:297
static const fltSemantics & IEEEdouble()
Definition APFloat.h:298
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:345
static const fltSemantics & IEEEhalf()
Definition APFloat.h:295
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1509
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1246
const fltSemantics & getSemantics() const
Definition APFloat.h:1552
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1264
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1223
APInt bitcastToAPInt() const
Definition APFloat.h:1436
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1163
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1414
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
bool ugt(const APInt &RHS) const
Unsigned greater than comparison.
Definition APInt.h:1189
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1411
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
const BlockAddress * getBlockAddress() const
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:357
iterator_range< arg_iterator > args()
Definition Function.h:866
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isGuaranteedNotToBeUndefOrPoison(SDValue Op, UndefPoisonKind Kind=UndefPoisonKind::UndefOrPoison, unsigned Depth=0) const
Return true if this function can prove that Op is never poison and, Kind can be used to track poison ...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT, int64_t Offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:800
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:819
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:813
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:795
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:953
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:573
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
LLVM_ABI void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1688
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:418
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:508
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:494
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:266
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:453
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:501
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:435
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:442
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:315
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:150
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:331
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:165
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:130
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:103
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...