LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
343 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
345 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
347 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
348
349 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
351 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
352
353 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
355 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
356
357 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
358
359 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
362 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
365 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
366
367 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
368 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
371 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
372
373 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
375 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
376
377 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
379 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
380
381 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
383 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
384
385 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
387 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
388
389 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
391 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
395 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
396
397 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
398 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
399
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
410 {MVT::f16, MVT::f32}, Legal);
412
414 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
416 {MVT::f16, MVT::f32, MVT::f64}, Expand);
417
420 Custom);
421
422 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
423
424 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
425
426 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
427 Expand);
428
429 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
430
431 if (Subtarget->has16BitInsts()) {
432 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
434 } else {
435 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
437 }
438
440 Custom);
441
442 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
443 if (Subtarget->has16BitInsts()) {
445 }
446
447 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
448 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
449 // default unless marked custom/legal.
451 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
452 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
453 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
454 MVT::v16f64},
455 Custom);
456
457 if (isTypeLegal(MVT::f16))
459 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
460 Custom);
461
462 // Expand to fneg + fadd.
464
466 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
467 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
468 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
469 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
470 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
471 Custom);
472
475 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
476 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
477 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
478 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
479 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
480 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
481 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
482 Custom);
483
485 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
486
487 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
488 for (MVT VT : ScalarIntVTs) {
489 // These should use [SU]DIVREM, so set them to expand
491 Expand);
492
493 // GPU does not have divrem function for signed or unsigned.
495
496 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
498
500
501 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
503 }
504
505 // The hardware supports 32-bit FSHR, but not FSHL.
507
508 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
509
511
515 MVT::i64, Custom);
517
519 Legal);
520
523 MVT::i64, Custom);
524
525 for (auto VT : {MVT::i8, MVT::i16})
527
528 static const MVT::SimpleValueType VectorIntTypes[] = {
529 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
530 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
531
532 for (MVT VT : VectorIntTypes) {
533 // Expand the following operations for the current type by default.
546 VT, Expand);
547 }
548
549 static const MVT::SimpleValueType FloatVectorTypes[] = {
550 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
551 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
552
553 for (MVT VT : FloatVectorTypes) {
566 VT, Expand);
567 }
568
569 // This causes using an unrolled select operation rather than expansion with
570 // bit operations. This is in general better, but the alternative using BFI
571 // instructions may be better if the select sources are SGPRs.
573 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
574
576 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
577
579 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
580
582 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
583
585 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
586
588 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
589
591 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
592
594 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
595
597 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
598
600 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
601
603 setJumpIsExpensive(true);
604
607
609
610 // We want to find all load dependencies for long chains of stores to enable
611 // merging into very wide vectors. The problem is with vectors with > 4
612 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
613 // vectors are a legal type, even though we have to split the loads
614 // usually. When we can more precisely specify load legality per address
615 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
616 // smarter so that they can figure out what to do in 2 iterations without all
617 // N > 4 stores on the same chain.
619
620 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
621 // about these during lowering.
622 MaxStoresPerMemcpy = 0xffffffff;
623 MaxStoresPerMemmove = 0xffffffff;
624 MaxStoresPerMemset = 0xffffffff;
625
626 // The expansion for 64-bit division is enormous.
628 addBypassSlowDiv(64, 32);
629
640
644}
645
647 if (getTargetMachine().Options.NoSignedZerosFPMath)
648 return true;
649
650 const auto Flags = Op.getNode()->getFlags();
651 if (Flags.hasNoSignedZeros())
652 return true;
653
654 return false;
655}
656
657//===----------------------------------------------------------------------===//
658// Target Information
659//===----------------------------------------------------------------------===//
660
662static bool fnegFoldsIntoOpcode(unsigned Opc) {
663 switch (Opc) {
664 case ISD::FADD:
665 case ISD::FSUB:
666 case ISD::FMUL:
667 case ISD::FMA:
668 case ISD::FMAD:
669 case ISD::FMINNUM:
670 case ISD::FMAXNUM:
673 case ISD::FMINIMUM:
674 case ISD::FMAXIMUM:
675 case ISD::FMINIMUMNUM:
676 case ISD::FMAXIMUMNUM:
677 case ISD::SELECT:
678 case ISD::FSIN:
679 case ISD::FTRUNC:
680 case ISD::FRINT:
681 case ISD::FNEARBYINT:
682 case ISD::FROUNDEVEN:
684 case AMDGPUISD::RCP:
685 case AMDGPUISD::RCP_LEGACY:
686 case AMDGPUISD::RCP_IFLAG:
687 case AMDGPUISD::SIN_HW:
688 case AMDGPUISD::FMUL_LEGACY:
689 case AMDGPUISD::FMIN_LEGACY:
690 case AMDGPUISD::FMAX_LEGACY:
691 case AMDGPUISD::FMED3:
692 // TODO: handle llvm.amdgcn.fma.legacy
693 return true;
694 case ISD::BITCAST:
695 llvm_unreachable("bitcast is special cased");
696 default:
697 return false;
698 }
699}
700
701static bool fnegFoldsIntoOp(const SDNode *N) {
702 unsigned Opc = N->getOpcode();
703 if (Opc == ISD::BITCAST) {
704 // TODO: Is there a benefit to checking the conditions performFNegCombine
705 // does? We don't for the other cases.
706 SDValue BCSrc = N->getOperand(0);
707 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
708 return BCSrc.getNumOperands() == 2 &&
709 BCSrc.getOperand(1).getValueSizeInBits() == 32;
710 }
711
712 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
713 }
714
715 return fnegFoldsIntoOpcode(Opc);
716}
717
718/// \p returns true if the operation will definitely need to use a 64-bit
719/// encoding, and thus will use a VOP3 encoding regardless of the source
720/// modifiers.
722static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
723 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
724 VT == MVT::f64;
725}
726
727/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
728/// type for ISD::SELECT.
730static bool selectSupportsSourceMods(const SDNode *N) {
731 // TODO: Only applies if select will be vector
732 return N->getValueType(0) == MVT::f32;
733}
734
735// Most FP instructions support source modifiers, but this could be refined
736// slightly.
738static bool hasSourceMods(const SDNode *N) {
739 if (isa<MemSDNode>(N))
740 return false;
741
742 switch (N->getOpcode()) {
743 case ISD::CopyToReg:
744 case ISD::FDIV:
745 case ISD::FREM:
746 case ISD::INLINEASM:
748 case AMDGPUISD::DIV_SCALE:
750
751 // TODO: Should really be looking at the users of the bitcast. These are
752 // problematic because bitcasts are used to legalize all stores to integer
753 // types.
754 case ISD::BITCAST:
755 return false;
757 switch (N->getConstantOperandVal(0)) {
758 case Intrinsic::amdgcn_interp_p1:
759 case Intrinsic::amdgcn_interp_p2:
760 case Intrinsic::amdgcn_interp_mov:
761 case Intrinsic::amdgcn_interp_p1_f16:
762 case Intrinsic::amdgcn_interp_p2_f16:
763 return false;
764 default:
765 return true;
766 }
767 }
768 case ISD::SELECT:
770 default:
771 return true;
772 }
773}
774
776 unsigned CostThreshold) {
777 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
778 // it is truly free to use a source modifier in all cases. If there are
779 // multiple users but for each one will necessitate using VOP3, there will be
780 // a code size increase. Try to avoid increasing code size unless we know it
781 // will save on the instruction count.
782 unsigned NumMayIncreaseSize = 0;
783 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
784
785 assert(!N->use_empty());
786
787 // XXX - Should this limit number of uses to check?
788 for (const SDNode *U : N->users()) {
789 if (!hasSourceMods(U))
790 return false;
791
792 if (!opMustUseVOP3Encoding(U, VT)) {
793 if (++NumMayIncreaseSize > CostThreshold)
794 return false;
795 }
796 }
797
798 return true;
799}
800
802 ISD::NodeType ExtendKind) const {
803 assert(!VT.isVector() && "only scalar expected");
804
805 // Round to the next multiple of 32-bits.
806 unsigned Size = VT.getSizeInBits();
807 if (Size <= 32)
808 return MVT::i32;
809 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
810}
811
813 return 32;
814}
815
817 return true;
818}
819
820// The backend supports 32 and 64 bit floating point immediates.
821// FIXME: Why are we reporting vectors of FP immediates as legal?
823 bool ForCodeSize) const {
824 EVT ScalarVT = VT.getScalarType();
825 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
826 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
827}
828
829// We don't want to shrink f64 / f32 constants.
831 EVT ScalarVT = VT.getScalarType();
832 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
833}
834
836 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
837 std::optional<unsigned> ByteOffset) const {
838 // TODO: This may be worth removing. Check regression tests for diffs.
839 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
840 return false;
841
842 unsigned NewSize = NewVT.getStoreSizeInBits();
843
844 // If we are reducing to a 32-bit load or a smaller multi-dword load,
845 // this is always better.
846 if (NewSize >= 32)
847 return true;
848
849 EVT OldVT = N->getValueType(0);
850 unsigned OldSize = OldVT.getStoreSizeInBits();
851
853 unsigned AS = MN->getAddressSpace();
854 // Do not shrink an aligned scalar load to sub-dword.
855 // Scalar engine cannot do sub-dword loads.
856 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
857 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
861 MN->isInvariant())) &&
863 return false;
864
865 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
866 // extloads, so doing one requires using a buffer_load. In cases where we
867 // still couldn't use a scalar load, using the wider load shouldn't really
868 // hurt anything.
869
870 // If the old size already had to be an extload, there's no harm in continuing
871 // to reduce the width.
872 return (OldSize < 32);
873}
874
876 const SelectionDAG &DAG,
877 const MachineMemOperand &MMO) const {
878
879 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
880
881 if (LoadTy.getScalarType() == MVT::i32)
882 return false;
883
884 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
885 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
886
887 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
888 return false;
889
890 unsigned Fast = 0;
892 CastTy, MMO, &Fast) &&
893 Fast;
894}
895
896// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
897// profitable with the expansion for 64-bit since it's generally good to
898// speculate things.
900 return true;
901}
902
904 return true;
905}
906
908 switch (N->getOpcode()) {
909 case ISD::EntryToken:
910 case ISD::TokenFactor:
911 return true;
913 unsigned IntrID = N->getConstantOperandVal(0);
915 }
917 unsigned IntrID = N->getConstantOperandVal(1);
919 }
920 case ISD::LOAD:
921 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
923 return true;
924 return false;
925 case AMDGPUISD::SETCC: // ballot-style instruction
926 return true;
927 }
928 return false;
929}
930
932 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
933 NegatibleCost &Cost, unsigned Depth) const {
934
935 switch (Op.getOpcode()) {
936 case ISD::FMA:
937 case ISD::FMAD: {
938 // Negating a fma is not free if it has users without source mods.
939 if (!allUsesHaveSourceMods(Op.getNode()))
940 return SDValue();
941 break;
942 }
943 case AMDGPUISD::RCP: {
944 SDValue Src = Op.getOperand(0);
945 EVT VT = Op.getValueType();
946 SDLoc SL(Op);
947
948 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
949 ForCodeSize, Cost, Depth + 1);
950 if (NegSrc)
951 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
952 return SDValue();
953 }
954 default:
955 break;
956 }
957
958 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
959 ForCodeSize, Cost, Depth);
960}
961
962//===---------------------------------------------------------------------===//
963// Target Properties
964//===---------------------------------------------------------------------===//
965
968
969 // Packed operations do not have a fabs modifier.
970 return VT == MVT::f32 || VT == MVT::f64 ||
971 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
972}
973
976 // Report this based on the end legalized type.
977 VT = VT.getScalarType();
978 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
979}
980
982 unsigned NumElem,
983 unsigned AS) const {
984 return true;
985}
986
988 // There are few operations which truly have vector input operands. Any vector
989 // operation is going to involve operations on each component, and a
990 // build_vector will be a copy per element, so it always makes sense to use a
991 // build_vector input in place of the extracted element to avoid a copy into a
992 // super register.
993 //
994 // We should probably only do this if all users are extracts only, but this
995 // should be the common case.
996 return true;
997}
998
1000 // Truncate is just accessing a subregister.
1001
1002 unsigned SrcSize = Source.getSizeInBits();
1003 unsigned DestSize = Dest.getSizeInBits();
1004
1005 return DestSize < SrcSize && DestSize % 32 == 0 ;
1006}
1007
1009 // Truncate is just accessing a subregister.
1010
1011 unsigned SrcSize = Source->getScalarSizeInBits();
1012 unsigned DestSize = Dest->getScalarSizeInBits();
1013
1014 if (DestSize== 16 && Subtarget->has16BitInsts())
1015 return SrcSize >= 32;
1016
1017 return DestSize < SrcSize && DestSize % 32 == 0;
1018}
1019
1021 unsigned SrcSize = Src->getScalarSizeInBits();
1022 unsigned DestSize = Dest->getScalarSizeInBits();
1023
1024 if (SrcSize == 16 && Subtarget->has16BitInsts())
1025 return DestSize >= 32;
1026
1027 return SrcSize == 32 && DestSize == 64;
1028}
1029
1031 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1032 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1033 // this will enable reducing 64-bit operations the 32-bit, which is always
1034 // good.
1035
1036 if (Src == MVT::i16)
1037 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1038
1039 return Src == MVT::i32 && Dest == MVT::i64;
1040}
1041
1043 EVT DestVT) const {
1044 switch (N->getOpcode()) {
1045 case ISD::ADD:
1046 case ISD::SUB:
1047 case ISD::SHL:
1048 case ISD::SRL:
1049 case ISD::SRA:
1050 case ISD::AND:
1051 case ISD::OR:
1052 case ISD::XOR:
1053 case ISD::MUL:
1054 case ISD::SETCC:
1055 case ISD::SELECT:
1056 case ISD::SMIN:
1057 case ISD::SMAX:
1058 case ISD::UMIN:
1059 case ISD::UMAX:
1060 if (Subtarget->has16BitInsts() &&
1061 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1062 // Don't narrow back down to i16 if promoted to i32 already.
1063 if (!N->isDivergent() && DestVT.isInteger() &&
1064 DestVT.getScalarSizeInBits() > 1 &&
1065 DestVT.getScalarSizeInBits() <= 16 &&
1066 SrcVT.getScalarSizeInBits() > 16) {
1067 return false;
1068 }
1069 }
1070 return true;
1071 default:
1072 break;
1073 }
1074
1075 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1076 // limited number of native 64-bit operations. Shrinking an operation to fit
1077 // in a single 32-bit register should always be helpful. As currently used,
1078 // this is much less general than the name suggests, and is only used in
1079 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1080 // not profitable, and may actually be harmful.
1081 if (isa<LoadSDNode>(N))
1082 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1083
1084 return true;
1085}
1086
1088 const SDNode* N, CombineLevel Level) const {
1089 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1090 N->getOpcode() == ISD::SRL) &&
1091 "Expected shift op");
1092
1093 SDValue ShiftLHS = N->getOperand(0);
1094 if (!ShiftLHS->hasOneUse())
1095 return false;
1096
1097 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1098 !ShiftLHS.getOperand(0)->hasOneUse())
1099 return false;
1100
1101 // Always commute pre-type legalization and right shifts.
1102 // We're looking for shl(or(x,y),z) patterns.
1104 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1105 return true;
1106
1107 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1108 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1109 (N->user_begin()->getOpcode() == ISD::SRA ||
1110 N->user_begin()->getOpcode() == ISD::SRL))
1111 return false;
1112
1113 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1114 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1115 if (LHS.getOpcode() != ISD::SHL)
1116 return false;
1117 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1118 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1119 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1120 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1121 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1122 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1123 };
1124 SDValue LHS = N->getOperand(0).getOperand(0);
1125 SDValue RHS = N->getOperand(0).getOperand(1);
1126 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1127}
1128
1129//===---------------------------------------------------------------------===//
1130// TargetLowering Callbacks
1131//===---------------------------------------------------------------------===//
1132
1134 bool IsVarArg) {
1135 switch (CC) {
1143 return CC_AMDGPU;
1146 return CC_AMDGPU_CS_CHAIN;
1147 case CallingConv::C:
1148 case CallingConv::Fast:
1149 case CallingConv::Cold:
1150 return CC_AMDGPU_Func;
1153 return CC_SI_Gfx;
1156 default:
1157 reportFatalUsageError("unsupported calling convention for call");
1158 }
1159}
1160
1162 bool IsVarArg) {
1163 switch (CC) {
1166 llvm_unreachable("kernels should not be handled here");
1176 return RetCC_SI_Shader;
1179 return RetCC_SI_Gfx;
1180 case CallingConv::C:
1181 case CallingConv::Fast:
1182 case CallingConv::Cold:
1183 return RetCC_AMDGPU_Func;
1184 default:
1185 reportFatalUsageError("unsupported calling convention");
1186 }
1187}
1188
1189/// The SelectionDAGBuilder will automatically promote function arguments
1190/// with illegal types. However, this does not work for the AMDGPU targets
1191/// since the function arguments are stored in memory as these illegal types.
1192/// In order to handle this properly we need to get the original types sizes
1193/// from the LLVM IR Function and fixup the ISD:InputArg values before
1194/// passing them to AnalyzeFormalArguments()
1195
1196/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1197/// input values across multiple registers. Each item in the Ins array
1198/// represents a single value that will be stored in registers. Ins[x].VT is
1199/// the value type of the value that will be stored in the register, so
1200/// whatever SDNode we lower the argument to needs to be this type.
1201///
1202/// In order to correctly lower the arguments we need to know the size of each
1203/// argument. Since Ins[x].VT gives us the size of the register that will
1204/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1205/// for the original function argument so that we can deduce the correct memory
1206/// type to use for Ins[x]. In most cases the correct memory type will be
1207/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1208/// we have a kernel argument of type v8i8, this argument will be split into
1209/// 8 parts and each part will be represented by its own item in the Ins array.
1210/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1211/// the argument before it was split. From this, we deduce that the memory type
1212/// for each individual part is i8. We pass the memory type as LocVT to the
1213/// calling convention analysis function and the register type (Ins[x].VT) as
1214/// the ValVT.
1216 CCState &State,
1217 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1218 const MachineFunction &MF = State.getMachineFunction();
1219 const Function &Fn = MF.getFunction();
1220 LLVMContext &Ctx = Fn.getContext();
1221 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1222 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1224
1225 Align MaxAlign = Align(1);
1226 uint64_t ExplicitArgOffset = 0;
1227 const DataLayout &DL = Fn.getDataLayout();
1228
1229 unsigned InIndex = 0;
1230
1231 for (const Argument &Arg : Fn.args()) {
1232 const bool IsByRef = Arg.hasByRefAttr();
1233 Type *BaseArgTy = Arg.getType();
1234 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1235 Align Alignment = DL.getValueOrABITypeAlignment(
1236 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1237 MaxAlign = std::max(Alignment, MaxAlign);
1238 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1239
1240 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1241 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1242
1243 // We're basically throwing away everything passed into us and starting over
1244 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1245 // to us as computed in Ins.
1246 //
1247 // We also need to figure out what type legalization is trying to do to get
1248 // the correct memory offsets.
1249
1250 SmallVector<EVT, 16> ValueVTs;
1252 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1253 &Offsets, ArgOffset);
1254
1255 for (unsigned Value = 0, NumValues = ValueVTs.size();
1256 Value != NumValues; ++Value) {
1257 uint64_t BasePartOffset = Offsets[Value];
1258
1259 EVT ArgVT = ValueVTs[Value];
1260 EVT MemVT = ArgVT;
1261 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1262 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1263
1264 if (NumRegs == 1) {
1265 // This argument is not split, so the IR type is the memory type.
1266 if (ArgVT.isExtended()) {
1267 // We have an extended type, like i24, so we should just use the
1268 // register type.
1269 MemVT = RegisterVT;
1270 } else {
1271 MemVT = ArgVT;
1272 }
1273 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1274 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1275 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1276 // We have a vector value which has been split into a vector with
1277 // the same scalar type, but fewer elements. This should handle
1278 // all the floating-point vector types.
1279 MemVT = RegisterVT;
1280 } else if (ArgVT.isVector() &&
1281 ArgVT.getVectorNumElements() == NumRegs) {
1282 // This arg has been split so that each element is stored in a separate
1283 // register.
1284 MemVT = ArgVT.getScalarType();
1285 } else if (ArgVT.isExtended()) {
1286 // We have an extended type, like i65.
1287 MemVT = RegisterVT;
1288 } else {
1289 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1290 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1291 if (RegisterVT.isInteger()) {
1292 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1293 } else if (RegisterVT.isVector()) {
1294 assert(!RegisterVT.getScalarType().isFloatingPoint());
1295 unsigned NumElements = RegisterVT.getVectorNumElements();
1296 assert(MemoryBits % NumElements == 0);
1297 // This vector type has been split into another vector type with
1298 // a different elements size.
1299 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1300 MemoryBits / NumElements);
1301 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1302 } else {
1303 llvm_unreachable("cannot deduce memory type.");
1304 }
1305 }
1306
1307 // Convert one element vectors to scalar.
1308 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1309 MemVT = MemVT.getScalarType();
1310
1311 // Round up vec3/vec5 argument.
1312 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1313 MemVT = MemVT.getPow2VectorType(State.getContext());
1314 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1315 MemVT = MemVT.getRoundIntegerType(State.getContext());
1316 }
1317
1318 unsigned PartOffset = 0;
1319 for (unsigned i = 0; i != NumRegs; ++i) {
1320 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1321 BasePartOffset + PartOffset,
1322 MemVT.getSimpleVT(),
1324 PartOffset += MemVT.getStoreSize();
1325 }
1326 }
1327 }
1328}
1329
1331 SDValue Chain, CallingConv::ID CallConv,
1332 bool isVarArg,
1334 const SmallVectorImpl<SDValue> &OutVals,
1335 const SDLoc &DL, SelectionDAG &DAG) const {
1336 // FIXME: Fails for r600 tests
1337 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1338 // "wave terminate should not have return values");
1339 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1340}
1341
1342//===---------------------------------------------------------------------===//
1343// Target specific lowering
1344//===---------------------------------------------------------------------===//
1345
1346/// Selects the correct CCAssignFn for a given CallingConvention value.
1351
1356
1358 SelectionDAG &DAG,
1359 MachineFrameInfo &MFI,
1360 int ClobberedFI) const {
1361 SmallVector<SDValue, 8> ArgChains;
1362 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1363 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1364
1365 // Include the original chain at the beginning of the list. When this is
1366 // used by target LowerCall hooks, this helps legalize find the
1367 // CALLSEQ_BEGIN node.
1368 ArgChains.push_back(Chain);
1369
1370 // Add a chain value for each stack argument corresponding
1371 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1372 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1373 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1374 if (FI->getIndex() < 0) {
1375 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1376 int64_t InLastByte = InFirstByte;
1377 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1378
1379 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1380 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1381 ArgChains.push_back(SDValue(L, 1));
1382 }
1383 }
1384 }
1385 }
1386
1387 // Build a tokenfactor for all the chains.
1388 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1389}
1390
1393 StringRef Reason) const {
1394 SDValue Callee = CLI.Callee;
1395 SelectionDAG &DAG = CLI.DAG;
1396
1397 const Function &Fn = DAG.getMachineFunction().getFunction();
1398
1399 StringRef FuncName("<unknown>");
1400
1402 FuncName = G->getSymbol();
1403 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1404 FuncName = G->getGlobal()->getName();
1405
1406 DAG.getContext()->diagnose(
1407 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1408
1409 if (!CLI.IsTailCall) {
1410 for (ISD::InputArg &Arg : CLI.Ins)
1411 InVals.push_back(DAG.getPOISON(Arg.VT));
1412 }
1413
1414 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1415 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1416 return CLI.Chain;
1417
1418 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1419 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1420}
1421
1423 SmallVectorImpl<SDValue> &InVals) const {
1424 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1425}
1426
1428 SelectionDAG &DAG) const {
1429 const Function &Fn = DAG.getMachineFunction().getFunction();
1430
1432 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1433 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1434 return DAG.getMergeValues(Ops, SDLoc());
1435}
1436
1438 SelectionDAG &DAG) const {
1439 switch (Op.getOpcode()) {
1440 default:
1441 Op->print(errs(), &DAG);
1442 llvm_unreachable("Custom lowering code for this "
1443 "instruction is not implemented yet!");
1444 break;
1446 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1448 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1449 case ISD::SDIVREM:
1450 return LowerSDIVREM(Op, DAG);
1451 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1452 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1453 case ISD::FRINT: return LowerFRINT(Op, DAG);
1454 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1455 case ISD::FROUNDEVEN:
1456 return LowerFROUNDEVEN(Op, DAG);
1457 case ISD::FROUND: return LowerFROUND(Op, DAG);
1458 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1459 case ISD::FLOG2:
1460 return LowerFLOG2(Op, DAG);
1461 case ISD::FLOG:
1462 case ISD::FLOG10:
1463 return LowerFLOGCommon(Op, DAG);
1464 case ISD::FEXP:
1465 case ISD::FEXP10:
1466 return lowerFEXP(Op, DAG);
1467 case ISD::FEXP2:
1468 return lowerFEXP2(Op, DAG);
1469 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1470 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1471 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1472 case ISD::FP_TO_SINT:
1473 case ISD::FP_TO_UINT:
1474 return LowerFP_TO_INT(Op, DAG);
1475 case ISD::CTTZ:
1477 case ISD::CTLZ:
1479 return LowerCTLZ_CTTZ(Op, DAG);
1481 }
1482 return Op;
1483}
1484
1487 SelectionDAG &DAG) const {
1488 switch (N->getOpcode()) {
1490 // Different parts of legalization seem to interpret which type of
1491 // sign_extend_inreg is the one to check for custom lowering. The extended
1492 // from type is what really matters, but some places check for custom
1493 // lowering of the result type. This results in trying to use
1494 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1495 // nothing here and let the illegal result integer be handled normally.
1496 return;
1497 case ISD::FLOG2:
1498 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1499 Results.push_back(Lowered);
1500 return;
1501 case ISD::FLOG:
1502 case ISD::FLOG10:
1503 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1504 Results.push_back(Lowered);
1505 return;
1506 case ISD::FEXP2:
1507 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1508 Results.push_back(Lowered);
1509 return;
1510 case ISD::FEXP:
1511 case ISD::FEXP10:
1512 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1513 Results.push_back(Lowered);
1514 return;
1515 case ISD::CTLZ:
1517 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1518 Results.push_back(Lowered);
1519 return;
1520 default:
1521 return;
1522 }
1523}
1524
1526 SDValue Op,
1527 SelectionDAG &DAG) const {
1528
1529 const DataLayout &DL = DAG.getDataLayout();
1531 const GlobalValue *GV = G->getGlobal();
1532
1533 if (!MFI->isModuleEntryFunction()) {
1534 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1535 if (std::optional<uint32_t> Address =
1537 if (IsNamedBarrier) {
1538 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1539 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1540 }
1541 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1542 } else if (IsNamedBarrier) {
1543 llvm_unreachable("named barrier should have an assigned address");
1544 }
1545 }
1546
1547 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1548 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1549 if (!MFI->isModuleEntryFunction() &&
1550 GV->getName() != "llvm.amdgcn.module.lds" &&
1552 SDLoc DL(Op);
1553 const Function &Fn = DAG.getMachineFunction().getFunction();
1555 Fn, "local memory global used by non-kernel function",
1556 DL.getDebugLoc(), DS_Warning));
1557
1558 // We currently don't have a way to correctly allocate LDS objects that
1559 // aren't directly associated with a kernel. We do force inlining of
1560 // functions that use local objects. However, if these dead functions are
1561 // not eliminated, we don't want a compile time error. Just emit a warning
1562 // and a trap, since there should be no callable path here.
1563 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1564 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1565 Trap, DAG.getRoot());
1566 DAG.setRoot(OutputChain);
1567 return DAG.getPOISON(Op.getValueType());
1568 }
1569
1570 // XXX: What does the value of G->getOffset() mean?
1571 assert(G->getOffset() == 0 &&
1572 "Do not know what to do with an non-zero offset");
1573
1574 // TODO: We could emit code to handle the initialization somewhere.
1575 // We ignore the initializer for now and legalize it to allow selection.
1576 // The initializer will anyway get errored out during assembly emission.
1577 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1578 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1579 }
1580 return SDValue();
1581}
1582
1584 SelectionDAG &DAG) const {
1586 SDLoc SL(Op);
1587
1588 EVT VT = Op.getValueType();
1589 if (VT.getVectorElementType().getSizeInBits() < 32) {
1590 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1591 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1592 unsigned NewNumElt = OpBitSize / 32;
1593 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1595 MVT::i32, NewNumElt);
1596 for (const SDUse &U : Op->ops()) {
1597 SDValue In = U.get();
1598 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1599 if (NewNumElt > 1)
1600 DAG.ExtractVectorElements(NewIn, Args);
1601 else
1602 Args.push_back(NewIn);
1603 }
1604
1605 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1606 NewNumElt * Op.getNumOperands());
1607 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1608 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1609 }
1610 }
1611
1612 for (const SDUse &U : Op->ops())
1613 DAG.ExtractVectorElements(U.get(), Args);
1614
1615 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1616}
1617
1619 SelectionDAG &DAG) const {
1620 SDLoc SL(Op);
1622 unsigned Start = Op.getConstantOperandVal(1);
1623 EVT VT = Op.getValueType();
1624 EVT SrcVT = Op.getOperand(0).getValueType();
1625
1626 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1627 unsigned NumElt = VT.getVectorNumElements();
1628 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1629 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1630
1631 // Extract 32-bit registers at a time.
1632 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1633 EVT NewVT = NumElt == 2
1634 ? MVT::i32
1635 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1636 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1637
1638 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1639 if (NumElt == 2)
1640 Tmp = Args[0];
1641 else
1642 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1643
1644 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1645 }
1646
1647 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1649
1650 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1651}
1652
1653// TODO: Handle fabs too
1655 if (Val.getOpcode() == ISD::FNEG)
1656 return Val.getOperand(0);
1657
1658 return Val;
1659}
1660
1662 if (Val.getOpcode() == ISD::FNEG)
1663 Val = Val.getOperand(0);
1664 if (Val.getOpcode() == ISD::FABS)
1665 Val = Val.getOperand(0);
1666 if (Val.getOpcode() == ISD::FCOPYSIGN)
1667 Val = Val.getOperand(0);
1668 return Val;
1669}
1670
1672 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1673 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1674 SelectionDAG &DAG = DCI.DAG;
1675 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1676 switch (CCOpcode) {
1677 case ISD::SETOEQ:
1678 case ISD::SETONE:
1679 case ISD::SETUNE:
1680 case ISD::SETNE:
1681 case ISD::SETUEQ:
1682 case ISD::SETEQ:
1683 case ISD::SETFALSE:
1684 case ISD::SETFALSE2:
1685 case ISD::SETTRUE:
1686 case ISD::SETTRUE2:
1687 case ISD::SETUO:
1688 case ISD::SETO:
1689 break;
1690 case ISD::SETULE:
1691 case ISD::SETULT: {
1692 if (LHS == True)
1693 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1694 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1695 }
1696 case ISD::SETOLE:
1697 case ISD::SETOLT:
1698 case ISD::SETLE:
1699 case ISD::SETLT: {
1700 // Ordered. Assume ordered for undefined.
1701
1702 // Only do this after legalization to avoid interfering with other combines
1703 // which might occur.
1705 !DCI.isCalledByLegalizer())
1706 return SDValue();
1707
1708 // We need to permute the operands to get the correct NaN behavior. The
1709 // selected operand is the second one based on the failing compare with NaN,
1710 // so permute it based on the compare type the hardware uses.
1711 if (LHS == True)
1712 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1713 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1714 }
1715 case ISD::SETUGE:
1716 case ISD::SETUGT: {
1717 if (LHS == True)
1718 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1719 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1720 }
1721 case ISD::SETGT:
1722 case ISD::SETGE:
1723 case ISD::SETOGE:
1724 case ISD::SETOGT: {
1726 !DCI.isCalledByLegalizer())
1727 return SDValue();
1728
1729 if (LHS == True)
1730 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1731 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1732 }
1733 case ISD::SETCC_INVALID:
1734 llvm_unreachable("Invalid setcc condcode!");
1735 }
1736 return SDValue();
1737}
1738
1739/// Generate Min/Max node
1741 SDValue LHS, SDValue RHS,
1742 SDValue True, SDValue False,
1743 SDValue CC,
1744 DAGCombinerInfo &DCI) const {
1745 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1746 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1747
1748 SelectionDAG &DAG = DCI.DAG;
1749
1750 // If we can't directly match this, try to see if we can fold an fneg to
1751 // match.
1752
1755 SDValue NegTrue = peekFNeg(True);
1756
1757 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1758 // fmin/fmax.
1759 //
1760 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1761 // -> fneg (fmin_legacy lhs, K)
1762 //
1763 // TODO: Use getNegatedExpression
1764 if (LHS == NegTrue && CFalse && CRHS) {
1765 APFloat NegRHS = neg(CRHS->getValueAPF());
1766 if (NegRHS == CFalse->getValueAPF()) {
1767 SDValue Combined =
1768 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1769 if (Combined)
1770 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1771 return SDValue();
1772 }
1773 }
1774
1775 return SDValue();
1776}
1777
1778std::pair<SDValue, SDValue>
1780 SDLoc SL(Op);
1781
1782 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1783
1784 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1785 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1786
1787 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1788 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1789
1790 return std::pair(Lo, Hi);
1791}
1792
1794 SDLoc SL(Op);
1795
1796 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1797 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1798 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1799}
1800
1802 SDLoc SL(Op);
1803
1804 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1805 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1806 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1807}
1808
1809// Split a vector type into two parts. The first part is a power of two vector.
1810// The second part is whatever is left over, and is a scalar if it would
1811// otherwise be a 1-vector.
1812std::pair<EVT, EVT>
1814 EVT LoVT, HiVT;
1815 EVT EltVT = VT.getVectorElementType();
1816 unsigned NumElts = VT.getVectorNumElements();
1817 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1818 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1819 HiVT = NumElts - LoNumElts == 1
1820 ? EltVT
1821 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1822 return std::pair(LoVT, HiVT);
1823}
1824
1825// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1826// scalar.
1827std::pair<SDValue, SDValue>
1829 const EVT &LoVT, const EVT &HiVT,
1830 SelectionDAG &DAG) const {
1831 EVT VT = N.getValueType();
1833 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1834 VT.getVectorNumElements() &&
1835 "More vector elements requested than available!");
1837 DAG.getVectorIdxConstant(0, DL));
1838
1839 unsigned LoNumElts = LoVT.getVectorNumElements();
1840
1841 if (HiVT.isVector()) {
1842 unsigned HiNumElts = HiVT.getVectorNumElements();
1843 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1844 // Avoid creating an extract_subvector with an index that isn't a multiple
1845 // of the result type.
1847 DAG.getConstant(LoNumElts, DL, MVT::i32));
1848 return {Lo, Hi};
1849 }
1850
1852 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1853 /*Count=*/HiNumElts);
1854 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1855 return {Lo, Hi};
1856 }
1857
1859 DAG.getVectorIdxConstant(LoNumElts, DL));
1860 return {Lo, Hi};
1861}
1862
1864 SelectionDAG &DAG) const {
1866 EVT VT = Op.getValueType();
1867 SDLoc SL(Op);
1868
1869
1870 // If this is a 2 element vector, we really want to scalarize and not create
1871 // weird 1 element vectors.
1872 if (VT.getVectorNumElements() == 2) {
1873 SDValue Ops[2];
1874 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1875 return DAG.getMergeValues(Ops, SL);
1876 }
1877
1878 SDValue BasePtr = Load->getBasePtr();
1879 EVT MemVT = Load->getMemoryVT();
1880
1881 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1882
1883 EVT LoVT, HiVT;
1884 EVT LoMemVT, HiMemVT;
1885 SDValue Lo, Hi;
1886
1887 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1888 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1889 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1890
1891 unsigned Size = LoMemVT.getStoreSize();
1892 Align BaseAlign = Load->getAlign();
1893 Align HiAlign = commonAlignment(BaseAlign, Size);
1894
1895 SDValue LoLoad = DAG.getExtLoad(
1896 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1897 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1898 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1899 SDValue HiLoad = DAG.getExtLoad(
1900 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1901 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1902 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1903
1904 SDValue Join;
1905 if (LoVT == HiVT) {
1906 // This is the case that the vector is power of two so was evenly split.
1907 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1908 } else {
1909 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1910 DAG.getVectorIdxConstant(0, SL));
1911 Join = DAG.getNode(
1913 VT, Join, HiLoad,
1915 }
1916
1917 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1918 LoLoad.getValue(1), HiLoad.getValue(1))};
1919
1920 return DAG.getMergeValues(Ops, SL);
1921}
1922
1924 SelectionDAG &DAG) const {
1926 EVT VT = Op.getValueType();
1927 SDValue BasePtr = Load->getBasePtr();
1928 EVT MemVT = Load->getMemoryVT();
1929 SDLoc SL(Op);
1930 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1931 Align BaseAlign = Load->getAlign();
1932 unsigned NumElements = MemVT.getVectorNumElements();
1933
1934 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1935 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1936 if (NumElements != 3 ||
1937 (BaseAlign < Align(8) &&
1938 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1939 return SplitVectorLoad(Op, DAG);
1940
1941 assert(NumElements == 3);
1942
1943 EVT WideVT =
1945 EVT WideMemVT =
1947 SDValue WideLoad = DAG.getExtLoad(
1948 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1949 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1950 return DAG.getMergeValues(
1951 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1952 DAG.getVectorIdxConstant(0, SL)),
1953 WideLoad.getValue(1)},
1954 SL);
1955}
1956
1958 SelectionDAG &DAG) const {
1960 SDValue Val = Store->getValue();
1961 EVT VT = Val.getValueType();
1962
1963 // If this is a 2 element vector, we really want to scalarize and not create
1964 // weird 1 element vectors.
1965 if (VT.getVectorNumElements() == 2)
1966 return scalarizeVectorStore(Store, DAG);
1967
1968 EVT MemVT = Store->getMemoryVT();
1969 SDValue Chain = Store->getChain();
1970 SDValue BasePtr = Store->getBasePtr();
1971 SDLoc SL(Op);
1972
1973 EVT LoVT, HiVT;
1974 EVT LoMemVT, HiMemVT;
1975 SDValue Lo, Hi;
1976
1977 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1978 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1979 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1980
1981 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1982
1983 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1984 Align BaseAlign = Store->getAlign();
1985 unsigned Size = LoMemVT.getStoreSize();
1986 Align HiAlign = commonAlignment(BaseAlign, Size);
1987
1988 SDValue LoStore =
1989 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1990 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1991 SDValue HiStore = DAG.getTruncStore(
1992 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1993 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1994
1995 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1996}
1997
1998// This is a shortcut for integer division because we have fast i32<->f32
1999// conversions, and fast f32 reciprocal instructions. The fractional part of a
2000// float is enough to accurately represent up to a 24-bit signed integer.
2002 bool Sign) const {
2003 SDLoc DL(Op);
2004 EVT VT = Op.getValueType();
2005 SDValue LHS = Op.getOperand(0);
2006 SDValue RHS = Op.getOperand(1);
2007 MVT IntVT = MVT::i32;
2008 MVT FltVT = MVT::f32;
2009
2010 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2011 if (LHSSignBits < 9)
2012 return SDValue();
2013
2014 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2015 if (RHSSignBits < 9)
2016 return SDValue();
2017
2018 unsigned BitSize = VT.getSizeInBits();
2019 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2020 unsigned DivBits = BitSize - SignBits;
2021 if (Sign)
2022 ++DivBits;
2023
2026
2027 SDValue jq = DAG.getConstant(1, DL, IntVT);
2028
2029 if (Sign) {
2030 // char|short jq = ia ^ ib;
2031 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2032
2033 // jq = jq >> (bitsize - 2)
2034 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2035 DAG.getConstant(BitSize - 2, DL, VT));
2036
2037 // jq = jq | 0x1
2038 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2039 }
2040
2041 // int ia = (int)LHS;
2042 SDValue ia = LHS;
2043
2044 // int ib, (int)RHS;
2045 SDValue ib = RHS;
2046
2047 // float fa = (float)ia;
2048 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2049
2050 // float fb = (float)ib;
2051 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2052
2053 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2054 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2055
2056 // fq = trunc(fq);
2057 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2058
2059 // float fqneg = -fq;
2060 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2061
2063
2064 bool UseFmadFtz = false;
2065 if (Subtarget->isGCN()) {
2067 UseFmadFtz =
2069 }
2070
2071 // float fr = mad(fqneg, fb, fa);
2072 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2073 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2075 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2076
2077 // int iq = (int)fq;
2078 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2079
2080 // fr = fabs(fr);
2081 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2082
2083 // fb = fabs(fb);
2084 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2085
2086 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2087
2088 // int cv = fr >= fb;
2089 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2090
2091 // jq = (cv ? jq : 0);
2092 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2093
2094 // dst = iq + jq;
2095 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2096
2097 // Rem needs compensation, it's easier to recompute it
2098 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2099 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2100
2101 // Truncate to number of bits this divide really is.
2102 if (Sign) {
2103 SDValue InRegSize
2104 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2105 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2106 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2107 } else {
2108 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2109 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2110 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2111 }
2112
2113 return DAG.getMergeValues({ Div, Rem }, DL);
2114}
2115
2117 SelectionDAG &DAG,
2119 SDLoc DL(Op);
2120 EVT VT = Op.getValueType();
2121
2122 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2123
2124 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2125
2126 SDValue One = DAG.getConstant(1, DL, HalfVT);
2127 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2128
2129 //HiLo split
2130 SDValue LHS_Lo, LHS_Hi;
2131 SDValue LHS = Op.getOperand(0);
2132 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2133
2134 SDValue RHS_Lo, RHS_Hi;
2135 SDValue RHS = Op.getOperand(1);
2136 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2137
2138 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2139 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2140
2141 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2142 LHS_Lo, RHS_Lo);
2143
2144 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2145 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2146
2147 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2148 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2149 return;
2150 }
2151
2152 if (isTypeLegal(MVT::i64)) {
2153 // The algorithm here is based on ideas from "Software Integer Division",
2154 // Tom Rodeheffer, August 2008.
2155
2158
2159 // Compute denominator reciprocal.
2160 unsigned FMAD =
2161 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2164 : (unsigned)AMDGPUISD::FMAD_FTZ;
2165
2166 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2167 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2168 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2169 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2170 Cvt_Lo);
2171 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2172 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2173 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2174 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2175 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2176 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2177 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2178 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2179 Mul1);
2180 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2181 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2182 SDValue Rcp64 = DAG.getBitcast(VT,
2183 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2184
2185 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2186 SDValue One64 = DAG.getConstant(1, DL, VT);
2187 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2188 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2189
2190 // First round of UNR (Unsigned integer Newton-Raphson).
2191 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2192 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2193 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2194 SDValue Mulhi1_Lo, Mulhi1_Hi;
2195 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2196 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2197 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2198 Mulhi1_Lo, Zero1);
2199 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2200 Mulhi1_Hi, Add1_Lo.getValue(1));
2201 SDValue Add1 = DAG.getBitcast(VT,
2202 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2203
2204 // Second round of UNR.
2205 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2206 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2207 SDValue Mulhi2_Lo, Mulhi2_Hi;
2208 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2209 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2210 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2211 Mulhi2_Lo, Zero1);
2212 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2213 Mulhi2_Hi, Add2_Lo.getValue(1));
2214 SDValue Add2 = DAG.getBitcast(VT,
2215 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2216
2217 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2218
2219 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2220
2221 SDValue Mul3_Lo, Mul3_Hi;
2222 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2223 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2224 Mul3_Lo, Zero1);
2225 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2226 Mul3_Hi, Sub1_Lo.getValue(1));
2227 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2228 SDValue Sub1 = DAG.getBitcast(VT,
2229 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2230
2231 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2232 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2233 ISD::SETUGE);
2234 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2235 ISD::SETUGE);
2236 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2237
2238 // TODO: Here and below portions of the code can be enclosed into if/endif.
2239 // Currently control flow is unconditional and we have 4 selects after
2240 // potential endif to substitute PHIs.
2241
2242 // if C3 != 0 ...
2243 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2244 RHS_Lo, Zero1);
2245 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2246 RHS_Hi, Sub1_Lo.getValue(1));
2247 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2248 Zero, Sub2_Lo.getValue(1));
2249 SDValue Sub2 = DAG.getBitcast(VT,
2250 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2251
2252 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2253
2254 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2255 ISD::SETUGE);
2256 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2257 ISD::SETUGE);
2258 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2259
2260 // if (C6 != 0)
2261 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2262
2263 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2264 RHS_Lo, Zero1);
2265 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2266 RHS_Hi, Sub2_Lo.getValue(1));
2267 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2268 Zero, Sub3_Lo.getValue(1));
2269 SDValue Sub3 = DAG.getBitcast(VT,
2270 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2271
2272 // endif C6
2273 // endif C3
2274
2275 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2276 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2277
2278 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2279 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2280
2281 Results.push_back(Div);
2282 Results.push_back(Rem);
2283
2284 return;
2285 }
2286
2287 // r600 expandion.
2288 // Get Speculative values
2289 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2290 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2291
2292 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2293 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2294 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2295
2296 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2297 SDValue DIV_Lo = Zero;
2298
2299 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2300
2301 for (unsigned i = 0; i < halfBitWidth; ++i) {
2302 const unsigned bitPos = halfBitWidth - i - 1;
2303 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2304 // Get value of high bit
2305 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2306 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2307 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2308
2309 // Shift
2310 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2311 // Add LHS high bit
2312 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2313
2314 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2315 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2316
2317 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2318
2319 // Update REM
2320 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2321 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2322 }
2323
2324 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2325 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2326 Results.push_back(DIV);
2327 Results.push_back(REM);
2328}
2329
2331 SelectionDAG &DAG) const {
2332 SDLoc DL(Op);
2333 EVT VT = Op.getValueType();
2334
2335 if (VT == MVT::i64) {
2337 LowerUDIVREM64(Op, DAG, Results);
2338 return DAG.getMergeValues(Results, DL);
2339 }
2340
2341 if (VT == MVT::i32) {
2342 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2343 return Res;
2344 }
2345
2346 SDValue X = Op.getOperand(0);
2347 SDValue Y = Op.getOperand(1);
2348
2349 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2350 // algorithm used here.
2351
2352 // Initial estimate of inv(y).
2353 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2354
2355 // One round of UNR.
2356 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2357 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2358 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2359 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2360
2361 // Quotient/remainder estimate.
2362 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2363 SDValue R =
2364 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2365
2366 // First quotient/remainder refinement.
2367 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2368 SDValue One = DAG.getConstant(1, DL, VT);
2369 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2370 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2371 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2372 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2373 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2374
2375 // Second quotient/remainder refinement.
2376 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2377 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2378 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2379 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2380 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2381
2382 return DAG.getMergeValues({Q, R}, DL);
2383}
2384
2386 SelectionDAG &DAG) const {
2387 SDLoc DL(Op);
2388 EVT VT = Op.getValueType();
2389
2390 SDValue LHS = Op.getOperand(0);
2391 SDValue RHS = Op.getOperand(1);
2392
2393 SDValue Zero = DAG.getConstant(0, DL, VT);
2394 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2395
2396 if (VT == MVT::i32) {
2397 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2398 return Res;
2399 }
2400
2401 if (VT == MVT::i64 &&
2402 DAG.ComputeNumSignBits(LHS) > 32 &&
2403 DAG.ComputeNumSignBits(RHS) > 32) {
2404 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2405
2406 //HiLo split
2407 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2408 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2409 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2410 LHS_Lo, RHS_Lo);
2411 SDValue Res[2] = {
2412 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2413 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2414 };
2415 return DAG.getMergeValues(Res, DL);
2416 }
2417
2418 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2419 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2420 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2421 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2422
2423 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2424 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2425
2426 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2427 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2428
2429 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2430 SDValue Rem = Div.getValue(1);
2431
2432 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2433 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2434
2435 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2436 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2437
2438 SDValue Res[2] = {
2439 Div,
2440 Rem
2441 };
2442 return DAG.getMergeValues(Res, DL);
2443}
2444
2446 SDLoc SL(Op);
2447 SDValue Src = Op.getOperand(0);
2448
2449 // result = trunc(src)
2450 // if (src > 0.0 && src != result)
2451 // result += 1.0
2452
2453 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2454
2455 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2456 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2457
2458 EVT SetCCVT =
2459 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2460
2461 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2462 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2463 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2464
2465 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2466 // TODO: Should this propagate fast-math-flags?
2467 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2468}
2469
2471 SelectionDAG &DAG) {
2472 const unsigned FractBits = 52;
2473 const unsigned ExpBits = 11;
2474
2475 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2476 Hi,
2477 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2478 DAG.getConstant(ExpBits, SL, MVT::i32));
2479 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2480 DAG.getConstant(1023, SL, MVT::i32));
2481
2482 return Exp;
2483}
2484
2486 SDLoc SL(Op);
2487 SDValue Src = Op.getOperand(0);
2488
2489 assert(Op.getValueType() == MVT::f64);
2490
2491 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2492
2493 // Extract the upper half, since this is where we will find the sign and
2494 // exponent.
2495 SDValue Hi = getHiHalf64(Src, DAG);
2496
2497 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2498
2499 const unsigned FractBits = 52;
2500
2501 // Extract the sign bit.
2502 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2503 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2504
2505 // Extend back to 64-bits.
2506 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2507 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2508
2509 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2510 const SDValue FractMask
2511 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2512
2513 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2514 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2515 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2516
2517 EVT SetCCVT =
2518 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2519
2520 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2521
2522 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2523 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2524
2525 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2526 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2527
2528 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2529}
2530
2532 SelectionDAG &DAG) const {
2533 SDLoc SL(Op);
2534 SDValue Src = Op.getOperand(0);
2535
2536 assert(Op.getValueType() == MVT::f64);
2537
2538 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2539 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2540 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2541
2542 // TODO: Should this propagate fast-math-flags?
2543
2544 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2545 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2546
2547 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2548
2549 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2550 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2551
2552 EVT SetCCVT =
2553 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2554 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2555
2556 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2557}
2558
2560 SelectionDAG &DAG) const {
2561 // FNEARBYINT and FRINT are the same, except in their handling of FP
2562 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2563 // rint, so just treat them as equivalent.
2564 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2565 Op.getOperand(0));
2566}
2567
2569 auto VT = Op.getValueType();
2570 auto Arg = Op.getOperand(0u);
2571 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2572}
2573
2574// XXX - May require not supporting f32 denormals?
2575
2576// Don't handle v2f16. The extra instructions to scalarize and repack around the
2577// compare and vselect end up producing worse code than scalarizing the whole
2578// operation.
2580 SDLoc SL(Op);
2581 SDValue X = Op.getOperand(0);
2582 EVT VT = Op.getValueType();
2583
2584 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2585
2586 // TODO: Should this propagate fast-math-flags?
2587
2588 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2589
2590 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2591
2592 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2593 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2594
2595 EVT SetCCVT =
2596 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2597
2598 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2599 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2600 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2601
2602 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2603 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2604}
2605
2607 SDLoc SL(Op);
2608 SDValue Src = Op.getOperand(0);
2609
2610 // result = trunc(src);
2611 // if (src < 0.0 && src != result)
2612 // result += -1.0.
2613
2614 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2615
2616 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2617 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2618
2619 EVT SetCCVT =
2620 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2621
2622 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2623 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2624 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2625
2626 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2627 // TODO: Should this propagate fast-math-flags?
2628 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2629}
2630
2631/// Return true if it's known that \p Src can never be an f32 denormal value.
2633 switch (Src.getOpcode()) {
2634 case ISD::FP_EXTEND:
2635 return Src.getOperand(0).getValueType() == MVT::f16;
2636 case ISD::FP16_TO_FP:
2637 case ISD::FFREXP:
2638 case AMDGPUISD::LOG:
2639 case AMDGPUISD::EXP:
2640 return true;
2642 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2643 switch (IntrinsicID) {
2644 case Intrinsic::amdgcn_frexp_mant:
2645 case Intrinsic::amdgcn_log:
2646 case Intrinsic::amdgcn_log_clamp:
2647 case Intrinsic::amdgcn_exp2:
2648 return true;
2649 default:
2650 return false;
2651 }
2652 }
2653 default:
2654 return false;
2655 }
2656
2657 llvm_unreachable("covered opcode switch");
2658}
2659
2661 SDNodeFlags Flags) {
2662 return Flags.hasApproximateFuncs();
2663}
2664
2673
2675 SDValue Src,
2676 SDNodeFlags Flags) const {
2677 SDLoc SL(Src);
2678 EVT VT = Src.getValueType();
2679 const fltSemantics &Semantics = VT.getFltSemantics();
2680 SDValue SmallestNormal =
2681 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2682
2683 // Want to scale denormals up, but negatives and 0 work just as well on the
2684 // scaled path.
2685 SDValue IsLtSmallestNormal = DAG.getSetCC(
2686 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2687 SmallestNormal, ISD::SETOLT);
2688
2689 return IsLtSmallestNormal;
2690}
2691
2693 SDNodeFlags Flags) const {
2694 SDLoc SL(Src);
2695 EVT VT = Src.getValueType();
2696 const fltSemantics &Semantics = VT.getFltSemantics();
2697 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2698
2699 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2700 SDValue IsFinite = DAG.getSetCC(
2701 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2702 Inf, ISD::SETOLT);
2703 return IsFinite;
2704}
2705
2706/// If denormal handling is required return the scaled input to FLOG2, and the
2707/// check for denormal range. Otherwise, return null values.
2708std::pair<SDValue, SDValue>
2710 SDValue Src, SDNodeFlags Flags) const {
2711 if (!needsDenormHandlingF32(DAG, Src, Flags))
2712 return {};
2713
2714 MVT VT = MVT::f32;
2715 const fltSemantics &Semantics = APFloat::IEEEsingle();
2716 SDValue SmallestNormal =
2717 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2718
2719 SDValue IsLtSmallestNormal = DAG.getSetCC(
2720 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2721 SmallestNormal, ISD::SETOLT);
2722
2723 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2724 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2725 SDValue ScaleFactor =
2726 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2727
2728 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2729 return {ScaledInput, IsLtSmallestNormal};
2730}
2731
2733 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2734 // If we have to handle denormals, scale up the input and adjust the result.
2735
2736 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2737 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2738
2739 SDLoc SL(Op);
2740 EVT VT = Op.getValueType();
2741 SDValue Src = Op.getOperand(0);
2742 SDNodeFlags Flags = Op->getFlags();
2743
2744 if (VT == MVT::f16) {
2745 // Nothing in half is a denormal when promoted to f32.
2746 assert(!Subtarget->has16BitInsts());
2747 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2748 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2749 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2750 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2751 }
2752
2753 auto [ScaledInput, IsLtSmallestNormal] =
2754 getScaledLogInput(DAG, SL, Src, Flags);
2755 if (!ScaledInput)
2756 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2757
2758 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2759
2760 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2761 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2762 SDValue ResultOffset =
2763 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2764 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2765}
2766
2767static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2768 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2769 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2770 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2771}
2772
2774 SelectionDAG &DAG) const {
2775 SDValue X = Op.getOperand(0);
2776 EVT VT = Op.getValueType();
2777 SDNodeFlags Flags = Op->getFlags();
2778 SDLoc DL(Op);
2779 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2780 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2781
2782 const auto &Options = getTargetMachine().Options;
2783 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2784
2785 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2786 // Log and multiply in f32 is good enough for f16.
2787 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2788 }
2789
2790 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2791 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2792 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2793 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2794 }
2795
2796 return Lowered;
2797 }
2798
2799 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2800 if (ScaledInput)
2801 X = ScaledInput;
2802
2803 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2804
2805 SDValue R;
2806 if (Subtarget->hasFastFMAF32()) {
2807 // c+cc are ln(2)/ln(10) to more than 49 bits
2808 const float c_log10 = 0x1.344134p-2f;
2809 const float cc_log10 = 0x1.09f79ep-26f;
2810
2811 // c + cc is ln(2) to more than 49 bits
2812 const float c_log = 0x1.62e42ep-1f;
2813 const float cc_log = 0x1.efa39ep-25f;
2814
2815 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2816 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2817 // This adds correction terms for which contraction may lead to an increase
2818 // in the error of the approximation, so disable it.
2819 Flags.setAllowContract(false);
2820 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2821 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2822 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2823 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2824 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2825 } else {
2826 // ch+ct is ln(2)/ln(10) to more than 36 bits
2827 const float ch_log10 = 0x1.344000p-2f;
2828 const float ct_log10 = 0x1.3509f6p-18f;
2829
2830 // ch + ct is ln(2) to more than 36 bits
2831 const float ch_log = 0x1.62e000p-1f;
2832 const float ct_log = 0x1.0bfbe8p-15f;
2833
2834 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2835 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2836
2837 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2838 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2839 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2840 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2841 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2842 // This adds correction terms for which contraction may lead to an increase
2843 // in the error of the approximation, so disable it.
2844 Flags.setAllowContract(false);
2845 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2846 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2847 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2848 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2849 }
2850
2851 const bool IsFiniteOnly =
2852 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2853
2854 // TODO: Check if known finite from source value.
2855 if (!IsFiniteOnly) {
2856 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2857 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2858 }
2859
2860 if (IsScaled) {
2861 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2862 SDValue ShiftK =
2863 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2864 SDValue Shift =
2865 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2866 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2867 }
2868
2869 return R;
2870}
2871
2875
2876// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2877// promote f16 operation.
2879 SelectionDAG &DAG, bool IsLog10,
2880 SDNodeFlags Flags) const {
2881 EVT VT = Src.getValueType();
2882 unsigned LogOp =
2883 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2884
2885 double Log2BaseInverted =
2887
2888 if (VT == MVT::f32) {
2889 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2890 if (ScaledInput) {
2891 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2892 SDValue ScaledResultOffset =
2893 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2894
2895 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2896
2897 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2898 ScaledResultOffset, Zero, Flags);
2899
2900 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2901
2902 if (Subtarget->hasFastFMAF32())
2903 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2904 Flags);
2905 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2906 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2907 }
2908 }
2909
2910 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2911 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2912
2913 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2914 Flags);
2915}
2916
2918 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2919 // If we have to handle denormals, scale up the input and adjust the result.
2920
2921 SDLoc SL(Op);
2922 EVT VT = Op.getValueType();
2923 SDValue Src = Op.getOperand(0);
2924 SDNodeFlags Flags = Op->getFlags();
2925
2926 if (VT == MVT::f16) {
2927 // Nothing in half is a denormal when promoted to f32.
2928 assert(!Subtarget->has16BitInsts());
2929 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2930 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2931 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2932 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2933 }
2934
2935 assert(VT == MVT::f32);
2936
2937 if (!needsDenormHandlingF32(DAG, Src, Flags))
2938 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2939
2940 // bool needs_scaling = x < -0x1.f80000p+6f;
2941 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2942
2943 // -nextafter(128.0, -1)
2944 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2945
2946 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2947
2948 SDValue NeedsScaling =
2949 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2950
2951 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2952 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2953
2954 SDValue AddOffset =
2955 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2956
2957 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2958 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2959
2960 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2961 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2962 SDValue ResultScale =
2963 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2964
2965 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2966}
2967
2969 SelectionDAG &DAG,
2970 SDNodeFlags Flags,
2971 bool IsExp10) const {
2972 // exp(x) -> exp2(M_LOG2E_F * x);
2973 // exp10(x) -> exp2(log2(10) * x);
2974 EVT VT = X.getValueType();
2975 SDValue Const =
2976 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
2977
2978 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
2979 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2980 : (unsigned)ISD::FEXP2,
2981 SL, VT, Mul, Flags);
2982}
2983
2985 SelectionDAG &DAG,
2986 SDNodeFlags Flags) const {
2987 EVT VT = X.getValueType();
2988 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
2989 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
2990
2991 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2992
2993 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2994 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2995
2996 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2997
2998 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2999
3000 SDValue AdjustedX =
3001 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3002
3003 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3004 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3005
3006 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3007
3008 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3009 SDValue AdjustedResult =
3010 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3011
3012 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3013 Flags);
3014}
3015
3016/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3017/// handled correctly.
3019 SelectionDAG &DAG,
3020 SDNodeFlags Flags) const {
3021 const EVT VT = X.getValueType();
3022
3023 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3024 : static_cast<unsigned>(ISD::FEXP2);
3025
3026 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3027 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3028 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3029 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3030
3031 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3032 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3033 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3034 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3035 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3036 }
3037
3038 // bool s = x < -0x1.2f7030p+5f;
3039 // x += s ? 0x1.0p+5f : 0.0f;
3040 // exp10 = exp2(x * 0x1.a92000p+1f) *
3041 // exp2(x * 0x1.4f0978p-11f) *
3042 // (s ? 0x1.9f623ep-107f : 1.0f);
3043
3044 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3045
3046 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3047 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3048
3049 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3050 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3051 SDValue AdjustedX =
3052 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3053
3054 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3055 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3056
3057 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3058 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3059 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3060 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3061
3062 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3063
3064 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3065 SDValue AdjustedResult =
3066 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3067
3068 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3069 Flags);
3070}
3071
3073 EVT VT = Op.getValueType();
3074 SDLoc SL(Op);
3075 SDValue X = Op.getOperand(0);
3076 SDNodeFlags Flags = Op->getFlags();
3077 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3078
3079 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3080 // library behavior. Also, is known-not-daz source sufficient?
3081 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3082 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3083 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3084 }
3085
3086 if (VT.getScalarType() == MVT::f16) {
3087 if (VT.isVector())
3088 return SDValue();
3089
3090 // Nothing in half is a denormal when promoted to f32.
3091 //
3092 // exp(f16 x) ->
3093 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3094 //
3095 // exp10(f16 x) ->
3096 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3097 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3098 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3099 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3100 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3101 }
3102
3103 assert(VT == MVT::f32);
3104
3105 // Algorithm:
3106 //
3107 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3108 //
3109 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3110 // n = 64*m + j, 0 <= j < 64
3111 //
3112 // e^x = 2^((64*m + j + f)/64)
3113 // = (2^m) * (2^(j/64)) * 2^(f/64)
3114 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3115 //
3116 // f = x*(64/ln(2)) - n
3117 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3118 //
3119 // e^x = (2^m) * (2^(j/64)) * e^r
3120 //
3121 // (2^(j/64)) is precomputed
3122 //
3123 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3124 // e^r = 1 + q
3125 //
3126 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3127 //
3128 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3129 SDNodeFlags FlagsNoContract = Flags;
3130 FlagsNoContract.setAllowContract(false);
3131
3132 SDValue PH, PL;
3133 if (Subtarget->hasFastFMAF32()) {
3134 const float c_exp = numbers::log2ef;
3135 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3136 const float c_exp10 = 0x1.a934f0p+1f;
3137 const float cc_exp10 = 0x1.2f346ep-24f;
3138
3139 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3140 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3141
3142 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3143 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3144 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3145 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3146 } else {
3147 const float ch_exp = 0x1.714000p+0f;
3148 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3149
3150 const float ch_exp10 = 0x1.a92000p+1f;
3151 const float cl_exp10 = 0x1.4f0978p-11f;
3152
3153 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3154 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3155
3156 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3157 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3158 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3159 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3160 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3161
3162 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3163
3164 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3165 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3166 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3167 }
3168
3169 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3170
3171 // It is unsafe to contract this fsub into the PH multiply.
3172 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3173
3174 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3175 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3176 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3177
3178 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3179
3180 SDValue UnderflowCheckConst =
3181 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3182
3183 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3184 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3185 SDValue Underflow =
3186 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3187
3188 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3189
3190 if (!Flags.hasNoInfs()) {
3191 SDValue OverflowCheckConst =
3192 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3193 SDValue Overflow =
3194 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3195 SDValue Inf =
3197 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3198 }
3199
3200 return R;
3201}
3202
3203static bool isCtlzOpc(unsigned Opc) {
3204 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3205}
3206
3207static bool isCttzOpc(unsigned Opc) {
3208 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3209}
3210
3212 SelectionDAG &DAG) const {
3213 auto SL = SDLoc(Op);
3214 auto Opc = Op.getOpcode();
3215 auto Arg = Op.getOperand(0u);
3216 auto ResultVT = Op.getValueType();
3217
3218 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3219 return {};
3220
3222 assert(ResultVT == Arg.getValueType());
3223
3224 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3225 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3226 SDValue NewOp;
3227
3228 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3229 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3230 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3231 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3232 } else {
3233 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3234 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3235 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3236 }
3237
3238 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3239}
3240
3242 SDLoc SL(Op);
3243 SDValue Src = Op.getOperand(0);
3244
3245 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3246 bool Ctlz = isCtlzOpc(Op.getOpcode());
3247 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3248
3249 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3250 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3251 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3252
3253 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3254 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3255 // (cttz hi:lo) -> (umin (ffbl src), 32)
3256 // (ctlz_zero_undef src) -> (ffbh src)
3257 // (cttz_zero_undef src) -> (ffbl src)
3258
3259 // 64-bit scalar version produce 32-bit result
3260 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3261 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3262 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3263 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3264 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3265 if (!ZeroUndef) {
3266 const SDValue ConstVal = DAG.getConstant(
3267 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3268 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3269 }
3270 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3271 }
3272
3273 SDValue Lo, Hi;
3274 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3275
3276 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3277 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3278
3279 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3280 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3281 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3282 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3283
3284 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3285 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3286 if (Ctlz)
3287 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3288 else
3289 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3290
3291 SDValue NewOpr;
3292 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3293 if (!ZeroUndef) {
3294 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3295 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3296 }
3297
3298 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3299}
3300
3302 bool Signed) const {
3303 // The regular method converting a 64-bit integer to float roughly consists of
3304 // 2 steps: normalization and rounding. In fact, after normalization, the
3305 // conversion from a 64-bit integer to a float is essentially the same as the
3306 // one from a 32-bit integer. The only difference is that it has more
3307 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3308 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3309 // converted into the correct float number. The basic steps for the unsigned
3310 // conversion are illustrated in the following pseudo code:
3311 //
3312 // f32 uitofp(i64 u) {
3313 // i32 hi, lo = split(u);
3314 // // Only count the leading zeros in hi as we have native support of the
3315 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3316 // // reduced to a 32-bit one automatically.
3317 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3318 // u <<= shamt;
3319 // hi, lo = split(u);
3320 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3321 // // convert it as a 32-bit integer and scale the result back.
3322 // return uitofp(hi) * 2^(32 - shamt);
3323 // }
3324 //
3325 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3326 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3327 // converted instead followed by negation based its sign bit.
3328
3329 SDLoc SL(Op);
3330 SDValue Src = Op.getOperand(0);
3331
3332 SDValue Lo, Hi;
3333 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3334 SDValue Sign;
3335 SDValue ShAmt;
3336 if (Signed && Subtarget->isGCN()) {
3337 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3338 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3339 // account. That is, the maximal shift is
3340 // - 32 if Lo and Hi have opposite signs;
3341 // - 33 if Lo and Hi have the same sign.
3342 //
3343 // Or, MaxShAmt = 33 + OppositeSign, where
3344 //
3345 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3346 // - -1 if Lo and Hi have opposite signs; and
3347 // - 0 otherwise.
3348 //
3349 // All in all, ShAmt is calculated as
3350 //
3351 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3352 //
3353 // or
3354 //
3355 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3356 //
3357 // to reduce the critical path.
3358 SDValue OppositeSign = DAG.getNode(
3359 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3360 DAG.getConstant(31, SL, MVT::i32));
3361 SDValue MaxShAmt =
3362 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3363 OppositeSign);
3364 // Count the leading sign bits.
3365 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3366 // Different from unsigned conversion, the shift should be one bit less to
3367 // preserve the sign bit.
3368 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3369 DAG.getConstant(1, SL, MVT::i32));
3370 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3371 } else {
3372 if (Signed) {
3373 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3374 // absolute value first.
3375 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3376 DAG.getConstant(63, SL, MVT::i64));
3377 SDValue Abs =
3378 DAG.getNode(ISD::XOR, SL, MVT::i64,
3379 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3380 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3381 }
3382 // Count the leading zeros.
3383 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3384 // The shift amount for signed integers is [0, 32].
3385 }
3386 // Normalize the given 64-bit integer.
3387 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3388 // Split it again.
3389 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3390 // Calculate the adjust bit for rounding.
3391 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3392 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3393 DAG.getConstant(1, SL, MVT::i32), Lo);
3394 // Get the 32-bit normalized integer.
3395 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3396 // Convert the normalized 32-bit integer into f32.
3397 unsigned Opc =
3398 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3399 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3400
3401 // Finally, need to scale back the converted floating number as the original
3402 // 64-bit integer is converted as a 32-bit one.
3403 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3404 ShAmt);
3405 // On GCN, use LDEXP directly.
3406 if (Subtarget->isGCN())
3407 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3408
3409 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3410 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3411 // exponent is enough to avoid overflowing into the sign bit.
3412 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3413 DAG.getConstant(23, SL, MVT::i32));
3414 SDValue IVal =
3415 DAG.getNode(ISD::ADD, SL, MVT::i32,
3416 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3417 if (Signed) {
3418 // Set the sign bit.
3419 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3420 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3421 DAG.getConstant(31, SL, MVT::i32));
3422 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3423 }
3424 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3425}
3426
3428 bool Signed) const {
3429 SDLoc SL(Op);
3430 SDValue Src = Op.getOperand(0);
3431
3432 SDValue Lo, Hi;
3433 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3434
3436 SL, MVT::f64, Hi);
3437
3438 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3439
3440 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3441 DAG.getConstant(32, SL, MVT::i32));
3442 // TODO: Should this propagate fast-math-flags?
3443 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3444}
3445
3447 SelectionDAG &DAG) const {
3448 // TODO: Factor out code common with LowerSINT_TO_FP.
3449 EVT DestVT = Op.getValueType();
3450 SDValue Src = Op.getOperand(0);
3451 EVT SrcVT = Src.getValueType();
3452
3453 if (SrcVT == MVT::i16) {
3454 if (DestVT == MVT::f16)
3455 return Op;
3456 SDLoc DL(Op);
3457
3458 // Promote src to i32
3459 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3460 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3461 }
3462
3463 if (DestVT == MVT::bf16) {
3464 SDLoc SL(Op);
3465 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3466 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3467 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3468 }
3469
3470 if (SrcVT != MVT::i64)
3471 return Op;
3472
3473 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3474 SDLoc DL(Op);
3475
3476 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3477 SDValue FPRoundFlag =
3478 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3479 SDValue FPRound =
3480 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3481
3482 return FPRound;
3483 }
3484
3485 if (DestVT == MVT::f32)
3486 return LowerINT_TO_FP32(Op, DAG, false);
3487
3488 assert(DestVT == MVT::f64);
3489 return LowerINT_TO_FP64(Op, DAG, false);
3490}
3491
3493 SelectionDAG &DAG) const {
3494 EVT DestVT = Op.getValueType();
3495
3496 SDValue Src = Op.getOperand(0);
3497 EVT SrcVT = Src.getValueType();
3498
3499 if (SrcVT == MVT::i16) {
3500 if (DestVT == MVT::f16)
3501 return Op;
3502
3503 SDLoc DL(Op);
3504 // Promote src to i32
3505 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3506 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3507 }
3508
3509 if (DestVT == MVT::bf16) {
3510 SDLoc SL(Op);
3511 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3512 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3513 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3514 }
3515
3516 if (SrcVT != MVT::i64)
3517 return Op;
3518
3519 // TODO: Factor out code common with LowerUINT_TO_FP.
3520
3521 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3522 SDLoc DL(Op);
3523 SDValue Src = Op.getOperand(0);
3524
3525 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3526 SDValue FPRoundFlag =
3527 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3528 SDValue FPRound =
3529 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3530
3531 return FPRound;
3532 }
3533
3534 if (DestVT == MVT::f32)
3535 return LowerINT_TO_FP32(Op, DAG, true);
3536
3537 assert(DestVT == MVT::f64);
3538 return LowerINT_TO_FP64(Op, DAG, true);
3539}
3540
3542 bool Signed) const {
3543 SDLoc SL(Op);
3544
3545 SDValue Src = Op.getOperand(0);
3546 EVT SrcVT = Src.getValueType();
3547
3548 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3549
3550 // The basic idea of converting a floating point number into a pair of 32-bit
3551 // integers is illustrated as follows:
3552 //
3553 // tf := trunc(val);
3554 // hif := floor(tf * 2^-32);
3555 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3556 // hi := fptoi(hif);
3557 // lo := fptoi(lof);
3558 //
3559 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3560 SDValue Sign;
3561 if (Signed && SrcVT == MVT::f32) {
3562 // However, a 32-bit floating point number has only 23 bits mantissa and
3563 // it's not enough to hold all the significant bits of `lof` if val is
3564 // negative. To avoid the loss of precision, We need to take the absolute
3565 // value after truncating and flip the result back based on the original
3566 // signedness.
3567 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3568 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3569 DAG.getConstant(31, SL, MVT::i32));
3570 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3571 }
3572
3573 SDValue K0, K1;
3574 if (SrcVT == MVT::f64) {
3575 K0 = DAG.getConstantFP(
3576 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3577 SrcVT);
3578 K1 = DAG.getConstantFP(
3579 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3580 SrcVT);
3581 } else {
3582 K0 = DAG.getConstantFP(
3583 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3584 K1 = DAG.getConstantFP(
3585 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3586 }
3587 // TODO: Should this propagate fast-math-flags?
3588 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3589
3590 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3591
3592 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3593
3594 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3596 SL, MVT::i32, FloorMul);
3597 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3598
3599 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3600 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3601
3602 if (Signed && SrcVT == MVT::f32) {
3603 assert(Sign);
3604 // Flip the result based on the signedness, which is either all 0s or 1s.
3605 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3606 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3607 // r := xor(r, sign) - sign;
3608 Result =
3609 DAG.getNode(ISD::SUB, SL, MVT::i64,
3610 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3611 }
3612
3613 return Result;
3614}
3615
3617 SDLoc DL(Op);
3618 SDValue N0 = Op.getOperand(0);
3619
3620 // Convert to target node to get known bits
3621 if (N0.getValueType() == MVT::f32)
3622 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3623
3624 if (Op->getFlags().hasApproximateFuncs()) {
3625 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3626 return SDValue();
3627 }
3628
3629 return LowerF64ToF16Safe(N0, DL, DAG);
3630}
3631
3632// return node in i32
3634 SelectionDAG &DAG) const {
3635 assert(Src.getSimpleValueType() == MVT::f64);
3636
3637 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3638 // TODO: We can generate better code for True16.
3639 const unsigned ExpMask = 0x7ff;
3640 const unsigned ExpBiasf64 = 1023;
3641 const unsigned ExpBiasf16 = 15;
3642 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3643 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3644 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3645 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3646 DAG.getConstant(32, DL, MVT::i64));
3647 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3648 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3649 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3650 DAG.getConstant(20, DL, MVT::i64));
3651 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3652 DAG.getConstant(ExpMask, DL, MVT::i32));
3653 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3654 // add the f16 bias (15) to get the biased exponent for the f16 format.
3655 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3656 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3657
3658 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3659 DAG.getConstant(8, DL, MVT::i32));
3660 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3661 DAG.getConstant(0xffe, DL, MVT::i32));
3662
3663 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3664 DAG.getConstant(0x1ff, DL, MVT::i32));
3665 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3666
3667 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3668 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3669
3670 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3671 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3672 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3673 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3674
3675 // N = M | (E << 12);
3676 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3677 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3678 DAG.getConstant(12, DL, MVT::i32)));
3679
3680 // B = clamp(1-E, 0, 13);
3681 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3682 One, E);
3683 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3684 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3685 DAG.getConstant(13, DL, MVT::i32));
3686
3687 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3688 DAG.getConstant(0x1000, DL, MVT::i32));
3689
3690 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3691 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3692 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3693 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3694
3695 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3696 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3697 DAG.getConstant(0x7, DL, MVT::i32));
3698 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3699 DAG.getConstant(2, DL, MVT::i32));
3700 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3701 One, Zero, ISD::SETEQ);
3702 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3703 One, Zero, ISD::SETGT);
3704 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3705 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3706
3707 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3708 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3709 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3710 I, V, ISD::SETEQ);
3711
3712 // Extract the sign bit.
3713 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3714 DAG.getConstant(16, DL, MVT::i32));
3715 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3716 DAG.getConstant(0x8000, DL, MVT::i32));
3717
3718 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3719}
3720
3722 SelectionDAG &DAG) const {
3723 SDValue Src = Op.getOperand(0);
3724 unsigned OpOpcode = Op.getOpcode();
3725 EVT SrcVT = Src.getValueType();
3726 EVT DestVT = Op.getValueType();
3727
3728 // Will be selected natively
3729 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3730 return Op;
3731
3732 if (SrcVT == MVT::bf16) {
3733 SDLoc DL(Op);
3734 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3735 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3736 }
3737
3738 // Promote i16 to i32
3739 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3740 SDLoc DL(Op);
3741
3742 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3743 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3744 }
3745
3746 if (DestVT != MVT::i64)
3747 return Op;
3748
3749 if (SrcVT == MVT::f16 ||
3750 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3751 SDLoc DL(Op);
3752
3753 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3754 unsigned Ext =
3756 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3757 }
3758
3759 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3760 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3761
3762 return SDValue();
3763}
3764
3766 SelectionDAG &DAG) const {
3767 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3768 MVT VT = Op.getSimpleValueType();
3769 MVT ScalarVT = VT.getScalarType();
3770
3771 assert(VT.isVector());
3772
3773 SDValue Src = Op.getOperand(0);
3774 SDLoc DL(Op);
3775
3776 // TODO: Don't scalarize on Evergreen?
3777 unsigned NElts = VT.getVectorNumElements();
3779 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3780
3781 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3782 for (unsigned I = 0; I < NElts; ++I)
3783 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3784
3785 return DAG.getBuildVector(VT, DL, Args);
3786}
3787
3788//===----------------------------------------------------------------------===//
3789// Custom DAG optimizations
3790//===----------------------------------------------------------------------===//
3791
3792static bool isU24(SDValue Op, SelectionDAG &DAG) {
3793 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3794}
3795
3796static bool isI24(SDValue Op, SelectionDAG &DAG) {
3797 EVT VT = Op.getValueType();
3798 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3799 // as unsigned 24-bit values.
3801}
3802
3805 SelectionDAG &DAG = DCI.DAG;
3806 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3807 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3808
3809 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3810 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3811 unsigned NewOpcode = Node24->getOpcode();
3812 if (IsIntrin) {
3813 unsigned IID = Node24->getConstantOperandVal(0);
3814 switch (IID) {
3815 case Intrinsic::amdgcn_mul_i24:
3816 NewOpcode = AMDGPUISD::MUL_I24;
3817 break;
3818 case Intrinsic::amdgcn_mul_u24:
3819 NewOpcode = AMDGPUISD::MUL_U24;
3820 break;
3821 case Intrinsic::amdgcn_mulhi_i24:
3822 NewOpcode = AMDGPUISD::MULHI_I24;
3823 break;
3824 case Intrinsic::amdgcn_mulhi_u24:
3825 NewOpcode = AMDGPUISD::MULHI_U24;
3826 break;
3827 default:
3828 llvm_unreachable("Expected 24-bit mul intrinsic");
3829 }
3830 }
3831
3832 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3833
3834 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3835 // the operands to have other uses, but will only perform simplifications that
3836 // involve bypassing some nodes for this user.
3837 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3838 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3839 if (DemandedLHS || DemandedRHS)
3840 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3841 DemandedLHS ? DemandedLHS : LHS,
3842 DemandedRHS ? DemandedRHS : RHS);
3843
3844 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3845 // operands if this node is the only user.
3846 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3847 return SDValue(Node24, 0);
3848 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3849 return SDValue(Node24, 0);
3850
3851 return SDValue();
3852}
3853
3854template <typename IntTy>
3856 uint32_t Width, const SDLoc &DL) {
3857 if (Width + Offset < 32) {
3858 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3859 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3860 if constexpr (std::is_signed_v<IntTy>) {
3861 return DAG.getSignedConstant(Result, DL, MVT::i32);
3862 } else {
3863 return DAG.getConstant(Result, DL, MVT::i32);
3864 }
3865 }
3866
3867 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3868}
3869
3870static bool hasVolatileUser(SDNode *Val) {
3871 for (SDNode *U : Val->users()) {
3872 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3873 if (M->isVolatile())
3874 return true;
3875 }
3876 }
3877
3878 return false;
3879}
3880
3882 // i32 vectors are the canonical memory type.
3883 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3884 return false;
3885
3886 if (!VT.isByteSized())
3887 return false;
3888
3889 unsigned Size = VT.getStoreSize();
3890
3891 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3892 return false;
3893
3894 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3895 return false;
3896
3897 return true;
3898}
3899
3900// Replace load of an illegal type with a bitcast from a load of a friendlier
3901// type.
3903 DAGCombinerInfo &DCI) const {
3904 if (!DCI.isBeforeLegalize())
3905 return SDValue();
3906
3908 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3909 return SDValue();
3910
3911 SDLoc SL(N);
3912 SelectionDAG &DAG = DCI.DAG;
3913 EVT VT = LN->getMemoryVT();
3914
3915 unsigned Size = VT.getStoreSize();
3916 Align Alignment = LN->getAlign();
3917 if (Alignment < Size && isTypeLegal(VT)) {
3918 unsigned IsFast;
3919 unsigned AS = LN->getAddressSpace();
3920
3921 // Expand unaligned loads earlier than legalization. Due to visitation order
3922 // problems during legalization, the emitted instructions to pack and unpack
3923 // the bytes again are not eliminated in the case of an unaligned copy.
3925 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3926 if (VT.isVector())
3927 return SplitVectorLoad(SDValue(LN, 0), DAG);
3928
3929 SDValue Ops[2];
3930 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3931
3932 return DAG.getMergeValues(Ops, SDLoc(N));
3933 }
3934
3935 if (!IsFast)
3936 return SDValue();
3937 }
3938
3939 if (!shouldCombineMemoryType(VT))
3940 return SDValue();
3941
3942 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3943
3944 SDValue NewLoad
3945 = DAG.getLoad(NewVT, SL, LN->getChain(),
3946 LN->getBasePtr(), LN->getMemOperand());
3947
3948 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3949 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3950 return SDValue(N, 0);
3951}
3952
3953// Replace store of an illegal type with a store of a bitcast to a friendlier
3954// type.
3956 DAGCombinerInfo &DCI) const {
3957 if (!DCI.isBeforeLegalize())
3958 return SDValue();
3959
3961 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3962 return SDValue();
3963
3964 EVT VT = SN->getMemoryVT();
3965 unsigned Size = VT.getStoreSize();
3966
3967 SDLoc SL(N);
3968 SelectionDAG &DAG = DCI.DAG;
3969 Align Alignment = SN->getAlign();
3970 if (Alignment < Size && isTypeLegal(VT)) {
3971 unsigned IsFast;
3972 unsigned AS = SN->getAddressSpace();
3973
3974 // Expand unaligned stores earlier than legalization. Due to visitation
3975 // order problems during legalization, the emitted instructions to pack and
3976 // unpack the bytes again are not eliminated in the case of an unaligned
3977 // copy.
3979 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3980 if (VT.isVector())
3981 return SplitVectorStore(SDValue(SN, 0), DAG);
3982
3983 return expandUnalignedStore(SN, DAG);
3984 }
3985
3986 if (!IsFast)
3987 return SDValue();
3988 }
3989
3990 if (!shouldCombineMemoryType(VT))
3991 return SDValue();
3992
3993 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3994 SDValue Val = SN->getValue();
3995
3996 //DCI.AddToWorklist(Val.getNode());
3997
3998 bool OtherUses = !Val.hasOneUse();
3999 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
4000 if (OtherUses) {
4001 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
4002 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4003 }
4004
4005 return DAG.getStore(SN->getChain(), SL, CastVal,
4006 SN->getBasePtr(), SN->getMemOperand());
4007}
4008
4009// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4010// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4011// issues.
4013 DAGCombinerInfo &DCI) const {
4014 SelectionDAG &DAG = DCI.DAG;
4015 SDValue N0 = N->getOperand(0);
4016
4017 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4018 // (vt2 (truncate (assertzext vt0:x, vt1)))
4019 if (N0.getOpcode() == ISD::TRUNCATE) {
4020 SDValue N1 = N->getOperand(1);
4021 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4022 SDLoc SL(N);
4023
4024 SDValue Src = N0.getOperand(0);
4025 EVT SrcVT = Src.getValueType();
4026 if (SrcVT.bitsGE(ExtVT)) {
4027 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4028 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4029 }
4030 }
4031
4032 return SDValue();
4033}
4034
4036 SDNode *N, DAGCombinerInfo &DCI) const {
4037 unsigned IID = N->getConstantOperandVal(0);
4038 switch (IID) {
4039 case Intrinsic::amdgcn_mul_i24:
4040 case Intrinsic::amdgcn_mul_u24:
4041 case Intrinsic::amdgcn_mulhi_i24:
4042 case Intrinsic::amdgcn_mulhi_u24:
4043 return simplifyMul24(N, DCI);
4044 case Intrinsic::amdgcn_fract:
4045 case Intrinsic::amdgcn_rsq:
4046 case Intrinsic::amdgcn_rcp_legacy:
4047 case Intrinsic::amdgcn_rsq_legacy:
4048 case Intrinsic::amdgcn_rsq_clamp:
4049 case Intrinsic::amdgcn_tanh:
4050 case Intrinsic::amdgcn_prng_b32: {
4051 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4052 SDValue Src = N->getOperand(1);
4053 return Src.isUndef() ? Src : SDValue();
4054 }
4055 case Intrinsic::amdgcn_frexp_exp: {
4056 // frexp_exp (fneg x) -> frexp_exp x
4057 // frexp_exp (fabs x) -> frexp_exp x
4058 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4059 SDValue Src = N->getOperand(1);
4060 SDValue PeekSign = peekFPSignOps(Src);
4061 if (PeekSign == Src)
4062 return SDValue();
4063 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4064 0);
4065 }
4066 default:
4067 return SDValue();
4068 }
4069}
4070
4071/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4072/// binary operation \p Opc to it with the corresponding constant operands.
4074 DAGCombinerInfo &DCI, const SDLoc &SL,
4075 unsigned Opc, SDValue LHS,
4076 uint32_t ValLo, uint32_t ValHi) const {
4077 SelectionDAG &DAG = DCI.DAG;
4078 SDValue Lo, Hi;
4079 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4080
4081 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4082 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4083
4084 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4085 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4086
4087 // Re-visit the ands. It's possible we eliminated one of them and it could
4088 // simplify the vector.
4089 DCI.AddToWorklist(Lo.getNode());
4090 DCI.AddToWorklist(Hi.getNode());
4091
4092 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4093 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4094}
4095
4097 DAGCombinerInfo &DCI) const {
4098 EVT VT = N->getValueType(0);
4099 SDValue LHS = N->getOperand(0);
4100 SDValue RHS = N->getOperand(1);
4102 SDLoc SL(N);
4103 SelectionDAG &DAG = DCI.DAG;
4104
4105 unsigned RHSVal;
4106 if (CRHS) {
4107 RHSVal = CRHS->getZExtValue();
4108 if (!RHSVal)
4109 return LHS;
4110
4111 switch (LHS->getOpcode()) {
4112 default:
4113 break;
4114 case ISD::ZERO_EXTEND:
4115 case ISD::SIGN_EXTEND:
4116 case ISD::ANY_EXTEND: {
4117 SDValue X = LHS->getOperand(0);
4118
4119 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4120 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4121 // Prefer build_vector as the canonical form if packed types are legal.
4122 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4123 SDValue Vec = DAG.getBuildVector(
4124 MVT::v2i16, SL,
4125 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4126 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4127 }
4128
4129 // shl (ext x) => zext (shl x), if shift does not overflow int
4130 if (VT != MVT::i64)
4131 break;
4132 KnownBits Known = DAG.computeKnownBits(X);
4133 unsigned LZ = Known.countMinLeadingZeros();
4134 if (LZ < RHSVal)
4135 break;
4136 EVT XVT = X.getValueType();
4137 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4138 return DAG.getZExtOrTrunc(Shl, SL, VT);
4139 }
4140 }
4141 }
4142
4143 if (VT.getScalarType() != MVT::i64)
4144 return SDValue();
4145
4146 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4147 // common case, splitting this into a move and a 32-bit shift is faster and
4148 // the same code size.
4149 KnownBits Known = DAG.computeKnownBits(RHS);
4150
4151 EVT ElementType = VT.getScalarType();
4152 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4153 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4154 : TargetScalarType;
4155
4156 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4157 return SDValue();
4158 SDValue ShiftAmt;
4159
4160 if (CRHS) {
4161 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4162 TargetType);
4163 } else {
4164 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4165 const SDValue ShiftMask =
4166 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4167 // This AND instruction will clamp out of bounds shift values.
4168 // It will also be removed during later instruction selection.
4169 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4170 }
4171
4172 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4173 SDValue NewShift =
4174 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4175
4176 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4177 SDValue Vec;
4178
4179 if (VT.isVector()) {
4180 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4181 unsigned NElts = TargetType.getVectorNumElements();
4183 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4184
4185 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4186 for (unsigned I = 0; I != NElts; ++I)
4187 HiAndLoOps[2 * I + 1] = HiOps[I];
4188 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4189 } else {
4190 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4191 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4192 }
4193 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4194}
4195
4197 DAGCombinerInfo &DCI) const {
4198 SDValue RHS = N->getOperand(1);
4200 EVT VT = N->getValueType(0);
4201 SDValue LHS = N->getOperand(0);
4202 SelectionDAG &DAG = DCI.DAG;
4203 SDLoc SL(N);
4204
4205 if (VT.getScalarType() != MVT::i64)
4206 return SDValue();
4207
4208 // For C >= 32
4209 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4210
4211 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4212 // common case, splitting this into a move and a 32-bit shift is faster and
4213 // the same code size.
4214 KnownBits Known = DAG.computeKnownBits(RHS);
4215
4216 EVT ElementType = VT.getScalarType();
4217 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4218 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4219 : TargetScalarType;
4220
4221 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4222 return SDValue();
4223
4224 SDValue ShiftFullAmt =
4225 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4226 SDValue ShiftAmt;
4227 if (CRHS) {
4228 unsigned RHSVal = CRHS->getZExtValue();
4229 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4230 TargetType);
4231 } else if (Known.getMinValue().getZExtValue() ==
4232 (ElementType.getSizeInBits() - 1)) {
4233 ShiftAmt = ShiftFullAmt;
4234 } else {
4235 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4236 const SDValue ShiftMask =
4237 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4238 // This AND instruction will clamp out of bounds shift values.
4239 // It will also be removed during later instruction selection.
4240 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4241 }
4242
4243 EVT ConcatType;
4244 SDValue Hi;
4245 SDLoc LHSSL(LHS);
4246 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4247 if (VT.isVector()) {
4248 unsigned NElts = TargetType.getVectorNumElements();
4249 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4250 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4251 SmallVector<SDValue, 8> HiOps(NElts);
4252 SmallVector<SDValue, 16> HiAndLoOps;
4253
4254 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4255 for (unsigned I = 0; I != NElts; ++I) {
4256 HiOps[I] = HiAndLoOps[2 * I + 1];
4257 }
4258 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4259 } else {
4260 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4261 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4262 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4263 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4264 }
4265
4266 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4267 SDValue HiShift;
4268 if (KnownLHS.isNegative()) {
4269 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4270 } else {
4271 Hi = DAG.getFreeze(Hi);
4272 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4273 }
4274 SDValue NewShift =
4275 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4276
4277 SDValue Vec;
4278 if (VT.isVector()) {
4279 unsigned NElts = TargetType.getVectorNumElements();
4282 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4283
4284 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4285 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4286 for (unsigned I = 0; I != NElts; ++I) {
4287 HiAndLoOps[2 * I + 1] = HiOps[I];
4288 HiAndLoOps[2 * I] = LoOps[I];
4289 }
4290 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4291 } else {
4292 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4293 }
4294 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4295}
4296
4298 DAGCombinerInfo &DCI) const {
4299 SDValue RHS = N->getOperand(1);
4301 EVT VT = N->getValueType(0);
4302 SDValue LHS = N->getOperand(0);
4303 SelectionDAG &DAG = DCI.DAG;
4304 SDLoc SL(N);
4305 unsigned RHSVal;
4306
4307 if (CRHS) {
4308 RHSVal = CRHS->getZExtValue();
4309
4310 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4311 // this improves the ability to match BFE patterns in isel.
4312 if (LHS.getOpcode() == ISD::AND) {
4313 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4314 unsigned MaskIdx, MaskLen;
4315 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4316 MaskIdx == RHSVal) {
4317 return DAG.getNode(ISD::AND, SL, VT,
4318 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4319 N->getOperand(1)),
4320 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4321 N->getOperand(1)));
4322 }
4323 }
4324 }
4325 }
4326
4327 if (VT.getScalarType() != MVT::i64)
4328 return SDValue();
4329
4330 // for C >= 32
4331 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4332
4333 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4334 // common case, splitting this into a move and a 32-bit shift is faster and
4335 // the same code size.
4336 KnownBits Known = DAG.computeKnownBits(RHS);
4337
4338 EVT ElementType = VT.getScalarType();
4339 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4340 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4341 : TargetScalarType;
4342
4343 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4344 return SDValue();
4345
4346 SDValue ShiftAmt;
4347 if (CRHS) {
4348 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4349 TargetType);
4350 } else {
4351 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4352 const SDValue ShiftMask =
4353 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4354 // This AND instruction will clamp out of bounds shift values.
4355 // It will also be removed during later instruction selection.
4356 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4357 }
4358
4359 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4360 EVT ConcatType;
4361 SDValue Hi;
4362 SDLoc LHSSL(LHS);
4363 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4364 if (VT.isVector()) {
4365 unsigned NElts = TargetType.getVectorNumElements();
4366 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4367 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4368 SmallVector<SDValue, 8> HiOps(NElts);
4369 SmallVector<SDValue, 16> HiAndLoOps;
4370
4371 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4372 for (unsigned I = 0; I != NElts; ++I)
4373 HiOps[I] = HiAndLoOps[2 * I + 1];
4374 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4375 } else {
4376 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4377 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4378 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4379 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4380 }
4381
4382 SDValue NewShift =
4383 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4384
4385 SDValue Vec;
4386 if (VT.isVector()) {
4387 unsigned NElts = TargetType.getVectorNumElements();
4389 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4390
4391 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4392 for (unsigned I = 0; I != NElts; ++I)
4393 HiAndLoOps[2 * I] = LoOps[I];
4394 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4395 } else {
4396 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4397 }
4398 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4399}
4400
4402 SDNode *N, DAGCombinerInfo &DCI) const {
4403 SDLoc SL(N);
4404 SelectionDAG &DAG = DCI.DAG;
4405 EVT VT = N->getValueType(0);
4406 SDValue Src = N->getOperand(0);
4407
4408 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4409 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4410 SDValue Vec = Src.getOperand(0);
4411 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4412 SDValue Elt0 = Vec.getOperand(0);
4413 EVT EltVT = Elt0.getValueType();
4414 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4415 if (EltVT.isFloatingPoint()) {
4416 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4417 EltVT.changeTypeToInteger(), Elt0);
4418 }
4419
4420 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4421 }
4422 }
4423 }
4424
4425 // Equivalent of above for accessing the high element of a vector as an
4426 // integer operation.
4427 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4428 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4429 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4430 SDValue BV = stripBitcast(Src.getOperand(0));
4431 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4432 EVT SrcEltVT = BV.getOperand(0).getValueType();
4433 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4434 unsigned BitIndex = K->getZExtValue();
4435 unsigned PartIndex = BitIndex / SrcEltSize;
4436
4437 if (PartIndex * SrcEltSize == BitIndex &&
4438 PartIndex < BV.getNumOperands()) {
4439 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4440 SDValue SrcElt =
4441 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4442 BV.getOperand(PartIndex));
4443 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4444 }
4445 }
4446 }
4447 }
4448 }
4449
4450 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4451 //
4452 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4453 // i16 (trunc (srl (i32 (trunc x), K)))
4454 if (VT.getScalarSizeInBits() < 32) {
4455 EVT SrcVT = Src.getValueType();
4456 if (SrcVT.getScalarSizeInBits() > 32 &&
4457 (Src.getOpcode() == ISD::SRL ||
4458 Src.getOpcode() == ISD::SRA ||
4459 Src.getOpcode() == ISD::SHL)) {
4460 SDValue Amt = Src.getOperand(1);
4461 KnownBits Known = DAG.computeKnownBits(Amt);
4462
4463 // - For left shifts, do the transform as long as the shift
4464 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4465 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4466 // losing information stored in the high bits when truncating.
4467 const unsigned MaxCstSize =
4468 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4469 if (Known.getMaxValue().ule(MaxCstSize)) {
4470 EVT MidVT = VT.isVector() ?
4471 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4472 VT.getVectorNumElements()) : MVT::i32;
4473
4474 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4475 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4476 Src.getOperand(0));
4477 DCI.AddToWorklist(Trunc.getNode());
4478
4479 if (Amt.getValueType() != NewShiftVT) {
4480 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4481 DCI.AddToWorklist(Amt.getNode());
4482 }
4483
4484 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4485 Trunc, Amt);
4486 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4487 }
4488 }
4489 }
4490
4491 return SDValue();
4492}
4493
4494// We need to specifically handle i64 mul here to avoid unnecessary conversion
4495// instructions. If we only match on the legalized i64 mul expansion,
4496// SimplifyDemandedBits will be unable to remove them because there will be
4497// multiple uses due to the separate mul + mulh[su].
4498static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4499 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4500 if (Size <= 32) {
4501 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4502 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4503 }
4504
4505 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4506 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4507
4508 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4509 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4510
4511 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4512}
4513
4514/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4515/// return SDValue().
4516static SDValue getAddOneOp(const SDNode *V) {
4517 if (V->getOpcode() != ISD::ADD)
4518 return SDValue();
4519
4520 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4521}
4522
4524 DAGCombinerInfo &DCI) const {
4525 assert(N->getOpcode() == ISD::MUL);
4526 EVT VT = N->getValueType(0);
4527
4528 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4529 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4530 // unnecessarily). isDivergent() is used as an approximation of whether the
4531 // value is in an SGPR.
4532 if (!N->isDivergent())
4533 return SDValue();
4534
4535 unsigned Size = VT.getSizeInBits();
4536 if (VT.isVector() || Size > 64)
4537 return SDValue();
4538
4539 SelectionDAG &DAG = DCI.DAG;
4540 SDLoc DL(N);
4541
4542 SDValue N0 = N->getOperand(0);
4543 SDValue N1 = N->getOperand(1);
4544
4545 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4546 // matching.
4547
4548 // mul x, (add y, 1) -> add (mul x, y), x
4549 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4550 SDValue AddOp = getAddOneOp(V.getNode());
4551 if (!AddOp)
4552 return SDValue();
4553
4554 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4555 return U->getOpcode() == ISD::MUL;
4556 }))
4557 return AddOp;
4558
4559 return SDValue();
4560 };
4561
4562 // FIXME: The selection pattern is not properly checking for commuted
4563 // operands, so we have to place the mul in the LHS
4564 if (SDValue MulOper = IsFoldableAdd(N0)) {
4565 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4566 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4567 }
4568
4569 if (SDValue MulOper = IsFoldableAdd(N1)) {
4570 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4571 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4572 }
4573
4574 // There are i16 integer mul/mad.
4575 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4576 return SDValue();
4577
4578 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4579 // in the source into any_extends if the result of the mul is truncated. Since
4580 // we can assume the high bits are whatever we want, use the underlying value
4581 // to avoid the unknown high bits from interfering.
4582 if (N0.getOpcode() == ISD::ANY_EXTEND)
4583 N0 = N0.getOperand(0);
4584
4585 if (N1.getOpcode() == ISD::ANY_EXTEND)
4586 N1 = N1.getOperand(0);
4587
4588 SDValue Mul;
4589
4590 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4591 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4592 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4593 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4594 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4595 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4596 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4597 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4598 } else {
4599 return SDValue();
4600 }
4601
4602 // We need to use sext even for MUL_U24, because MUL_U24 is used
4603 // for signed multiply of 8 and 16-bit types.
4604 return DAG.getSExtOrTrunc(Mul, DL, VT);
4605}
4606
4607SDValue
4609 DAGCombinerInfo &DCI) const {
4610 if (N->getValueType(0) != MVT::i32)
4611 return SDValue();
4612
4613 SelectionDAG &DAG = DCI.DAG;
4614 SDLoc DL(N);
4615
4616 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4617 SDValue N0 = N->getOperand(0);
4618 SDValue N1 = N->getOperand(1);
4619
4620 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4621 // in the source into any_extends if the result of the mul is truncated. Since
4622 // we can assume the high bits are whatever we want, use the underlying value
4623 // to avoid the unknown high bits from interfering.
4624 if (N0.getOpcode() == ISD::ANY_EXTEND)
4625 N0 = N0.getOperand(0);
4626 if (N1.getOpcode() == ISD::ANY_EXTEND)
4627 N1 = N1.getOperand(0);
4628
4629 // Try to use two fast 24-bit multiplies (one for each half of the result)
4630 // instead of one slow extending multiply.
4631 unsigned LoOpcode = 0;
4632 unsigned HiOpcode = 0;
4633 if (Signed) {
4634 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4635 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4636 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4637 LoOpcode = AMDGPUISD::MUL_I24;
4638 HiOpcode = AMDGPUISD::MULHI_I24;
4639 }
4640 } else {
4641 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4642 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4643 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4644 LoOpcode = AMDGPUISD::MUL_U24;
4645 HiOpcode = AMDGPUISD::MULHI_U24;
4646 }
4647 }
4648 if (!LoOpcode)
4649 return SDValue();
4650
4651 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4652 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4653 DCI.CombineTo(N, Lo, Hi);
4654 return SDValue(N, 0);
4655}
4656
4658 DAGCombinerInfo &DCI) const {
4659 EVT VT = N->getValueType(0);
4660
4661 if (!Subtarget->hasMulI24() || VT.isVector())
4662 return SDValue();
4663
4664 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4665 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4666 // unnecessarily). isDivergent() is used as an approximation of whether the
4667 // value is in an SGPR.
4668 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4669 // valu op anyway)
4670 if (Subtarget->hasSMulHi() && !N->isDivergent())
4671 return SDValue();
4672
4673 SelectionDAG &DAG = DCI.DAG;
4674 SDLoc DL(N);
4675
4676 SDValue N0 = N->getOperand(0);
4677 SDValue N1 = N->getOperand(1);
4678
4679 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4680 return SDValue();
4681
4682 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4683 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4684
4685 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4686 DCI.AddToWorklist(Mulhi.getNode());
4687 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4688}
4689
4691 DAGCombinerInfo &DCI) const {
4692 EVT VT = N->getValueType(0);
4693
4694 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4695 return SDValue();
4696
4697 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4698 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4699 // unnecessarily). isDivergent() is used as an approximation of whether the
4700 // value is in an SGPR.
4701 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4702 // valu op anyway)
4703 if (Subtarget->hasSMulHi() && !N->isDivergent())
4704 return SDValue();
4705
4706 SelectionDAG &DAG = DCI.DAG;
4707 SDLoc DL(N);
4708
4709 SDValue N0 = N->getOperand(0);
4710 SDValue N1 = N->getOperand(1);
4711
4712 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4713 return SDValue();
4714
4715 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4716 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4717
4718 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4719 DCI.AddToWorklist(Mulhi.getNode());
4720 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4721}
4722
4723SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4724 SDValue Op,
4725 const SDLoc &DL,
4726 unsigned Opc) const {
4727 EVT VT = Op.getValueType();
4728 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4729 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4730 LegalVT != MVT::i16))
4731 return SDValue();
4732
4733 if (VT != MVT::i32)
4734 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4735
4736 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4737 if (VT != MVT::i32)
4738 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4739
4740 return FFBX;
4741}
4742
4743// The native instructions return -1 on 0 input. Optimize out a select that
4744// produces -1 on 0.
4745//
4746// TODO: If zero is not undef, we could also do this if the output is compared
4747// against the bitwidth.
4748//
4749// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4751 SDValue LHS, SDValue RHS,
4752 DAGCombinerInfo &DCI) const {
4753 if (!isNullConstant(Cond.getOperand(1)))
4754 return SDValue();
4755
4756 SelectionDAG &DAG = DCI.DAG;
4757 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4758 SDValue CmpLHS = Cond.getOperand(0);
4759
4760 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4761 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4762 if (CCOpcode == ISD::SETEQ &&
4763 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4764 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4765 unsigned Opc =
4766 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4767 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4768 }
4769
4770 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4771 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4772 if (CCOpcode == ISD::SETNE &&
4773 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4774 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4775 unsigned Opc =
4776 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4777
4778 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4779 }
4780
4781 return SDValue();
4782}
4783
4785 unsigned Op,
4786 const SDLoc &SL,
4787 SDValue Cond,
4788 SDValue N1,
4789 SDValue N2) {
4790 SelectionDAG &DAG = DCI.DAG;
4791 EVT VT = N1.getValueType();
4792
4793 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4794 N1.getOperand(0), N2.getOperand(0));
4795 DCI.AddToWorklist(NewSelect.getNode());
4796 return DAG.getNode(Op, SL, VT, NewSelect);
4797}
4798
4799// Pull a free FP operation out of a select so it may fold into uses.
4800//
4801// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4802// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4803//
4804// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4805// select c, (fabs x), +k -> fabs (select c, x, k)
4806SDValue
4808 SDValue N) const {
4809 SelectionDAG &DAG = DCI.DAG;
4810 SDValue Cond = N.getOperand(0);
4811 SDValue LHS = N.getOperand(1);
4812 SDValue RHS = N.getOperand(2);
4813
4814 EVT VT = N.getValueType();
4815 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4816 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4818 return SDValue();
4819
4820 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4821 SDLoc(N), Cond, LHS, RHS);
4822 }
4823
4824 bool Inv = false;
4825 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4826 std::swap(LHS, RHS);
4827 Inv = true;
4828 }
4829
4830 // TODO: Support vector constants.
4832 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4833 !selectSupportsSourceMods(N.getNode())) {
4834 SDLoc SL(N);
4835 // If one side is an fneg/fabs and the other is a constant, we can push the
4836 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4837 SDValue NewLHS = LHS.getOperand(0);
4838 SDValue NewRHS = RHS;
4839
4840 // Careful: if the neg can be folded up, don't try to pull it back down.
4841 bool ShouldFoldNeg = true;
4842
4843 if (NewLHS.hasOneUse()) {
4844 unsigned Opc = NewLHS.getOpcode();
4845 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4846 ShouldFoldNeg = false;
4847 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4848 ShouldFoldNeg = false;
4849 }
4850
4851 if (ShouldFoldNeg) {
4852 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4853 return SDValue();
4854
4855 // We're going to be forced to use a source modifier anyway, there's no
4856 // point to pulling the negate out unless we can get a size reduction by
4857 // negating the constant.
4858 //
4859 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4860 // about cheaper constants.
4861 if (NewLHS.getOpcode() == ISD::FABS &&
4863 return SDValue();
4864
4866 return SDValue();
4867
4868 if (LHS.getOpcode() == ISD::FNEG)
4869 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4870
4871 if (Inv)
4872 std::swap(NewLHS, NewRHS);
4873
4874 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4875 Cond, NewLHS, NewRHS);
4876 DCI.AddToWorklist(NewSelect.getNode());
4877 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4878 }
4879 }
4880
4881 return SDValue();
4882}
4883
4885 DAGCombinerInfo &DCI) const {
4886 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4887 return Folded;
4888
4889 SDValue Cond = N->getOperand(0);
4890 if (Cond.getOpcode() != ISD::SETCC)
4891 return SDValue();
4892
4893 EVT VT = N->getValueType(0);
4894 SDValue LHS = Cond.getOperand(0);
4895 SDValue RHS = Cond.getOperand(1);
4896 SDValue CC = Cond.getOperand(2);
4897
4898 SDValue True = N->getOperand(1);
4899 SDValue False = N->getOperand(2);
4900
4901 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4902 SelectionDAG &DAG = DCI.DAG;
4903 if (DAG.isConstantValueOfAnyType(True) &&
4904 !DAG.isConstantValueOfAnyType(False)) {
4905 // Swap cmp + select pair to move constant to false input.
4906 // This will allow using VOPC cndmasks more often.
4907 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4908
4909 SDLoc SL(N);
4910 ISD::CondCode NewCC =
4911 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4912
4913 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4914 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4915 }
4916
4917 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4919 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4920 // Revisit this node so we can catch min3/max3/med3 patterns.
4921 //DCI.AddToWorklist(MinMax.getNode());
4922 return MinMax;
4923 }
4924 }
4925
4926 // There's no reason to not do this if the condition has other uses.
4927 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4928}
4929
4930static bool isInv2Pi(const APFloat &APF) {
4931 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4932 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4933 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4934
4935 return APF.bitwiseIsEqual(KF16) ||
4936 APF.bitwiseIsEqual(KF32) ||
4937 APF.bitwiseIsEqual(KF64);
4938}
4939
4940// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4941// additional cost to negate them.
4944 if (C->isZero())
4945 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4946
4947 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4948 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4949
4951}
4952
4958
4964
4965static unsigned inverseMinMax(unsigned Opc) {
4966 switch (Opc) {
4967 case ISD::FMAXNUM:
4968 return ISD::FMINNUM;
4969 case ISD::FMINNUM:
4970 return ISD::FMAXNUM;
4971 case ISD::FMAXNUM_IEEE:
4972 return ISD::FMINNUM_IEEE;
4973 case ISD::FMINNUM_IEEE:
4974 return ISD::FMAXNUM_IEEE;
4975 case ISD::FMAXIMUM:
4976 return ISD::FMINIMUM;
4977 case ISD::FMINIMUM:
4978 return ISD::FMAXIMUM;
4979 case ISD::FMAXIMUMNUM:
4980 return ISD::FMINIMUMNUM;
4981 case ISD::FMINIMUMNUM:
4982 return ISD::FMAXIMUMNUM;
4983 case AMDGPUISD::FMAX_LEGACY:
4984 return AMDGPUISD::FMIN_LEGACY;
4985 case AMDGPUISD::FMIN_LEGACY:
4986 return AMDGPUISD::FMAX_LEGACY;
4987 default:
4988 llvm_unreachable("invalid min/max opcode");
4989 }
4990}
4991
4992/// \return true if it's profitable to try to push an fneg into its source
4993/// instruction.
4995 // If the input has multiple uses and we can either fold the negate down, or
4996 // the other uses cannot, give up. This both prevents unprofitable
4997 // transformations and infinite loops: we won't repeatedly try to fold around
4998 // a negate that has no 'good' form.
4999 if (N0.hasOneUse()) {
5000 // This may be able to fold into the source, but at a code size cost. Don't
5001 // fold if the fold into the user is free.
5002 if (allUsesHaveSourceMods(N, 0))
5003 return false;
5004 } else {
5005 if (fnegFoldsIntoOp(N0.getNode()) &&
5007 return false;
5008 }
5009
5010 return true;
5011}
5012
5014 DAGCombinerInfo &DCI) const {
5015 SelectionDAG &DAG = DCI.DAG;
5016 SDValue N0 = N->getOperand(0);
5017 EVT VT = N->getValueType(0);
5018
5019 unsigned Opc = N0.getOpcode();
5020
5021 if (!shouldFoldFNegIntoSrc(N, N0))
5022 return SDValue();
5023
5024 SDLoc SL(N);
5025 switch (Opc) {
5026 case ISD::FADD: {
5027 if (!mayIgnoreSignedZero(N0))
5028 return SDValue();
5029
5030 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5031 SDValue LHS = N0.getOperand(0);
5032 SDValue RHS = N0.getOperand(1);
5033
5034 if (LHS.getOpcode() != ISD::FNEG)
5035 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5036 else
5037 LHS = LHS.getOperand(0);
5038
5039 if (RHS.getOpcode() != ISD::FNEG)
5040 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5041 else
5042 RHS = RHS.getOperand(0);
5043
5044 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5045 if (Res.getOpcode() != ISD::FADD)
5046 return SDValue(); // Op got folded away.
5047 if (!N0.hasOneUse())
5048 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5049 return Res;
5050 }
5051 case ISD::FMUL:
5052 case AMDGPUISD::FMUL_LEGACY: {
5053 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5054 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5055 SDValue LHS = N0.getOperand(0);
5056 SDValue RHS = N0.getOperand(1);
5057
5058 if (LHS.getOpcode() == ISD::FNEG)
5059 LHS = LHS.getOperand(0);
5060 else if (RHS.getOpcode() == ISD::FNEG)
5061 RHS = RHS.getOperand(0);
5062 else
5063 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5064
5065 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5066 if (Res.getOpcode() != Opc)
5067 return SDValue(); // Op got folded away.
5068 if (!N0.hasOneUse())
5069 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5070 return Res;
5071 }
5072 case ISD::FMA:
5073 case ISD::FMAD: {
5074 // TODO: handle llvm.amdgcn.fma.legacy
5075 if (!mayIgnoreSignedZero(N0))
5076 return SDValue();
5077
5078 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5079 SDValue LHS = N0.getOperand(0);
5080 SDValue MHS = N0.getOperand(1);
5081 SDValue RHS = N0.getOperand(2);
5082
5083 if (LHS.getOpcode() == ISD::FNEG)
5084 LHS = LHS.getOperand(0);
5085 else if (MHS.getOpcode() == ISD::FNEG)
5086 MHS = MHS.getOperand(0);
5087 else
5088 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5089
5090 if (RHS.getOpcode() != ISD::FNEG)
5091 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5092 else
5093 RHS = RHS.getOperand(0);
5094
5095 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5096 if (Res.getOpcode() != Opc)
5097 return SDValue(); // Op got folded away.
5098 if (!N0.hasOneUse())
5099 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5100 return Res;
5101 }
5102 case ISD::FMAXNUM:
5103 case ISD::FMINNUM:
5104 case ISD::FMAXNUM_IEEE:
5105 case ISD::FMINNUM_IEEE:
5106 case ISD::FMINIMUM:
5107 case ISD::FMAXIMUM:
5108 case ISD::FMINIMUMNUM:
5109 case ISD::FMAXIMUMNUM:
5110 case AMDGPUISD::FMAX_LEGACY:
5111 case AMDGPUISD::FMIN_LEGACY: {
5112 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5113 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5114 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5115 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5116
5117 SDValue LHS = N0.getOperand(0);
5118 SDValue RHS = N0.getOperand(1);
5119
5120 // 0 doesn't have a negated inline immediate.
5121 // TODO: This constant check should be generalized to other operations.
5123 return SDValue();
5124
5125 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5126 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5127 unsigned Opposite = inverseMinMax(Opc);
5128
5129 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5130 if (Res.getOpcode() != Opposite)
5131 return SDValue(); // Op got folded away.
5132 if (!N0.hasOneUse())
5133 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5134 return Res;
5135 }
5136 case AMDGPUISD::FMED3: {
5137 SDValue Ops[3];
5138 for (unsigned I = 0; I < 3; ++I)
5139 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5140
5141 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5142 if (Res.getOpcode() != AMDGPUISD::FMED3)
5143 return SDValue(); // Op got folded away.
5144
5145 if (!N0.hasOneUse()) {
5146 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5147 DAG.ReplaceAllUsesWith(N0, Neg);
5148
5149 for (SDNode *U : Neg->users())
5150 DCI.AddToWorklist(U);
5151 }
5152
5153 return Res;
5154 }
5155 case ISD::FP_EXTEND:
5156 case ISD::FTRUNC:
5157 case ISD::FRINT:
5158 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5159 case ISD::FROUNDEVEN:
5160 case ISD::FSIN:
5161 case ISD::FCANONICALIZE:
5162 case AMDGPUISD::RCP:
5163 case AMDGPUISD::RCP_LEGACY:
5164 case AMDGPUISD::RCP_IFLAG:
5165 case AMDGPUISD::SIN_HW: {
5166 SDValue CvtSrc = N0.getOperand(0);
5167 if (CvtSrc.getOpcode() == ISD::FNEG) {
5168 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5169 // (fneg (rcp (fneg x))) -> (rcp x)
5170 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5171 }
5172
5173 if (!N0.hasOneUse())
5174 return SDValue();
5175
5176 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5177 // (fneg (rcp x)) -> (rcp (fneg x))
5178 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5179 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5180 }
5181 case ISD::FP_ROUND: {
5182 SDValue CvtSrc = N0.getOperand(0);
5183
5184 if (CvtSrc.getOpcode() == ISD::FNEG) {
5185 // (fneg (fp_round (fneg x))) -> (fp_round x)
5186 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5187 CvtSrc.getOperand(0), N0.getOperand(1));
5188 }
5189
5190 if (!N0.hasOneUse())
5191 return SDValue();
5192
5193 // (fneg (fp_round x)) -> (fp_round (fneg x))
5194 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5195 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5196 }
5197 case ISD::FP16_TO_FP: {
5198 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5199 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5200 // Put the fneg back as a legal source operation that can be matched later.
5201 SDLoc SL(N);
5202
5203 SDValue Src = N0.getOperand(0);
5204 EVT SrcVT = Src.getValueType();
5205
5206 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5207 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5208 DAG.getConstant(0x8000, SL, SrcVT));
5209 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5210 }
5211 case ISD::SELECT: {
5212 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5213 // TODO: Invert conditions of foldFreeOpFromSelect
5214 return SDValue();
5215 }
5216 case ISD::BITCAST: {
5217 SDLoc SL(N);
5218 SDValue BCSrc = N0.getOperand(0);
5219 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5220 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5221 if (HighBits.getValueType().getSizeInBits() != 32 ||
5222 !fnegFoldsIntoOp(HighBits.getNode()))
5223 return SDValue();
5224
5225 // f64 fneg only really needs to operate on the high half of of the
5226 // register, so try to force it to an f32 operation to help make use of
5227 // source modifiers.
5228 //
5229 //
5230 // fneg (f64 (bitcast (build_vector x, y))) ->
5231 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5232 // (fneg (bitcast i32:y to f32)))
5233
5234 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5235 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5236 SDValue CastBack =
5237 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5238
5240 Ops.back() = CastBack;
5241 DCI.AddToWorklist(NegHi.getNode());
5242 SDValue Build =
5243 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5244 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5245
5246 if (!N0.hasOneUse())
5247 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5248 return Result;
5249 }
5250
5251 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5252 BCSrc.hasOneUse()) {
5253 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5254 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5255
5256 // TODO: Cast back result for multiple uses is beneficial in some cases.
5257
5258 SDValue LHS =
5259 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5260 SDValue RHS =
5261 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5262
5263 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5264 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5265
5266 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5267 NegRHS);
5268 }
5269
5270 return SDValue();
5271 }
5272 default:
5273 return SDValue();
5274 }
5275}
5276
5278 DAGCombinerInfo &DCI) const {
5279 SelectionDAG &DAG = DCI.DAG;
5280 SDValue N0 = N->getOperand(0);
5281
5282 if (!N0.hasOneUse())
5283 return SDValue();
5284
5285 switch (N0.getOpcode()) {
5286 case ISD::FP16_TO_FP: {
5287 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5288 SDLoc SL(N);
5289 SDValue Src = N0.getOperand(0);
5290 EVT SrcVT = Src.getValueType();
5291
5292 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5293 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5294 DAG.getConstant(0x7fff, SL, SrcVT));
5295 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5296 }
5297 default:
5298 return SDValue();
5299 }
5300}
5301
5303 DAGCombinerInfo &DCI) const {
5304 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5305 if (!CFP)
5306 return SDValue();
5307
5308 // XXX - Should this flush denormals?
5309 const APFloat &Val = CFP->getValueAPF();
5310 APFloat One(Val.getSemantics(), "1.0");
5311 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5312}
5313
5315 DAGCombinerInfo &DCI) const {
5316 SelectionDAG &DAG = DCI.DAG;
5317 SDLoc DL(N);
5318
5319 switch(N->getOpcode()) {
5320 default:
5321 break;
5322 case ISD::BITCAST: {
5323 EVT DestVT = N->getValueType(0);
5324
5325 // Push casts through vector builds. This helps avoid emitting a large
5326 // number of copies when materializing floating point vector constants.
5327 //
5328 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5329 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5330 if (DestVT.isVector()) {
5331 SDValue Src = N->getOperand(0);
5332 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5335 EVT SrcVT = Src.getValueType();
5336 unsigned NElts = DestVT.getVectorNumElements();
5337
5338 if (SrcVT.getVectorNumElements() == NElts) {
5339 EVT DestEltVT = DestVT.getVectorElementType();
5340
5341 SmallVector<SDValue, 8> CastedElts;
5342 SDLoc SL(N);
5343 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5344 SDValue Elt = Src.getOperand(I);
5345 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5346 }
5347
5348 return DAG.getBuildVector(DestVT, SL, CastedElts);
5349 }
5350 }
5351 }
5352
5353 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5354 break;
5355
5356 // Fold bitcasts of constants.
5357 //
5358 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5359 // TODO: Generalize and move to DAGCombiner
5360 SDValue Src = N->getOperand(0);
5362 SDLoc SL(N);
5363 uint64_t CVal = C->getZExtValue();
5364 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5365 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5366 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5367 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5368 }
5369
5371 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5372 SDLoc SL(N);
5373 uint64_t CVal = Val.getZExtValue();
5374 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5375 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5376 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5377
5378 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5379 }
5380
5381 break;
5382 }
5383 case ISD::SHL:
5384 case ISD::SRA:
5385 case ISD::SRL: {
5386 // Range metadata can be invalidated when loads are converted to legal types
5387 // (e.g. v2i64 -> v4i32).
5388 // Try to convert vector shl/sra/srl before type legalization so that range
5389 // metadata can be utilized.
5390 if (!(N->getValueType(0).isVector() &&
5393 break;
5394 if (N->getOpcode() == ISD::SHL)
5395 return performShlCombine(N, DCI);
5396 if (N->getOpcode() == ISD::SRA)
5397 return performSraCombine(N, DCI);
5398 return performSrlCombine(N, DCI);
5399 }
5400 case ISD::TRUNCATE:
5401 return performTruncateCombine(N, DCI);
5402 case ISD::MUL:
5403 return performMulCombine(N, DCI);
5404 case AMDGPUISD::MUL_U24:
5405 case AMDGPUISD::MUL_I24: {
5406 if (SDValue Simplified = simplifyMul24(N, DCI))
5407 return Simplified;
5408 break;
5409 }
5410 case AMDGPUISD::MULHI_I24:
5411 case AMDGPUISD::MULHI_U24:
5412 return simplifyMul24(N, DCI);
5413 case ISD::SMUL_LOHI:
5414 case ISD::UMUL_LOHI:
5415 return performMulLoHiCombine(N, DCI);
5416 case ISD::MULHS:
5417 return performMulhsCombine(N, DCI);
5418 case ISD::MULHU:
5419 return performMulhuCombine(N, DCI);
5420 case ISD::SELECT:
5421 return performSelectCombine(N, DCI);
5422 case ISD::FNEG:
5423 return performFNegCombine(N, DCI);
5424 case ISD::FABS:
5425 return performFAbsCombine(N, DCI);
5426 case AMDGPUISD::BFE_I32:
5427 case AMDGPUISD::BFE_U32: {
5428 assert(!N->getValueType(0).isVector() &&
5429 "Vector handling of BFE not implemented");
5430 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5431 if (!Width)
5432 break;
5433
5434 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5435 if (WidthVal == 0)
5436 return DAG.getConstant(0, DL, MVT::i32);
5437
5439 if (!Offset)
5440 break;
5441
5442 SDValue BitsFrom = N->getOperand(0);
5443 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5444
5445 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5446
5447 if (OffsetVal == 0) {
5448 // This is already sign / zero extended, so try to fold away extra BFEs.
5449 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5450
5451 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5452 if (OpSignBits >= SignBits)
5453 return BitsFrom;
5454
5455 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5456 if (Signed) {
5457 // This is a sign_extend_inreg. Replace it to take advantage of existing
5458 // DAG Combines. If not eliminated, we will match back to BFE during
5459 // selection.
5460
5461 // TODO: The sext_inreg of extended types ends, although we can could
5462 // handle them in a single BFE.
5463 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5464 DAG.getValueType(SmallVT));
5465 }
5466
5467 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5468 }
5469
5470 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5471 if (Signed) {
5472 return constantFoldBFE<int32_t>(DAG,
5473 CVal->getSExtValue(),
5474 OffsetVal,
5475 WidthVal,
5476 DL);
5477 }
5478
5479 return constantFoldBFE<uint32_t>(DAG,
5480 CVal->getZExtValue(),
5481 OffsetVal,
5482 WidthVal,
5483 DL);
5484 }
5485
5486 if ((OffsetVal + WidthVal) >= 32 &&
5487 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5488 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5489 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5490 BitsFrom, ShiftVal);
5491 }
5492
5493 if (BitsFrom.hasOneUse()) {
5494 APInt Demanded = APInt::getBitsSet(32,
5495 OffsetVal,
5496 OffsetVal + WidthVal);
5497
5498 KnownBits Known;
5500 !DCI.isBeforeLegalizeOps());
5501 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5502 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5503 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5504 DCI.CommitTargetLoweringOpt(TLO);
5505 }
5506 }
5507
5508 break;
5509 }
5510 case ISD::LOAD:
5511 return performLoadCombine(N, DCI);
5512 case ISD::STORE:
5513 return performStoreCombine(N, DCI);
5514 case AMDGPUISD::RCP:
5515 case AMDGPUISD::RCP_IFLAG:
5516 return performRcpCombine(N, DCI);
5517 case ISD::AssertZext:
5518 case ISD::AssertSext:
5519 return performAssertSZExtCombine(N, DCI);
5521 return performIntrinsicWOChainCombine(N, DCI);
5522 case AMDGPUISD::FMAD_FTZ: {
5523 SDValue N0 = N->getOperand(0);
5524 SDValue N1 = N->getOperand(1);
5525 SDValue N2 = N->getOperand(2);
5526 EVT VT = N->getValueType(0);
5527
5528 // FMAD_FTZ is a FMAD + flush denormals to zero.
5529 // We flush the inputs, the intermediate step, and the output.
5533 if (N0CFP && N1CFP && N2CFP) {
5534 const auto FTZ = [](const APFloat &V) {
5535 if (V.isDenormal()) {
5536 APFloat Zero(V.getSemantics(), 0);
5537 return V.isNegative() ? -Zero : Zero;
5538 }
5539 return V;
5540 };
5541
5542 APFloat V0 = FTZ(N0CFP->getValueAPF());
5543 APFloat V1 = FTZ(N1CFP->getValueAPF());
5544 APFloat V2 = FTZ(N2CFP->getValueAPF());
5546 V0 = FTZ(V0);
5548 return DAG.getConstantFP(FTZ(V0), DL, VT);
5549 }
5550 break;
5551 }
5552 }
5553 return SDValue();
5554}
5555
5556//===----------------------------------------------------------------------===//
5557// Helper functions
5558//===----------------------------------------------------------------------===//
5559
5561 const TargetRegisterClass *RC,
5562 Register Reg, EVT VT,
5563 const SDLoc &SL,
5564 bool RawReg) const {
5567 Register VReg;
5568
5569 if (!MRI.isLiveIn(Reg)) {
5570 VReg = MRI.createVirtualRegister(RC);
5571 MRI.addLiveIn(Reg, VReg);
5572 } else {
5573 VReg = MRI.getLiveInVirtReg(Reg);
5574 }
5575
5576 if (RawReg)
5577 return DAG.getRegister(VReg, VT);
5578
5579 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5580}
5581
5582// This may be called multiple times, and nothing prevents creating multiple
5583// objects at the same offset. See if we already defined this object.
5585 int64_t Offset) {
5586 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5587 if (MFI.getObjectOffset(I) == Offset) {
5588 assert(MFI.getObjectSize(I) == Size);
5589 return I;
5590 }
5591 }
5592
5593 return MFI.CreateFixedObject(Size, Offset, true);
5594}
5595
5597 EVT VT,
5598 const SDLoc &SL,
5599 int64_t Offset) const {
5601 MachineFrameInfo &MFI = MF.getFrameInfo();
5602 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5603
5604 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5605 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5606
5607 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5610}
5611
5613 const SDLoc &SL,
5614 SDValue Chain,
5615 SDValue ArgVal,
5616 int64_t Offset) const {
5620
5621 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5622 // Stores to the argument stack area are relative to the stack pointer.
5623 SDValue SP =
5624 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5625 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5626 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5628 return Store;
5629}
5630
5632 const TargetRegisterClass *RC,
5633 EVT VT, const SDLoc &SL,
5634 const ArgDescriptor &Arg) const {
5635 assert(Arg && "Attempting to load missing argument");
5636
5637 SDValue V = Arg.isRegister() ?
5638 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5639 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5640
5641 if (!Arg.isMasked())
5642 return V;
5643
5644 unsigned Mask = Arg.getMask();
5645 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5646 V = DAG.getNode(ISD::SRL, SL, VT, V,
5647 DAG.getShiftAmountConstant(Shift, VT, SL));
5648 return DAG.getNode(ISD::AND, SL, VT, V,
5649 DAG.getConstant(Mask >> Shift, SL, VT));
5650}
5651
5653 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5654 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5655 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5656 uint64_t ArgOffset =
5657 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5658 switch (Param) {
5659 case FIRST_IMPLICIT:
5660 return ArgOffset;
5661 case PRIVATE_BASE:
5663 case SHARED_BASE:
5664 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5665 case QUEUE_PTR:
5666 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5667 }
5668 llvm_unreachable("unexpected implicit parameter type");
5669}
5670
5676
5678 SelectionDAG &DAG, int Enabled,
5679 int &RefinementSteps,
5680 bool &UseOneConstNR,
5681 bool Reciprocal) const {
5682 EVT VT = Operand.getValueType();
5683
5684 if (VT == MVT::f32) {
5685 RefinementSteps = 0;
5686 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5687 }
5688
5689 // TODO: There is also f64 rsq instruction, but the documentation is less
5690 // clear on its precision.
5691
5692 return SDValue();
5693}
5694
5696 SelectionDAG &DAG, int Enabled,
5697 int &RefinementSteps) const {
5698 EVT VT = Operand.getValueType();
5699
5700 if (VT == MVT::f32) {
5701 // Reciprocal, < 1 ulp error.
5702 //
5703 // This reciprocal approximation converges to < 0.5 ulp error with one
5704 // newton rhapson performed with two fused multiple adds (FMAs).
5705
5706 RefinementSteps = 0;
5707 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5708 }
5709
5710 // TODO: There is also f64 rcp instruction, but the documentation is less
5711 // clear on its precision.
5712
5713 return SDValue();
5714}
5715
5716static unsigned workitemIntrinsicDim(unsigned ID) {
5717 switch (ID) {
5718 case Intrinsic::amdgcn_workitem_id_x:
5719 return 0;
5720 case Intrinsic::amdgcn_workitem_id_y:
5721 return 1;
5722 case Intrinsic::amdgcn_workitem_id_z:
5723 return 2;
5724 default:
5725 llvm_unreachable("not a workitem intrinsic");
5726 }
5727}
5728
5730 const SDValue Op, KnownBits &Known,
5731 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5732
5733 Known.resetAll(); // Don't know anything.
5734
5735 unsigned Opc = Op.getOpcode();
5736
5737 switch (Opc) {
5738 default:
5739 break;
5740 case AMDGPUISD::CARRY:
5741 case AMDGPUISD::BORROW: {
5742 Known.Zero = APInt::getHighBitsSet(32, 31);
5743 break;
5744 }
5745
5746 case AMDGPUISD::BFE_I32:
5747 case AMDGPUISD::BFE_U32: {
5748 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5749 if (!CWidth)
5750 return;
5751
5752 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5753
5754 if (Opc == AMDGPUISD::BFE_U32)
5755 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5756
5757 break;
5758 }
5759 case AMDGPUISD::FP_TO_FP16: {
5760 unsigned BitWidth = Known.getBitWidth();
5761
5762 // High bits are zero.
5764 break;
5765 }
5766 case AMDGPUISD::MUL_U24:
5767 case AMDGPUISD::MUL_I24: {
5768 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5769 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5770 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5771 RHSKnown.countMinTrailingZeros();
5772 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5773 // Skip extra check if all bits are known zeros.
5774 if (TrailZ >= 32)
5775 break;
5776
5777 // Truncate to 24 bits.
5778 LHSKnown = LHSKnown.trunc(24);
5779 RHSKnown = RHSKnown.trunc(24);
5780
5781 if (Opc == AMDGPUISD::MUL_I24) {
5782 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5783 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5784 unsigned MaxValBits = LHSValBits + RHSValBits;
5785 if (MaxValBits > 32)
5786 break;
5787 unsigned SignBits = 32 - MaxValBits + 1;
5788 bool LHSNegative = LHSKnown.isNegative();
5789 bool LHSNonNegative = LHSKnown.isNonNegative();
5790 bool LHSPositive = LHSKnown.isStrictlyPositive();
5791 bool RHSNegative = RHSKnown.isNegative();
5792 bool RHSNonNegative = RHSKnown.isNonNegative();
5793 bool RHSPositive = RHSKnown.isStrictlyPositive();
5794
5795 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5796 Known.Zero.setHighBits(SignBits);
5797 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5798 Known.One.setHighBits(SignBits);
5799 } else {
5800 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5801 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5802 unsigned MaxValBits = LHSValBits + RHSValBits;
5803 if (MaxValBits >= 32)
5804 break;
5805 Known.Zero.setBitsFrom(MaxValBits);
5806 }
5807 break;
5808 }
5809 case AMDGPUISD::PERM: {
5810 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5811 if (!CMask)
5812 return;
5813
5814 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5815 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5816 unsigned Sel = CMask->getZExtValue();
5817
5818 for (unsigned I = 0; I < 32; I += 8) {
5819 unsigned SelBits = Sel & 0xff;
5820 if (SelBits < 4) {
5821 SelBits *= 8;
5822 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5823 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5824 } else if (SelBits < 7) {
5825 SelBits = (SelBits & 3) * 8;
5826 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5827 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5828 } else if (SelBits == 0x0c) {
5829 Known.Zero |= 0xFFull << I;
5830 } else if (SelBits > 0x0c) {
5831 Known.One |= 0xFFull << I;
5832 }
5833 Sel >>= 8;
5834 }
5835 break;
5836 }
5837 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
5838 Known.Zero.setHighBits(24);
5839 break;
5840 }
5841 case AMDGPUISD::BUFFER_LOAD_USHORT: {
5842 Known.Zero.setHighBits(16);
5843 break;
5844 }
5845 case AMDGPUISD::LDS: {
5846 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5847 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5848
5849 Known.Zero.setHighBits(16);
5850 Known.Zero.setLowBits(Log2(Alignment));
5851 break;
5852 }
5853 case AMDGPUISD::SMIN3:
5854 case AMDGPUISD::SMAX3:
5855 case AMDGPUISD::SMED3:
5856 case AMDGPUISD::UMIN3:
5857 case AMDGPUISD::UMAX3:
5858 case AMDGPUISD::UMED3: {
5859 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5860 if (Known2.isUnknown())
5861 break;
5862
5863 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5864 if (Known1.isUnknown())
5865 break;
5866
5867 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5868 if (Known0.isUnknown())
5869 break;
5870
5871 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
5872 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
5873 Known.One = Known0.One & Known1.One & Known2.One;
5874 break;
5875 }
5877 unsigned IID = Op.getConstantOperandVal(0);
5878 switch (IID) {
5879 case Intrinsic::amdgcn_workitem_id_x:
5880 case Intrinsic::amdgcn_workitem_id_y:
5881 case Intrinsic::amdgcn_workitem_id_z: {
5882 unsigned MaxValue = Subtarget->getMaxWorkitemID(
5884 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
5885 break;
5886 }
5887 default:
5888 break;
5889 }
5890 }
5891 }
5892}
5893
5895 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5896 unsigned Depth) const {
5897 switch (Op.getOpcode()) {
5898 case AMDGPUISD::BFE_I32: {
5899 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5900 if (!Width)
5901 return 1;
5902
5903 unsigned SignBits = 32 - Width->getZExtValue() + 1;
5904 if (!isNullConstant(Op.getOperand(1)))
5905 return SignBits;
5906
5907 // TODO: Could probably figure something out with non-0 offsets.
5908 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5909 return std::max(SignBits, Op0SignBits);
5910 }
5911
5912 case AMDGPUISD::BFE_U32: {
5913 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5914 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
5915 }
5916
5917 case AMDGPUISD::CARRY:
5918 case AMDGPUISD::BORROW:
5919 return 31;
5920 case AMDGPUISD::BUFFER_LOAD_BYTE:
5921 return 25;
5922 case AMDGPUISD::BUFFER_LOAD_SHORT:
5923 return 17;
5924 case AMDGPUISD::BUFFER_LOAD_UBYTE:
5925 return 24;
5926 case AMDGPUISD::BUFFER_LOAD_USHORT:
5927 return 16;
5928 case AMDGPUISD::FP_TO_FP16:
5929 return 16;
5930 case AMDGPUISD::SMIN3:
5931 case AMDGPUISD::SMAX3:
5932 case AMDGPUISD::SMED3:
5933 case AMDGPUISD::UMIN3:
5934 case AMDGPUISD::UMAX3:
5935 case AMDGPUISD::UMED3: {
5936 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
5937 if (Tmp2 == 1)
5938 return 1; // Early out.
5939
5940 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
5941 if (Tmp1 == 1)
5942 return 1; // Early out.
5943
5944 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
5945 if (Tmp0 == 1)
5946 return 1; // Early out.
5947
5948 return std::min({Tmp0, Tmp1, Tmp2});
5949 }
5950 default:
5951 return 1;
5952 }
5953}
5954
5956 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
5957 const MachineRegisterInfo &MRI, unsigned Depth) const {
5958 const MachineInstr *MI = MRI.getVRegDef(R);
5959 if (!MI)
5960 return 1;
5961
5962 // TODO: Check range metadata on MMO.
5963 switch (MI->getOpcode()) {
5964 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
5965 return 25;
5966 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
5967 return 17;
5968 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
5969 return 24;
5970 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
5971 return 16;
5972 case AMDGPU::G_AMDGPU_SMED3:
5973 case AMDGPU::G_AMDGPU_UMED3: {
5974 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
5975 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
5976 if (Tmp2 == 1)
5977 return 1;
5978 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
5979 if (Tmp1 == 1)
5980 return 1;
5981 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
5982 if (Tmp0 == 1)
5983 return 1;
5984 return std::min({Tmp0, Tmp1, Tmp2});
5985 }
5986 default:
5987 return 1;
5988 }
5989}
5990
5992 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
5993 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
5994 unsigned Opcode = Op.getOpcode();
5995 switch (Opcode) {
5996 case AMDGPUISD::BFE_I32:
5997 case AMDGPUISD::BFE_U32:
5998 return false;
5999 }
6001 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6002}
6003
6005 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6006 unsigned Depth) const {
6007 unsigned Opcode = Op.getOpcode();
6008 switch (Opcode) {
6009 case AMDGPUISD::FMIN_LEGACY:
6010 case AMDGPUISD::FMAX_LEGACY: {
6011 if (SNaN)
6012 return true;
6013
6014 // TODO: Can check no nans on one of the operands for each one, but which
6015 // one?
6016 return false;
6017 }
6018 case AMDGPUISD::FMUL_LEGACY:
6019 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6020 if (SNaN)
6021 return true;
6022 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6023 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6024 }
6025 case AMDGPUISD::FMED3:
6026 case AMDGPUISD::FMIN3:
6027 case AMDGPUISD::FMAX3:
6028 case AMDGPUISD::FMINIMUM3:
6029 case AMDGPUISD::FMAXIMUM3:
6030 case AMDGPUISD::FMAD_FTZ: {
6031 if (SNaN)
6032 return true;
6033 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6034 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6035 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6036 }
6037 case AMDGPUISD::CVT_F32_UBYTE0:
6038 case AMDGPUISD::CVT_F32_UBYTE1:
6039 case AMDGPUISD::CVT_F32_UBYTE2:
6040 case AMDGPUISD::CVT_F32_UBYTE3:
6041 return true;
6042
6043 case AMDGPUISD::RCP:
6044 case AMDGPUISD::RSQ:
6045 case AMDGPUISD::RCP_LEGACY:
6046 case AMDGPUISD::RSQ_CLAMP: {
6047 if (SNaN)
6048 return true;
6049
6050 // TODO: Need is known positive check.
6051 return false;
6052 }
6053 case ISD::FLDEXP:
6054 case AMDGPUISD::FRACT: {
6055 if (SNaN)
6056 return true;
6057 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6058 }
6059 case AMDGPUISD::DIV_SCALE:
6060 case AMDGPUISD::DIV_FMAS:
6061 case AMDGPUISD::DIV_FIXUP:
6062 // TODO: Refine on operands.
6063 return SNaN;
6064 case AMDGPUISD::SIN_HW:
6065 case AMDGPUISD::COS_HW: {
6066 // TODO: Need check for infinity
6067 return SNaN;
6068 }
6070 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6071 // TODO: Handle more intrinsics
6072 switch (IntrinsicID) {
6073 case Intrinsic::amdgcn_cubeid:
6074 case Intrinsic::amdgcn_cvt_off_f32_i4:
6075 return true;
6076
6077 case Intrinsic::amdgcn_frexp_mant: {
6078 if (SNaN)
6079 return true;
6080 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6081 }
6082 case Intrinsic::amdgcn_cvt_pkrtz: {
6083 if (SNaN)
6084 return true;
6085 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6086 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6087 }
6088 case Intrinsic::amdgcn_rcp:
6089 case Intrinsic::amdgcn_rsq:
6090 case Intrinsic::amdgcn_rcp_legacy:
6091 case Intrinsic::amdgcn_rsq_legacy:
6092 case Intrinsic::amdgcn_rsq_clamp:
6093 case Intrinsic::amdgcn_tanh: {
6094 if (SNaN)
6095 return true;
6096
6097 // TODO: Need is known positive check.
6098 return false;
6099 }
6100 case Intrinsic::amdgcn_trig_preop:
6101 case Intrinsic::amdgcn_fdot2:
6102 // TODO: Refine on operand
6103 return SNaN;
6104 case Intrinsic::amdgcn_fma_legacy:
6105 if (SNaN)
6106 return true;
6107 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6108 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6109 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6110 default:
6111 return false;
6112 }
6113 }
6114 default:
6115 return false;
6116 }
6117}
6118
6120 Register N0, Register N1) const {
6121 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6122}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1396
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1163
const fltSemantics & getSemantics() const
Definition APFloat.h:1439
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1181
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1151
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:359
Type * getValueType() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:809
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:782
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:595
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:773
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:517
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:843
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:513
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:870
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:579
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:412
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
Definition ISDOpcodes.h:993
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:983
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:834
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:781
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:536
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:543
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:786
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:703
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:764
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:644
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:609
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:571
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:840
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:801
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:878
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:726
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:968
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:795
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:916
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:738
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:560
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:949
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:987
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:846
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:529
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:551
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1551
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:269
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...