LLVM 23.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
26#include "llvm/IR/IntrinsicsAMDGPU.h"
30
31using namespace llvm;
32
33#include "AMDGPUGenCallingConv.inc"
34
36 "amdgpu-bypass-slow-div",
37 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
38 cl::init(true));
39
40// Find a larger type to do a load / store of a vector with.
42 unsigned StoreSize = VT.getStoreSizeInBits();
43 if (StoreSize <= 32)
44 return EVT::getIntegerVT(Ctx, StoreSize);
45
46 if (StoreSize % 32 == 0)
47 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
48
49 return VT;
50}
51
55
57 // In order for this to be a signed 24-bit value, bit 23, must
58 // be a sign bit.
59 return DAG.ComputeMaxSignificantBits(Op);
60}
61
63 const TargetSubtargetInfo &STI,
64 const AMDGPUSubtarget &AMDGPUSTI)
65 : TargetLowering(TM, STI), Subtarget(&AMDGPUSTI) {
66 // Always lower memset, memcpy, and memmove intrinsics to load/store
67 // instructions, rather then generating calls to memset, mempcy or memmove.
71
72 // Enable ganging up loads and stores in the memcpy DAG lowering.
74
75 // Lower floating point store/load to integer store/load to reduce the number
76 // of patterns in tablegen.
78 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
79
81 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
82
84 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
85
87 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
88
90 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
91
93 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
94
96 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
97
99 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
100
102 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
103
104 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
105 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
106
107 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
108 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
109
110 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
111 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
112
113 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
114 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
115
116 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
117 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
118
120 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
121
123 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
124
126 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
127
129 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
130
132 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
133
135 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
136
138 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
139
141 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
142
144 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
145
147 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
148
149 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
150 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
151
152 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
153 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
154
156 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
157
158 // TODO: Would be better to consume as directly legal
160 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
161
163 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
164
166 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
167
169 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
170
172 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
173
175 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
176
178 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
179
181 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
182
183 // There are no 64-bit extloads. These should be done as a 32-bit extload and
184 // an extension to 64-bit.
185 for (MVT VT : MVT::integer_valuetypes())
187 Expand);
188
189 for (MVT VT : MVT::integer_valuetypes()) {
190 if (VT == MVT::i64)
191 continue;
192
193 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
194 setLoadExtAction(Op, VT, MVT::i1, Promote);
195 setLoadExtAction(Op, VT, MVT::i8, Legal);
196 setLoadExtAction(Op, VT, MVT::i16, Legal);
197 setLoadExtAction(Op, VT, MVT::i32, Expand);
198 }
199 }
200
202 for (auto MemVT :
203 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
205 Expand);
206
207 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
219 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
220 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
221
222 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
226 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
227 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
228
229 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
239 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
240 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
241
243 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
244
246 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
247
249 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
250
252 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
253
255 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
256
258 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
259
261 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
262
264 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
265
267 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
268
270 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
271
273 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
274
276 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
277
279 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
280
282 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
283
285 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
286
288 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
289
291 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
292
294 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
295
297 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
298
300 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
301
303 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
304
306 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
307
309 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
310
312 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
313
315 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
316
318 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
319
321 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
322
323 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
325 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
326 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
327
328 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
330 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
331 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
332
333 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
334 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
335 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
336 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
337 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
338 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
339 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
340 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
341 setTruncStoreAction(MVT::v6f32, MVT::v6f16, Expand);
342 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
343 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
344 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
345 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
346 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
347 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
348
349 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
350 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
351 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
352
353 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
354 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
355 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
356
357 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
358
359 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
360 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
361 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
362 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
363 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
364 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
365 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
366
367 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
368 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
369 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
370 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
371 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
372
373 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
374 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
375 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
376
377 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
378 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
379 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
380
381 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
382 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
383 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
384
385 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
386 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
387 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
388
389 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
390 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
391 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
393 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
394 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
395 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
396
397 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
398 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
399
401
402 // For R600, this is totally unsupported, just custom lower to produce an
403 // error.
405
406 // Library functions. These default to Expand, but we have instructions
407 // for them.
410 {MVT::f16, MVT::f32}, Legal);
412
414 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
416 {MVT::f16, MVT::f32, MVT::f64}, Expand);
417
420 Custom);
422
423 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
424
425 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
426
427 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
428 Expand);
429
430 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
431 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
433
435 Custom);
436
437 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
438
439 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
440 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
441 // default unless marked custom/legal.
443 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
444 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
445 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
446 MVT::v16f64},
447 Custom);
448
449 // Expand to fneg + fadd.
451
453 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
454 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
455 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
456 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
457 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
458 Custom);
459
462 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
463 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
464 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
465 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
466 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
467 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
468 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
469 Custom);
470
472 Expand);
473 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
474
475 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
476 for (MVT VT : ScalarIntVTs) {
477 // These should use [SU]DIVREM, so set them to expand
479 Expand);
480
481 // GPU does not have divrem function for signed or unsigned.
483
484 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
486
488
489 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
491 }
492
493 // The hardware supports 32-bit FSHR, but not FSHL.
495
496 setOperationAction({ISD::ROTL, ISD::ROTR}, {MVT::i32, MVT::i64}, Expand);
497
499
504 MVT::i64, Custom);
506
508 Legal);
509
512 MVT::i64, Custom);
513
514 for (auto VT : {MVT::i8, MVT::i16})
516
517 static const MVT::SimpleValueType VectorIntTypes[] = {
518 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
519 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
520
521 for (MVT VT : VectorIntTypes) {
522 // Expand the following operations for the current type by default.
523 // clang-format off
543 VT, Expand);
544 // clang-format on
545 }
546
547 static const MVT::SimpleValueType FloatVectorTypes[] = {
548 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
549 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
550
551 for (MVT VT : FloatVectorTypes) {
564 VT, Expand);
565 }
566
567 // This causes using an unrolled select operation rather than expansion with
568 // bit operations. This is in general better, but the alternative using BFI
569 // instructions may be better if the select sources are SGPRs.
571 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
572
574 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
575
577 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
578
580 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
581
583 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
584
586 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
587
589 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
590
592 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
593
595 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
596
598 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
599
601 setJumpIsExpensive(true);
602
605
607
608 // We want to find all load dependencies for long chains of stores to enable
609 // merging into very wide vectors. The problem is with vectors with > 4
610 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
611 // vectors are a legal type, even though we have to split the loads
612 // usually. When we can more precisely specify load legality per address
613 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
614 // smarter so that they can figure out what to do in 2 iterations without all
615 // N > 4 stores on the same chain.
617
618 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
619 // about these during lowering.
620 MaxStoresPerMemcpy = 0xffffffff;
621 MaxStoresPerMemmove = 0xffffffff;
622 MaxStoresPerMemset = 0xffffffff;
623
624 // The expansion for 64-bit division is enormous.
626 addBypassSlowDiv(64, 32);
627
638
642}
643
645 const auto Flags = Op.getNode()->getFlags();
646 if (Flags.hasNoSignedZeros())
647 return true;
648
649 return false;
650}
651
652//===----------------------------------------------------------------------===//
653// Target Information
654//===----------------------------------------------------------------------===//
655
657static bool fnegFoldsIntoOpcode(unsigned Opc) {
658 switch (Opc) {
659 case ISD::FADD:
660 case ISD::FSUB:
661 case ISD::FMUL:
662 case ISD::FMA:
663 case ISD::FMAD:
664 case ISD::FMINNUM:
665 case ISD::FMAXNUM:
668 case ISD::FMINIMUM:
669 case ISD::FMAXIMUM:
670 case ISD::FMINIMUMNUM:
671 case ISD::FMAXIMUMNUM:
672 case ISD::SELECT:
673 case ISD::FSIN:
674 case ISD::FTRUNC:
675 case ISD::FRINT:
676 case ISD::FNEARBYINT:
677 case ISD::FROUNDEVEN:
679 case AMDGPUISD::RCP:
680 case AMDGPUISD::RCP_LEGACY:
681 case AMDGPUISD::RCP_IFLAG:
682 case AMDGPUISD::SIN_HW:
683 case AMDGPUISD::FMUL_LEGACY:
684 case AMDGPUISD::FMIN_LEGACY:
685 case AMDGPUISD::FMAX_LEGACY:
686 case AMDGPUISD::FMED3:
687 // TODO: handle llvm.amdgcn.fma.legacy
688 return true;
689 case ISD::BITCAST:
690 llvm_unreachable("bitcast is special cased");
691 default:
692 return false;
693 }
694}
695
696static bool fnegFoldsIntoOp(const SDNode *N) {
697 unsigned Opc = N->getOpcode();
698 if (Opc == ISD::BITCAST) {
699 // TODO: Is there a benefit to checking the conditions performFNegCombine
700 // does? We don't for the other cases.
701 SDValue BCSrc = N->getOperand(0);
702 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
703 return BCSrc.getNumOperands() == 2 &&
704 BCSrc.getOperand(1).getValueSizeInBits() == 32;
705 }
706
707 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
708 }
709
710 return fnegFoldsIntoOpcode(Opc);
711}
712
713/// \p returns true if the operation will definitely need to use a 64-bit
714/// encoding, and thus will use a VOP3 encoding regardless of the source
715/// modifiers.
717static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
718 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
719 VT == MVT::f64;
720}
721
722/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
723/// type for ISD::SELECT.
725static bool selectSupportsSourceMods(const SDNode *N) {
726 // TODO: Only applies if select will be vector
727 return N->getValueType(0) == MVT::f32;
728}
729
730// Most FP instructions support source modifiers, but this could be refined
731// slightly.
733static bool hasSourceMods(const SDNode *N) {
734 if (isa<MemSDNode>(N))
735 return false;
736
737 switch (N->getOpcode()) {
738 case ISD::CopyToReg:
739 case ISD::FDIV:
740 case ISD::FREM:
741 case ISD::INLINEASM:
743 case AMDGPUISD::DIV_SCALE:
745
746 // TODO: Should really be looking at the users of the bitcast. These are
747 // problematic because bitcasts are used to legalize all stores to integer
748 // types.
749 case ISD::BITCAST:
750 return false;
752 switch (N->getConstantOperandVal(0)) {
753 case Intrinsic::amdgcn_interp_p1:
754 case Intrinsic::amdgcn_interp_p2:
755 case Intrinsic::amdgcn_interp_mov:
756 case Intrinsic::amdgcn_interp_p1_f16:
757 case Intrinsic::amdgcn_interp_p2_f16:
758 return false;
759 default:
760 return true;
761 }
762 }
763 case ISD::SELECT:
765 default:
766 return true;
767 }
768}
769
771 unsigned CostThreshold) {
772 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
773 // it is truly free to use a source modifier in all cases. If there are
774 // multiple users but for each one will necessitate using VOP3, there will be
775 // a code size increase. Try to avoid increasing code size unless we know it
776 // will save on the instruction count.
777 unsigned NumMayIncreaseSize = 0;
778 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
779
780 assert(!N->use_empty());
781
782 // XXX - Should this limit number of uses to check?
783 for (const SDNode *U : N->users()) {
784 if (!hasSourceMods(U))
785 return false;
786
787 if (!opMustUseVOP3Encoding(U, VT)) {
788 if (++NumMayIncreaseSize > CostThreshold)
789 return false;
790 }
791 }
792
793 return true;
794}
795
797 ISD::NodeType ExtendKind) const {
798 assert(!VT.isVector() && "only scalar expected");
799
800 // Round to the next multiple of 32-bits.
801 unsigned Size = VT.getSizeInBits();
802 if (Size <= 32)
803 return MVT::i32;
804 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
805}
806
808 return 32;
809}
810
812 return true;
813}
814
815// The backend supports 32 and 64 bit floating point immediates.
816// FIXME: Why are we reporting vectors of FP immediates as legal?
818 bool ForCodeSize) const {
819 return isTypeLegal(VT.getScalarType());
820}
821
822// We don't want to shrink f64 / f32 constants.
824 EVT ScalarVT = VT.getScalarType();
825 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
826}
827
829 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
830 std::optional<unsigned> ByteOffset) const {
831 // TODO: This may be worth removing. Check regression tests for diffs.
832 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
833 return false;
834
835 unsigned NewSize = NewVT.getStoreSizeInBits();
836
837 // If we are reducing to a 32-bit load or a smaller multi-dword load,
838 // this is always better.
839 if (NewSize >= 32)
840 return true;
841
842 EVT OldVT = N->getValueType(0);
843 unsigned OldSize = OldVT.getStoreSizeInBits();
844
846 unsigned AS = MN->getAddressSpace();
847 // Do not shrink an aligned scalar load to sub-dword.
848 // Scalar engine cannot do sub-dword loads.
849 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
850 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
854 MN->isInvariant())) &&
856 return false;
857
858 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
859 // extloads, so doing one requires using a buffer_load. In cases where we
860 // still couldn't use a scalar load, using the wider load shouldn't really
861 // hurt anything.
862
863 // If the old size already had to be an extload, there's no harm in continuing
864 // to reduce the width.
865 return (OldSize < 32);
866}
867
869 const SelectionDAG &DAG,
870 const MachineMemOperand &MMO) const {
871
872 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
873
874 if (LoadTy.getScalarType() == MVT::i32)
875 return false;
876
877 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
878 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
879
880 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
881 return false;
882
883 unsigned Fast = 0;
885 CastTy, MMO, &Fast) &&
886 Fast;
887}
888
889// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
890// profitable with the expansion for 64-bit since it's generally good to
891// speculate things.
893 return true;
894}
895
897 return true;
898}
899
901 switch (N->getOpcode()) {
902 case ISD::EntryToken:
903 case ISD::TokenFactor:
904 return true;
906 unsigned IntrID = N->getConstantOperandVal(0);
908 }
910 unsigned IntrID = N->getConstantOperandVal(1);
912 }
913 case ISD::LOAD:
914 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
916 return true;
917 return false;
918 case AMDGPUISD::SETCC: // ballot-style instruction
919 return true;
920 }
921 return false;
922}
923
925 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
926 NegatibleCost &Cost, unsigned Depth) const {
927
928 switch (Op.getOpcode()) {
929 case ISD::FMA:
930 case ISD::FMAD: {
931 // Negating a fma is not free if it has users without source mods.
932 if (!allUsesHaveSourceMods(Op.getNode()))
933 return SDValue();
934 break;
935 }
936 case AMDGPUISD::RCP: {
937 SDValue Src = Op.getOperand(0);
938 EVT VT = Op.getValueType();
939 SDLoc SL(Op);
940
941 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
942 ForCodeSize, Cost, Depth + 1);
943 if (NegSrc)
944 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
945 return SDValue();
946 }
947 default:
948 break;
949 }
950
951 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
952 ForCodeSize, Cost, Depth);
953}
954
955//===---------------------------------------------------------------------===//
956// Target Properties
957//===---------------------------------------------------------------------===//
958
961
962 // Packed operations do not have a fabs modifier.
963 // Report this based on the end legalized type.
964 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
965}
966
969 // Report this based on the end legalized type.
970 VT = VT.getScalarType();
971 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
972}
973
975 unsigned NumElem,
976 unsigned AS) const {
977 return true;
978}
979
981 // There are few operations which truly have vector input operands. Any vector
982 // operation is going to involve operations on each component, and a
983 // build_vector will be a copy per element, so it always makes sense to use a
984 // build_vector input in place of the extracted element to avoid a copy into a
985 // super register.
986 //
987 // We should probably only do this if all users are extracts only, but this
988 // should be the common case.
989 return true;
990}
991
993 // Truncate is just accessing a subregister.
994
995 unsigned SrcSize = Source.getSizeInBits();
996 unsigned DestSize = Dest.getSizeInBits();
997
998 return DestSize < SrcSize && DestSize % 32 == 0 ;
999}
1000
1002 // Truncate is just accessing a subregister.
1003
1004 unsigned SrcSize = Source->getScalarSizeInBits();
1005 unsigned DestSize = Dest->getScalarSizeInBits();
1006
1007 if (DestSize== 16 && Subtarget->has16BitInsts())
1008 return SrcSize >= 32;
1009
1010 return DestSize < SrcSize && DestSize % 32 == 0;
1011}
1012
1014 unsigned SrcSize = Src->getScalarSizeInBits();
1015 unsigned DestSize = Dest->getScalarSizeInBits();
1016
1017 if (SrcSize == 16 && Subtarget->has16BitInsts())
1018 return DestSize >= 32;
1019
1020 return SrcSize == 32 && DestSize == 64;
1021}
1022
1024 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1025 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1026 // this will enable reducing 64-bit operations the 32-bit, which is always
1027 // good.
1028
1029 if (Src == MVT::i16)
1030 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1031
1032 return Src == MVT::i32 && Dest == MVT::i64;
1033}
1034
1036 EVT DestVT) const {
1037 switch (N->getOpcode()) {
1038 case ISD::ADD:
1039 case ISD::SUB:
1040 case ISD::SHL:
1041 case ISD::SRL:
1042 case ISD::SRA:
1043 case ISD::AND:
1044 case ISD::OR:
1045 case ISD::XOR:
1046 case ISD::MUL:
1047 case ISD::SETCC:
1048 case ISD::SELECT:
1049 case ISD::SMIN:
1050 case ISD::SMAX:
1051 case ISD::UMIN:
1052 case ISD::UMAX:
1053 if (isTypeLegal(MVT::i16) &&
1054 (!DestVT.isVector() ||
1055 !isOperationLegal(ISD::ADD, MVT::v2i16))) { // Check if VOP3P
1056 // Don't narrow back down to i16 if promoted to i32 already.
1057 if (!N->isDivergent() && DestVT.isInteger() &&
1058 DestVT.getScalarSizeInBits() > 1 &&
1059 DestVT.getScalarSizeInBits() <= 16 &&
1060 SrcVT.getScalarSizeInBits() > 16) {
1061 return false;
1062 }
1063 }
1064 return true;
1065 default:
1066 break;
1067 }
1068
1069 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1070 // limited number of native 64-bit operations. Shrinking an operation to fit
1071 // in a single 32-bit register should always be helpful. As currently used,
1072 // this is much less general than the name suggests, and is only used in
1073 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1074 // not profitable, and may actually be harmful.
1075 if (isa<LoadSDNode>(N))
1076 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1077
1078 return true;
1079}
1080
1082 const SDNode* N, CombineLevel Level) const {
1083 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1084 N->getOpcode() == ISD::SRL) &&
1085 "Expected shift op");
1086
1087 SDValue ShiftLHS = N->getOperand(0);
1088 if (!ShiftLHS->hasOneUse())
1089 return false;
1090
1091 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1092 !ShiftLHS.getOperand(0)->hasOneUse())
1093 return false;
1094
1095 // Always commute pre-type legalization and right shifts.
1096 // We're looking for shl(or(x,y),z) patterns.
1098 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1099 return true;
1100
1101 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1102 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1103 (N->user_begin()->getOpcode() == ISD::SRA ||
1104 N->user_begin()->getOpcode() == ISD::SRL))
1105 return false;
1106
1107 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1108 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1109 if (LHS.getOpcode() != ISD::SHL)
1110 return false;
1111 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1112 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1113 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1114 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1115 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1116 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1117 };
1118 SDValue LHS = N->getOperand(0).getOperand(0);
1119 SDValue RHS = N->getOperand(0).getOperand(1);
1120 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1121}
1122
1123//===---------------------------------------------------------------------===//
1124// TargetLowering Callbacks
1125//===---------------------------------------------------------------------===//
1126
1128 bool IsVarArg) {
1129 switch (CC) {
1137 return CC_AMDGPU;
1140 return CC_AMDGPU_CS_CHAIN;
1141 case CallingConv::C:
1142 case CallingConv::Fast:
1143 case CallingConv::Cold:
1144 return CC_AMDGPU_Func;
1147 return CC_SI_Gfx;
1150 default:
1151 reportFatalUsageError("unsupported calling convention for call");
1152 }
1153}
1154
1156 bool IsVarArg) {
1157 switch (CC) {
1160 llvm_unreachable("kernels should not be handled here");
1170 return RetCC_SI_Shader;
1173 return RetCC_SI_Gfx;
1174 case CallingConv::C:
1175 case CallingConv::Fast:
1176 case CallingConv::Cold:
1177 return RetCC_AMDGPU_Func;
1178 default:
1179 reportFatalUsageError("unsupported calling convention");
1180 }
1181}
1182
1183/// The SelectionDAGBuilder will automatically promote function arguments
1184/// with illegal types. However, this does not work for the AMDGPU targets
1185/// since the function arguments are stored in memory as these illegal types.
1186/// In order to handle this properly we need to get the original types sizes
1187/// from the LLVM IR Function and fixup the ISD:InputArg values before
1188/// passing them to AnalyzeFormalArguments()
1189
1190/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1191/// input values across multiple registers. Each item in the Ins array
1192/// represents a single value that will be stored in registers. Ins[x].VT is
1193/// the value type of the value that will be stored in the register, so
1194/// whatever SDNode we lower the argument to needs to be this type.
1195///
1196/// In order to correctly lower the arguments we need to know the size of each
1197/// argument. Since Ins[x].VT gives us the size of the register that will
1198/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1199/// for the original function argument so that we can deduce the correct memory
1200/// type to use for Ins[x]. In most cases the correct memory type will be
1201/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1202/// we have a kernel argument of type v8i8, this argument will be split into
1203/// 8 parts and each part will be represented by its own item in the Ins array.
1204/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1205/// the argument before it was split. From this, we deduce that the memory type
1206/// for each individual part is i8. We pass the memory type as LocVT to the
1207/// calling convention analysis function and the register type (Ins[x].VT) as
1208/// the ValVT.
1210 CCState &State,
1211 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1212 const MachineFunction &MF = State.getMachineFunction();
1213 const Function &Fn = MF.getFunction();
1214 LLVMContext &Ctx = Fn.getContext();
1215 const unsigned ExplicitOffset = Subtarget->getExplicitKernelArgOffset();
1217
1218 Align MaxAlign = Align(1);
1219 uint64_t ExplicitArgOffset = 0;
1220 const DataLayout &DL = Fn.getDataLayout();
1221
1222 unsigned InIndex = 0;
1223
1224 for (const Argument &Arg : Fn.args()) {
1225 const bool IsByRef = Arg.hasByRefAttr();
1226 Type *BaseArgTy = Arg.getType();
1227 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1228 Align Alignment = DL.getValueOrABITypeAlignment(
1229 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1230 MaxAlign = std::max(Alignment, MaxAlign);
1231 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1232
1233 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1234 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1235
1236 // We're basically throwing away everything passed into us and starting over
1237 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1238 // to us as computed in Ins.
1239 //
1240 // We also need to figure out what type legalization is trying to do to get
1241 // the correct memory offsets.
1242
1243 SmallVector<EVT, 16> ValueVTs;
1245 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1246 &Offsets, ArgOffset);
1247
1248 for (unsigned Value = 0, NumValues = ValueVTs.size();
1249 Value != NumValues; ++Value) {
1250 uint64_t BasePartOffset = Offsets[Value];
1251
1252 EVT ArgVT = ValueVTs[Value];
1253 EVT MemVT = ArgVT;
1254 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1255 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1256
1257 if (NumRegs == 1) {
1258 // This argument is not split, so the IR type is the memory type.
1259 if (ArgVT.isExtended()) {
1260 // We have an extended type, like i24, so we should just use the
1261 // register type.
1262 MemVT = RegisterVT;
1263 } else {
1264 MemVT = ArgVT;
1265 }
1266 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1267 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1268 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1269 // We have a vector value which has been split into a vector with
1270 // the same scalar type, but fewer elements. This should handle
1271 // all the floating-point vector types.
1272 MemVT = RegisterVT;
1273 } else if (ArgVT.isVector() &&
1274 ArgVT.getVectorNumElements() == NumRegs) {
1275 // This arg has been split so that each element is stored in a separate
1276 // register.
1277 MemVT = ArgVT.getScalarType();
1278 } else if (ArgVT.isExtended()) {
1279 // We have an extended type, like i65.
1280 MemVT = RegisterVT;
1281 } else {
1282 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1283 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1284 if (RegisterVT.isInteger()) {
1285 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1286 } else if (RegisterVT.isVector()) {
1287 assert(!RegisterVT.getScalarType().isFloatingPoint());
1288 unsigned NumElements = RegisterVT.getVectorNumElements();
1289 assert(MemoryBits % NumElements == 0);
1290 // This vector type has been split into another vector type with
1291 // a different elements size.
1292 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1293 MemoryBits / NumElements);
1294 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1295 } else {
1296 llvm_unreachable("cannot deduce memory type.");
1297 }
1298 }
1299
1300 // Convert one element vectors to scalar.
1301 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1302 MemVT = MemVT.getScalarType();
1303
1304 // Round up vec3/vec5 argument.
1305 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1306 MemVT = MemVT.getPow2VectorType(State.getContext());
1307 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1308 MemVT = MemVT.getRoundIntegerType(State.getContext());
1309 }
1310
1311 unsigned PartOffset = 0;
1312 for (unsigned i = 0; i != NumRegs; ++i) {
1313 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1314 BasePartOffset + PartOffset,
1315 MemVT.getSimpleVT(),
1317 PartOffset += MemVT.getStoreSize();
1318 }
1319 }
1320 }
1321}
1322
1324 SDValue Chain, CallingConv::ID CallConv,
1325 bool isVarArg,
1327 const SmallVectorImpl<SDValue> &OutVals,
1328 const SDLoc &DL, SelectionDAG &DAG) const {
1329 // FIXME: Fails for r600 tests
1330 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1331 // "wave terminate should not have return values");
1332 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1333}
1334
1335//===---------------------------------------------------------------------===//
1336// Target specific lowering
1337//===---------------------------------------------------------------------===//
1338
1339/// Selects the correct CCAssignFn for a given CallingConvention value.
1344
1349
1351 SelectionDAG &DAG,
1352 MachineFrameInfo &MFI,
1353 int ClobberedFI) const {
1354 SmallVector<SDValue, 8> ArgChains;
1355 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1356 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1357
1358 // Include the original chain at the beginning of the list. When this is
1359 // used by target LowerCall hooks, this helps legalize find the
1360 // CALLSEQ_BEGIN node.
1361 ArgChains.push_back(Chain);
1362
1363 // Add a chain value for each stack argument corresponding
1364 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1365 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1366 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1367 if (FI->getIndex() < 0) {
1368 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1369 int64_t InLastByte = InFirstByte;
1370 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1371
1372 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1373 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1374 ArgChains.push_back(SDValue(L, 1));
1375 }
1376 }
1377 }
1378 }
1379
1380 // Build a tokenfactor for all the chains.
1381 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1382}
1383
1386 StringRef Reason) const {
1387 SDValue Callee = CLI.Callee;
1388 SelectionDAG &DAG = CLI.DAG;
1389
1390 const Function &Fn = DAG.getMachineFunction().getFunction();
1391
1392 StringRef FuncName("<unknown>");
1393
1395 FuncName = G->getSymbol();
1396 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1397 FuncName = G->getGlobal()->getName();
1398
1399 DAG.getContext()->diagnose(
1400 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1401
1402 if (!CLI.IsTailCall) {
1403 for (ISD::InputArg &Arg : CLI.Ins)
1404 InVals.push_back(DAG.getPOISON(Arg.VT));
1405 }
1406
1407 // FIXME: Hack because R600 doesn't handle callseq pseudos yet.
1408 if (getTargetMachine().getTargetTriple().getArch() == Triple::r600)
1409 return CLI.Chain;
1410
1411 SDValue Chain = DAG.getCALLSEQ_START(CLI.Chain, 0, 0, CLI.DL);
1412 return DAG.getCALLSEQ_END(Chain, 0, 0, /*InGlue=*/SDValue(), CLI.DL);
1413}
1414
1416 SmallVectorImpl<SDValue> &InVals) const {
1417 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1418}
1419
1421 SelectionDAG &DAG) const {
1422 const Function &Fn = DAG.getMachineFunction().getFunction();
1423
1425 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1426 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1427 return DAG.getMergeValues(Ops, SDLoc());
1428}
1429
1431 SelectionDAG &DAG) const {
1432 switch (Op.getOpcode()) {
1433 default:
1434 Op->print(errs(), &DAG);
1435 llvm_unreachable("Custom lowering code for this "
1436 "instruction is not implemented yet!");
1437 break;
1439 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1441 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1442 case ISD::SDIVREM:
1443 return LowerSDIVREM(Op, DAG);
1444 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1445 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1446 case ISD::FRINT: return LowerFRINT(Op, DAG);
1447 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1448 case ISD::FROUNDEVEN:
1449 return LowerFROUNDEVEN(Op, DAG);
1450 case ISD::FROUND: return LowerFROUND(Op, DAG);
1451 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1452 case ISD::FLOG2:
1453 return LowerFLOG2(Op, DAG);
1454 case ISD::FLOG:
1455 case ISD::FLOG10:
1456 return LowerFLOGCommon(Op, DAG);
1457 case ISD::FEXP:
1458 case ISD::FEXP10:
1459 return lowerFEXP(Op, DAG);
1460 case ISD::FEXP2:
1461 return lowerFEXP2(Op, DAG);
1462 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1463 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1464 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1465 case ISD::FP_TO_SINT:
1466 case ISD::FP_TO_UINT:
1467 return LowerFP_TO_INT(Op, DAG);
1470 return LowerFP_TO_INT_SAT(Op, DAG);
1471 case ISD::CTTZ:
1473 case ISD::CTLZ:
1475 return LowerCTLZ_CTTZ(Op, DAG);
1476 case ISD::CTLS:
1477 return LowerCTLS(Op, DAG);
1479 }
1480 return Op;
1481}
1482
1485 SelectionDAG &DAG) const {
1486 switch (N->getOpcode()) {
1488 // Different parts of legalization seem to interpret which type of
1489 // sign_extend_inreg is the one to check for custom lowering. The extended
1490 // from type is what really matters, but some places check for custom
1491 // lowering of the result type. This results in trying to use
1492 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1493 // nothing here and let the illegal result integer be handled normally.
1494 return;
1495 case ISD::FLOG2:
1496 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1497 Results.push_back(Lowered);
1498 return;
1499 case ISD::FLOG:
1500 case ISD::FLOG10:
1501 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1502 Results.push_back(Lowered);
1503 return;
1504 case ISD::FEXP2:
1505 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1506 Results.push_back(Lowered);
1507 return;
1508 case ISD::FEXP:
1509 case ISD::FEXP10:
1510 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1511 Results.push_back(Lowered);
1512 return;
1513 case ISD::CTLZ:
1515 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1516 Results.push_back(Lowered);
1517 return;
1518 default:
1519 return;
1520 }
1521}
1522
1524 SDValue Op,
1525 SelectionDAG &DAG) const {
1526
1527 const DataLayout &DL = DAG.getDataLayout();
1529 const GlobalValue *GV = G->getGlobal();
1530
1531 if (!MFI->isModuleEntryFunction()) {
1532 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1533 if (std::optional<uint32_t> Address =
1535 if (IsNamedBarrier) {
1536 unsigned BarCnt = cast<GlobalVariable>(GV)->getGlobalSize(DL) / 16;
1537 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1538 }
1539 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1540 } else if (IsNamedBarrier) {
1541 llvm_unreachable("named barrier should have an assigned address");
1542 }
1543 }
1544
1545 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1546 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1547 if (!MFI->isModuleEntryFunction() &&
1548 GV->getName() != "llvm.amdgcn.module.lds" &&
1550 SDLoc DL(Op);
1551 const Function &Fn = DAG.getMachineFunction().getFunction();
1553 Fn, "local memory global used by non-kernel function",
1554 DL.getDebugLoc(), DS_Warning));
1555
1556 // We currently don't have a way to correctly allocate LDS objects that
1557 // aren't directly associated with a kernel. We do force inlining of
1558 // functions that use local objects. However, if these dead functions are
1559 // not eliminated, we don't want a compile time error. Just emit a warning
1560 // and a trap, since there should be no callable path here.
1561 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1562 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1563 Trap, DAG.getRoot());
1564 DAG.setRoot(OutputChain);
1565 return DAG.getPOISON(Op.getValueType());
1566 }
1567
1568 // XXX: What does the value of G->getOffset() mean?
1569 assert(G->getOffset() == 0 &&
1570 "Do not know what to do with an non-zero offset");
1571
1572 // TODO: We could emit code to handle the initialization somewhere.
1573 // We ignore the initializer for now and legalize it to allow selection.
1574 // The initializer will anyway get errored out during assembly emission.
1575 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1576 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1577 }
1578 return SDValue();
1579}
1580
1582 SelectionDAG &DAG) const {
1584 SDLoc SL(Op);
1585
1586 EVT VT = Op.getValueType();
1587 if (VT.getVectorElementType().getSizeInBits() < 32) {
1588 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1589 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1590 unsigned NewNumElt = OpBitSize / 32;
1591 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1593 MVT::i32, NewNumElt);
1594 for (const SDUse &U : Op->ops()) {
1595 SDValue In = U.get();
1596 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1597 if (NewNumElt > 1)
1598 DAG.ExtractVectorElements(NewIn, Args);
1599 else
1600 Args.push_back(NewIn);
1601 }
1602
1603 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1604 NewNumElt * Op.getNumOperands());
1605 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1606 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1607 }
1608 }
1609
1610 for (const SDUse &U : Op->ops())
1611 DAG.ExtractVectorElements(U.get(), Args);
1612
1613 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1614}
1615
1617 SelectionDAG &DAG) const {
1618 SDLoc SL(Op);
1620 unsigned Start = Op.getConstantOperandVal(1);
1621 EVT VT = Op.getValueType();
1622 EVT SrcVT = Op.getOperand(0).getValueType();
1623
1624 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1625 unsigned NumElt = VT.getVectorNumElements();
1626 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1627 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1628
1629 // Extract 32-bit registers at a time.
1630 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1631 EVT NewVT = NumElt == 2
1632 ? MVT::i32
1633 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1634 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1635
1636 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1637 if (NumElt == 2)
1638 Tmp = Args[0];
1639 else
1640 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1641
1642 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1643 }
1644
1645 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1647
1648 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1649}
1650
1651// TODO: Handle fabs too
1653 if (Val.getOpcode() == ISD::FNEG)
1654 return Val.getOperand(0);
1655
1656 return Val;
1657}
1658
1660 if (Val.getOpcode() == ISD::FNEG)
1661 Val = Val.getOperand(0);
1662 if (Val.getOpcode() == ISD::FABS)
1663 Val = Val.getOperand(0);
1664 if (Val.getOpcode() == ISD::FCOPYSIGN)
1665 Val = Val.getOperand(0);
1666 return Val;
1667}
1668
1670 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1671 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1672 SelectionDAG &DAG = DCI.DAG;
1673 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1674 switch (CCOpcode) {
1675 case ISD::SETOEQ:
1676 case ISD::SETONE:
1677 case ISD::SETUNE:
1678 case ISD::SETNE:
1679 case ISD::SETUEQ:
1680 case ISD::SETEQ:
1681 case ISD::SETFALSE:
1682 case ISD::SETFALSE2:
1683 case ISD::SETTRUE:
1684 case ISD::SETTRUE2:
1685 case ISD::SETUO:
1686 case ISD::SETO:
1687 break;
1688 case ISD::SETULE:
1689 case ISD::SETULT: {
1690 if (LHS == True)
1691 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1692 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1693 }
1694 case ISD::SETOLE:
1695 case ISD::SETOLT:
1696 case ISD::SETLE:
1697 case ISD::SETLT: {
1698 // Ordered. Assume ordered for undefined.
1699
1700 // Only do this after legalization to avoid interfering with other combines
1701 // which might occur.
1703 !DCI.isCalledByLegalizer())
1704 return SDValue();
1705
1706 // We need to permute the operands to get the correct NaN behavior. The
1707 // selected operand is the second one based on the failing compare with NaN,
1708 // so permute it based on the compare type the hardware uses.
1709 if (LHS == True)
1710 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1711 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1712 }
1713 case ISD::SETUGE:
1714 case ISD::SETUGT: {
1715 if (LHS == True)
1716 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1717 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1718 }
1719 case ISD::SETGT:
1720 case ISD::SETGE:
1721 case ISD::SETOGE:
1722 case ISD::SETOGT: {
1724 !DCI.isCalledByLegalizer())
1725 return SDValue();
1726
1727 if (LHS == True)
1728 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1729 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1730 }
1731 case ISD::SETCC_INVALID:
1732 llvm_unreachable("Invalid setcc condcode!");
1733 }
1734 return SDValue();
1735}
1736
1737/// Generate Min/Max node
1739 SDValue LHS, SDValue RHS,
1740 SDValue True, SDValue False,
1741 SDValue CC,
1742 DAGCombinerInfo &DCI) const {
1743 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1744 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1745
1746 SelectionDAG &DAG = DCI.DAG;
1747
1748 // If we can't directly match this, try to see if we can fold an fneg to
1749 // match.
1750
1753 SDValue NegTrue = peekFNeg(True);
1754
1755 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1756 // fmin/fmax.
1757 //
1758 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1759 // -> fneg (fmin_legacy lhs, K)
1760 //
1761 // TODO: Use getNegatedExpression
1762 if (LHS == NegTrue && CFalse && CRHS) {
1763 APFloat NegRHS = neg(CRHS->getValueAPF());
1764 if (NegRHS == CFalse->getValueAPF()) {
1765 SDValue Combined =
1766 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1767 if (Combined)
1768 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1769 return SDValue();
1770 }
1771 }
1772
1773 return SDValue();
1774}
1775
1776std::pair<SDValue, SDValue>
1778 SDLoc SL(Op);
1779
1780 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1781
1782 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1783 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1784
1785 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1786 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1787
1788 return std::pair(Lo, Hi);
1789}
1790
1792 SDLoc SL(Op);
1793
1794 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1795 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1796 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1797}
1798
1800 SDLoc SL(Op);
1801
1802 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1803 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1804 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1805}
1806
1807// Split a vector type into two parts. The first part is a power of two vector.
1808// The second part is whatever is left over, and is a scalar if it would
1809// otherwise be a 1-vector.
1810std::pair<EVT, EVT>
1812 EVT LoVT, HiVT;
1813 EVT EltVT = VT.getVectorElementType();
1814 unsigned NumElts = VT.getVectorNumElements();
1815 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1816 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1817 HiVT = NumElts - LoNumElts == 1
1818 ? EltVT
1819 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1820 return std::pair(LoVT, HiVT);
1821}
1822
1823// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1824// scalar.
1825std::pair<SDValue, SDValue>
1827 const EVT &LoVT, const EVT &HiVT,
1828 SelectionDAG &DAG) const {
1829 EVT VT = N.getValueType();
1831 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1832 VT.getVectorNumElements() &&
1833 "More vector elements requested than available!");
1835 DAG.getVectorIdxConstant(0, DL));
1836
1837 unsigned LoNumElts = LoVT.getVectorNumElements();
1838
1839 if (HiVT.isVector()) {
1840 unsigned HiNumElts = HiVT.getVectorNumElements();
1841 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1842 // Avoid creating an extract_subvector with an index that isn't a multiple
1843 // of the result type.
1845 DAG.getConstant(LoNumElts, DL, MVT::i32));
1846 return {Lo, Hi};
1847 }
1848
1850 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1851 /*Count=*/HiNumElts);
1852 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1853 return {Lo, Hi};
1854 }
1855
1857 DAG.getVectorIdxConstant(LoNumElts, DL));
1858 return {Lo, Hi};
1859}
1860
1862 SelectionDAG &DAG) const {
1864 EVT VT = Op.getValueType();
1865 SDLoc SL(Op);
1866
1867
1868 // If this is a 2 element vector, we really want to scalarize and not create
1869 // weird 1 element vectors.
1870 if (VT.getVectorNumElements() == 2) {
1871 SDValue Ops[2];
1872 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1873 return DAG.getMergeValues(Ops, SL);
1874 }
1875
1876 SDValue BasePtr = Load->getBasePtr();
1877 EVT MemVT = Load->getMemoryVT();
1878
1879 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1880
1881 EVT LoVT, HiVT;
1882 EVT LoMemVT, HiMemVT;
1883 SDValue Lo, Hi;
1884
1885 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1886 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1887 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1888
1889 unsigned Size = LoMemVT.getStoreSize();
1890 Align BaseAlign = Load->getAlign();
1891 Align HiAlign = commonAlignment(BaseAlign, Size);
1892
1893 SDValue LoLoad = DAG.getExtLoad(
1894 Load->getExtensionType(), SL, LoVT, Load->getChain(), BasePtr, SrcValue,
1895 LoMemVT, BaseAlign, Load->getMemOperand()->getFlags(), Load->getAAInfo());
1896 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1897 SDValue HiLoad = DAG.getExtLoad(
1898 Load->getExtensionType(), SL, HiVT, Load->getChain(), HiPtr,
1899 SrcValue.getWithOffset(LoMemVT.getStoreSize()), HiMemVT, HiAlign,
1900 Load->getMemOperand()->getFlags(), Load->getAAInfo());
1901
1902 SDValue Join;
1903 if (LoVT == HiVT) {
1904 // This is the case that the vector is power of two so was evenly split.
1905 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1906 } else {
1907 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1908 DAG.getVectorIdxConstant(0, SL));
1909 Join = DAG.getNode(
1911 VT, Join, HiLoad,
1913 }
1914
1915 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1916 LoLoad.getValue(1), HiLoad.getValue(1))};
1917
1918 return DAG.getMergeValues(Ops, SL);
1919}
1920
1922 SelectionDAG &DAG) const {
1924 EVT VT = Op.getValueType();
1925 SDValue BasePtr = Load->getBasePtr();
1926 EVT MemVT = Load->getMemoryVT();
1927 SDLoc SL(Op);
1928 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1929 Align BaseAlign = Load->getAlign();
1930 unsigned NumElements = MemVT.getVectorNumElements();
1931
1932 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1933 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1934 if (NumElements != 3 ||
1935 (BaseAlign < Align(8) &&
1936 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1937 return SplitVectorLoad(Op, DAG);
1938
1939 assert(NumElements == 3);
1940
1941 EVT WideVT =
1943 EVT WideMemVT =
1945 SDValue WideLoad = DAG.getExtLoad(
1946 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1947 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1948 return DAG.getMergeValues(
1949 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1950 DAG.getVectorIdxConstant(0, SL)),
1951 WideLoad.getValue(1)},
1952 SL);
1953}
1954
1956 SelectionDAG &DAG) const {
1958 SDValue Val = Store->getValue();
1959 EVT VT = Val.getValueType();
1960
1961 // If this is a 2 element vector, we really want to scalarize and not create
1962 // weird 1 element vectors.
1963 if (VT.getVectorNumElements() == 2)
1964 return scalarizeVectorStore(Store, DAG);
1965
1966 EVT MemVT = Store->getMemoryVT();
1967 SDValue Chain = Store->getChain();
1968 SDValue BasePtr = Store->getBasePtr();
1969 SDLoc SL(Op);
1970
1971 EVT LoVT, HiVT;
1972 EVT LoMemVT, HiMemVT;
1973 SDValue Lo, Hi;
1974
1975 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1976 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1977 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1978
1979 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1980
1981 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1982 Align BaseAlign = Store->getAlign();
1983 unsigned Size = LoMemVT.getStoreSize();
1984 Align HiAlign = commonAlignment(BaseAlign, Size);
1985
1986 SDValue LoStore =
1987 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1988 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1989 SDValue HiStore = DAG.getTruncStore(
1990 Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size), HiMemVT, HiAlign,
1991 Store->getMemOperand()->getFlags(), Store->getAAInfo());
1992
1993 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1994}
1995
1996// This is a shortcut for integer division because we have fast i32<->f32
1997// conversions, and fast f32 reciprocal instructions. The fractional part of a
1998// float is enough to accurately represent up to a 24-bit signed integer.
2000 bool Sign) const {
2001 SDLoc DL(Op);
2002 EVT VT = Op.getValueType();
2003 SDValue LHS = Op.getOperand(0);
2004 SDValue RHS = Op.getOperand(1);
2005 MVT IntVT = MVT::i32;
2006 MVT FltVT = MVT::f32;
2007
2008 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2009 if (LHSSignBits < 9)
2010 return SDValue();
2011
2012 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2013 if (RHSSignBits < 9)
2014 return SDValue();
2015
2016 unsigned BitSize = VT.getSizeInBits();
2017 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2018 unsigned DivBits = BitSize - SignBits;
2019 if (Sign)
2020 ++DivBits;
2021
2024
2025 SDValue jq = DAG.getConstant(1, DL, IntVT);
2026
2027 if (Sign) {
2028 // char|short jq = ia ^ ib;
2029 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2030
2031 // jq = jq >> (bitsize - 2)
2032 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2033 DAG.getConstant(BitSize - 2, DL, VT));
2034
2035 // jq = jq | 0x1
2036 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2037 }
2038
2039 // int ia = (int)LHS;
2040 SDValue ia = LHS;
2041
2042 // int ib, (int)RHS;
2043 SDValue ib = RHS;
2044
2045 // float fa = (float)ia;
2046 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2047
2048 // float fb = (float)ib;
2049 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2050
2051 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2052 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2053
2054 // fq = trunc(fq);
2055 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2056
2057 // float fqneg = -fq;
2058 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2059
2061
2062 bool UseFmadFtz = false;
2063 if (Subtarget->isGCN()) {
2065 UseFmadFtz =
2067 }
2068
2069 // float fr = mad(fqneg, fb, fa);
2070 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2071 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2073 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2074
2075 // int iq = (int)fq;
2076 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2077
2078 // fr = fabs(fr);
2079 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2080
2081 // fb = fabs(fb);
2082 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2083
2084 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2085
2086 // int cv = fr >= fb;
2087 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2088
2089 // jq = (cv ? jq : 0);
2090 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2091
2092 // dst = iq + jq;
2093 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2094
2095 // Rem needs compensation, it's easier to recompute it
2096 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2097 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2098
2099 // Truncate to number of bits this divide really is.
2100 if (Sign) {
2101 SDValue InRegSize
2102 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2103 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2104 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2105 } else {
2106 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2107 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2108 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2109 }
2110
2111 return DAG.getMergeValues({ Div, Rem }, DL);
2112}
2113
2115 SelectionDAG &DAG,
2117 SDLoc DL(Op);
2118 EVT VT = Op.getValueType();
2119
2120 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2121
2122 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2123
2124 SDValue One = DAG.getConstant(1, DL, HalfVT);
2125 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2126
2127 //HiLo split
2128 SDValue LHS_Lo, LHS_Hi;
2129 SDValue LHS = Op.getOperand(0);
2130 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2131
2132 SDValue RHS_Lo, RHS_Hi;
2133 SDValue RHS = Op.getOperand(1);
2134 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2135
2136 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2137 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2138
2139 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2140 LHS_Lo, RHS_Lo);
2141
2142 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2143 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2144
2145 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2146 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2147 return;
2148 }
2149
2150 if (isTypeLegal(MVT::i64)) {
2151 // The algorithm here is based on ideas from "Software Integer Division",
2152 // Tom Rodeheffer, August 2008.
2153
2156
2157 // Compute denominator reciprocal.
2158 unsigned FMAD =
2159 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2162 : (unsigned)AMDGPUISD::FMAD_FTZ;
2163
2164 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2165 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2166 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2167 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2168 Cvt_Lo);
2169 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2170 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2171 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2172 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2173 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2174 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2175 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2176 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2177 Mul1);
2178 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2179 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2180 SDValue Rcp64 = DAG.getBitcast(VT,
2181 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2182
2183 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2184 SDValue One64 = DAG.getConstant(1, DL, VT);
2185 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2186 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2187
2188 // First round of UNR (Unsigned integer Newton-Raphson).
2189 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2190 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2191 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2192 SDValue Mulhi1_Lo, Mulhi1_Hi;
2193 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2194 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2195 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2196 Mulhi1_Lo, Zero1);
2197 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2198 Mulhi1_Hi, Add1_Lo.getValue(1));
2199 SDValue Add1 = DAG.getBitcast(VT,
2200 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2201
2202 // Second round of UNR.
2203 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2204 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2205 SDValue Mulhi2_Lo, Mulhi2_Hi;
2206 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2207 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2208 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2209 Mulhi2_Lo, Zero1);
2210 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2211 Mulhi2_Hi, Add2_Lo.getValue(1));
2212 SDValue Add2 = DAG.getBitcast(VT,
2213 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2214
2215 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2216
2217 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2218
2219 SDValue Mul3_Lo, Mul3_Hi;
2220 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2221 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2222 Mul3_Lo, Zero1);
2223 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2224 Mul3_Hi, Sub1_Lo.getValue(1));
2225 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2226 SDValue Sub1 = DAG.getBitcast(VT,
2227 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2228
2229 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2230 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2231 ISD::SETUGE);
2232 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2233 ISD::SETUGE);
2234 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2235
2236 // TODO: Here and below portions of the code can be enclosed into if/endif.
2237 // Currently control flow is unconditional and we have 4 selects after
2238 // potential endif to substitute PHIs.
2239
2240 // if C3 != 0 ...
2241 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2242 RHS_Lo, Zero1);
2243 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2244 RHS_Hi, Sub1_Lo.getValue(1));
2245 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2246 Zero, Sub2_Lo.getValue(1));
2247 SDValue Sub2 = DAG.getBitcast(VT,
2248 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2249
2250 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2251
2252 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2253 ISD::SETUGE);
2254 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2255 ISD::SETUGE);
2256 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2257
2258 // if (C6 != 0)
2259 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2260
2261 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2262 RHS_Lo, Zero1);
2263 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2264 RHS_Hi, Sub2_Lo.getValue(1));
2265 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2266 Zero, Sub3_Lo.getValue(1));
2267 SDValue Sub3 = DAG.getBitcast(VT,
2268 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2269
2270 // endif C6
2271 // endif C3
2272
2273 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2274 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2275
2276 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2277 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2278
2279 Results.push_back(Div);
2280 Results.push_back(Rem);
2281
2282 return;
2283 }
2284
2285 // r600 expandion.
2286 // Get Speculative values
2287 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2288 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2289
2290 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2291 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2292 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2293
2294 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2295 SDValue DIV_Lo = Zero;
2296
2297 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2298
2299 for (unsigned i = 0; i < halfBitWidth; ++i) {
2300 const unsigned bitPos = halfBitWidth - i - 1;
2301 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2302 // Get value of high bit
2303 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2304 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2305 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2306
2307 // Shift
2308 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2309 // Add LHS high bit
2310 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2311
2312 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2313 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2314
2315 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2316
2317 // Update REM
2318 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2319 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2320 }
2321
2322 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2323 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2324 Results.push_back(DIV);
2325 Results.push_back(REM);
2326}
2327
2329 SelectionDAG &DAG) const {
2330 SDLoc DL(Op);
2331 EVT VT = Op.getValueType();
2332
2333 if (VT == MVT::i64) {
2335 LowerUDIVREM64(Op, DAG, Results);
2336 return DAG.getMergeValues(Results, DL);
2337 }
2338
2339 if (VT == MVT::i32) {
2340 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2341 return Res;
2342 }
2343
2344 SDValue X = Op.getOperand(0);
2345 SDValue Y = Op.getOperand(1);
2346
2347 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2348 // algorithm used here.
2349
2350 // Initial estimate of inv(y).
2351 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2352
2353 // One round of UNR.
2354 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2355 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2356 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2357 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2358
2359 // Quotient/remainder estimate.
2360 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2361 SDValue R =
2362 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2363
2364 // First quotient/remainder refinement.
2365 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2366 SDValue One = DAG.getConstant(1, DL, VT);
2367 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2368 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2369 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2370 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2371 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2372
2373 // Second quotient/remainder refinement.
2374 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2375 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2376 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2377 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2378 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2379
2380 return DAG.getMergeValues({Q, R}, DL);
2381}
2382
2384 SelectionDAG &DAG) const {
2385 SDLoc DL(Op);
2386 EVT VT = Op.getValueType();
2387
2388 SDValue LHS = Op.getOperand(0);
2389 SDValue RHS = Op.getOperand(1);
2390
2391 SDValue Zero = DAG.getConstant(0, DL, VT);
2392 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2393
2394 if (VT == MVT::i32) {
2395 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2396 return Res;
2397 }
2398
2399 if (VT == MVT::i64 &&
2400 DAG.ComputeNumSignBits(LHS) > 32 &&
2401 DAG.ComputeNumSignBits(RHS) > 32) {
2402 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2403
2404 //HiLo split
2405 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2406 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2407 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2408 LHS_Lo, RHS_Lo);
2409 SDValue Res[2] = {
2410 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2411 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2412 };
2413 return DAG.getMergeValues(Res, DL);
2414 }
2415
2416 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2417 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2418 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2419 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2420
2421 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2422 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2423
2424 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2425 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2426
2427 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2428 SDValue Rem = Div.getValue(1);
2429
2430 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2431 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2432
2433 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2434 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2435
2436 SDValue Res[2] = {
2437 Div,
2438 Rem
2439 };
2440 return DAG.getMergeValues(Res, DL);
2441}
2442
2444 SDLoc SL(Op);
2445 SDValue Src = Op.getOperand(0);
2446
2447 // result = trunc(src)
2448 // if (src > 0.0 && src != result)
2449 // result += 1.0
2450
2451 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2452
2453 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2454 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2455
2456 EVT SetCCVT =
2457 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2458
2459 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2460 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2461 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2462
2463 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2464 // TODO: Should this propagate fast-math-flags?
2465 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2466}
2467
2469 SelectionDAG &DAG) {
2470 const unsigned FractBits = 52;
2471 const unsigned ExpBits = 11;
2472
2473 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2474 Hi,
2475 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2476 DAG.getConstant(ExpBits, SL, MVT::i32));
2477 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2478 DAG.getConstant(1023, SL, MVT::i32));
2479
2480 return Exp;
2481}
2482
2484 SDLoc SL(Op);
2485 SDValue Src = Op.getOperand(0);
2486
2487 assert(Op.getValueType() == MVT::f64);
2488
2489 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2490
2491 // Extract the upper half, since this is where we will find the sign and
2492 // exponent.
2493 SDValue Hi = getHiHalf64(Src, DAG);
2494
2495 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2496
2497 const unsigned FractBits = 52;
2498
2499 // Extract the sign bit.
2500 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2501 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2502
2503 // Extend back to 64-bits.
2504 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2505 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2506
2507 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2508 const SDValue FractMask
2509 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2510
2511 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2512 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2513 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2514
2515 EVT SetCCVT =
2516 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2517
2518 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2519
2520 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2521 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2522
2523 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2524 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2525
2526 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2527}
2528
2530 SelectionDAG &DAG) const {
2531 SDLoc SL(Op);
2532 SDValue Src = Op.getOperand(0);
2533
2534 assert(Op.getValueType() == MVT::f64);
2535
2536 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2537 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2538 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2539
2540 // TODO: Should this propagate fast-math-flags?
2541
2542 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2543 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2544
2545 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2546
2547 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2548 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2549
2550 EVT SetCCVT =
2551 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2552 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2553
2554 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2555}
2556
2558 SelectionDAG &DAG) const {
2559 // FNEARBYINT and FRINT are the same, except in their handling of FP
2560 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2561 // rint, so just treat them as equivalent.
2562 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2563 Op.getOperand(0));
2564}
2565
2567 auto VT = Op.getValueType();
2568 auto Arg = Op.getOperand(0u);
2569 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2570}
2571
2572// XXX - May require not supporting f32 denormals?
2573
2574// Don't handle v2f16. The extra instructions to scalarize and repack around the
2575// compare and vselect end up producing worse code than scalarizing the whole
2576// operation.
2578 SDLoc SL(Op);
2579 SDValue X = Op.getOperand(0);
2580 EVT VT = Op.getValueType();
2581
2582 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2583
2584 // TODO: Should this propagate fast-math-flags?
2585
2586 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2587
2588 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2589
2590 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2591 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2592
2593 EVT SetCCVT =
2594 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2595
2596 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2597 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2598 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2599
2600 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2601 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2602}
2603
2605 SDLoc SL(Op);
2606 SDValue Src = Op.getOperand(0);
2607
2608 // result = trunc(src);
2609 // if (src < 0.0 && src != result)
2610 // result += -1.0.
2611
2612 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2613
2614 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2615 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2616
2617 EVT SetCCVT =
2618 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2619
2620 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2621 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2622 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2623
2624 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2625 // TODO: Should this propagate fast-math-flags?
2626 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2627}
2628
2629/// Return true if it's known that \p Src can never be an f32 denormal value.
2631 switch (Src.getOpcode()) {
2632 case ISD::FP_EXTEND:
2633 return Src.getOperand(0).getValueType() == MVT::f16;
2634 case ISD::FP16_TO_FP:
2635 case ISD::FFREXP:
2636 case ISD::FSQRT:
2637 case AMDGPUISD::LOG:
2638 case AMDGPUISD::EXP:
2639 return true;
2641 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2642 switch (IntrinsicID) {
2643 case Intrinsic::amdgcn_frexp_mant:
2644 case Intrinsic::amdgcn_log:
2645 case Intrinsic::amdgcn_log_clamp:
2646 case Intrinsic::amdgcn_exp2:
2647 case Intrinsic::amdgcn_sqrt:
2648 return true;
2649 default:
2650 return false;
2651 }
2652 }
2653 default:
2654 return false;
2655 }
2656
2657 llvm_unreachable("covered opcode switch");
2658}
2659
2661 SDNodeFlags Flags) {
2662 return Flags.hasApproximateFuncs();
2663}
2664
2673
2675 SDValue Src,
2676 SDNodeFlags Flags) const {
2677 SDLoc SL(Src);
2678 EVT VT = Src.getValueType();
2679 const fltSemantics &Semantics = VT.getFltSemantics();
2680 SDValue SmallestNormal =
2681 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2682
2683 // Want to scale denormals up, but negatives and 0 work just as well on the
2684 // scaled path.
2685 SDValue IsLtSmallestNormal = DAG.getSetCC(
2686 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2687 SmallestNormal, ISD::SETOLT);
2688
2689 return IsLtSmallestNormal;
2690}
2691
2693 SDNodeFlags Flags) const {
2694 SDLoc SL(Src);
2695 EVT VT = Src.getValueType();
2696 const fltSemantics &Semantics = VT.getFltSemantics();
2697 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2698
2699 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2700 SDValue IsFinite = DAG.getSetCC(
2701 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2702 Inf, ISD::SETOLT);
2703 return IsFinite;
2704}
2705
2706/// If denormal handling is required return the scaled input to FLOG2, and the
2707/// check for denormal range. Otherwise, return null values.
2708std::pair<SDValue, SDValue>
2710 SDValue Src, SDNodeFlags Flags) const {
2711 if (!needsDenormHandlingF32(DAG, Src, Flags))
2712 return {};
2713
2714 MVT VT = MVT::f32;
2715 const fltSemantics &Semantics = APFloat::IEEEsingle();
2716 SDValue SmallestNormal =
2717 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2718
2719 SDValue IsLtSmallestNormal = DAG.getSetCC(
2720 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2721 SmallestNormal, ISD::SETOLT);
2722
2723 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2724 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2725 SDValue ScaleFactor =
2726 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2727
2728 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2729 return {ScaledInput, IsLtSmallestNormal};
2730}
2731
2733 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2734 // If we have to handle denormals, scale up the input and adjust the result.
2735
2736 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2737 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2738
2739 SDLoc SL(Op);
2740 EVT VT = Op.getValueType();
2741 SDValue Src = Op.getOperand(0);
2742 SDNodeFlags Flags = Op->getFlags();
2743
2744 if (VT == MVT::f16) {
2745 // Nothing in half is a denormal when promoted to f32.
2746 assert(!isTypeLegal(VT));
2747 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2748 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2749 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2750 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2751 }
2752
2753 auto [ScaledInput, IsLtSmallestNormal] =
2754 getScaledLogInput(DAG, SL, Src, Flags);
2755 if (!ScaledInput)
2756 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2757
2758 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2759
2760 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2761 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2762 SDValue ResultOffset =
2763 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2764 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2765}
2766
2767static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2768 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2769 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2770 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2771}
2772
2774 SelectionDAG &DAG) const {
2775 SDValue X = Op.getOperand(0);
2776 EVT VT = Op.getValueType();
2777 SDNodeFlags Flags = Op->getFlags();
2778 SDLoc DL(Op);
2779 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2780 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2781
2782 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2783 // TODO: The direct f16 path is 1.79 ulp for f16. This should be used
2784 // depending on !fpmath metadata.
2785
2786 bool PromoteToF32 = VT == MVT::f16 && (!Flags.hasApproximateFuncs() ||
2787 !isTypeLegal(MVT::f16));
2788
2789 if (PromoteToF32) {
2790 // Log and multiply in f32 is always good enough for f16.
2791 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2792 }
2793
2794 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2795 if (PromoteToF32) {
2796 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2797 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2798 }
2799
2800 return Lowered;
2801 }
2802
2803 SDValue ScaledInput, IsScaled;
2804 if (VT == MVT::f16)
2805 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2806 else {
2807 std::tie(ScaledInput, IsScaled) = getScaledLogInput(DAG, DL, X, Flags);
2808 if (ScaledInput)
2809 X = ScaledInput;
2810 }
2811
2812 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2813
2814 SDValue R;
2815 if (Subtarget->hasFastFMAF32()) {
2816 // c+cc are ln(2)/ln(10) to more than 49 bits
2817 const float c_log10 = 0x1.344134p-2f;
2818 const float cc_log10 = 0x1.09f79ep-26f;
2819
2820 // c + cc is ln(2) to more than 49 bits
2821 const float c_log = 0x1.62e42ep-1f;
2822 const float cc_log = 0x1.efa39ep-25f;
2823
2824 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2825 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2826 // This adds correction terms for which contraction may lead to an increase
2827 // in the error of the approximation, so disable it.
2828 Flags.setAllowContract(false);
2829 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2830 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2831 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2832 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2833 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2834 } else {
2835 // ch+ct is ln(2)/ln(10) to more than 36 bits
2836 const float ch_log10 = 0x1.344000p-2f;
2837 const float ct_log10 = 0x1.3509f6p-18f;
2838
2839 // ch + ct is ln(2) to more than 36 bits
2840 const float ch_log = 0x1.62e000p-1f;
2841 const float ct_log = 0x1.0bfbe8p-15f;
2842
2843 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2844 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2845
2846 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2847 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2848 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2849 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2850 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2851 // This adds correction terms for which contraction may lead to an increase
2852 // in the error of the approximation, so disable it.
2853 Flags.setAllowContract(false);
2854 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2855 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2856 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2857 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2858 }
2859
2860 const bool IsFiniteOnly = Flags.hasNoNaNs() && Flags.hasNoInfs();
2861
2862 // TODO: Check if known finite from source value.
2863 if (!IsFiniteOnly) {
2864 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2865 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2866 }
2867
2868 if (IsScaled) {
2869 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2870 SDValue ShiftK =
2871 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2872 SDValue Shift =
2873 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2874 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2875 }
2876
2877 return R;
2878}
2879
2883
2884// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2885// promote f16 operation.
2887 SelectionDAG &DAG, bool IsLog10,
2888 SDNodeFlags Flags) const {
2889 EVT VT = Src.getValueType();
2890 unsigned LogOp =
2891 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2892
2893 double Log2BaseInverted =
2895
2896 if (VT == MVT::f32) {
2897 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2898 if (ScaledInput) {
2899 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2900 SDValue ScaledResultOffset =
2901 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2902
2903 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2904
2905 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2906 ScaledResultOffset, Zero, Flags);
2907
2908 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2909
2910 if (Subtarget->hasFastFMAF32())
2911 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2912 Flags);
2913 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2914 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2915 }
2916 }
2917
2918 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2919 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2920
2921 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2922 Flags);
2923}
2924
2925// This expansion gives a result slightly better than 1ulp.
2927 SelectionDAG &DAG) const {
2928 SDLoc DL(Op);
2929 SDValue X = Op.getOperand(0);
2930
2931 // TODO: Check if reassoc is safe. There is an output change in exp2 and
2932 // exp10, which slightly increases ulp.
2933 SDNodeFlags Flags = Op->getFlags() & ~SDNodeFlags::AllowReassociation;
2934
2935 SDValue DN, F, T;
2936
2937 if (Op.getOpcode() == ISD::FEXP2) {
2938 // dn = rint(x)
2939 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, X, Flags);
2940 // f = x - dn
2941 F = DAG.getNode(ISD::FSUB, DL, MVT::f64, X, DN, Flags);
2942 // t = f*C1 + f*C2
2943 SDValue C1 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2944 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2945 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C2, Flags);
2946 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C1, Mul2, Flags);
2947 } else if (Op.getOpcode() == ISD::FEXP10) {
2948 // dn = rint(x * C1)
2949 SDValue C1 = DAG.getConstantFP(0x1.a934f0979a371p+1, DL, MVT::f64);
2950 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2951 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2952
2953 // f = FMA(-dn, C2, FMA(-dn, C3, x))
2954 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2955 SDValue C2 = DAG.getConstantFP(-0x1.9dc1da994fd21p-59, DL, MVT::f64);
2956 SDValue C3 = DAG.getConstantFP(0x1.34413509f79ffp-2, DL, MVT::f64);
2957 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2958 F = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2959
2960 // t = FMA(f, C4, f*C5)
2961 SDValue C4 = DAG.getConstantFP(0x1.26bb1bbb55516p+1, DL, MVT::f64);
2962 SDValue C5 = DAG.getConstantFP(-0x1.f48ad494ea3e9p-53, DL, MVT::f64);
2963 SDValue MulF = DAG.getNode(ISD::FMUL, DL, MVT::f64, F, C5, Flags);
2964 T = DAG.getNode(ISD::FMA, DL, MVT::f64, F, C4, MulF, Flags);
2965 } else { // ISD::FEXP
2966 // dn = rint(x * C1)
2967 SDValue C1 = DAG.getConstantFP(0x1.71547652b82fep+0, DL, MVT::f64);
2968 SDValue Mul = DAG.getNode(ISD::FMUL, DL, MVT::f64, X, C1, Flags);
2969 DN = DAG.getNode(ISD::FRINT, DL, MVT::f64, Mul, Flags);
2970
2971 // t = FMA(-dn, C2, FMA(-dn, C3, x))
2972 SDValue NegDN = DAG.getNode(ISD::FNEG, DL, MVT::f64, DN, Flags);
2973 SDValue C2 = DAG.getConstantFP(0x1.abc9e3b39803fp-56, DL, MVT::f64);
2974 SDValue C3 = DAG.getConstantFP(0x1.62e42fefa39efp-1, DL, MVT::f64);
2975 SDValue Inner = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C3, X, Flags);
2976 T = DAG.getNode(ISD::FMA, DL, MVT::f64, NegDN, C2, Inner, Flags);
2977 }
2978
2979 // Polynomial expansion for p
2980 SDValue P = DAG.getConstantFP(0x1.ade156a5dcb37p-26, DL, MVT::f64);
2981 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2982 DAG.getConstantFP(0x1.28af3fca7ab0cp-22, DL, MVT::f64),
2983 Flags);
2984 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2985 DAG.getConstantFP(0x1.71dee623fde64p-19, DL, MVT::f64),
2986 Flags);
2987 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2988 DAG.getConstantFP(0x1.a01997c89e6b0p-16, DL, MVT::f64),
2989 Flags);
2990 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2991 DAG.getConstantFP(0x1.a01a014761f6ep-13, DL, MVT::f64),
2992 Flags);
2993 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2994 DAG.getConstantFP(0x1.6c16c1852b7b0p-10, DL, MVT::f64),
2995 Flags);
2996 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2997 DAG.getConstantFP(0x1.1111111122322p-7, DL, MVT::f64), Flags);
2998 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
2999 DAG.getConstantFP(0x1.55555555502a1p-5, DL, MVT::f64), Flags);
3000 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3001 DAG.getConstantFP(0x1.5555555555511p-3, DL, MVT::f64), Flags);
3002 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P,
3003 DAG.getConstantFP(0x1.000000000000bp-1, DL, MVT::f64), Flags);
3004
3005 SDValue One = DAG.getConstantFP(1.0, DL, MVT::f64);
3006
3007 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3008 P = DAG.getNode(ISD::FMA, DL, MVT::f64, T, P, One, Flags);
3009
3010 // z = ldexp(p, (int)dn)
3011 SDValue DNInt = DAG.getNode(ISD::FP_TO_SINT, DL, MVT::i32, DN);
3012 SDValue Z = DAG.getNode(ISD::FLDEXP, DL, MVT::f64, P, DNInt, Flags);
3013
3014 // Overflow/underflow guards
3015 SDValue CondHi = DAG.getSetCC(
3016 DL, MVT::i1, X, DAG.getConstantFP(1024.0, DL, MVT::f64), ISD::SETULE);
3017
3018 if (!Flags.hasNoInfs()) {
3019 SDValue PInf = DAG.getConstantFP(std::numeric_limits<double>::infinity(),
3020 DL, MVT::f64);
3021 Z = DAG.getSelect(DL, MVT::f64, CondHi, Z, PInf, Flags);
3022 }
3023
3024 SDValue CondLo = DAG.getSetCC(
3025 DL, MVT::i1, X, DAG.getConstantFP(-1075.0, DL, MVT::f64), ISD::SETUGE);
3026 SDValue Zero = DAG.getConstantFP(0.0, DL, MVT::f64);
3027 Z = DAG.getSelect(DL, MVT::f64, CondLo, Z, Zero, Flags);
3028
3029 return Z;
3030}
3031
3033 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
3034 // If we have to handle denormals, scale up the input and adjust the result.
3035
3036 EVT VT = Op.getValueType();
3037 if (VT == MVT::f64)
3038 return lowerFEXPF64(Op, DAG);
3039
3040 SDLoc SL(Op);
3041 SDValue Src = Op.getOperand(0);
3042 SDNodeFlags Flags = Op->getFlags();
3043
3044 if (VT == MVT::f16) {
3045 // Nothing in half is a denormal when promoted to f32.
3046 assert(!isTypeLegal(MVT::f16));
3047 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
3048 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
3049 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
3050 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3051 }
3052
3053 assert(VT == MVT::f32);
3054
3055 if (!needsDenormHandlingF32(DAG, Src, Flags))
3056 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
3057
3058 // bool needs_scaling = x < -0x1.f80000p+6f;
3059 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
3060
3061 // -nextafter(128.0, -1)
3062 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
3063
3064 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3065
3066 SDValue NeedsScaling =
3067 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
3068
3069 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3070 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3071
3072 SDValue AddOffset =
3073 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
3074
3075 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
3076 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
3077
3078 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
3079 SDValue One = DAG.getConstantFP(1.0, SL, VT);
3080 SDValue ResultScale =
3081 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
3082
3083 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
3084}
3085
3087 SelectionDAG &DAG,
3088 SDNodeFlags Flags,
3089 bool IsExp10) const {
3090 // exp(x) -> exp2(M_LOG2E_F * x);
3091 // exp10(x) -> exp2(log2(10) * x);
3092 EVT VT = X.getValueType();
3093 SDValue Const =
3094 DAG.getConstantFP(IsExp10 ? 0x1.a934f0p+1f : numbers::log2e, SL, VT);
3095
3096 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Const, Flags);
3097 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
3098 : (unsigned)ISD::FEXP2,
3099 SL, VT, Mul, Flags);
3100}
3101
3103 SelectionDAG &DAG,
3104 SDNodeFlags Flags) const {
3105 EVT VT = X.getValueType();
3106 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags))
3107 return lowerFEXPUnsafeImpl(X, SL, DAG, Flags, /*IsExp10=*/false);
3108
3109 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3110
3111 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
3112 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3113
3114 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
3115
3116 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3117
3118 SDValue AdjustedX =
3119 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3120
3121 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
3122 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
3123
3124 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
3125
3126 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
3127 SDValue AdjustedResult =
3128 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
3129
3130 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
3131 Flags);
3132}
3133
3134/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
3135/// handled correctly.
3137 SelectionDAG &DAG,
3138 SDNodeFlags Flags) const {
3139 const EVT VT = X.getValueType();
3140
3141 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
3142 : static_cast<unsigned>(ISD::FEXP2);
3143
3144 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3145 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3146 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3147 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3148
3149 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3150 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3151 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3152 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3153 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3154 }
3155
3156 // bool s = x < -0x1.2f7030p+5f;
3157 // x += s ? 0x1.0p+5f : 0.0f;
3158 // exp10 = exp2(x * 0x1.a92000p+1f) *
3159 // exp2(x * 0x1.4f0978p-11f) *
3160 // (s ? 0x1.9f623ep-107f : 1.0f);
3161
3162 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3163
3164 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3165 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3166
3167 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3168 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3169 SDValue AdjustedX =
3170 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3171
3172 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3173 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3174
3175 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3176 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3177 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3178 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3179
3180 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3181
3182 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3183 SDValue AdjustedResult =
3184 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3185
3186 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3187 Flags);
3188}
3189
3191 EVT VT = Op.getValueType();
3192
3193 if (VT == MVT::f64)
3194 return lowerFEXPF64(Op, DAG);
3195
3196 SDLoc SL(Op);
3197 SDValue X = Op.getOperand(0);
3198 SDNodeFlags Flags = Op->getFlags();
3199 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3200
3201 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3202 // library behavior. Also, is known-not-daz source sufficient?
3203 if (allowApproxFunc(DAG, Flags)) { // TODO: Does this really require fast?
3204 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3205 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3206 }
3207
3208 if (VT.getScalarType() == MVT::f16) {
3209 if (VT.isVector())
3210 return SDValue();
3211
3212 // Nothing in half is a denormal when promoted to f32.
3213 //
3214 // exp(f16 x) ->
3215 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3216 //
3217 // exp10(f16 x) ->
3218 // fptrunc (v_exp_f32 (fmul (fpext x), log2(10)))
3219 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3220 SDValue Lowered = lowerFEXPUnsafeImpl(Ext, SL, DAG, Flags, IsExp10);
3221 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3222 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3223 }
3224
3225 assert(VT == MVT::f32);
3226
3227 // Algorithm:
3228 //
3229 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3230 //
3231 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3232 // n = 64*m + j, 0 <= j < 64
3233 //
3234 // e^x = 2^((64*m + j + f)/64)
3235 // = (2^m) * (2^(j/64)) * 2^(f/64)
3236 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3237 //
3238 // f = x*(64/ln(2)) - n
3239 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3240 //
3241 // e^x = (2^m) * (2^(j/64)) * e^r
3242 //
3243 // (2^(j/64)) is precomputed
3244 //
3245 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3246 // e^r = 1 + q
3247 //
3248 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3249 //
3250 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3251 SDNodeFlags FlagsNoContract = Flags;
3252 FlagsNoContract.setAllowContract(false);
3253
3254 SDValue PH, PL;
3255 if (Subtarget->hasFastFMAF32()) {
3256 const float c_exp = numbers::log2ef;
3257 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3258 const float c_exp10 = 0x1.a934f0p+1f;
3259 const float cc_exp10 = 0x1.2f346ep-24f;
3260
3261 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3262 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3263
3264 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3265 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3266 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3267 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3268 } else {
3269 const float ch_exp = 0x1.714000p+0f;
3270 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3271
3272 const float ch_exp10 = 0x1.a92000p+1f;
3273 const float cl_exp10 = 0x1.4f0978p-11f;
3274
3275 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3276 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3277
3278 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3279 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3280 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3281 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3282 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3283
3284 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3285
3286 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3287 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3288 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3289 }
3290
3291 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3292
3293 // It is unsafe to contract this fsub into the PH multiply.
3294 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3295
3296 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3297 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3298 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3299
3300 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3301
3302 SDValue UnderflowCheckConst =
3303 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3304
3305 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3306 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3307 SDValue Underflow =
3308 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3309
3310 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3311
3312 if (!Flags.hasNoInfs()) {
3313 SDValue OverflowCheckConst =
3314 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3315 SDValue Overflow =
3316 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3317 SDValue Inf =
3319 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3320 }
3321
3322 return R;
3323}
3324
3325static bool isCtlzOpc(unsigned Opc) {
3326 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3327}
3328
3329static bool isCttzOpc(unsigned Opc) {
3330 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3331}
3332
3334 SelectionDAG &DAG) const {
3335 auto SL = SDLoc(Op);
3336 auto Opc = Op.getOpcode();
3337 auto Arg = Op.getOperand(0u);
3338 auto ResultVT = Op.getValueType();
3339
3340 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3341 return {};
3342
3344 assert(ResultVT == Arg.getValueType());
3345
3346 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3347 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3348 SDValue NewOp;
3349
3350 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3351 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3352 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3353 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3354 } else {
3355 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3356 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3357 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3358 }
3359
3360 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3361}
3362
3364 SDLoc SL(Op);
3365 SDValue Src = Op.getOperand(0);
3366
3367 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3368 bool Ctlz = isCtlzOpc(Op.getOpcode());
3369 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3370
3371 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3372 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3373 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3374
3375 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3376 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3377 // (cttz hi:lo) -> (umin (ffbl src), 32)
3378 // (ctlz_zero_undef src) -> (ffbh src)
3379 // (cttz_zero_undef src) -> (ffbl src)
3380
3381 // 64-bit scalar version produce 32-bit result
3382 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3383 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3384 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3385 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3386 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3387 if (!ZeroUndef) {
3388 const SDValue ConstVal = DAG.getConstant(
3389 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3390 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3391 }
3392 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3393 }
3394
3395 SDValue Lo, Hi;
3396 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3397
3398 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3399 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3400
3401 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3402 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3403 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3404 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3405
3406 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3407 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3408 if (Ctlz)
3409 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3410 else
3411 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3412
3413 SDValue NewOpr;
3414 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3415 if (!ZeroUndef) {
3416 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3417 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3418 }
3419
3420 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3421}
3422
3424 SDLoc SL(Op);
3425 SDValue Src = Op.getOperand(0);
3426 assert(Src.getValueType() == MVT::i32 && "LowerCTLS only supports i32");
3427 SDValue Ffbh = DAG.getNode(
3428 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3429 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Src);
3430 SDValue Clamped = DAG.getNode(ISD::UMIN, SL, MVT::i32, Ffbh,
3431 DAG.getConstant(32, SL, MVT::i32));
3432 return DAG.getNode(ISD::ADD, SL, MVT::i32, Clamped,
3433 DAG.getAllOnesConstant(SL, MVT::i32));
3434}
3435
3437 bool Signed) const {
3438 // The regular method converting a 64-bit integer to float roughly consists of
3439 // 2 steps: normalization and rounding. In fact, after normalization, the
3440 // conversion from a 64-bit integer to a float is essentially the same as the
3441 // one from a 32-bit integer. The only difference is that it has more
3442 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3443 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3444 // converted into the correct float number. The basic steps for the unsigned
3445 // conversion are illustrated in the following pseudo code:
3446 //
3447 // f32 uitofp(i64 u) {
3448 // i32 hi, lo = split(u);
3449 // // Only count the leading zeros in hi as we have native support of the
3450 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3451 // // reduced to a 32-bit one automatically.
3452 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3453 // u <<= shamt;
3454 // hi, lo = split(u);
3455 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3456 // // convert it as a 32-bit integer and scale the result back.
3457 // return uitofp(hi) * 2^(32 - shamt);
3458 // }
3459 //
3460 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3461 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3462 // converted instead followed by negation based its sign bit.
3463
3464 SDLoc SL(Op);
3465 SDValue Src = Op.getOperand(0);
3466
3467 SDValue Lo, Hi;
3468 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3469 SDValue Sign;
3470 SDValue ShAmt;
3471 if (Signed && Subtarget->isGCN()) {
3472 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3473 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3474 // account. That is, the maximal shift is
3475 // - 32 if Lo and Hi have opposite signs;
3476 // - 33 if Lo and Hi have the same sign.
3477 //
3478 // Or, MaxShAmt = 33 + OppositeSign, where
3479 //
3480 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3481 // - -1 if Lo and Hi have opposite signs; and
3482 // - 0 otherwise.
3483 //
3484 // All in all, ShAmt is calculated as
3485 //
3486 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3487 //
3488 // or
3489 //
3490 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3491 //
3492 // to reduce the critical path.
3493 SDValue OppositeSign = DAG.getNode(
3494 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3495 DAG.getConstant(31, SL, MVT::i32));
3496 SDValue MaxShAmt =
3497 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3498 OppositeSign);
3499 // Count the leading sign bits.
3500 ShAmt = DAG.getNode(
3501 ISD::INTRINSIC_WO_CHAIN, SL, MVT::i32,
3502 DAG.getTargetConstant(Intrinsic::amdgcn_sffbh, SL, MVT::i32), Hi);
3503 // Different from unsigned conversion, the shift should be one bit less to
3504 // preserve the sign bit.
3505 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3506 DAG.getConstant(1, SL, MVT::i32));
3507 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3508 } else {
3509 if (Signed) {
3510 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3511 // absolute value first.
3512 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3513 DAG.getConstant(63, SL, MVT::i64));
3514 SDValue Abs =
3515 DAG.getNode(ISD::XOR, SL, MVT::i64,
3516 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3517 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3518 }
3519 // Count the leading zeros.
3520 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3521 // The shift amount for signed integers is [0, 32].
3522 }
3523 // Normalize the given 64-bit integer.
3524 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3525 // Split it again.
3526 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3527 // Calculate the adjust bit for rounding.
3528 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3529 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3530 DAG.getConstant(1, SL, MVT::i32), Lo);
3531 // Get the 32-bit normalized integer.
3532 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3533 // Convert the normalized 32-bit integer into f32.
3534
3535 bool UseLDEXP = isOperationLegal(ISD::FLDEXP, MVT::f32);
3536 unsigned Opc = Signed && UseLDEXP ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3537 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3538
3539 // Finally, need to scale back the converted floating number as the original
3540 // 64-bit integer is converted as a 32-bit one.
3541 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3542 ShAmt);
3543 // On GCN, use LDEXP directly.
3544 if (UseLDEXP)
3545 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3546
3547 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3548 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3549 // exponent is enough to avoid overflowing into the sign bit.
3550 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3551 DAG.getConstant(23, SL, MVT::i32));
3552 SDValue IVal =
3553 DAG.getNode(ISD::ADD, SL, MVT::i32,
3554 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3555 if (Signed) {
3556 // Set the sign bit.
3557 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3558 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3559 DAG.getConstant(31, SL, MVT::i32));
3560 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3561 }
3562 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3563}
3564
3566 bool Signed) const {
3567 SDLoc SL(Op);
3568 SDValue Src = Op.getOperand(0);
3569
3570 SDValue Lo, Hi;
3571 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3572
3574 SL, MVT::f64, Hi);
3575
3576 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3577
3578 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3579 DAG.getConstant(32, SL, MVT::i32));
3580 // TODO: Should this propagate fast-math-flags?
3581 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3582}
3583
3585 SelectionDAG &DAG) const {
3586 // TODO: Factor out code common with LowerSINT_TO_FP.
3587 EVT DestVT = Op.getValueType();
3588 SDValue Src = Op.getOperand(0);
3589 EVT SrcVT = Src.getValueType();
3590
3591 if (SrcVT == MVT::i16) {
3592 if (DestVT == MVT::f16)
3593 return Op;
3594 SDLoc DL(Op);
3595
3596 // Promote src to i32
3597 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3598 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3599 }
3600
3601 if (DestVT == MVT::bf16) {
3602 SDLoc SL(Op);
3603 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3604 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3605 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3606 }
3607
3608 if (SrcVT != MVT::i64)
3609 return Op;
3610
3611 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3612 SDLoc DL(Op);
3613
3614 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3615 SDValue FPRoundFlag =
3616 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3617 SDValue FPRound =
3618 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3619
3620 return FPRound;
3621 }
3622
3623 if (DestVT == MVT::f32)
3624 return LowerINT_TO_FP32(Op, DAG, false);
3625
3626 assert(DestVT == MVT::f64);
3627 return LowerINT_TO_FP64(Op, DAG, false);
3628}
3629
3631 SelectionDAG &DAG) const {
3632 EVT DestVT = Op.getValueType();
3633
3634 SDValue Src = Op.getOperand(0);
3635 EVT SrcVT = Src.getValueType();
3636
3637 if (SrcVT == MVT::i16) {
3638 if (DestVT == MVT::f16)
3639 return Op;
3640
3641 SDLoc DL(Op);
3642 // Promote src to i32
3643 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3644 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3645 }
3646
3647 if (DestVT == MVT::bf16) {
3648 SDLoc SL(Op);
3649 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3650 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3651 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3652 }
3653
3654 if (SrcVT != MVT::i64)
3655 return Op;
3656
3657 // TODO: Factor out code common with LowerUINT_TO_FP.
3658
3659 if (DestVT == MVT::f16 && isTypeLegal(MVT::f16)) {
3660 SDLoc DL(Op);
3661 SDValue Src = Op.getOperand(0);
3662
3663 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3664 SDValue FPRoundFlag =
3665 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3666 SDValue FPRound =
3667 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3668
3669 return FPRound;
3670 }
3671
3672 if (DestVT == MVT::f32)
3673 return LowerINT_TO_FP32(Op, DAG, true);
3674
3675 assert(DestVT == MVT::f64);
3676 return LowerINT_TO_FP64(Op, DAG, true);
3677}
3678
3680 bool Signed) const {
3681 SDLoc SL(Op);
3682
3683 SDValue Src = Op.getOperand(0);
3684 EVT SrcVT = Src.getValueType();
3685
3686 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3687
3688 // The basic idea of converting a floating point number into a pair of 32-bit
3689 // integers is illustrated as follows:
3690 //
3691 // tf := trunc(val);
3692 // hif := floor(tf * 2^-32);
3693 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3694 // hi := fptoi(hif);
3695 // lo := fptoi(lof);
3696 //
3697 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3698 SDValue Sign;
3699 if (Signed && SrcVT == MVT::f32) {
3700 // However, a 32-bit floating point number has only 23 bits mantissa and
3701 // it's not enough to hold all the significant bits of `lof` if val is
3702 // negative. To avoid the loss of precision, We need to take the absolute
3703 // value after truncating and flip the result back based on the original
3704 // signedness.
3705 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3706 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3707 DAG.getConstant(31, SL, MVT::i32));
3708 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3709 }
3710
3711 SDValue K0, K1;
3712 if (SrcVT == MVT::f64) {
3713 K0 = DAG.getConstantFP(
3714 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3715 SrcVT);
3716 K1 = DAG.getConstantFP(
3717 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3718 SrcVT);
3719 } else {
3720 K0 = DAG.getConstantFP(
3721 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3722 K1 = DAG.getConstantFP(
3723 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3724 }
3725 // TODO: Should this propagate fast-math-flags?
3726 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3727
3728 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3729
3730 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3731
3732 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3734 SL, MVT::i32, FloorMul);
3735 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3736
3737 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3738 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3739
3740 if (Signed && SrcVT == MVT::f32) {
3741 assert(Sign);
3742 // Flip the result based on the signedness, which is either all 0s or 1s.
3743 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3744 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3745 // r := xor(r, sign) - sign;
3746 Result =
3747 DAG.getNode(ISD::SUB, SL, MVT::i64,
3748 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3749 }
3750
3751 return Result;
3752}
3753
3755 SDLoc DL(Op);
3756 SDValue N0 = Op.getOperand(0);
3757
3758 // Convert to target node to get known bits
3759 if (N0.getValueType() == MVT::f32)
3760 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3761
3762 if (Op->getFlags().hasApproximateFuncs()) {
3763 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3764 return SDValue();
3765 }
3766
3767 return LowerF64ToF16Safe(N0, DL, DAG);
3768}
3769
3770// return node in i32
3772 SelectionDAG &DAG) const {
3773 assert(Src.getSimpleValueType() == MVT::f64);
3774
3775 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3776 // TODO: We can generate better code for True16.
3777 const unsigned ExpMask = 0x7ff;
3778 const unsigned ExpBiasf64 = 1023;
3779 const unsigned ExpBiasf16 = 15;
3780 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3781 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3782 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3783 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3784 DAG.getConstant(32, DL, MVT::i64));
3785 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3786 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3787 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3788 DAG.getConstant(20, DL, MVT::i64));
3789 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3790 DAG.getConstant(ExpMask, DL, MVT::i32));
3791 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3792 // add the f16 bias (15) to get the biased exponent for the f16 format.
3793 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3794 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3795
3796 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3797 DAG.getConstant(8, DL, MVT::i32));
3798 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3799 DAG.getConstant(0xffe, DL, MVT::i32));
3800
3801 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3802 DAG.getConstant(0x1ff, DL, MVT::i32));
3803 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3804
3805 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3806 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3807
3808 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3809 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3810 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3811 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3812
3813 // N = M | (E << 12);
3814 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3815 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3816 DAG.getConstant(12, DL, MVT::i32)));
3817
3818 // B = clamp(1-E, 0, 13);
3819 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3820 One, E);
3821 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3822 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3823 DAG.getConstant(13, DL, MVT::i32));
3824
3825 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3826 DAG.getConstant(0x1000, DL, MVT::i32));
3827
3828 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3829 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3830 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3831 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3832
3833 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3834 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3835 DAG.getConstant(0x7, DL, MVT::i32));
3836 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3837 DAG.getConstant(2, DL, MVT::i32));
3838 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3839 One, Zero, ISD::SETEQ);
3840 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3841 One, Zero, ISD::SETGT);
3842 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3843 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3844
3845 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3846 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3847 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3848 I, V, ISD::SETEQ);
3849
3850 // Extract the sign bit.
3851 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3852 DAG.getConstant(16, DL, MVT::i32));
3853 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3854 DAG.getConstant(0x8000, DL, MVT::i32));
3855
3856 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3857}
3858
3860 SelectionDAG &DAG) const {
3861 SDValue Src = Op.getOperand(0);
3862 unsigned OpOpcode = Op.getOpcode();
3863 EVT SrcVT = Src.getValueType();
3864 EVT DestVT = Op.getValueType();
3865
3866 // Will be selected natively
3867 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3868 return Op;
3869
3870 if (SrcVT == MVT::bf16) {
3871 SDLoc DL(Op);
3872 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3873 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3874 }
3875
3876 // Promote i16 to i32
3877 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3878 SDLoc DL(Op);
3879
3880 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3881 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3882 }
3883
3884 if (DestVT != MVT::i64)
3885 return Op;
3886
3887 if (SrcVT == MVT::f16 ||
3888 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3889 SDLoc DL(Op);
3890
3891 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3892 unsigned Ext =
3894 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3895 }
3896
3897 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3898 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3899
3900 return SDValue();
3901}
3902
3904 SelectionDAG &DAG) const {
3905 SDValue Src = Op.getOperand(0);
3906 unsigned OpOpcode = Op.getOpcode();
3907 EVT SrcVT = Src.getValueType();
3908 EVT DstVT = Op.getValueType();
3909 SDValue SatVTOp = Op.getNode()->getOperand(1);
3910 EVT SatVT = cast<VTSDNode>(SatVTOp)->getVT();
3911 SDLoc DL(Op);
3912
3913 uint64_t DstWidth = DstVT.getScalarSizeInBits();
3914 uint64_t SatWidth = SatVT.getScalarSizeInBits();
3915 assert(SatWidth <= DstWidth && "Saturation width cannot exceed result width");
3916
3917 // Will be selected natively
3918 if (DstVT == MVT::i32 && SatWidth == DstWidth &&
3919 (SrcVT == MVT::f32 || SrcVT == MVT::f64))
3920 return Op;
3921
3922 if (DstVT == MVT::i16 && SatWidth == DstWidth && SrcVT == MVT::f16)
3923 return Op;
3924
3925 // Perform all saturation at selected width (i16 or i32) and truncate
3926 if (SatWidth < DstWidth && SatWidth <= 32) {
3927 // For f16 conversion with sub-i16 saturation perform saturation
3928 // at i16, if available in the target. This removes the need for extra f16
3929 // to f32 conversion. For all the others use i32.
3930 MVT ResultVT =
3931 Subtarget->has16BitInsts() && SrcVT == MVT::f16 && SatWidth < 16
3932 ? MVT::i16
3933 : MVT::i32;
3934
3935 const SDValue ResultVTOp = DAG.getValueType(ResultVT);
3936 const uint64_t ResultWidth = ResultVT.getScalarSizeInBits();
3937
3938 // First, convert input float into selected integer (i16 or i32)
3939 SDValue FpToInt = DAG.getNode(OpOpcode, DL, ResultVT, Src, ResultVTOp);
3940 SDValue IntSatVal;
3941
3942 // Then, clamp at the saturation width using either i16 or i32 instructions
3943 if (Op.getOpcode() == ISD::FP_TO_SINT_SAT) {
3944 SDValue MinConst = DAG.getConstant(
3945 APInt::getSignedMaxValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3946 SDValue MaxConst = DAG.getConstant(
3947 APInt::getSignedMinValue(SatWidth).sext(ResultWidth), DL, ResultVT);
3948 SDValue MinVal = DAG.getNode(ISD::SMIN, DL, ResultVT, FpToInt, MinConst);
3949 IntSatVal = DAG.getNode(ISD::SMAX, DL, ResultVT, MinVal, MaxConst);
3950 } else {
3951 SDValue MinConst = DAG.getConstant(
3952 APInt::getMaxValue(SatWidth).zext(ResultWidth), DL, ResultVT);
3953 IntSatVal = DAG.getNode(ISD::UMIN, DL, ResultVT, FpToInt, MinConst);
3954 }
3955
3956 // Finally, after saturating at i16 or i32 fit into the destination type
3957 return DAG.getExtOrTrunc(OpOpcode == ISD::FP_TO_SINT_SAT, IntSatVal, DL,
3958 DstVT);
3959 }
3960
3961 // SatWidth == DstWidth
3962
3963 // Saturate at i32 for i64 dst and f16/bf16 src (will invoke f16 promotion
3964 // below)
3965 if (DstVT == MVT::i64 &&
3966 (SrcVT == MVT::f16 || SrcVT == MVT::bf16 ||
3967 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP))) {
3968 const SDValue Int32VTOp = DAG.getValueType(MVT::i32);
3969 return DAG.getNode(OpOpcode, DL, DstVT, Src, Int32VTOp);
3970 }
3971
3972 // Promote f16/bf16 src to f32 for i32 conversion
3973 if (DstVT == MVT::i32 && (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
3974 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3975 return DAG.getNode(Op.getOpcode(), DL, DstVT, PromotedSrc, SatVTOp);
3976 }
3977
3978 // For DstWidth < 16, promote i1 and i8 dst to i16 (if legal) with sub-i16
3979 // saturation. For DstWidth == 16, promote i16 dst to i32 with sub-i32
3980 // saturation; this covers i16.f32 and i16.f64
3981 if (DstWidth < 32) {
3982 // Note: this triggers SatWidth < DstWidth above to generate saturated
3983 // truncate by requesting MVT::i16/i32 destination with SatWidth < 16/32.
3984 MVT PromoteVT =
3985 (DstWidth < 16 && Subtarget->has16BitInsts()) ? MVT::i16 : MVT::i32;
3986 SDValue FpToInt = DAG.getNode(OpOpcode, DL, PromoteVT, Src, SatVTOp);
3987 return DAG.getNode(ISD::TRUNCATE, DL, DstVT, FpToInt);
3988 }
3989
3990 // TODO: can we implement i64 dst for f32/f64?
3991
3992 return SDValue();
3993}
3994
3996 SelectionDAG &DAG) const {
3997 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3998 MVT VT = Op.getSimpleValueType();
3999 MVT ScalarVT = VT.getScalarType();
4000
4001 assert(VT.isVector());
4002
4003 SDValue Src = Op.getOperand(0);
4004 SDLoc DL(Op);
4005
4006 // TODO: Don't scalarize on Evergreen?
4007 unsigned NElts = VT.getVectorNumElements();
4009 DAG.ExtractVectorElements(Src, Args, 0, NElts);
4010
4011 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
4012 for (unsigned I = 0; I < NElts; ++I)
4013 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
4014
4015 return DAG.getBuildVector(VT, DL, Args);
4016}
4017
4018//===----------------------------------------------------------------------===//
4019// Custom DAG optimizations
4020//===----------------------------------------------------------------------===//
4021
4022static bool isU24(SDValue Op, SelectionDAG &DAG) {
4023 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
4024}
4025
4026static bool isI24(SDValue Op, SelectionDAG &DAG) {
4027 EVT VT = Op.getValueType();
4028 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
4029 // as unsigned 24-bit values.
4031}
4032
4035 SelectionDAG &DAG = DCI.DAG;
4036 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
4037 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
4038
4039 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
4040 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
4041 unsigned NewOpcode = Node24->getOpcode();
4042 if (IsIntrin) {
4043 unsigned IID = Node24->getConstantOperandVal(0);
4044 switch (IID) {
4045 case Intrinsic::amdgcn_mul_i24:
4046 NewOpcode = AMDGPUISD::MUL_I24;
4047 break;
4048 case Intrinsic::amdgcn_mul_u24:
4049 NewOpcode = AMDGPUISD::MUL_U24;
4050 break;
4051 case Intrinsic::amdgcn_mulhi_i24:
4052 NewOpcode = AMDGPUISD::MULHI_I24;
4053 break;
4054 case Intrinsic::amdgcn_mulhi_u24:
4055 NewOpcode = AMDGPUISD::MULHI_U24;
4056 break;
4057 default:
4058 llvm_unreachable("Expected 24-bit mul intrinsic");
4059 }
4060 }
4061
4062 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
4063
4064 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
4065 // the operands to have other uses, but will only perform simplifications that
4066 // involve bypassing some nodes for this user.
4067 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
4068 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
4069 if (DemandedLHS || DemandedRHS)
4070 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
4071 DemandedLHS ? DemandedLHS : LHS,
4072 DemandedRHS ? DemandedRHS : RHS);
4073
4074 // Now try SimplifyDemandedBits which can simplify the nodes used by our
4075 // operands if this node is the only user.
4076 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
4077 return SDValue(Node24, 0);
4078 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
4079 return SDValue(Node24, 0);
4080
4081 return SDValue();
4082}
4083
4084template <typename IntTy>
4086 uint32_t Width, const SDLoc &DL) {
4087 if (Width + Offset < 32) {
4088 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
4089 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
4090 if constexpr (std::is_signed_v<IntTy>) {
4091 return DAG.getSignedConstant(Result, DL, MVT::i32);
4092 } else {
4093 return DAG.getConstant(Result, DL, MVT::i32);
4094 }
4095 }
4096
4097 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
4098}
4099
4100static bool hasVolatileUser(SDNode *Val) {
4101 for (SDNode *U : Val->users()) {
4102 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
4103 if (M->isVolatile())
4104 return true;
4105 }
4106 }
4107
4108 return false;
4109}
4110
4112 // i32 vectors are the canonical memory type.
4113 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
4114 return false;
4115
4116 if (!VT.isByteSized())
4117 return false;
4118
4119 unsigned Size = VT.getStoreSize();
4120
4121 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
4122 return false;
4123
4124 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
4125 return false;
4126
4127 return true;
4128}
4129
4130// Replace load of an illegal type with a bitcast from a load of a friendlier
4131// type.
4133 DAGCombinerInfo &DCI) const {
4134 if (!DCI.isBeforeLegalize())
4135 return SDValue();
4136
4138 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
4139 return SDValue();
4140
4141 SDLoc SL(N);
4142 SelectionDAG &DAG = DCI.DAG;
4143 EVT VT = LN->getMemoryVT();
4144
4145 unsigned Size = VT.getStoreSize();
4146 Align Alignment = LN->getAlign();
4147 if (Alignment < Size && isTypeLegal(VT)) {
4148 unsigned IsFast;
4149 unsigned AS = LN->getAddressSpace();
4150
4151 // Expand unaligned loads earlier than legalization. Due to visitation order
4152 // problems during legalization, the emitted instructions to pack and unpack
4153 // the bytes again are not eliminated in the case of an unaligned copy.
4155 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
4156 if (VT.isVector())
4157 return SplitVectorLoad(SDValue(LN, 0), DAG);
4158
4159 SDValue Ops[2];
4160 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
4161
4162 return DAG.getMergeValues(Ops, SDLoc(N));
4163 }
4164
4165 if (!IsFast)
4166 return SDValue();
4167 }
4168
4169 if (!shouldCombineMemoryType(VT))
4170 return SDValue();
4171
4172 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4173
4174 SDValue NewLoad
4175 = DAG.getLoad(NewVT, SL, LN->getChain(),
4176 LN->getBasePtr(), LN->getMemOperand());
4177
4178 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
4179 DCI.CombineTo(N, BC, NewLoad.getValue(1));
4180 return SDValue(N, 0);
4181}
4182
4183// Replace store of an illegal type with a store of a bitcast to a friendlier
4184// type.
4186 DAGCombinerInfo &DCI) const {
4187 if (!DCI.isBeforeLegalize())
4188 return SDValue();
4189
4191 if (!SN->isSimple() || !ISD::isNormalStore(SN))
4192 return SDValue();
4193
4194 EVT VT = SN->getMemoryVT();
4195 unsigned Size = VT.getStoreSize();
4196
4197 SDLoc SL(N);
4198 SelectionDAG &DAG = DCI.DAG;
4199 Align Alignment = SN->getAlign();
4200 if (Alignment < Size && isTypeLegal(VT)) {
4201 unsigned IsFast;
4202 unsigned AS = SN->getAddressSpace();
4203
4204 // Expand unaligned stores earlier than legalization. Due to visitation
4205 // order problems during legalization, the emitted instructions to pack and
4206 // unpack the bytes again are not eliminated in the case of an unaligned
4207 // copy.
4209 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
4210 if (VT.isVector())
4211 return SplitVectorStore(SDValue(SN, 0), DAG);
4212
4213 return expandUnalignedStore(SN, DAG);
4214 }
4215
4216 if (!IsFast)
4217 return SDValue();
4218 }
4219
4220 if (!shouldCombineMemoryType(VT))
4221 return SDValue();
4222
4223 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
4224 SDValue Val = SN->getValue();
4225
4226 //DCI.AddToWorklist(Val.getNode());
4227
4228 bool OtherUses = !Val.hasOneUse();
4229 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
4230 if (OtherUses) {
4231 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
4232 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
4233 }
4234
4235 return DAG.getStore(SN->getChain(), SL, CastVal,
4236 SN->getBasePtr(), SN->getMemOperand());
4237}
4238
4239// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
4240// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
4241// issues.
4243 DAGCombinerInfo &DCI) const {
4244 SelectionDAG &DAG = DCI.DAG;
4245 SDValue N0 = N->getOperand(0);
4246
4247 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
4248 // (vt2 (truncate (assertzext vt0:x, vt1)))
4249 if (N0.getOpcode() == ISD::TRUNCATE) {
4250 SDValue N1 = N->getOperand(1);
4251 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
4252 SDLoc SL(N);
4253
4254 SDValue Src = N0.getOperand(0);
4255 EVT SrcVT = Src.getValueType();
4256 if (SrcVT.bitsGE(ExtVT)) {
4257 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4258 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4259 }
4260 }
4261
4262 return SDValue();
4263}
4264
4266 SDNode *N, DAGCombinerInfo &DCI) const {
4267 unsigned IID = N->getConstantOperandVal(0);
4268 switch (IID) {
4269 case Intrinsic::amdgcn_mul_i24:
4270 case Intrinsic::amdgcn_mul_u24:
4271 case Intrinsic::amdgcn_mulhi_i24:
4272 case Intrinsic::amdgcn_mulhi_u24:
4273 return simplifyMul24(N, DCI);
4274 case Intrinsic::amdgcn_fract:
4275 case Intrinsic::amdgcn_rsq:
4276 case Intrinsic::amdgcn_rcp_legacy:
4277 case Intrinsic::amdgcn_rsq_legacy:
4278 case Intrinsic::amdgcn_rsq_clamp:
4279 case Intrinsic::amdgcn_tanh:
4280 case Intrinsic::amdgcn_prng_b32: {
4281 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4282 SDValue Src = N->getOperand(1);
4283 return Src.isUndef() ? Src : SDValue();
4284 }
4285 case Intrinsic::amdgcn_frexp_exp: {
4286 // frexp_exp (fneg x) -> frexp_exp x
4287 // frexp_exp (fabs x) -> frexp_exp x
4288 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4289 SDValue Src = N->getOperand(1);
4290 SDValue PeekSign = peekFPSignOps(Src);
4291 if (PeekSign == Src)
4292 return SDValue();
4293 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4294 0);
4295 }
4296 default:
4297 return SDValue();
4298 }
4299}
4300
4301/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4302/// binary operation \p Opc to it with the corresponding constant operands.
4304 DAGCombinerInfo &DCI, const SDLoc &SL,
4305 unsigned Opc, SDValue LHS,
4306 uint32_t ValLo, uint32_t ValHi) const {
4307 SelectionDAG &DAG = DCI.DAG;
4308 SDValue Lo, Hi;
4309 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4310
4311 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4312 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4313
4314 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4315 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4316
4317 // Re-visit the ands. It's possible we eliminated one of them and it could
4318 // simplify the vector.
4319 DCI.AddToWorklist(Lo.getNode());
4320 DCI.AddToWorklist(Hi.getNode());
4321
4322 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4323 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4324}
4325
4327 DAGCombinerInfo &DCI) const {
4328 EVT VT = N->getValueType(0);
4329 SDValue LHS = N->getOperand(0);
4330 SDValue RHS = N->getOperand(1);
4332 SDLoc SL(N);
4333 SelectionDAG &DAG = DCI.DAG;
4334
4335 unsigned RHSVal;
4336 if (CRHS) {
4337 RHSVal = CRHS->getZExtValue();
4338 if (!RHSVal)
4339 return LHS;
4340
4341 switch (LHS->getOpcode()) {
4342 default:
4343 break;
4344 case ISD::ZERO_EXTEND:
4345 case ISD::SIGN_EXTEND:
4346 case ISD::ANY_EXTEND: {
4347 SDValue X = LHS->getOperand(0);
4348
4349 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4350 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4351 // Prefer build_vector as the canonical form if packed types are legal.
4352 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4353 SDValue Vec = DAG.getBuildVector(
4354 MVT::v2i16, SL,
4355 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4356 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4357 }
4358
4359 // shl (ext x) => zext (shl x), if shift does not overflow int
4360 if (VT != MVT::i64)
4361 break;
4362 KnownBits Known = DAG.computeKnownBits(X);
4363 unsigned LZ = Known.countMinLeadingZeros();
4364 if (LZ < RHSVal)
4365 break;
4366 EVT XVT = X.getValueType();
4367 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4368 return DAG.getZExtOrTrunc(Shl, SL, VT);
4369 }
4370 }
4371 }
4372
4373 if (VT.getScalarType() != MVT::i64)
4374 return SDValue();
4375
4376 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4377 // common case, splitting this into a move and a 32-bit shift is faster and
4378 // the same code size.
4379 KnownBits Known = DAG.computeKnownBits(RHS);
4380
4381 EVT ElementType = VT.getScalarType();
4382 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4383 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4384
4385 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4386 return SDValue();
4387 SDValue ShiftAmt;
4388
4389 if (CRHS) {
4390 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4391 TargetType);
4392 } else {
4393 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4394 const SDValue ShiftMask =
4395 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4396 // This AND instruction will clamp out of bounds shift values.
4397 // It will also be removed during later instruction selection.
4398 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4399 }
4400
4401 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4402 SDValue NewShift =
4403 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4404
4405 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4406 SDValue Vec;
4407
4408 if (VT.isVector()) {
4409 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4410 unsigned NElts = TargetType.getVectorNumElements();
4412 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4413
4414 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4415 for (unsigned I = 0; I != NElts; ++I)
4416 HiAndLoOps[2 * I + 1] = HiOps[I];
4417 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4418 } else {
4419 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4420 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4421 }
4422 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4423}
4424
4426 DAGCombinerInfo &DCI) const {
4427 SDValue RHS = N->getOperand(1);
4429 EVT VT = N->getValueType(0);
4430 SDValue LHS = N->getOperand(0);
4431 SelectionDAG &DAG = DCI.DAG;
4432 SDLoc SL(N);
4433
4434 if (VT.getScalarType() != MVT::i64)
4435 return SDValue();
4436
4437 // For C >= 32
4438 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4439
4440 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4441 // common case, splitting this into a move and a 32-bit shift is faster and
4442 // the same code size.
4443 KnownBits Known = DAG.computeKnownBits(RHS);
4444
4445 EVT ElementType = VT.getScalarType();
4446 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4447 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4448
4449 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4450 return SDValue();
4451
4452 SDValue ShiftFullAmt =
4453 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4454 SDValue ShiftAmt;
4455 if (CRHS) {
4456 unsigned RHSVal = CRHS->getZExtValue();
4457 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4458 TargetType);
4459 } else if (Known.getMinValue().getZExtValue() ==
4460 (ElementType.getSizeInBits() - 1)) {
4461 ShiftAmt = ShiftFullAmt;
4462 } else {
4463 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4464 const SDValue ShiftMask =
4465 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4466 // This AND instruction will clamp out of bounds shift values.
4467 // It will also be removed during later instruction selection.
4468 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4469 }
4470
4471 EVT ConcatType;
4472 SDValue Hi;
4473 SDLoc LHSSL(LHS);
4474 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4475 if (VT.isVector()) {
4476 unsigned NElts = TargetType.getVectorNumElements();
4477 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4478 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4479 SmallVector<SDValue, 8> HiOps(NElts);
4480 SmallVector<SDValue, 16> HiAndLoOps;
4481
4482 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4483 for (unsigned I = 0; I != NElts; ++I) {
4484 HiOps[I] = HiAndLoOps[2 * I + 1];
4485 }
4486 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4487 } else {
4488 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4489 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4490 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4491 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4492 }
4493
4494 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4495 SDValue HiShift;
4496 if (KnownLHS.isNegative()) {
4497 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4498 } else {
4499 Hi = DAG.getFreeze(Hi);
4500 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4501 }
4502 SDValue NewShift =
4503 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4504
4505 SDValue Vec;
4506 if (VT.isVector()) {
4507 unsigned NElts = TargetType.getVectorNumElements();
4510 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4511
4512 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4513 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4514 for (unsigned I = 0; I != NElts; ++I) {
4515 HiAndLoOps[2 * I + 1] = HiOps[I];
4516 HiAndLoOps[2 * I] = LoOps[I];
4517 }
4518 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4519 } else {
4520 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4521 }
4522 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4523}
4524
4526 DAGCombinerInfo &DCI) const {
4527 SDValue RHS = N->getOperand(1);
4529 EVT VT = N->getValueType(0);
4530 SDValue LHS = N->getOperand(0);
4531 SelectionDAG &DAG = DCI.DAG;
4532 SDLoc SL(N);
4533 unsigned RHSVal;
4534
4535 if (CRHS) {
4536 RHSVal = CRHS->getZExtValue();
4537
4538 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4539 // this improves the ability to match BFE patterns in isel.
4540 if (LHS.getOpcode() == ISD::AND) {
4541 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4542 unsigned MaskIdx, MaskLen;
4543 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4544 MaskIdx == RHSVal) {
4545 return DAG.getNode(ISD::AND, SL, VT,
4546 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4547 N->getOperand(1)),
4548 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4549 N->getOperand(1)));
4550 }
4551 }
4552 }
4553 }
4554
4555 if (VT.getScalarType() != MVT::i64)
4556 return SDValue();
4557
4558 // for C >= 32
4559 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4560
4561 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4562 // common case, splitting this into a move and a 32-bit shift is faster and
4563 // the same code size.
4564 KnownBits Known = DAG.computeKnownBits(RHS);
4565
4566 EVT ElementType = VT.getScalarType();
4567 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4568 EVT TargetType = VT.changeElementType(*DAG.getContext(), TargetScalarType);
4569
4570 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4571 return SDValue();
4572
4573 SDValue ShiftAmt;
4574 if (CRHS) {
4575 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4576 TargetType);
4577 } else {
4578 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4579 const SDValue ShiftMask =
4580 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4581 // This AND instruction will clamp out of bounds shift values.
4582 // It will also be removed during later instruction selection.
4583 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4584 }
4585
4586 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4587 EVT ConcatType;
4588 SDValue Hi;
4589 SDLoc LHSSL(LHS);
4590 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4591 if (VT.isVector()) {
4592 unsigned NElts = TargetType.getVectorNumElements();
4593 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4594 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4595 SmallVector<SDValue, 8> HiOps(NElts);
4596 SmallVector<SDValue, 16> HiAndLoOps;
4597
4598 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4599 for (unsigned I = 0; I != NElts; ++I)
4600 HiOps[I] = HiAndLoOps[2 * I + 1];
4601 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4602 } else {
4603 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4604 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4605 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4606 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4607 }
4608
4609 SDValue NewShift =
4610 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4611
4612 SDValue Vec;
4613 if (VT.isVector()) {
4614 unsigned NElts = TargetType.getVectorNumElements();
4616 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4617
4618 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4619 for (unsigned I = 0; I != NElts; ++I)
4620 HiAndLoOps[2 * I] = LoOps[I];
4621 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4622 } else {
4623 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4624 }
4625 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4626}
4627
4629 SDNode *N, DAGCombinerInfo &DCI) const {
4630 SDLoc SL(N);
4631 SelectionDAG &DAG = DCI.DAG;
4632 EVT VT = N->getValueType(0);
4633 SDValue Src = N->getOperand(0);
4634
4635 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4636 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4637 SDValue Vec = Src.getOperand(0);
4638 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4639 SDValue Elt0 = Vec.getOperand(0);
4640 EVT EltVT = Elt0.getValueType();
4641 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4642 if (EltVT.isFloatingPoint()) {
4643 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4644 EltVT.changeTypeToInteger(), Elt0);
4645 }
4646
4647 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4648 }
4649 }
4650 }
4651
4652 // Equivalent of above for accessing the high element of a vector as an
4653 // integer operation.
4654 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4655 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4656 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4657 SDValue BV = stripBitcast(Src.getOperand(0));
4658 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4659 EVT SrcEltVT = BV.getOperand(0).getValueType();
4660 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4661 unsigned BitIndex = K->getZExtValue();
4662 unsigned PartIndex = BitIndex / SrcEltSize;
4663
4664 if (PartIndex * SrcEltSize == BitIndex &&
4665 PartIndex < BV.getNumOperands()) {
4666 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4667 SDValue SrcElt =
4668 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4669 BV.getOperand(PartIndex));
4670 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4671 }
4672 }
4673 }
4674 }
4675 }
4676
4677 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4678 //
4679 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4680 // i16 (trunc (srl (i32 (trunc x), K)))
4681 if (VT.getScalarSizeInBits() < 32) {
4682 EVT SrcVT = Src.getValueType();
4683 if (SrcVT.getScalarSizeInBits() > 32 &&
4684 (Src.getOpcode() == ISD::SRL ||
4685 Src.getOpcode() == ISD::SRA ||
4686 Src.getOpcode() == ISD::SHL)) {
4687 SDValue Amt = Src.getOperand(1);
4688 KnownBits Known = DAG.computeKnownBits(Amt);
4689
4690 // - For left shifts, do the transform as long as the shift
4691 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4692 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4693 // losing information stored in the high bits when truncating.
4694 const unsigned MaxCstSize =
4695 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4696 if (Known.getMaxValue().ule(MaxCstSize)) {
4697 EVT MidVT = VT.isVector() ?
4698 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4699 VT.getVectorNumElements()) : MVT::i32;
4700
4701 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4702 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4703 Src.getOperand(0));
4704 DCI.AddToWorklist(Trunc.getNode());
4705
4706 if (Amt.getValueType() != NewShiftVT) {
4707 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4708 DCI.AddToWorklist(Amt.getNode());
4709 }
4710
4711 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4712 Trunc, Amt);
4713 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4714 }
4715 }
4716 }
4717
4718 return SDValue();
4719}
4720
4721// We need to specifically handle i64 mul here to avoid unnecessary conversion
4722// instructions. If we only match on the legalized i64 mul expansion,
4723// SimplifyDemandedBits will be unable to remove them because there will be
4724// multiple uses due to the separate mul + mulh[su].
4725static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4726 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4727 if (Size <= 32) {
4728 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4729 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4730 }
4731
4732 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4733 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4734
4735 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4736 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4737
4738 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4739}
4740
4741/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4742/// return SDValue().
4743static SDValue getAddOneOp(const SDNode *V) {
4744 if (V->getOpcode() != ISD::ADD)
4745 return SDValue();
4746
4747 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4748}
4749
4751 DAGCombinerInfo &DCI) const {
4752 assert(N->getOpcode() == ISD::MUL);
4753 EVT VT = N->getValueType(0);
4754
4755 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4756 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4757 // unnecessarily). isDivergent() is used as an approximation of whether the
4758 // value is in an SGPR.
4759 if (!N->isDivergent())
4760 return SDValue();
4761
4762 unsigned Size = VT.getSizeInBits();
4763 if (VT.isVector() || Size > 64)
4764 return SDValue();
4765
4766 SelectionDAG &DAG = DCI.DAG;
4767 SDLoc DL(N);
4768
4769 SDValue N0 = N->getOperand(0);
4770 SDValue N1 = N->getOperand(1);
4771
4772 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4773 // matching.
4774
4775 // mul x, (add y, 1) -> add (mul x, y), x
4776 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4777 SDValue AddOp = getAddOneOp(V.getNode());
4778 if (!AddOp)
4779 return SDValue();
4780
4781 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4782 return U->getOpcode() == ISD::MUL;
4783 }))
4784 return AddOp;
4785
4786 return SDValue();
4787 };
4788
4789 // FIXME: The selection pattern is not properly checking for commuted
4790 // operands, so we have to place the mul in the LHS
4791 if (SDValue MulOper = IsFoldableAdd(N0)) {
4792 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4793 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4794 }
4795
4796 if (SDValue MulOper = IsFoldableAdd(N1)) {
4797 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4798 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4799 }
4800
4801 // There are i16 integer mul/mad.
4802 if (isTypeLegal(MVT::i16) && VT.getScalarType().bitsLE(MVT::i16))
4803 return SDValue();
4804
4805 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4806 // in the source into any_extends if the result of the mul is truncated. Since
4807 // we can assume the high bits are whatever we want, use the underlying value
4808 // to avoid the unknown high bits from interfering.
4809 if (N0.getOpcode() == ISD::ANY_EXTEND)
4810 N0 = N0.getOperand(0);
4811
4812 if (N1.getOpcode() == ISD::ANY_EXTEND)
4813 N1 = N1.getOperand(0);
4814
4815 SDValue Mul;
4816
4817 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4818 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4819 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4820 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4821 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4822 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4823 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4824 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4825 } else {
4826 return SDValue();
4827 }
4828
4829 // We need to use sext even for MUL_U24, because MUL_U24 is used
4830 // for signed multiply of 8 and 16-bit types.
4831 return DAG.getSExtOrTrunc(Mul, DL, VT);
4832}
4833
4834SDValue
4836 DAGCombinerInfo &DCI) const {
4837 if (N->getValueType(0) != MVT::i32)
4838 return SDValue();
4839
4840 SelectionDAG &DAG = DCI.DAG;
4841 SDLoc DL(N);
4842
4843 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4844 SDValue N0 = N->getOperand(0);
4845 SDValue N1 = N->getOperand(1);
4846
4847 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4848 // in the source into any_extends if the result of the mul is truncated. Since
4849 // we can assume the high bits are whatever we want, use the underlying value
4850 // to avoid the unknown high bits from interfering.
4851 if (N0.getOpcode() == ISD::ANY_EXTEND)
4852 N0 = N0.getOperand(0);
4853 if (N1.getOpcode() == ISD::ANY_EXTEND)
4854 N1 = N1.getOperand(0);
4855
4856 // Try to use two fast 24-bit multiplies (one for each half of the result)
4857 // instead of one slow extending multiply.
4858 unsigned LoOpcode = 0;
4859 unsigned HiOpcode = 0;
4860 if (Signed) {
4861 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4862 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4863 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4864 LoOpcode = AMDGPUISD::MUL_I24;
4865 HiOpcode = AMDGPUISD::MULHI_I24;
4866 }
4867 } else {
4868 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4869 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4870 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4871 LoOpcode = AMDGPUISD::MUL_U24;
4872 HiOpcode = AMDGPUISD::MULHI_U24;
4873 }
4874 }
4875 if (!LoOpcode)
4876 return SDValue();
4877
4878 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4879 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4880 DCI.CombineTo(N, Lo, Hi);
4881 return SDValue(N, 0);
4882}
4883
4885 DAGCombinerInfo &DCI) const {
4886 EVT VT = N->getValueType(0);
4887
4888 if (!Subtarget->hasMulI24() || VT.isVector())
4889 return SDValue();
4890
4891 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4892 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4893 // unnecessarily). isDivergent() is used as an approximation of whether the
4894 // value is in an SGPR.
4895 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4896 // valu op anyway)
4897 if (Subtarget->hasSMulHi() && !N->isDivergent())
4898 return SDValue();
4899
4900 SelectionDAG &DAG = DCI.DAG;
4901 SDLoc DL(N);
4902
4903 SDValue N0 = N->getOperand(0);
4904 SDValue N1 = N->getOperand(1);
4905
4906 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4907 return SDValue();
4908
4909 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4910 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4911
4912 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4913 DCI.AddToWorklist(Mulhi.getNode());
4914 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4915}
4916
4918 DAGCombinerInfo &DCI) const {
4919 EVT VT = N->getValueType(0);
4920
4921 if (VT.isVector() || VT.getSizeInBits() > 32 || !Subtarget->hasMulU24())
4922 return SDValue();
4923
4924 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4925 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4926 // unnecessarily). isDivergent() is used as an approximation of whether the
4927 // value is in an SGPR.
4928 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4929 // valu op anyway)
4930 if (!N->isDivergent() && Subtarget->hasSMulHi())
4931 return SDValue();
4932
4933 SelectionDAG &DAG = DCI.DAG;
4934 SDLoc DL(N);
4935
4936 SDValue N0 = N->getOperand(0);
4937 SDValue N1 = N->getOperand(1);
4938
4939 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4940 return SDValue();
4941
4942 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4943 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4944
4945 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4946 DCI.AddToWorklist(Mulhi.getNode());
4947 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4948}
4949
4950SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4951 SDValue Op,
4952 const SDLoc &DL,
4953 unsigned Opc) const {
4954 EVT VT = Op.getValueType();
4955 if (VT.bitsGT(MVT::i32))
4956 return SDValue();
4957
4958 if (VT != MVT::i32)
4959 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4960
4961 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4962 if (VT != MVT::i32)
4963 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4964
4965 return FFBX;
4966}
4967
4968// The native instructions return -1 on 0 input. Optimize out a select that
4969// produces -1 on 0.
4970//
4971// TODO: If zero is not undef, we could also do this if the output is compared
4972// against the bitwidth.
4973//
4974// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4976 SDValue LHS, SDValue RHS,
4977 DAGCombinerInfo &DCI) const {
4978 if (!isNullConstant(Cond.getOperand(1)))
4979 return SDValue();
4980
4981 SelectionDAG &DAG = DCI.DAG;
4982 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4983 SDValue CmpLHS = Cond.getOperand(0);
4984
4985 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4986 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4987 if (CCOpcode == ISD::SETEQ &&
4988 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4989 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4990 unsigned Opc =
4991 isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
4992 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4993 }
4994
4995 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4996 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4997 if (CCOpcode == ISD::SETNE &&
4998 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4999 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
5000 unsigned Opc =
5001 isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
5002
5003 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
5004 }
5005
5006 return SDValue();
5007}
5008
5010 unsigned Op,
5011 const SDLoc &SL,
5012 SDValue Cond,
5013 SDValue N1,
5014 SDValue N2) {
5015 SelectionDAG &DAG = DCI.DAG;
5016 EVT VT = N1.getValueType();
5017
5018 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
5019 N1.getOperand(0), N2.getOperand(0));
5020 DCI.AddToWorklist(NewSelect.getNode());
5021 return DAG.getNode(Op, SL, VT, NewSelect);
5022}
5023
5024// Pull a free FP operation out of a select so it may fold into uses.
5025//
5026// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
5027// select c, (fneg x), k -> fneg (select c, x, (fneg k))
5028//
5029// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
5030// select c, (fabs x), +k -> fabs (select c, x, k)
5031SDValue
5033 SDValue N) const {
5034 SelectionDAG &DAG = DCI.DAG;
5035 SDValue Cond = N.getOperand(0);
5036 SDValue LHS = N.getOperand(1);
5037 SDValue RHS = N.getOperand(2);
5038
5039 EVT VT = N.getValueType();
5040 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
5041 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
5043 return SDValue();
5044
5045 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
5046 SDLoc(N), Cond, LHS, RHS);
5047 }
5048
5049 bool Inv = false;
5050 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
5051 std::swap(LHS, RHS);
5052 Inv = true;
5053 }
5054
5055 // TODO: Support vector constants.
5057 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
5058 !selectSupportsSourceMods(N.getNode())) {
5059 SDLoc SL(N);
5060 // If one side is an fneg/fabs and the other is a constant, we can push the
5061 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
5062 SDValue NewLHS = LHS.getOperand(0);
5063 SDValue NewRHS = RHS;
5064
5065 // Careful: if the neg can be folded up, don't try to pull it back down.
5066 bool ShouldFoldNeg = true;
5067
5068 if (NewLHS.hasOneUse()) {
5069 unsigned Opc = NewLHS.getOpcode();
5070 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
5071 ShouldFoldNeg = false;
5072 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
5073 ShouldFoldNeg = false;
5074 }
5075
5076 if (ShouldFoldNeg) {
5077 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
5078 return SDValue();
5079
5080 // We're going to be forced to use a source modifier anyway, there's no
5081 // point to pulling the negate out unless we can get a size reduction by
5082 // negating the constant.
5083 //
5084 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
5085 // about cheaper constants.
5086 if (NewLHS.getOpcode() == ISD::FABS &&
5088 return SDValue();
5089
5091 return SDValue();
5092
5093 if (LHS.getOpcode() == ISD::FNEG)
5094 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5095
5096 if (Inv)
5097 std::swap(NewLHS, NewRHS);
5098
5099 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
5100 Cond, NewLHS, NewRHS);
5101 DCI.AddToWorklist(NewSelect.getNode());
5102 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
5103 }
5104 }
5105
5106 return SDValue();
5107}
5108
5110 DAGCombinerInfo &DCI) const {
5111 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
5112 return Folded;
5113
5114 SDValue Cond = N->getOperand(0);
5115 if (Cond.getOpcode() != ISD::SETCC)
5116 return SDValue();
5117
5118 EVT VT = N->getValueType(0);
5119 SDValue LHS = Cond.getOperand(0);
5120 SDValue RHS = Cond.getOperand(1);
5121 SDValue CC = Cond.getOperand(2);
5122
5123 SDValue True = N->getOperand(1);
5124 SDValue False = N->getOperand(2);
5125
5126 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
5127 SelectionDAG &DAG = DCI.DAG;
5128 if (DAG.isConstantValueOfAnyType(True) &&
5129 !DAG.isConstantValueOfAnyType(False)) {
5130 // Swap cmp + select pair to move constant to false input.
5131 // This will allow using VOPC cndmasks more often.
5132 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
5133
5134 SDLoc SL(N);
5135 ISD::CondCode NewCC =
5136 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
5137
5138 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
5139 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
5140 }
5141
5142 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
5144 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
5145 // Revisit this node so we can catch min3/max3/med3 patterns.
5146 //DCI.AddToWorklist(MinMax.getNode());
5147 return MinMax;
5148 }
5149 }
5150
5151 // There's no reason to not do this if the condition has other uses.
5152 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
5153}
5154
5155static bool isInv2Pi(const APFloat &APF) {
5156 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
5157 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
5158 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
5159
5160 return APF.bitwiseIsEqual(KF16) ||
5161 APF.bitwiseIsEqual(KF32) ||
5162 APF.bitwiseIsEqual(KF64);
5163}
5164
5165// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
5166// additional cost to negate them.
5169 if (C->isZero())
5170 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5171
5172 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
5173 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
5174
5176}
5177
5183
5189
5190static unsigned inverseMinMax(unsigned Opc) {
5191 switch (Opc) {
5192 case ISD::FMAXNUM:
5193 return ISD::FMINNUM;
5194 case ISD::FMINNUM:
5195 return ISD::FMAXNUM;
5196 case ISD::FMAXNUM_IEEE:
5197 return ISD::FMINNUM_IEEE;
5198 case ISD::FMINNUM_IEEE:
5199 return ISD::FMAXNUM_IEEE;
5200 case ISD::FMAXIMUM:
5201 return ISD::FMINIMUM;
5202 case ISD::FMINIMUM:
5203 return ISD::FMAXIMUM;
5204 case ISD::FMAXIMUMNUM:
5205 return ISD::FMINIMUMNUM;
5206 case ISD::FMINIMUMNUM:
5207 return ISD::FMAXIMUMNUM;
5208 case AMDGPUISD::FMAX_LEGACY:
5209 return AMDGPUISD::FMIN_LEGACY;
5210 case AMDGPUISD::FMIN_LEGACY:
5211 return AMDGPUISD::FMAX_LEGACY;
5212 default:
5213 llvm_unreachable("invalid min/max opcode");
5214 }
5215}
5216
5217/// \return true if it's profitable to try to push an fneg into its source
5218/// instruction.
5220 // If the input has multiple uses and we can either fold the negate down, or
5221 // the other uses cannot, give up. This both prevents unprofitable
5222 // transformations and infinite loops: we won't repeatedly try to fold around
5223 // a negate that has no 'good' form.
5224 if (N0.hasOneUse()) {
5225 // This may be able to fold into the source, but at a code size cost. Don't
5226 // fold if the fold into the user is free.
5227 if (allUsesHaveSourceMods(N, 0))
5228 return false;
5229 } else {
5230 if (fnegFoldsIntoOp(N0.getNode()) &&
5232 return false;
5233 }
5234
5235 return true;
5236}
5237
5239 DAGCombinerInfo &DCI) const {
5240 SelectionDAG &DAG = DCI.DAG;
5241 SDValue N0 = N->getOperand(0);
5242 EVT VT = N->getValueType(0);
5243
5244 unsigned Opc = N0.getOpcode();
5245
5246 if (!shouldFoldFNegIntoSrc(N, N0))
5247 return SDValue();
5248
5249 SDLoc SL(N);
5250 switch (Opc) {
5251 case ISD::FADD: {
5252 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5253 return SDValue();
5254
5255 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5256 SDValue LHS = N0.getOperand(0);
5257 SDValue RHS = N0.getOperand(1);
5258
5259 if (LHS.getOpcode() != ISD::FNEG)
5260 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5261 else
5262 LHS = LHS.getOperand(0);
5263
5264 if (RHS.getOpcode() != ISD::FNEG)
5265 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5266 else
5267 RHS = RHS.getOperand(0);
5268
5269 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5270 if (Res.getOpcode() != ISD::FADD)
5271 return SDValue(); // Op got folded away.
5272 if (!N0.hasOneUse())
5273 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5274 return Res;
5275 }
5276 case ISD::FMUL:
5277 case AMDGPUISD::FMUL_LEGACY: {
5278 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5279 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5280 SDValue LHS = N0.getOperand(0);
5281 SDValue RHS = N0.getOperand(1);
5282
5283 if (LHS.getOpcode() == ISD::FNEG)
5284 LHS = LHS.getOperand(0);
5285 else if (RHS.getOpcode() == ISD::FNEG)
5286 RHS = RHS.getOperand(0);
5287 else
5288 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5289
5290 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5291 if (Res.getOpcode() != Opc)
5292 return SDValue(); // Op got folded away.
5293 if (!N0.hasOneUse())
5294 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5295 return Res;
5296 }
5297 case ISD::FMA:
5298 case ISD::FMAD: {
5299 // TODO: handle llvm.amdgcn.fma.legacy
5300 if (!mayIgnoreSignedZero(N0) && !N->getFlags().hasNoSignedZeros())
5301 return SDValue();
5302
5303 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5304 SDValue LHS = N0.getOperand(0);
5305 SDValue MHS = N0.getOperand(1);
5306 SDValue RHS = N0.getOperand(2);
5307
5308 if (LHS.getOpcode() == ISD::FNEG)
5309 LHS = LHS.getOperand(0);
5310 else if (MHS.getOpcode() == ISD::FNEG)
5311 MHS = MHS.getOperand(0);
5312 else
5313 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5314
5315 if (RHS.getOpcode() != ISD::FNEG)
5316 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5317 else
5318 RHS = RHS.getOperand(0);
5319
5320 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5321 if (Res.getOpcode() != Opc)
5322 return SDValue(); // Op got folded away.
5323 if (!N0.hasOneUse())
5324 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5325 return Res;
5326 }
5327 case ISD::FMAXNUM:
5328 case ISD::FMINNUM:
5329 case ISD::FMAXNUM_IEEE:
5330 case ISD::FMINNUM_IEEE:
5331 case ISD::FMINIMUM:
5332 case ISD::FMAXIMUM:
5333 case ISD::FMINIMUMNUM:
5334 case ISD::FMAXIMUMNUM:
5335 case AMDGPUISD::FMAX_LEGACY:
5336 case AMDGPUISD::FMIN_LEGACY: {
5337 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5338 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5339 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5340 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5341
5342 SDValue LHS = N0.getOperand(0);
5343 SDValue RHS = N0.getOperand(1);
5344
5345 // 0 doesn't have a negated inline immediate.
5346 // TODO: This constant check should be generalized to other operations.
5348 return SDValue();
5349
5350 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5351 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5352 unsigned Opposite = inverseMinMax(Opc);
5353
5354 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5355 if (Res.getOpcode() != Opposite)
5356 return SDValue(); // Op got folded away.
5357 if (!N0.hasOneUse())
5358 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5359 return Res;
5360 }
5361 case AMDGPUISD::FMED3: {
5362 SDValue Ops[3];
5363 for (unsigned I = 0; I < 3; ++I)
5364 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5365
5366 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5367 if (Res.getOpcode() != AMDGPUISD::FMED3)
5368 return SDValue(); // Op got folded away.
5369
5370 if (!N0.hasOneUse()) {
5371 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5372 DAG.ReplaceAllUsesWith(N0, Neg);
5373
5374 for (SDNode *U : Neg->users())
5375 DCI.AddToWorklist(U);
5376 }
5377
5378 return Res;
5379 }
5380 case ISD::FP_EXTEND:
5381 case ISD::FTRUNC:
5382 case ISD::FRINT:
5383 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5384 case ISD::FROUNDEVEN:
5385 case ISD::FSIN:
5386 case ISD::FCANONICALIZE:
5387 case AMDGPUISD::RCP:
5388 case AMDGPUISD::RCP_LEGACY:
5389 case AMDGPUISD::RCP_IFLAG:
5390 case AMDGPUISD::SIN_HW: {
5391 SDValue CvtSrc = N0.getOperand(0);
5392 if (CvtSrc.getOpcode() == ISD::FNEG) {
5393 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5394 // (fneg (rcp (fneg x))) -> (rcp x)
5395 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5396 }
5397
5398 if (!N0.hasOneUse())
5399 return SDValue();
5400
5401 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5402 // (fneg (rcp x)) -> (rcp (fneg x))
5403 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5404 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5405 }
5406 case ISD::FP_ROUND: {
5407 SDValue CvtSrc = N0.getOperand(0);
5408
5409 if (CvtSrc.getOpcode() == ISD::FNEG) {
5410 // (fneg (fp_round (fneg x))) -> (fp_round x)
5411 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5412 CvtSrc.getOperand(0), N0.getOperand(1));
5413 }
5414
5415 if (!N0.hasOneUse())
5416 return SDValue();
5417
5418 // (fneg (fp_round x)) -> (fp_round (fneg x))
5419 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5420 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5421 }
5422 case ISD::FP16_TO_FP: {
5423 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5424 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5425 // Put the fneg back as a legal source operation that can be matched later.
5426 SDLoc SL(N);
5427
5428 SDValue Src = N0.getOperand(0);
5429 EVT SrcVT = Src.getValueType();
5430
5431 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5432 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5433 DAG.getConstant(0x8000, SL, SrcVT));
5434 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5435 }
5436 case ISD::SELECT: {
5437 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5438 // TODO: Invert conditions of foldFreeOpFromSelect
5439 return SDValue();
5440 }
5441 case ISD::BITCAST: {
5442 SDLoc SL(N);
5443 SDValue BCSrc = N0.getOperand(0);
5444 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5445 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5446 if (HighBits.getValueType().getSizeInBits() != 32 ||
5447 !fnegFoldsIntoOp(HighBits.getNode()))
5448 return SDValue();
5449
5450 // f64 fneg only really needs to operate on the high half of of the
5451 // register, so try to force it to an f32 operation to help make use of
5452 // source modifiers.
5453 //
5454 //
5455 // fneg (f64 (bitcast (build_vector x, y))) ->
5456 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5457 // (fneg (bitcast i32:y to f32)))
5458
5459 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5460 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5461 SDValue CastBack =
5462 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5463
5465 Ops.back() = CastBack;
5466 DCI.AddToWorklist(NegHi.getNode());
5467 SDValue Build =
5468 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5469 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5470
5471 if (!N0.hasOneUse())
5472 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5473 return Result;
5474 }
5475
5476 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5477 BCSrc.hasOneUse()) {
5478 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5479 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5480
5481 // TODO: Cast back result for multiple uses is beneficial in some cases.
5482
5483 SDValue LHS =
5484 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5485 SDValue RHS =
5486 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5487
5488 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5489 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5490
5491 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5492 NegRHS);
5493 }
5494
5495 return SDValue();
5496 }
5497 default:
5498 return SDValue();
5499 }
5500}
5501
5503 DAGCombinerInfo &DCI) const {
5504 SelectionDAG &DAG = DCI.DAG;
5505 SDValue N0 = N->getOperand(0);
5506
5507 if (!N0.hasOneUse())
5508 return SDValue();
5509
5510 switch (N0.getOpcode()) {
5511 case ISD::FP16_TO_FP: {
5512 assert(!isTypeLegal(MVT::f16) && "should only see if f16 is illegal");
5513 SDLoc SL(N);
5514 SDValue Src = N0.getOperand(0);
5515 EVT SrcVT = Src.getValueType();
5516
5517 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5518 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5519 DAG.getConstant(0x7fff, SL, SrcVT));
5520 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5521 }
5522 default:
5523 return SDValue();
5524 }
5525}
5526
5528 DAGCombinerInfo &DCI) const {
5529 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5530 if (!CFP)
5531 return SDValue();
5532
5533 // XXX - Should this flush denormals?
5534 const APFloat &Val = CFP->getValueAPF();
5535 APFloat One(Val.getSemantics(), "1.0");
5536 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5537}
5538
5540 if (!Subtarget->isGCN())
5541 return false;
5542
5545 auto &ST = DAG.getSubtarget<GCNSubtarget>();
5546 const auto *TII = ST.getInstrInfo();
5547
5548 if (!ST.hasMovB64() || (!SDConstant && !SDFPConstant))
5549 return false;
5550
5551 if (ST.has64BitLiterals())
5552 return true;
5553
5554 if (SDConstant) {
5555 const APInt &APVal = SDConstant->getAPIntValue();
5556 return isUInt<32>(APVal.getZExtValue()) || TII->isInlineConstant(APVal);
5557 }
5558
5559 APInt Val = SDFPConstant->getValueAPF().bitcastToAPInt();
5560 return isUInt<32>(Val.getZExtValue()) || TII->isInlineConstant(Val);
5561}
5562
5564 DAGCombinerInfo &DCI) const {
5565 SelectionDAG &DAG = DCI.DAG;
5566 SDLoc DL(N);
5567
5568 switch(N->getOpcode()) {
5569 default:
5570 break;
5571 case ISD::BITCAST: {
5572 EVT DestVT = N->getValueType(0);
5573
5574 // Push casts through vector builds. This helps avoid emitting a large
5575 // number of copies when materializing floating point vector constants.
5576 //
5577 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5578 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5579 if (DestVT.isVector()) {
5580 SDValue Src = N->getOperand(0);
5581 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5584 EVT SrcVT = Src.getValueType();
5585 unsigned NElts = DestVT.getVectorNumElements();
5586
5587 if (SrcVT.getVectorNumElements() == NElts) {
5588 EVT DestEltVT = DestVT.getVectorElementType();
5589
5590 SmallVector<SDValue, 8> CastedElts;
5591 SDLoc SL(N);
5592 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5593 SDValue Elt = Src.getOperand(I);
5594 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5595 }
5596
5597 return DAG.getBuildVector(DestVT, SL, CastedElts);
5598 }
5599 }
5600 }
5601
5602 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5603 break;
5604
5605 // Fold bitcasts of constants.
5606 //
5607 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5608 // TODO: Generalize and move to DAGCombiner
5609 SDValue Src = N->getOperand(0);
5611 SDLoc SL(N);
5612 if (isInt64ImmLegal(C, DAG))
5613 break;
5614 uint64_t CVal = C->getZExtValue();
5615 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5616 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5617 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5618 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5619 }
5620
5622 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5623 SDLoc SL(N);
5624 if (isInt64ImmLegal(C, DAG))
5625 break;
5626 uint64_t CVal = Val.getZExtValue();
5627 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5628 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5629 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5630
5631 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5632 }
5633
5634 break;
5635 }
5636 case ISD::SHL:
5637 case ISD::SRA:
5638 case ISD::SRL: {
5639 // Range metadata can be invalidated when loads are converted to legal types
5640 // (e.g. v2i64 -> v4i32).
5641 // Try to convert vector shl/sra/srl before type legalization so that range
5642 // metadata can be utilized.
5643 if (!(N->getValueType(0).isVector() &&
5646 break;
5647 if (N->getOpcode() == ISD::SHL)
5648 return performShlCombine(N, DCI);
5649 if (N->getOpcode() == ISD::SRA)
5650 return performSraCombine(N, DCI);
5651 return performSrlCombine(N, DCI);
5652 }
5653 case ISD::TRUNCATE:
5654 return performTruncateCombine(N, DCI);
5655 case ISD::MUL:
5656 return performMulCombine(N, DCI);
5657 case AMDGPUISD::MUL_U24:
5658 case AMDGPUISD::MUL_I24: {
5659 if (SDValue Simplified = simplifyMul24(N, DCI))
5660 return Simplified;
5661 break;
5662 }
5663 case AMDGPUISD::MULHI_I24:
5664 case AMDGPUISD::MULHI_U24:
5665 return simplifyMul24(N, DCI);
5666 case ISD::SMUL_LOHI:
5667 case ISD::UMUL_LOHI:
5668 return performMulLoHiCombine(N, DCI);
5669 case ISD::MULHS:
5670 return performMulhsCombine(N, DCI);
5671 case ISD::MULHU:
5672 return performMulhuCombine(N, DCI);
5673 case ISD::SELECT:
5674 return performSelectCombine(N, DCI);
5675 case ISD::FNEG:
5676 return performFNegCombine(N, DCI);
5677 case ISD::FABS:
5678 return performFAbsCombine(N, DCI);
5679 case AMDGPUISD::BFE_I32:
5680 case AMDGPUISD::BFE_U32: {
5681 assert(!N->getValueType(0).isVector() &&
5682 "Vector handling of BFE not implemented");
5683 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5684 if (!Width)
5685 break;
5686
5687 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5688 if (WidthVal == 0)
5689 return DAG.getConstant(0, DL, MVT::i32);
5690
5692 if (!Offset)
5693 break;
5694
5695 SDValue BitsFrom = N->getOperand(0);
5696 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5697
5698 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5699
5700 if (OffsetVal == 0) {
5701 // This is already sign / zero extended, so try to fold away extra BFEs.
5702 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5703
5704 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5705 if (OpSignBits >= SignBits)
5706 return BitsFrom;
5707
5708 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5709 if (Signed) {
5710 // This is a sign_extend_inreg. Replace it to take advantage of existing
5711 // DAG Combines. If not eliminated, we will match back to BFE during
5712 // selection.
5713
5714 // TODO: The sext_inreg of extended types ends, although we can could
5715 // handle them in a single BFE.
5716 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5717 DAG.getValueType(SmallVT));
5718 }
5719
5720 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5721 }
5722
5723 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5724 if (Signed) {
5725 return constantFoldBFE<int32_t>(DAG,
5726 CVal->getSExtValue(),
5727 OffsetVal,
5728 WidthVal,
5729 DL);
5730 }
5731
5732 return constantFoldBFE<uint32_t>(DAG,
5733 CVal->getZExtValue(),
5734 OffsetVal,
5735 WidthVal,
5736 DL);
5737 }
5738
5739 if ((OffsetVal + WidthVal) >= 32 &&
5740 !(OffsetVal == 16 && WidthVal == 16 && Subtarget->hasSDWA())) {
5741 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5742 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5743 BitsFrom, ShiftVal);
5744 }
5745
5746 if (BitsFrom.hasOneUse()) {
5747 APInt Demanded = APInt::getBitsSet(32,
5748 OffsetVal,
5749 OffsetVal + WidthVal);
5750
5751 KnownBits Known;
5753 !DCI.isBeforeLegalizeOps());
5754 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5755 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5756 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5757 DCI.CommitTargetLoweringOpt(TLO);
5758 }
5759 }
5760
5761 break;
5762 }
5763 case ISD::LOAD:
5764 return performLoadCombine(N, DCI);
5765 case ISD::STORE:
5766 return performStoreCombine(N, DCI);
5767 case AMDGPUISD::RCP:
5768 case AMDGPUISD::RCP_IFLAG:
5769 return performRcpCombine(N, DCI);
5770 case ISD::AssertZext:
5771 case ISD::AssertSext:
5772 return performAssertSZExtCombine(N, DCI);
5774 return performIntrinsicWOChainCombine(N, DCI);
5775 case AMDGPUISD::FMAD_FTZ: {
5776 SDValue N0 = N->getOperand(0);
5777 SDValue N1 = N->getOperand(1);
5778 SDValue N2 = N->getOperand(2);
5779 EVT VT = N->getValueType(0);
5780
5781 // FMAD_FTZ is a FMAD + flush denormals to zero.
5782 // We flush the inputs, the intermediate step, and the output.
5786 if (N0CFP && N1CFP && N2CFP) {
5787 const auto FTZ = [](const APFloat &V) {
5788 if (V.isDenormal()) {
5789 APFloat Zero(V.getSemantics(), 0);
5790 return V.isNegative() ? -Zero : Zero;
5791 }
5792 return V;
5793 };
5794
5795 APFloat V0 = FTZ(N0CFP->getValueAPF());
5796 APFloat V1 = FTZ(N1CFP->getValueAPF());
5797 APFloat V2 = FTZ(N2CFP->getValueAPF());
5799 V0 = FTZ(V0);
5801 return DAG.getConstantFP(FTZ(V0), DL, VT);
5802 }
5803 break;
5804 }
5805 }
5806 return SDValue();
5807}
5808
5809//===----------------------------------------------------------------------===//
5810// Helper functions
5811//===----------------------------------------------------------------------===//
5812
5814 const TargetRegisterClass *RC,
5815 Register Reg, EVT VT,
5816 const SDLoc &SL,
5817 bool RawReg) const {
5819 MachineRegisterInfo &MRI = MF.getRegInfo();
5820 Register VReg;
5821
5822 if (!MRI.isLiveIn(Reg)) {
5823 VReg = MRI.createVirtualRegister(RC);
5824 MRI.addLiveIn(Reg, VReg);
5825 } else {
5826 VReg = MRI.getLiveInVirtReg(Reg);
5827 }
5828
5829 if (RawReg)
5830 return DAG.getRegister(VReg, VT);
5831
5832 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5833}
5834
5835// This may be called multiple times, and nothing prevents creating multiple
5836// objects at the same offset. See if we already defined this object.
5838 int64_t Offset) {
5839 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5840 if (MFI.getObjectOffset(I) == Offset) {
5841 assert(MFI.getObjectSize(I) == Size);
5842 return I;
5843 }
5844 }
5845
5846 return MFI.CreateFixedObject(Size, Offset, true);
5847}
5848
5850 EVT VT,
5851 const SDLoc &SL,
5852 int64_t Offset) const {
5854 MachineFrameInfo &MFI = MF.getFrameInfo();
5855 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5856
5857 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5858 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5859
5860 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5863}
5864
5866 const SDLoc &SL,
5867 SDValue Chain,
5868 SDValue ArgVal,
5869 int64_t Offset) const {
5873
5874 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5875 // Stores to the argument stack area are relative to the stack pointer.
5876 SDValue SP =
5877 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5878 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5879 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5881 return Store;
5882}
5883
5885 const TargetRegisterClass *RC,
5886 EVT VT, const SDLoc &SL,
5887 const ArgDescriptor &Arg) const {
5888 assert(Arg && "Attempting to load missing argument");
5889
5890 SDValue V = Arg.isRegister() ?
5891 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5892 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5893
5894 if (!Arg.isMasked())
5895 return V;
5896
5897 unsigned Mask = Arg.getMask();
5898 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5899 V = DAG.getNode(ISD::SRL, SL, VT, V,
5900 DAG.getShiftAmountConstant(Shift, VT, SL));
5901 return DAG.getNode(ISD::AND, SL, VT, V,
5902 DAG.getConstant(Mask >> Shift, SL, VT));
5903}
5904
5906 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5907 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5908 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5909 uint64_t ArgOffset =
5910 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5911 switch (Param) {
5912 case FIRST_IMPLICIT:
5913 return ArgOffset;
5914 case PRIVATE_BASE:
5916 case SHARED_BASE:
5917 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5918 case QUEUE_PTR:
5919 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5920 }
5921 llvm_unreachable("unexpected implicit parameter type");
5922}
5923
5930
5932 SelectionDAG &DAG, int Enabled,
5933 int &RefinementSteps,
5934 bool &UseOneConstNR,
5935 bool Reciprocal) const {
5936 EVT VT = Operand.getValueType();
5937
5938 if (VT == MVT::f32) {
5939 RefinementSteps = 0;
5940 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5941 }
5942
5943 // TODO: There is also f64 rsq instruction, but the documentation is less
5944 // clear on its precision.
5945
5946 return SDValue();
5947}
5948
5950 SelectionDAG &DAG, int Enabled,
5951 int &RefinementSteps) const {
5952 EVT VT = Operand.getValueType();
5953
5954 if (VT == MVT::f32) {
5955 // Reciprocal, < 1 ulp error.
5956 //
5957 // This reciprocal approximation converges to < 0.5 ulp error with one
5958 // newton rhapson performed with two fused multiple adds (FMAs).
5959
5960 RefinementSteps = 0;
5961 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5962 }
5963
5964 // TODO: There is also f64 rcp instruction, but the documentation is less
5965 // clear on its precision.
5966
5967 return SDValue();
5968}
5969
5970static unsigned workitemIntrinsicDim(unsigned ID) {
5971 switch (ID) {
5972 case Intrinsic::amdgcn_workitem_id_x:
5973 return 0;
5974 case Intrinsic::amdgcn_workitem_id_y:
5975 return 1;
5976 case Intrinsic::amdgcn_workitem_id_z:
5977 return 2;
5978 default:
5979 llvm_unreachable("not a workitem intrinsic");
5980 }
5981}
5982
5984 const SDValue Op, KnownBits &Known,
5985 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5986
5987 Known.resetAll(); // Don't know anything.
5988
5989 unsigned Opc = Op.getOpcode();
5990
5991 switch (Opc) {
5992 default:
5993 break;
5994 case AMDGPUISD::CARRY:
5995 case AMDGPUISD::BORROW: {
5996 Known.Zero = APInt::getHighBitsSet(32, 31);
5997 break;
5998 }
5999
6000 case AMDGPUISD::BFE_I32:
6001 case AMDGPUISD::BFE_U32: {
6002 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6003 if (!CWidth)
6004 return;
6005
6006 uint32_t Width = CWidth->getZExtValue() & 0x1f;
6007
6008 if (Opc == AMDGPUISD::BFE_U32)
6009 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
6010
6011 break;
6012 }
6013 case AMDGPUISD::FP_TO_FP16: {
6014 unsigned BitWidth = Known.getBitWidth();
6015
6016 // High bits are zero.
6018 break;
6019 }
6020 case AMDGPUISD::MUL_U24:
6021 case AMDGPUISD::MUL_I24: {
6022 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6023 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6024 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
6025 RHSKnown.countMinTrailingZeros();
6026 Known.Zero.setLowBits(std::min(TrailZ, 32u));
6027 // Skip extra check if all bits are known zeros.
6028 if (TrailZ >= 32)
6029 break;
6030
6031 // Truncate to 24 bits.
6032 LHSKnown = LHSKnown.trunc(24);
6033 RHSKnown = RHSKnown.trunc(24);
6034
6035 if (Opc == AMDGPUISD::MUL_I24) {
6036 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
6037 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
6038 unsigned MaxValBits = LHSValBits + RHSValBits;
6039 if (MaxValBits > 32)
6040 break;
6041 unsigned SignBits = 32 - MaxValBits + 1;
6042 bool LHSNegative = LHSKnown.isNegative();
6043 bool LHSNonNegative = LHSKnown.isNonNegative();
6044 bool LHSPositive = LHSKnown.isStrictlyPositive();
6045 bool RHSNegative = RHSKnown.isNegative();
6046 bool RHSNonNegative = RHSKnown.isNonNegative();
6047 bool RHSPositive = RHSKnown.isStrictlyPositive();
6048
6049 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
6050 Known.Zero.setHighBits(SignBits);
6051 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
6052 Known.One.setHighBits(SignBits);
6053 } else {
6054 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
6055 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
6056 unsigned MaxValBits = LHSValBits + RHSValBits;
6057 if (MaxValBits >= 32)
6058 break;
6059 Known.Zero.setBitsFrom(MaxValBits);
6060 }
6061 break;
6062 }
6063 case AMDGPUISD::PERM: {
6064 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6065 if (!CMask)
6066 return;
6067
6068 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6069 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6070 unsigned Sel = CMask->getZExtValue();
6071
6072 for (unsigned I = 0; I < 32; I += 8) {
6073 unsigned SelBits = Sel & 0xff;
6074 if (SelBits < 4) {
6075 SelBits *= 8;
6076 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6077 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6078 } else if (SelBits < 7) {
6079 SelBits = (SelBits & 3) * 8;
6080 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
6081 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
6082 } else if (SelBits == 0x0c) {
6083 Known.Zero |= 0xFFull << I;
6084 } else if (SelBits > 0x0c) {
6085 Known.One |= 0xFFull << I;
6086 }
6087 Sel >>= 8;
6088 }
6089 break;
6090 }
6091 case AMDGPUISD::BUFFER_LOAD_UBYTE: {
6092 Known.Zero.setHighBits(24);
6093 break;
6094 }
6095 case AMDGPUISD::BUFFER_LOAD_USHORT: {
6096 Known.Zero.setHighBits(16);
6097 break;
6098 }
6099 case AMDGPUISD::LDS: {
6100 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
6101 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
6102
6103 Known.Zero.setHighBits(16);
6104 Known.Zero.setLowBits(Log2(Alignment));
6105 break;
6106 }
6107 case AMDGPUISD::SMIN3:
6108 case AMDGPUISD::SMAX3:
6109 case AMDGPUISD::SMED3:
6110 case AMDGPUISD::UMIN3:
6111 case AMDGPUISD::UMAX3:
6112 case AMDGPUISD::UMED3: {
6113 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
6114 if (Known2.isUnknown())
6115 break;
6116
6117 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6118 if (Known1.isUnknown())
6119 break;
6120
6121 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6122 if (Known0.isUnknown())
6123 break;
6124
6125 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6126 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6127 Known.One = Known0.One & Known1.One & Known2.One;
6128 break;
6129 }
6131 unsigned IID = Op.getConstantOperandVal(0);
6132 switch (IID) {
6133 case Intrinsic::amdgcn_workitem_id_x:
6134 case Intrinsic::amdgcn_workitem_id_y:
6135 case Intrinsic::amdgcn_workitem_id_z: {
6136 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6138 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6139 break;
6140 }
6141 default:
6142 break;
6143 }
6144 }
6145 }
6146}
6147
6149 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6150 unsigned Depth) const {
6151 switch (Op.getOpcode()) {
6152 case AMDGPUISD::BFE_I32: {
6153 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6154 if (!Width)
6155 return 1;
6156
6157 unsigned SignBits = 32 - (Width->getZExtValue() & 0x1f) + 1;
6158 if (!isNullConstant(Op.getOperand(1)))
6159 return SignBits;
6160
6161 // TODO: Could probably figure something out with non-0 offsets.
6162 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6163 return std::max(SignBits, Op0SignBits);
6164 }
6165
6166 case AMDGPUISD::BFE_U32: {
6167 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6168 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6169 }
6170
6171 case AMDGPUISD::CARRY:
6172 case AMDGPUISD::BORROW:
6173 return 31;
6174 case AMDGPUISD::BUFFER_LOAD_BYTE:
6175 return 25;
6176 case AMDGPUISD::BUFFER_LOAD_SHORT:
6177 return 17;
6178 case AMDGPUISD::BUFFER_LOAD_UBYTE:
6179 return 24;
6180 case AMDGPUISD::BUFFER_LOAD_USHORT:
6181 return 16;
6182 case AMDGPUISD::FP_TO_FP16:
6183 return 16;
6184 case AMDGPUISD::SMIN3:
6185 case AMDGPUISD::SMAX3:
6186 case AMDGPUISD::SMED3:
6187 case AMDGPUISD::UMIN3:
6188 case AMDGPUISD::UMAX3:
6189 case AMDGPUISD::UMED3: {
6190 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6191 if (Tmp2 == 1)
6192 return 1; // Early out.
6193
6194 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6195 if (Tmp1 == 1)
6196 return 1; // Early out.
6197
6198 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6199 if (Tmp0 == 1)
6200 return 1; // Early out.
6201
6202 return std::min({Tmp0, Tmp1, Tmp2});
6203 }
6204 default:
6205 return 1;
6206 }
6207}
6208
6210 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6211 const MachineRegisterInfo &MRI, unsigned Depth) const {
6212 const MachineInstr *MI = MRI.getVRegDef(R);
6213 if (!MI)
6214 return 1;
6215
6216 // TODO: Check range metadata on MMO.
6217 switch (MI->getOpcode()) {
6218 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6219 return 25;
6220 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6221 return 17;
6222 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6223 return 24;
6224 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6225 return 16;
6226 case AMDGPU::G_AMDGPU_SMED3:
6227 case AMDGPU::G_AMDGPU_UMED3: {
6228 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6229 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6230 if (Tmp2 == 1)
6231 return 1;
6232 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6233 if (Tmp1 == 1)
6234 return 1;
6235 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6236 if (Tmp0 == 1)
6237 return 1;
6238 return std::min({Tmp0, Tmp1, Tmp2});
6239 }
6240 default:
6241 return 1;
6242 }
6243}
6244
6246 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6247 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6248 unsigned Opcode = Op.getOpcode();
6249 switch (Opcode) {
6250 case AMDGPUISD::BFE_I32:
6251 case AMDGPUISD::BFE_U32:
6252 return false;
6253 }
6255 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6256}
6257
6259 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6260 unsigned Depth) const {
6261 unsigned Opcode = Op.getOpcode();
6262 switch (Opcode) {
6263 case AMDGPUISD::FMIN_LEGACY:
6264 case AMDGPUISD::FMAX_LEGACY: {
6265 if (SNaN)
6266 return true;
6267
6268 // TODO: Can check no nans on one of the operands for each one, but which
6269 // one?
6270 return false;
6271 }
6272 case AMDGPUISD::FMUL_LEGACY:
6273 case AMDGPUISD::CVT_PKRTZ_F16_F32: {
6274 if (SNaN)
6275 return true;
6276 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6277 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6278 }
6279 case AMDGPUISD::FMED3:
6280 case AMDGPUISD::FMIN3:
6281 case AMDGPUISD::FMAX3:
6282 case AMDGPUISD::FMINIMUM3:
6283 case AMDGPUISD::FMAXIMUM3:
6284 case AMDGPUISD::FMAD_FTZ: {
6285 if (SNaN)
6286 return true;
6287 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6288 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6289 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6290 }
6291 case AMDGPUISD::CVT_F32_UBYTE0:
6292 case AMDGPUISD::CVT_F32_UBYTE1:
6293 case AMDGPUISD::CVT_F32_UBYTE2:
6294 case AMDGPUISD::CVT_F32_UBYTE3:
6295 return true;
6296
6297 case AMDGPUISD::RCP:
6298 case AMDGPUISD::RSQ:
6299 case AMDGPUISD::RCP_LEGACY:
6300 case AMDGPUISD::RSQ_CLAMP: {
6301 if (SNaN)
6302 return true;
6303
6304 // TODO: Need is known positive check.
6305 return false;
6306 }
6307 case ISD::FLDEXP:
6308 case AMDGPUISD::FRACT: {
6309 if (SNaN)
6310 return true;
6311 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6312 }
6313 case AMDGPUISD::DIV_SCALE:
6314 case AMDGPUISD::DIV_FMAS:
6315 case AMDGPUISD::DIV_FIXUP:
6316 // TODO: Refine on operands.
6317 return SNaN;
6318 case AMDGPUISD::SIN_HW:
6319 case AMDGPUISD::COS_HW: {
6320 // TODO: Need check for infinity
6321 return SNaN;
6322 }
6324 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6325 // TODO: Handle more intrinsics
6326 switch (IntrinsicID) {
6327 case Intrinsic::amdgcn_cubeid:
6328 case Intrinsic::amdgcn_cvt_off_f32_i4:
6329 return true;
6330
6331 case Intrinsic::amdgcn_frexp_mant: {
6332 if (SNaN)
6333 return true;
6334 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6335 }
6336 case Intrinsic::amdgcn_cvt_pkrtz: {
6337 if (SNaN)
6338 return true;
6339 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6340 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6341 }
6342 case Intrinsic::amdgcn_rcp:
6343 case Intrinsic::amdgcn_rsq:
6344 case Intrinsic::amdgcn_rcp_legacy:
6345 case Intrinsic::amdgcn_rsq_legacy:
6346 case Intrinsic::amdgcn_rsq_clamp:
6347 case Intrinsic::amdgcn_tanh: {
6348 if (SNaN)
6349 return true;
6350
6351 // TODO: Need is known positive check.
6352 return false;
6353 }
6354 case Intrinsic::amdgcn_trig_preop:
6355 case Intrinsic::amdgcn_fdot2:
6356 // TODO: Refine on operand
6357 return SNaN;
6358 case Intrinsic::amdgcn_fma_legacy:
6359 if (SNaN)
6360 return true;
6361 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6362 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6363 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6364 default:
6365 return false;
6366 }
6367 }
6368 default:
6369 return false;
6370 }
6371}
6372
6374 Register N0, Register N1) const {
6375 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6376}
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunctionInfo *MFI, SDValue Op, SelectionDAG &DAG) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXPUnsafeImpl(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags, bool IsExp10) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue lowerFEXPF64(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
AMDGPUTargetLowering(const TargetMachine &TM, const TargetSubtargetInfo &STI, const AMDGPUSubtarget &AMDGPUSTI)
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
bool isInt64ImmLegal(SDNode *Val, SelectionDAG &DAG) const
Check whether value Val can be supported by v_mov_b64, for the current target.
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue LowerCTLS(SDValue Op, SelectionDAG &DAG) const
Split a vector store into multiple scalar stores.
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1481
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1236
const fltSemantics & getSemantics() const
Definition APFloat.h:1524
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1254
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1213
APInt bitcastToAPInt() const
Definition APFloat.h:1408
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1153
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1406
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1400
static APInt getMaxValue(unsigned numBits)
Gets maximum unsigned value of APInt for specific bit width.
Definition APInt.h:207
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
static APInt getSignedMaxValue(unsigned numBits)
Gets maximum signed value of APInt for a specific bit width.
Definition APInt.h:210
static APInt getSignedMinValue(unsigned numBits)
Gets minimum signed value of APInt for a specific bit width.
Definition APInt.h:220
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1157
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1403
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:362
iterator_range< arg_iterator > args()
Definition Function.h:892
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:358
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI bool hasOneNonDBGUse(Register RegNo) const
hasOneNonDBGUse - Return true if there is exactly one non-Debug use of the specified register.
LLVM_ABI MachineInstr * getVRegDef(Register Reg) const
getVRegDef - Return the machine instr that defines the specified virtual register or null if none is ...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
LLVM_ABI bool isLiveIn(Register Reg) const
LLVM_ABI Register getLiveInVirtReg(MCRegister PReg) const
getLiveInVirtReg - If PReg is a live-in physical register, return the corresponding live-in virtual r...
void addLiveIn(MCRegister Reg, Register vreg=Register())
addLiveIn - Add the specified register as a live-in.
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT, unsigned Opcode)
Convert Op, which must be of integer type, to the integer type VT, by either any/sign/zero-extending ...
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
const TargetSubtargetInfo & getSubtarget() const
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false, SDNodeFlags Flags={})
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetSubtargetInfo - Generic base class for all target subtargets.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:819
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:788
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:779
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:294
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:522
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:853
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:880
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
Definition ISDOpcodes.h:993
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:844
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:787
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BRIND
BRIND - Indirect branch.
@ BR_JT
BR_JT - Jumptable branch.
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:541
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:548
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:796
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:792
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ STRICT_FP16_TO_FP
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:765
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ FMINNUM_IEEE
FMINNUM_IEEE/FMAXNUM_IEEE - Perform floating-point minimumNumber or maximumNumber on two values,...
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:850
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:811
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:888
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:978
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:805
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ INLINEASM_BR
INLINEASM_BR - Branching version of inline asm. Used by asm-goto.
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:926
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:304
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:959
@ ADDRSPACECAST
ADDRSPACECAST - This operator converts between pointers of different address spaces.
Definition ISDOpcodes.h:997
@ INLINEASM
INLINEASM - Represents an inline asm block.
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:945
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:856
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ FMINIMUMNUM
FMINIMUMNUM/FMAXIMUMNUM - minimumnum/maximumnum that is same with FMINNUM_IEEE and FMAXNUM_IEEE besid...
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:532
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:261
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1636
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:403
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:493
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:129
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:292
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:479
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:381
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:251
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:393
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:438
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:486
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:420
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:324
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:389
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:427
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:331
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:300
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:336
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:150
EVT changeElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a type whose attributes match ourselves with the exception of the element type that i...
Definition ValueTypes.h:121
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:344
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:316
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:258
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:167
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:312
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:264
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:148
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:132
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:285
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...