LLVM 22.0.0git
AMDGPUISelLowering.cpp
Go to the documentation of this file.
1//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// \file
10/// This is the parent TargetLowering class for hardware code gen
11/// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "AMDGPUISelLowering.h"
16#include "AMDGPU.h"
17#include "AMDGPUInstrInfo.h"
19#include "AMDGPUMemoryUtils.h"
25#include "llvm/IR/IntrinsicsAMDGPU.h"
29
30using namespace llvm;
31
32#include "AMDGPUGenCallingConv.inc"
33
35 "amdgpu-bypass-slow-div",
36 cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
37 cl::init(true));
38
39// Find a larger type to do a load / store of a vector with.
41 unsigned StoreSize = VT.getStoreSizeInBits();
42 if (StoreSize <= 32)
43 return EVT::getIntegerVT(Ctx, StoreSize);
44
45 if (StoreSize % 32 == 0)
46 return EVT::getVectorVT(Ctx, MVT::i32, StoreSize / 32);
47
48 return VT;
49}
50
54
56 // In order for this to be a signed 24-bit value, bit 23, must
57 // be a sign bit.
58 return DAG.ComputeMaxSignificantBits(Op);
59}
60
62 const AMDGPUSubtarget &STI)
63 : TargetLowering(TM), Subtarget(&STI) {
64 // Always lower memset, memcpy, and memmove intrinsics to load/store
65 // instructions, rather then generating calls to memset, mempcy or memmove.
69
70 // Enable ganging up loads and stores in the memcpy DAG lowering.
72
73 // Lower floating point store/load to integer store/load to reduce the number
74 // of patterns in tablegen.
75 setOperationAction(ISD::LOAD, MVT::f32, Promote);
76 AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
77
78 setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
79 AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
80
81 setOperationAction(ISD::LOAD, MVT::v3f32, Promote);
82 AddPromotedToType(ISD::LOAD, MVT::v3f32, MVT::v3i32);
83
84 setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
85 AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
86
87 setOperationAction(ISD::LOAD, MVT::v5f32, Promote);
88 AddPromotedToType(ISD::LOAD, MVT::v5f32, MVT::v5i32);
89
90 setOperationAction(ISD::LOAD, MVT::v6f32, Promote);
91 AddPromotedToType(ISD::LOAD, MVT::v6f32, MVT::v6i32);
92
93 setOperationAction(ISD::LOAD, MVT::v7f32, Promote);
94 AddPromotedToType(ISD::LOAD, MVT::v7f32, MVT::v7i32);
95
96 setOperationAction(ISD::LOAD, MVT::v8f32, Promote);
97 AddPromotedToType(ISD::LOAD, MVT::v8f32, MVT::v8i32);
98
99 setOperationAction(ISD::LOAD, MVT::v9f32, Promote);
100 AddPromotedToType(ISD::LOAD, MVT::v9f32, MVT::v9i32);
101
102 setOperationAction(ISD::LOAD, MVT::v10f32, Promote);
103 AddPromotedToType(ISD::LOAD, MVT::v10f32, MVT::v10i32);
104
105 setOperationAction(ISD::LOAD, MVT::v11f32, Promote);
106 AddPromotedToType(ISD::LOAD, MVT::v11f32, MVT::v11i32);
107
108 setOperationAction(ISD::LOAD, MVT::v12f32, Promote);
109 AddPromotedToType(ISD::LOAD, MVT::v12f32, MVT::v12i32);
110
111 setOperationAction(ISD::LOAD, MVT::v16f32, Promote);
112 AddPromotedToType(ISD::LOAD, MVT::v16f32, MVT::v16i32);
113
114 setOperationAction(ISD::LOAD, MVT::v32f32, Promote);
115 AddPromotedToType(ISD::LOAD, MVT::v32f32, MVT::v32i32);
116
117 setOperationAction(ISD::LOAD, MVT::i64, Promote);
118 AddPromotedToType(ISD::LOAD, MVT::i64, MVT::v2i32);
119
120 setOperationAction(ISD::LOAD, MVT::v2i64, Promote);
121 AddPromotedToType(ISD::LOAD, MVT::v2i64, MVT::v4i32);
122
123 setOperationAction(ISD::LOAD, MVT::f64, Promote);
124 AddPromotedToType(ISD::LOAD, MVT::f64, MVT::v2i32);
125
126 setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
127 AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
128
129 setOperationAction(ISD::LOAD, MVT::v3i64, Promote);
130 AddPromotedToType(ISD::LOAD, MVT::v3i64, MVT::v6i32);
131
132 setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
133 AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
134
135 setOperationAction(ISD::LOAD, MVT::v3f64, Promote);
136 AddPromotedToType(ISD::LOAD, MVT::v3f64, MVT::v6i32);
137
138 setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
139 AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
140
141 setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
142 AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
143
144 setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
145 AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
146
147 setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
148 AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
149
150 setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
151 AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
152
153 setOperationAction(ISD::LOAD, MVT::i128, Promote);
154 AddPromotedToType(ISD::LOAD, MVT::i128, MVT::v4i32);
155
156 // TODO: Would be better to consume as directly legal
157 setOperationAction(ISD::ATOMIC_LOAD, MVT::f32, Promote);
158 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
159
160 setOperationAction(ISD::ATOMIC_LOAD, MVT::f64, Promote);
161 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
162
163 setOperationAction(ISD::ATOMIC_LOAD, MVT::f16, Promote);
164 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
165
166 setOperationAction(ISD::ATOMIC_LOAD, MVT::bf16, Promote);
167 AddPromotedToType(ISD::ATOMIC_LOAD, MVT::bf16, MVT::i16);
168
169 setOperationAction(ISD::ATOMIC_STORE, MVT::f32, Promote);
170 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f32, MVT::i32);
171
172 setOperationAction(ISD::ATOMIC_STORE, MVT::f64, Promote);
173 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f64, MVT::i64);
174
175 setOperationAction(ISD::ATOMIC_STORE, MVT::f16, Promote);
176 AddPromotedToType(ISD::ATOMIC_STORE, MVT::f16, MVT::i16);
177
178 setOperationAction(ISD::ATOMIC_STORE, MVT::bf16, Promote);
179 AddPromotedToType(ISD::ATOMIC_STORE, MVT::bf16, MVT::i16);
180
181 // There are no 64-bit extloads. These should be done as a 32-bit extload and
182 // an extension to 64-bit.
183 for (MVT VT : MVT::integer_valuetypes())
185 Expand);
186
187 for (MVT VT : MVT::integer_valuetypes()) {
188 if (VT == MVT::i64)
189 continue;
190
191 for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD}) {
192 setLoadExtAction(Op, VT, MVT::i1, Promote);
193 setLoadExtAction(Op, VT, MVT::i8, Legal);
194 setLoadExtAction(Op, VT, MVT::i16, Legal);
195 setLoadExtAction(Op, VT, MVT::i32, Expand);
196 }
197 }
198
200 for (auto MemVT :
201 {MVT::v2i8, MVT::v4i8, MVT::v2i16, MVT::v3i16, MVT::v4i16})
203 Expand);
204
205 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
206 setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::bf16, Expand);
207 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2f16, Expand);
208 setLoadExtAction(ISD::EXTLOAD, MVT::v2f32, MVT::v2bf16, Expand);
209 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3f16, Expand);
210 setLoadExtAction(ISD::EXTLOAD, MVT::v3f32, MVT::v3bf16, Expand);
211 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4f16, Expand);
212 setLoadExtAction(ISD::EXTLOAD, MVT::v4f32, MVT::v4bf16, Expand);
213 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8f16, Expand);
214 setLoadExtAction(ISD::EXTLOAD, MVT::v8f32, MVT::v8bf16, Expand);
215 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16f16, Expand);
216 setLoadExtAction(ISD::EXTLOAD, MVT::v16f32, MVT::v16bf16, Expand);
217 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32f16, Expand);
218 setLoadExtAction(ISD::EXTLOAD, MVT::v32f32, MVT::v32bf16, Expand);
219
220 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f32, Expand);
221 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
222 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f32, Expand);
223 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
224 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
225 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
226
227 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
228 setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::bf16, Expand);
229 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
230 setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2bf16, Expand);
231 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3f16, Expand);
232 setLoadExtAction(ISD::EXTLOAD, MVT::v3f64, MVT::v3bf16, Expand);
233 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
234 setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4bf16, Expand);
235 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
236 setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8bf16, Expand);
237 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
238 setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16bf16, Expand);
239
240 setOperationAction(ISD::STORE, MVT::f32, Promote);
241 AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
242
243 setOperationAction(ISD::STORE, MVT::v2f32, Promote);
244 AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
245
246 setOperationAction(ISD::STORE, MVT::v3f32, Promote);
247 AddPromotedToType(ISD::STORE, MVT::v3f32, MVT::v3i32);
248
249 setOperationAction(ISD::STORE, MVT::v4f32, Promote);
250 AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
251
252 setOperationAction(ISD::STORE, MVT::v5f32, Promote);
253 AddPromotedToType(ISD::STORE, MVT::v5f32, MVT::v5i32);
254
255 setOperationAction(ISD::STORE, MVT::v6f32, Promote);
256 AddPromotedToType(ISD::STORE, MVT::v6f32, MVT::v6i32);
257
258 setOperationAction(ISD::STORE, MVT::v7f32, Promote);
259 AddPromotedToType(ISD::STORE, MVT::v7f32, MVT::v7i32);
260
261 setOperationAction(ISD::STORE, MVT::v8f32, Promote);
262 AddPromotedToType(ISD::STORE, MVT::v8f32, MVT::v8i32);
263
264 setOperationAction(ISD::STORE, MVT::v9f32, Promote);
265 AddPromotedToType(ISD::STORE, MVT::v9f32, MVT::v9i32);
266
267 setOperationAction(ISD::STORE, MVT::v10f32, Promote);
268 AddPromotedToType(ISD::STORE, MVT::v10f32, MVT::v10i32);
269
270 setOperationAction(ISD::STORE, MVT::v11f32, Promote);
271 AddPromotedToType(ISD::STORE, MVT::v11f32, MVT::v11i32);
272
273 setOperationAction(ISD::STORE, MVT::v12f32, Promote);
274 AddPromotedToType(ISD::STORE, MVT::v12f32, MVT::v12i32);
275
276 setOperationAction(ISD::STORE, MVT::v16f32, Promote);
277 AddPromotedToType(ISD::STORE, MVT::v16f32, MVT::v16i32);
278
279 setOperationAction(ISD::STORE, MVT::v32f32, Promote);
280 AddPromotedToType(ISD::STORE, MVT::v32f32, MVT::v32i32);
281
282 setOperationAction(ISD::STORE, MVT::i64, Promote);
283 AddPromotedToType(ISD::STORE, MVT::i64, MVT::v2i32);
284
285 setOperationAction(ISD::STORE, MVT::v2i64, Promote);
286 AddPromotedToType(ISD::STORE, MVT::v2i64, MVT::v4i32);
287
288 setOperationAction(ISD::STORE, MVT::f64, Promote);
289 AddPromotedToType(ISD::STORE, MVT::f64, MVT::v2i32);
290
291 setOperationAction(ISD::STORE, MVT::v2f64, Promote);
292 AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
293
294 setOperationAction(ISD::STORE, MVT::v3i64, Promote);
295 AddPromotedToType(ISD::STORE, MVT::v3i64, MVT::v6i32);
296
297 setOperationAction(ISD::STORE, MVT::v3f64, Promote);
298 AddPromotedToType(ISD::STORE, MVT::v3f64, MVT::v6i32);
299
300 setOperationAction(ISD::STORE, MVT::v4i64, Promote);
301 AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
302
303 setOperationAction(ISD::STORE, MVT::v4f64, Promote);
304 AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
305
306 setOperationAction(ISD::STORE, MVT::v8i64, Promote);
307 AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
308
309 setOperationAction(ISD::STORE, MVT::v8f64, Promote);
310 AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
311
312 setOperationAction(ISD::STORE, MVT::v16i64, Promote);
313 AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
314
315 setOperationAction(ISD::STORE, MVT::v16f64, Promote);
316 AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
317
318 setOperationAction(ISD::STORE, MVT::i128, Promote);
319 AddPromotedToType(ISD::STORE, MVT::i128, MVT::v4i32);
320
321 setTruncStoreAction(MVT::i64, MVT::i1, Expand);
322 setTruncStoreAction(MVT::i64, MVT::i8, Expand);
323 setTruncStoreAction(MVT::i64, MVT::i16, Expand);
324 setTruncStoreAction(MVT::i64, MVT::i32, Expand);
325
326 setTruncStoreAction(MVT::v2i64, MVT::v2i1, Expand);
327 setTruncStoreAction(MVT::v2i64, MVT::v2i8, Expand);
328 setTruncStoreAction(MVT::v2i64, MVT::v2i16, Expand);
329 setTruncStoreAction(MVT::v2i64, MVT::v2i32, Expand);
330
331 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
332 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
333 setTruncStoreAction(MVT::v2f32, MVT::v2bf16, Expand);
334 setTruncStoreAction(MVT::v2f32, MVT::v2f16, Expand);
335 setTruncStoreAction(MVT::v3f32, MVT::v3bf16, Expand);
336 setTruncStoreAction(MVT::v3f32, MVT::v3f16, Expand);
337 setTruncStoreAction(MVT::v4f32, MVT::v4bf16, Expand);
338 setTruncStoreAction(MVT::v4f32, MVT::v4f16, Expand);
339 setTruncStoreAction(MVT::v8f32, MVT::v8bf16, Expand);
340 setTruncStoreAction(MVT::v8f32, MVT::v8f16, Expand);
341 setTruncStoreAction(MVT::v16f32, MVT::v16bf16, Expand);
342 setTruncStoreAction(MVT::v16f32, MVT::v16f16, Expand);
343 setTruncStoreAction(MVT::v32f32, MVT::v32bf16, Expand);
344 setTruncStoreAction(MVT::v32f32, MVT::v32f16, Expand);
345
346 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
347 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
348 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
349
350 setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
351 setTruncStoreAction(MVT::v2f64, MVT::v2bf16, Expand);
352 setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
353
354 setTruncStoreAction(MVT::v3i32, MVT::v3i8, Expand);
355
356 setTruncStoreAction(MVT::v3i64, MVT::v3i32, Expand);
357 setTruncStoreAction(MVT::v3i64, MVT::v3i16, Expand);
358 setTruncStoreAction(MVT::v3i64, MVT::v3i8, Expand);
359 setTruncStoreAction(MVT::v3i64, MVT::v3i1, Expand);
360 setTruncStoreAction(MVT::v3f64, MVT::v3f32, Expand);
361 setTruncStoreAction(MVT::v3f64, MVT::v3bf16, Expand);
362 setTruncStoreAction(MVT::v3f64, MVT::v3f16, Expand);
363
364 setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
365 setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
366 setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
367 setTruncStoreAction(MVT::v4f64, MVT::v4bf16, Expand);
368 setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
369
370 setTruncStoreAction(MVT::v5i32, MVT::v5i1, Expand);
371 setTruncStoreAction(MVT::v5i32, MVT::v5i8, Expand);
372 setTruncStoreAction(MVT::v5i32, MVT::v5i16, Expand);
373
374 setTruncStoreAction(MVT::v6i32, MVT::v6i1, Expand);
375 setTruncStoreAction(MVT::v6i32, MVT::v6i8, Expand);
376 setTruncStoreAction(MVT::v6i32, MVT::v6i16, Expand);
377
378 setTruncStoreAction(MVT::v7i32, MVT::v7i1, Expand);
379 setTruncStoreAction(MVT::v7i32, MVT::v7i8, Expand);
380 setTruncStoreAction(MVT::v7i32, MVT::v7i16, Expand);
381
382 setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
383 setTruncStoreAction(MVT::v8f64, MVT::v8bf16, Expand);
384 setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
385
386 setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
387 setTruncStoreAction(MVT::v16f64, MVT::v16bf16, Expand);
388 setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
389 setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
390 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
391 setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
392 setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
393
394 setOperationAction(ISD::Constant, {MVT::i32, MVT::i64}, Legal);
395 setOperationAction(ISD::ConstantFP, {MVT::f32, MVT::f64}, Legal);
396
397 setOperationAction({ISD::BR_JT, ISD::BRIND}, MVT::Other, Expand);
398
399 // For R600, this is totally unsupported, just custom lower to produce an
400 // error.
401 setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
402
403 // Library functions. These default to Expand, but we have instructions
404 // for them.
405 setOperationAction({ISD::FCEIL, ISD::FPOW, ISD::FABS, ISD::FFLOOR,
406 ISD::FROUNDEVEN, ISD::FTRUNC},
407 {MVT::f16, MVT::f32}, Legal);
408 setOperationAction({ISD::FMINNUM, ISD::FMAXNUM}, MVT::f32, Legal);
409
410 setOperationAction(ISD::FLOG2, MVT::f32, Custom);
411 setOperationAction(ISD::FROUND, {MVT::f32, MVT::f64}, Custom);
412 setOperationAction({ISD::LROUND, ISD::LLROUND},
413 {MVT::f16, MVT::f32, MVT::f64}, Expand);
414
416 {ISD::FLOG, ISD::FLOG10, ISD::FEXP, ISD::FEXP2, ISD::FEXP10}, MVT::f32,
417 Custom);
418
419 setOperationAction(ISD::FNEARBYINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
420
421 setOperationAction(ISD::FRINT, {MVT::f16, MVT::f32, MVT::f64}, Custom);
422
423 setOperationAction({ISD::LRINT, ISD::LLRINT}, {MVT::f16, MVT::f32, MVT::f64},
424 Expand);
425
426 setOperationAction(ISD::FREM, {MVT::f16, MVT::f32, MVT::f64}, Expand);
427
428 if (Subtarget->has16BitInsts()) {
429 setOperationAction(ISD::IS_FPCLASS, {MVT::f16, MVT::f32, MVT::f64}, Legal);
430 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Legal);
431 } else {
432 setOperationAction(ISD::IS_FPCLASS, {MVT::f32, MVT::f64}, Legal);
433 setOperationAction({ISD::FLOG2, ISD::FEXP2}, MVT::f16, Custom);
434 }
435
436 setOperationAction({ISD::FLOG10, ISD::FLOG, ISD::FEXP, ISD::FEXP10}, MVT::f16,
437 Custom);
438
439 setOperationAction(ISD::FCANONICALIZE, {MVT::f32, MVT::f64}, Legal);
440 if (Subtarget->has16BitInsts()) {
442 }
443
444 // FIXME: These IS_FPCLASS vector fp types are marked custom so it reaches
445 // scalarization code. Can be removed when IS_FPCLASS expand isn't called by
446 // default unless marked custom/legal.
448 {MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32,
449 MVT::v6f32, MVT::v7f32, MVT::v8f32, MVT::v16f32,
450 MVT::v2f64, MVT::v3f64, MVT::v4f64, MVT::v8f64,
451 MVT::v16f64},
452 Custom);
453
454 if (isTypeLegal(MVT::f16))
456 {MVT::v2f16, MVT::v3f16, MVT::v4f16, MVT::v16f16},
457 Custom);
458
459 // Expand to fneg + fadd.
461
463 {MVT::v3i32, MVT::v3f32, MVT::v4i32, MVT::v4f32,
464 MVT::v5i32, MVT::v5f32, MVT::v6i32, MVT::v6f32,
465 MVT::v7i32, MVT::v7f32, MVT::v8i32, MVT::v8f32,
466 MVT::v9i32, MVT::v9f32, MVT::v10i32, MVT::v10f32,
467 MVT::v11i32, MVT::v11f32, MVT::v12i32, MVT::v12f32},
468 Custom);
469
472 {MVT::v2f32, MVT::v2i32, MVT::v3f32, MVT::v3i32, MVT::v4f32,
473 MVT::v4i32, MVT::v5f32, MVT::v5i32, MVT::v6f32, MVT::v6i32,
474 MVT::v7f32, MVT::v7i32, MVT::v8f32, MVT::v8i32, MVT::v9f32,
475 MVT::v9i32, MVT::v10i32, MVT::v10f32, MVT::v11i32, MVT::v11f32,
476 MVT::v12i32, MVT::v12f32, MVT::v16i32, MVT::v32f32, MVT::v32i32,
477 MVT::v2f64, MVT::v2i64, MVT::v3f64, MVT::v3i64, MVT::v4f64,
478 MVT::v4i64, MVT::v8f64, MVT::v8i64, MVT::v16f64, MVT::v16i64},
479 Custom);
480
481 setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
482 setOperationAction(ISD::FP_TO_FP16, {MVT::f64, MVT::f32}, Custom);
483
484 const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
485 for (MVT VT : ScalarIntVTs) {
486 // These should use [SU]DIVREM, so set them to expand
488 Expand);
489
490 // GPU does not have divrem function for signed or unsigned.
492
493 // GPU does not have [S|U]MUL_LOHI functions as a single instruction.
495
497
498 // AMDGPU uses ADDC/SUBC/ADDE/SUBE
500 }
501
502 // The hardware supports 32-bit FSHR, but not FSHL.
504
505 // The hardware supports 32-bit ROTR, but not ROTL.
506 setOperationAction(ISD::ROTL, {MVT::i32, MVT::i64}, Expand);
508
510
514 MVT::i64, Custom);
516
518 Legal);
519
522 MVT::i64, Custom);
523
524 for (auto VT : {MVT::i8, MVT::i16})
526
527 static const MVT::SimpleValueType VectorIntTypes[] = {
528 MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
529 MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
530
531 for (MVT VT : VectorIntTypes) {
532 // Expand the following operations for the current type by default.
544 ISD::SETCC, ISD::ADDRSPACECAST},
545 VT, Expand);
546 }
547
548 static const MVT::SimpleValueType FloatVectorTypes[] = {
549 MVT::v2f32, MVT::v3f32, MVT::v4f32, MVT::v5f32, MVT::v6f32, MVT::v7f32,
550 MVT::v9f32, MVT::v10f32, MVT::v11f32, MVT::v12f32};
551
552 for (MVT VT : FloatVectorTypes) {
554 {ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM,
555 ISD::FADD, ISD::FCEIL, ISD::FCOS,
556 ISD::FDIV, ISD::FEXP2, ISD::FEXP,
557 ISD::FEXP10, ISD::FLOG2, ISD::FREM,
558 ISD::FLOG, ISD::FLOG10, ISD::FPOW,
559 ISD::FFLOOR, ISD::FTRUNC, ISD::FMUL,
560 ISD::FMA, ISD::FRINT, ISD::FNEARBYINT,
561 ISD::FSQRT, ISD::FSIN, ISD::FSUB,
562 ISD::FNEG, ISD::VSELECT, ISD::SELECT_CC,
564 ISD::FCANONICALIZE, ISD::FROUNDEVEN},
565 VT, Expand);
566 }
567
568 // This causes using an unrolled select operation rather than expansion with
569 // bit operations. This is in general better, but the alternative using BFI
570 // instructions may be better if the select sources are SGPRs.
572 AddPromotedToType(ISD::SELECT, MVT::v2f32, MVT::v2i32);
573
575 AddPromotedToType(ISD::SELECT, MVT::v3f32, MVT::v3i32);
576
578 AddPromotedToType(ISD::SELECT, MVT::v4f32, MVT::v4i32);
579
581 AddPromotedToType(ISD::SELECT, MVT::v5f32, MVT::v5i32);
582
584 AddPromotedToType(ISD::SELECT, MVT::v6f32, MVT::v6i32);
585
587 AddPromotedToType(ISD::SELECT, MVT::v7f32, MVT::v7i32);
588
590 AddPromotedToType(ISD::SELECT, MVT::v9f32, MVT::v9i32);
591
593 AddPromotedToType(ISD::SELECT, MVT::v10f32, MVT::v10i32);
594
596 AddPromotedToType(ISD::SELECT, MVT::v11f32, MVT::v11i32);
597
599 AddPromotedToType(ISD::SELECT, MVT::v12f32, MVT::v12i32);
600
602 setJumpIsExpensive(true);
603
606
608
609 // We want to find all load dependencies for long chains of stores to enable
610 // merging into very wide vectors. The problem is with vectors with > 4
611 // elements. MergeConsecutiveStores will attempt to merge these because x8/x16
612 // vectors are a legal type, even though we have to split the loads
613 // usually. When we can more precisely specify load legality per address
614 // space, we should be able to make FindBetterChain/MergeConsecutiveStores
615 // smarter so that they can figure out what to do in 2 iterations without all
616 // N > 4 stores on the same chain.
618
619 // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
620 // about these during lowering.
621 MaxStoresPerMemcpy = 0xffffffff;
622 MaxStoresPerMemmove = 0xffffffff;
623 MaxStoresPerMemset = 0xffffffff;
624
625 // The expansion for 64-bit division is enormous.
627 addBypassSlowDiv(64, 32);
628
629 setTargetDAGCombine({ISD::BITCAST, ISD::SHL,
635 ISD::STORE, ISD::FADD,
636 ISD::FSUB, ISD::FNEG,
637 ISD::FABS, ISD::AssertZext,
639
643}
644
646 if (getTargetMachine().Options.NoSignedZerosFPMath)
647 return true;
648
649 const auto Flags = Op.getNode()->getFlags();
650 if (Flags.hasNoSignedZeros())
651 return true;
652
653 return false;
654}
655
656//===----------------------------------------------------------------------===//
657// Target Information
658//===----------------------------------------------------------------------===//
659
661static bool fnegFoldsIntoOpcode(unsigned Opc) {
662 switch (Opc) {
663 case ISD::FADD:
664 case ISD::FSUB:
665 case ISD::FMUL:
666 case ISD::FMA:
667 case ISD::FMAD:
668 case ISD::FMINNUM:
669 case ISD::FMAXNUM:
670 case ISD::FMINNUM_IEEE:
671 case ISD::FMAXNUM_IEEE:
672 case ISD::FMINIMUM:
673 case ISD::FMAXIMUM:
674 case ISD::FMINIMUMNUM:
675 case ISD::FMAXIMUMNUM:
676 case ISD::SELECT:
677 case ISD::FSIN:
678 case ISD::FTRUNC:
679 case ISD::FRINT:
680 case ISD::FNEARBYINT:
681 case ISD::FROUNDEVEN:
683 case AMDGPUISD::RCP:
690 case AMDGPUISD::FMED3:
691 // TODO: handle llvm.amdgcn.fma.legacy
692 return true;
693 case ISD::BITCAST:
694 llvm_unreachable("bitcast is special cased");
695 default:
696 return false;
697 }
698}
699
700static bool fnegFoldsIntoOp(const SDNode *N) {
701 unsigned Opc = N->getOpcode();
702 if (Opc == ISD::BITCAST) {
703 // TODO: Is there a benefit to checking the conditions performFNegCombine
704 // does? We don't for the other cases.
705 SDValue BCSrc = N->getOperand(0);
706 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
707 return BCSrc.getNumOperands() == 2 &&
708 BCSrc.getOperand(1).getValueSizeInBits() == 32;
709 }
710
711 return BCSrc.getOpcode() == ISD::SELECT && BCSrc.getValueType() == MVT::f32;
712 }
713
714 return fnegFoldsIntoOpcode(Opc);
715}
716
717/// \p returns true if the operation will definitely need to use a 64-bit
718/// encoding, and thus will use a VOP3 encoding regardless of the source
719/// modifiers.
721static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
722 return (N->getNumOperands() > 2 && N->getOpcode() != ISD::SELECT) ||
723 VT == MVT::f64;
724}
725
726/// Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the
727/// type for ISD::SELECT.
729static bool selectSupportsSourceMods(const SDNode *N) {
730 // TODO: Only applies if select will be vector
731 return N->getValueType(0) == MVT::f32;
732}
733
734// Most FP instructions support source modifiers, but this could be refined
735// slightly.
737static bool hasSourceMods(const SDNode *N) {
738 if (isa<MemSDNode>(N))
739 return false;
740
741 switch (N->getOpcode()) {
742 case ISD::CopyToReg:
743 case ISD::FDIV:
744 case ISD::FREM:
745 case ISD::INLINEASM:
746 case ISD::INLINEASM_BR:
749
750 // TODO: Should really be looking at the users of the bitcast. These are
751 // problematic because bitcasts are used to legalize all stores to integer
752 // types.
753 case ISD::BITCAST:
754 return false;
756 switch (N->getConstantOperandVal(0)) {
757 case Intrinsic::amdgcn_interp_p1:
758 case Intrinsic::amdgcn_interp_p2:
759 case Intrinsic::amdgcn_interp_mov:
760 case Intrinsic::amdgcn_interp_p1_f16:
761 case Intrinsic::amdgcn_interp_p2_f16:
762 return false;
763 default:
764 return true;
765 }
766 }
767 case ISD::SELECT:
769 default:
770 return true;
771 }
772}
773
775 unsigned CostThreshold) {
776 // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
777 // it is truly free to use a source modifier in all cases. If there are
778 // multiple users but for each one will necessitate using VOP3, there will be
779 // a code size increase. Try to avoid increasing code size unless we know it
780 // will save on the instruction count.
781 unsigned NumMayIncreaseSize = 0;
782 MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
783
784 assert(!N->use_empty());
785
786 // XXX - Should this limit number of uses to check?
787 for (const SDNode *U : N->users()) {
788 if (!hasSourceMods(U))
789 return false;
790
791 if (!opMustUseVOP3Encoding(U, VT)) {
792 if (++NumMayIncreaseSize > CostThreshold)
793 return false;
794 }
795 }
796
797 return true;
798}
799
801 ISD::NodeType ExtendKind) const {
802 assert(!VT.isVector() && "only scalar expected");
803
804 // Round to the next multiple of 32-bits.
805 unsigned Size = VT.getSizeInBits();
806 if (Size <= 32)
807 return MVT::i32;
808 return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
809}
810
812 return 32;
813}
814
816 return true;
817}
818
819// The backend supports 32 and 64 bit floating point immediates.
820// FIXME: Why are we reporting vectors of FP immediates as legal?
822 bool ForCodeSize) const {
823 EVT ScalarVT = VT.getScalarType();
824 return (ScalarVT == MVT::f32 || ScalarVT == MVT::f64 ||
825 (ScalarVT == MVT::f16 && Subtarget->has16BitInsts()));
826}
827
828// We don't want to shrink f64 / f32 constants.
830 EVT ScalarVT = VT.getScalarType();
831 return (ScalarVT != MVT::f32 && ScalarVT != MVT::f64);
832}
833
835 SDNode *N, ISD::LoadExtType ExtTy, EVT NewVT,
836 std::optional<unsigned> ByteOffset) const {
837 // TODO: This may be worth removing. Check regression tests for diffs.
838 if (!TargetLoweringBase::shouldReduceLoadWidth(N, ExtTy, NewVT, ByteOffset))
839 return false;
840
841 unsigned NewSize = NewVT.getStoreSizeInBits();
842
843 // If we are reducing to a 32-bit load or a smaller multi-dword load,
844 // this is always better.
845 if (NewSize >= 32)
846 return true;
847
848 EVT OldVT = N->getValueType(0);
849 unsigned OldSize = OldVT.getStoreSizeInBits();
850
852 unsigned AS = MN->getAddressSpace();
853 // Do not shrink an aligned scalar load to sub-dword.
854 // Scalar engine cannot do sub-dword loads.
855 // TODO: Update this for GFX12 which does have scalar sub-dword loads.
856 if (OldSize >= 32 && NewSize < 32 && MN->getAlign() >= Align(4) &&
860 MN->isInvariant())) &&
862 return false;
863
864 // Don't produce extloads from sub 32-bit types. SI doesn't have scalar
865 // extloads, so doing one requires using a buffer_load. In cases where we
866 // still couldn't use a scalar load, using the wider load shouldn't really
867 // hurt anything.
868
869 // If the old size already had to be an extload, there's no harm in continuing
870 // to reduce the width.
871 return (OldSize < 32);
872}
873
875 const SelectionDAG &DAG,
876 const MachineMemOperand &MMO) const {
877
878 assert(LoadTy.getSizeInBits() == CastTy.getSizeInBits());
879
880 if (LoadTy.getScalarType() == MVT::i32)
881 return false;
882
883 unsigned LScalarSize = LoadTy.getScalarSizeInBits();
884 unsigned CastScalarSize = CastTy.getScalarSizeInBits();
885
886 if ((LScalarSize >= CastScalarSize) && (CastScalarSize < 32))
887 return false;
888
889 unsigned Fast = 0;
891 CastTy, MMO, &Fast) &&
892 Fast;
893}
894
895// SI+ has instructions for cttz / ctlz for 32-bit values. This is probably also
896// profitable with the expansion for 64-bit since it's generally good to
897// speculate things.
899 return true;
900}
901
903 return true;
904}
905
907 switch (N->getOpcode()) {
908 case ISD::EntryToken:
909 case ISD::TokenFactor:
910 return true;
912 unsigned IntrID = N->getConstantOperandVal(0);
914 }
916 unsigned IntrID = N->getConstantOperandVal(1);
918 }
919 case ISD::LOAD:
920 if (cast<LoadSDNode>(N)->getMemOperand()->getAddrSpace() ==
922 return true;
923 return false;
924 case AMDGPUISD::SETCC: // ballot-style instruction
925 return true;
926 }
927 return false;
928}
929
931 SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
932 NegatibleCost &Cost, unsigned Depth) const {
933
934 switch (Op.getOpcode()) {
935 case ISD::FMA:
936 case ISD::FMAD: {
937 // Negating a fma is not free if it has users without source mods.
938 if (!allUsesHaveSourceMods(Op.getNode()))
939 return SDValue();
940 break;
941 }
942 case AMDGPUISD::RCP: {
943 SDValue Src = Op.getOperand(0);
944 EVT VT = Op.getValueType();
945 SDLoc SL(Op);
946
947 SDValue NegSrc = getNegatedExpression(Src, DAG, LegalOperations,
948 ForCodeSize, Cost, Depth + 1);
949 if (NegSrc)
950 return DAG.getNode(AMDGPUISD::RCP, SL, VT, NegSrc, Op->getFlags());
951 return SDValue();
952 }
953 default:
954 break;
955 }
956
957 return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
958 ForCodeSize, Cost, Depth);
959}
960
961//===---------------------------------------------------------------------===//
962// Target Properties
963//===---------------------------------------------------------------------===//
964
967
968 // Packed operations do not have a fabs modifier.
969 return VT == MVT::f32 || VT == MVT::f64 ||
970 (Subtarget->has16BitInsts() && (VT == MVT::f16 || VT == MVT::bf16));
971}
972
975 // Report this based on the end legalized type.
976 VT = VT.getScalarType();
977 return VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f16 || VT == MVT::bf16;
978}
979
981 unsigned NumElem,
982 unsigned AS) const {
983 return true;
984}
985
987 // There are few operations which truly have vector input operands. Any vector
988 // operation is going to involve operations on each component, and a
989 // build_vector will be a copy per element, so it always makes sense to use a
990 // build_vector input in place of the extracted element to avoid a copy into a
991 // super register.
992 //
993 // We should probably only do this if all users are extracts only, but this
994 // should be the common case.
995 return true;
996}
997
999 // Truncate is just accessing a subregister.
1000
1001 unsigned SrcSize = Source.getSizeInBits();
1002 unsigned DestSize = Dest.getSizeInBits();
1003
1004 return DestSize < SrcSize && DestSize % 32 == 0 ;
1005}
1006
1008 // Truncate is just accessing a subregister.
1009
1010 unsigned SrcSize = Source->getScalarSizeInBits();
1011 unsigned DestSize = Dest->getScalarSizeInBits();
1012
1013 if (DestSize== 16 && Subtarget->has16BitInsts())
1014 return SrcSize >= 32;
1015
1016 return DestSize < SrcSize && DestSize % 32 == 0;
1017}
1018
1020 unsigned SrcSize = Src->getScalarSizeInBits();
1021 unsigned DestSize = Dest->getScalarSizeInBits();
1022
1023 if (SrcSize == 16 && Subtarget->has16BitInsts())
1024 return DestSize >= 32;
1025
1026 return SrcSize == 32 && DestSize == 64;
1027}
1028
1030 // Any register load of a 64-bit value really requires 2 32-bit moves. For all
1031 // practical purposes, the extra mov 0 to load a 64-bit is free. As used,
1032 // this will enable reducing 64-bit operations the 32-bit, which is always
1033 // good.
1034
1035 if (Src == MVT::i16)
1036 return Dest == MVT::i32 ||Dest == MVT::i64 ;
1037
1038 return Src == MVT::i32 && Dest == MVT::i64;
1039}
1040
1042 EVT DestVT) const {
1043 switch (N->getOpcode()) {
1044 case ISD::ADD:
1045 case ISD::SUB:
1046 case ISD::SHL:
1047 case ISD::SRL:
1048 case ISD::SRA:
1049 case ISD::AND:
1050 case ISD::OR:
1051 case ISD::XOR:
1052 case ISD::MUL:
1053 case ISD::SETCC:
1054 case ISD::SELECT:
1055 case ISD::SMIN:
1056 case ISD::SMAX:
1057 case ISD::UMIN:
1058 case ISD::UMAX:
1059 if (Subtarget->has16BitInsts() &&
1060 (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) {
1061 // Don't narrow back down to i16 if promoted to i32 already.
1062 if (!N->isDivergent() && DestVT.isInteger() &&
1063 DestVT.getScalarSizeInBits() > 1 &&
1064 DestVT.getScalarSizeInBits() <= 16 &&
1065 SrcVT.getScalarSizeInBits() > 16) {
1066 return false;
1067 }
1068 }
1069 return true;
1070 default:
1071 break;
1072 }
1073
1074 // There aren't really 64-bit registers, but pairs of 32-bit ones and only a
1075 // limited number of native 64-bit operations. Shrinking an operation to fit
1076 // in a single 32-bit register should always be helpful. As currently used,
1077 // this is much less general than the name suggests, and is only used in
1078 // places trying to reduce the sizes of loads. Shrinking loads to < 32-bits is
1079 // not profitable, and may actually be harmful.
1080 if (isa<LoadSDNode>(N))
1081 return SrcVT.getSizeInBits() > 32 && DestVT.getSizeInBits() == 32;
1082
1083 return true;
1084}
1085
1087 const SDNode* N, CombineLevel Level) const {
1088 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
1089 N->getOpcode() == ISD::SRL) &&
1090 "Expected shift op");
1091
1092 SDValue ShiftLHS = N->getOperand(0);
1093 if (!ShiftLHS->hasOneUse())
1094 return false;
1095
1096 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
1097 !ShiftLHS.getOperand(0)->hasOneUse())
1098 return false;
1099
1100 // Always commute pre-type legalization and right shifts.
1101 // We're looking for shl(or(x,y),z) patterns.
1103 N->getOpcode() != ISD::SHL || N->getOperand(0).getOpcode() != ISD::OR)
1104 return true;
1105
1106 // If only user is a i32 right-shift, then don't destroy a BFE pattern.
1107 if (N->getValueType(0) == MVT::i32 && N->hasOneUse() &&
1108 (N->user_begin()->getOpcode() == ISD::SRA ||
1109 N->user_begin()->getOpcode() == ISD::SRL))
1110 return false;
1111
1112 // Don't destroy or(shl(load_zext(),c), load_zext()) patterns.
1113 auto IsShiftAndLoad = [](SDValue LHS, SDValue RHS) {
1114 if (LHS.getOpcode() != ISD::SHL)
1115 return false;
1116 auto *RHSLd = dyn_cast<LoadSDNode>(RHS);
1117 auto *LHS0 = dyn_cast<LoadSDNode>(LHS.getOperand(0));
1118 auto *LHS1 = dyn_cast<ConstantSDNode>(LHS.getOperand(1));
1119 return LHS0 && LHS1 && RHSLd && LHS0->getExtensionType() == ISD::ZEXTLOAD &&
1120 LHS1->getAPIntValue() == LHS0->getMemoryVT().getScalarSizeInBits() &&
1121 RHSLd->getExtensionType() == ISD::ZEXTLOAD;
1122 };
1123 SDValue LHS = N->getOperand(0).getOperand(0);
1124 SDValue RHS = N->getOperand(0).getOperand(1);
1125 return !(IsShiftAndLoad(LHS, RHS) || IsShiftAndLoad(RHS, LHS));
1126}
1127
1128//===---------------------------------------------------------------------===//
1129// TargetLowering Callbacks
1130//===---------------------------------------------------------------------===//
1131
1133 bool IsVarArg) {
1134 switch (CC) {
1142 return CC_AMDGPU;
1145 return CC_AMDGPU_CS_CHAIN;
1146 case CallingConv::C:
1147 case CallingConv::Fast:
1148 case CallingConv::Cold:
1149 return CC_AMDGPU_Func;
1152 return CC_SI_Gfx;
1155 default:
1156 reportFatalUsageError("unsupported calling convention for call");
1157 }
1158}
1159
1161 bool IsVarArg) {
1162 switch (CC) {
1165 llvm_unreachable("kernels should not be handled here");
1175 return RetCC_SI_Shader;
1178 return RetCC_SI_Gfx;
1179 case CallingConv::C:
1180 case CallingConv::Fast:
1181 case CallingConv::Cold:
1182 return RetCC_AMDGPU_Func;
1183 default:
1184 reportFatalUsageError("unsupported calling convention");
1185 }
1186}
1187
1188/// The SelectionDAGBuilder will automatically promote function arguments
1189/// with illegal types. However, this does not work for the AMDGPU targets
1190/// since the function arguments are stored in memory as these illegal types.
1191/// In order to handle this properly we need to get the original types sizes
1192/// from the LLVM IR Function and fixup the ISD:InputArg values before
1193/// passing them to AnalyzeFormalArguments()
1194
1195/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting
1196/// input values across multiple registers. Each item in the Ins array
1197/// represents a single value that will be stored in registers. Ins[x].VT is
1198/// the value type of the value that will be stored in the register, so
1199/// whatever SDNode we lower the argument to needs to be this type.
1200///
1201/// In order to correctly lower the arguments we need to know the size of each
1202/// argument. Since Ins[x].VT gives us the size of the register that will
1203/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type
1204/// for the original function argument so that we can deduce the correct memory
1205/// type to use for Ins[x]. In most cases the correct memory type will be
1206/// Ins[x].ArgVT. However, this will not always be the case. If, for example,
1207/// we have a kernel argument of type v8i8, this argument will be split into
1208/// 8 parts and each part will be represented by its own item in the Ins array.
1209/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of
1210/// the argument before it was split. From this, we deduce that the memory type
1211/// for each individual part is i8. We pass the memory type as LocVT to the
1212/// calling convention analysis function and the register type (Ins[x].VT) as
1213/// the ValVT.
1215 CCState &State,
1216 const SmallVectorImpl<ISD::InputArg> &Ins) const {
1217 const MachineFunction &MF = State.getMachineFunction();
1218 const Function &Fn = MF.getFunction();
1219 LLVMContext &Ctx = Fn.getParent()->getContext();
1220 const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
1221 const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset();
1223
1224 Align MaxAlign = Align(1);
1225 uint64_t ExplicitArgOffset = 0;
1226 const DataLayout &DL = Fn.getDataLayout();
1227
1228 unsigned InIndex = 0;
1229
1230 for (const Argument &Arg : Fn.args()) {
1231 const bool IsByRef = Arg.hasByRefAttr();
1232 Type *BaseArgTy = Arg.getType();
1233 Type *MemArgTy = IsByRef ? Arg.getParamByRefType() : BaseArgTy;
1234 Align Alignment = DL.getValueOrABITypeAlignment(
1235 IsByRef ? Arg.getParamAlign() : std::nullopt, MemArgTy);
1236 MaxAlign = std::max(Alignment, MaxAlign);
1237 uint64_t AllocSize = DL.getTypeAllocSize(MemArgTy);
1238
1239 uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
1240 ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
1241
1242 // We're basically throwing away everything passed into us and starting over
1243 // to get accurate in-memory offsets. The "PartOffset" is completely useless
1244 // to us as computed in Ins.
1245 //
1246 // We also need to figure out what type legalization is trying to do to get
1247 // the correct memory offsets.
1248
1249 SmallVector<EVT, 16> ValueVTs;
1251 ComputeValueVTs(*this, DL, BaseArgTy, ValueVTs, /*MemVTs=*/nullptr,
1252 &Offsets, ArgOffset);
1253
1254 for (unsigned Value = 0, NumValues = ValueVTs.size();
1255 Value != NumValues; ++Value) {
1256 uint64_t BasePartOffset = Offsets[Value];
1257
1258 EVT ArgVT = ValueVTs[Value];
1259 EVT MemVT = ArgVT;
1260 MVT RegisterVT = getRegisterTypeForCallingConv(Ctx, CC, ArgVT);
1261 unsigned NumRegs = getNumRegistersForCallingConv(Ctx, CC, ArgVT);
1262
1263 if (NumRegs == 1) {
1264 // This argument is not split, so the IR type is the memory type.
1265 if (ArgVT.isExtended()) {
1266 // We have an extended type, like i24, so we should just use the
1267 // register type.
1268 MemVT = RegisterVT;
1269 } else {
1270 MemVT = ArgVT;
1271 }
1272 } else if (ArgVT.isVector() && RegisterVT.isVector() &&
1273 ArgVT.getScalarType() == RegisterVT.getScalarType()) {
1274 assert(ArgVT.getVectorNumElements() > RegisterVT.getVectorNumElements());
1275 // We have a vector value which has been split into a vector with
1276 // the same scalar type, but fewer elements. This should handle
1277 // all the floating-point vector types.
1278 MemVT = RegisterVT;
1279 } else if (ArgVT.isVector() &&
1280 ArgVT.getVectorNumElements() == NumRegs) {
1281 // This arg has been split so that each element is stored in a separate
1282 // register.
1283 MemVT = ArgVT.getScalarType();
1284 } else if (ArgVT.isExtended()) {
1285 // We have an extended type, like i65.
1286 MemVT = RegisterVT;
1287 } else {
1288 unsigned MemoryBits = ArgVT.getStoreSizeInBits() / NumRegs;
1289 assert(ArgVT.getStoreSizeInBits() % NumRegs == 0);
1290 if (RegisterVT.isInteger()) {
1291 MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits);
1292 } else if (RegisterVT.isVector()) {
1293 assert(!RegisterVT.getScalarType().isFloatingPoint());
1294 unsigned NumElements = RegisterVT.getVectorNumElements();
1295 assert(MemoryBits % NumElements == 0);
1296 // This vector type has been split into another vector type with
1297 // a different elements size.
1298 EVT ScalarVT = EVT::getIntegerVT(State.getContext(),
1299 MemoryBits / NumElements);
1300 MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements);
1301 } else {
1302 llvm_unreachable("cannot deduce memory type.");
1303 }
1304 }
1305
1306 // Convert one element vectors to scalar.
1307 if (MemVT.isVector() && MemVT.getVectorNumElements() == 1)
1308 MemVT = MemVT.getScalarType();
1309
1310 // Round up vec3/vec5 argument.
1311 if (MemVT.isVector() && !MemVT.isPow2VectorType()) {
1312 MemVT = MemVT.getPow2VectorType(State.getContext());
1313 } else if (!MemVT.isSimple() && !MemVT.isVector()) {
1314 MemVT = MemVT.getRoundIntegerType(State.getContext());
1315 }
1316
1317 unsigned PartOffset = 0;
1318 for (unsigned i = 0; i != NumRegs; ++i) {
1319 State.addLoc(CCValAssign::getCustomMem(InIndex++, RegisterVT,
1320 BasePartOffset + PartOffset,
1321 MemVT.getSimpleVT(),
1323 PartOffset += MemVT.getStoreSize();
1324 }
1325 }
1326 }
1327}
1328
1330 SDValue Chain, CallingConv::ID CallConv,
1331 bool isVarArg,
1333 const SmallVectorImpl<SDValue> &OutVals,
1334 const SDLoc &DL, SelectionDAG &DAG) const {
1335 // FIXME: Fails for r600 tests
1336 //assert(!isVarArg && Outs.empty() && OutVals.empty() &&
1337 // "wave terminate should not have return values");
1338 return DAG.getNode(AMDGPUISD::ENDPGM, DL, MVT::Other, Chain);
1339}
1340
1341//===---------------------------------------------------------------------===//
1342// Target specific lowering
1343//===---------------------------------------------------------------------===//
1344
1345/// Selects the correct CCAssignFn for a given CallingConvention value.
1350
1355
1357 SelectionDAG &DAG,
1358 MachineFrameInfo &MFI,
1359 int ClobberedFI) const {
1360 SmallVector<SDValue, 8> ArgChains;
1361 int64_t FirstByte = MFI.getObjectOffset(ClobberedFI);
1362 int64_t LastByte = FirstByte + MFI.getObjectSize(ClobberedFI) - 1;
1363
1364 // Include the original chain at the beginning of the list. When this is
1365 // used by target LowerCall hooks, this helps legalize find the
1366 // CALLSEQ_BEGIN node.
1367 ArgChains.push_back(Chain);
1368
1369 // Add a chain value for each stack argument corresponding
1370 for (SDNode *U : DAG.getEntryNode().getNode()->users()) {
1371 if (LoadSDNode *L = dyn_cast<LoadSDNode>(U)) {
1372 if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr())) {
1373 if (FI->getIndex() < 0) {
1374 int64_t InFirstByte = MFI.getObjectOffset(FI->getIndex());
1375 int64_t InLastByte = InFirstByte;
1376 InLastByte += MFI.getObjectSize(FI->getIndex()) - 1;
1377
1378 if ((InFirstByte <= FirstByte && FirstByte <= InLastByte) ||
1379 (FirstByte <= InFirstByte && InFirstByte <= LastByte))
1380 ArgChains.push_back(SDValue(L, 1));
1381 }
1382 }
1383 }
1384 }
1385
1386 // Build a tokenfactor for all the chains.
1387 return DAG.getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
1388}
1389
1392 StringRef Reason) const {
1393 SDValue Callee = CLI.Callee;
1394 SelectionDAG &DAG = CLI.DAG;
1395
1396 const Function &Fn = DAG.getMachineFunction().getFunction();
1397
1398 StringRef FuncName("<unknown>");
1399
1401 FuncName = G->getSymbol();
1402 else if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
1403 FuncName = G->getGlobal()->getName();
1404
1405 DAG.getContext()->diagnose(
1406 DiagnosticInfoUnsupported(Fn, Reason + FuncName, CLI.DL.getDebugLoc()));
1407
1408 if (!CLI.IsTailCall) {
1409 for (ISD::InputArg &Arg : CLI.Ins)
1410 InVals.push_back(DAG.getPOISON(Arg.VT));
1411 }
1412
1413 return DAG.getEntryNode();
1414}
1415
1417 SmallVectorImpl<SDValue> &InVals) const {
1418 return lowerUnhandledCall(CLI, InVals, "unsupported call to function ");
1419}
1420
1422 SelectionDAG &DAG) const {
1423 const Function &Fn = DAG.getMachineFunction().getFunction();
1424
1426 Fn, "unsupported dynamic alloca", SDLoc(Op).getDebugLoc()));
1427 auto Ops = {DAG.getConstant(0, SDLoc(), Op.getValueType()), Op.getOperand(0)};
1428 return DAG.getMergeValues(Ops, SDLoc());
1429}
1430
1432 SelectionDAG &DAG) const {
1433 switch (Op.getOpcode()) {
1434 default:
1435 Op->print(errs(), &DAG);
1436 llvm_unreachable("Custom lowering code for this "
1437 "instruction is not implemented yet!");
1438 break;
1440 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
1442 case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
1443 case ISD::SDIVREM:
1444 return LowerSDIVREM(Op, DAG);
1445 case ISD::FCEIL: return LowerFCEIL(Op, DAG);
1446 case ISD::FTRUNC: return LowerFTRUNC(Op, DAG);
1447 case ISD::FRINT: return LowerFRINT(Op, DAG);
1448 case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG);
1449 case ISD::FROUNDEVEN:
1450 return LowerFROUNDEVEN(Op, DAG);
1451 case ISD::FROUND: return LowerFROUND(Op, DAG);
1452 case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
1453 case ISD::FLOG2:
1454 return LowerFLOG2(Op, DAG);
1455 case ISD::FLOG:
1456 case ISD::FLOG10:
1457 return LowerFLOGCommon(Op, DAG);
1458 case ISD::FEXP:
1459 case ISD::FEXP10:
1460 return lowerFEXP(Op, DAG);
1461 case ISD::FEXP2:
1462 return lowerFEXP2(Op, DAG);
1463 case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
1464 case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
1465 case ISD::FP_TO_FP16: return LowerFP_TO_FP16(Op, DAG);
1466 case ISD::FP_TO_SINT:
1467 case ISD::FP_TO_UINT:
1468 return LowerFP_TO_INT(Op, DAG);
1469 case ISD::CTTZ:
1471 case ISD::CTLZ:
1473 return LowerCTLZ_CTTZ(Op, DAG);
1474 case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG);
1475 }
1476 return Op;
1477}
1478
1481 SelectionDAG &DAG) const {
1482 switch (N->getOpcode()) {
1484 // Different parts of legalization seem to interpret which type of
1485 // sign_extend_inreg is the one to check for custom lowering. The extended
1486 // from type is what really matters, but some places check for custom
1487 // lowering of the result type. This results in trying to use
1488 // ReplaceNodeResults to sext_in_reg to an illegal type, so we'll just do
1489 // nothing here and let the illegal result integer be handled normally.
1490 return;
1491 case ISD::FLOG2:
1492 if (SDValue Lowered = LowerFLOG2(SDValue(N, 0), DAG))
1493 Results.push_back(Lowered);
1494 return;
1495 case ISD::FLOG:
1496 case ISD::FLOG10:
1497 if (SDValue Lowered = LowerFLOGCommon(SDValue(N, 0), DAG))
1498 Results.push_back(Lowered);
1499 return;
1500 case ISD::FEXP2:
1501 if (SDValue Lowered = lowerFEXP2(SDValue(N, 0), DAG))
1502 Results.push_back(Lowered);
1503 return;
1504 case ISD::FEXP:
1505 case ISD::FEXP10:
1506 if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
1507 Results.push_back(Lowered);
1508 return;
1509 case ISD::CTLZ:
1511 if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
1512 Results.push_back(Lowered);
1513 return;
1514 default:
1515 return;
1516 }
1517}
1518
1520 SDValue Op,
1521 SelectionDAG &DAG) const {
1522
1523 const DataLayout &DL = DAG.getDataLayout();
1525 const GlobalValue *GV = G->getGlobal();
1526
1527 if (!MFI->isModuleEntryFunction()) {
1528 auto IsNamedBarrier = AMDGPU::isNamedBarrier(*cast<GlobalVariable>(GV));
1529 if (std::optional<uint32_t> Address =
1531 if (IsNamedBarrier) {
1532 unsigned BarCnt = DL.getTypeAllocSize(GV->getValueType()) / 16;
1533 MFI->recordNumNamedBarriers(Address.value(), BarCnt);
1534 }
1535 return DAG.getConstant(*Address, SDLoc(Op), Op.getValueType());
1536 } else if (IsNamedBarrier) {
1537 llvm_unreachable("named barrier should have an assigned address");
1538 }
1539 }
1540
1541 if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
1542 G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
1543 if (!MFI->isModuleEntryFunction() &&
1544 GV->getName() != "llvm.amdgcn.module.lds" &&
1546 SDLoc DL(Op);
1547 const Function &Fn = DAG.getMachineFunction().getFunction();
1549 Fn, "local memory global used by non-kernel function",
1550 DL.getDebugLoc(), DS_Warning));
1551
1552 // We currently don't have a way to correctly allocate LDS objects that
1553 // aren't directly associated with a kernel. We do force inlining of
1554 // functions that use local objects. However, if these dead functions are
1555 // not eliminated, we don't want a compile time error. Just emit a warning
1556 // and a trap, since there should be no callable path here.
1557 SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
1558 SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
1559 Trap, DAG.getRoot());
1560 DAG.setRoot(OutputChain);
1561 return DAG.getPOISON(Op.getValueType());
1562 }
1563
1564 // XXX: What does the value of G->getOffset() mean?
1565 assert(G->getOffset() == 0 &&
1566 "Do not know what to do with an non-zero offset");
1567
1568 // TODO: We could emit code to handle the initialization somewhere.
1569 // We ignore the initializer for now and legalize it to allow selection.
1570 // The initializer will anyway get errored out during assembly emission.
1571 unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
1572 return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
1573 }
1574 return SDValue();
1575}
1576
1578 SelectionDAG &DAG) const {
1580 SDLoc SL(Op);
1581
1582 EVT VT = Op.getValueType();
1583 if (VT.getVectorElementType().getSizeInBits() < 32) {
1584 unsigned OpBitSize = Op.getOperand(0).getValueType().getSizeInBits();
1585 if (OpBitSize >= 32 && OpBitSize % 32 == 0) {
1586 unsigned NewNumElt = OpBitSize / 32;
1587 EVT NewEltVT = (NewNumElt == 1) ? MVT::i32
1589 MVT::i32, NewNumElt);
1590 for (const SDUse &U : Op->ops()) {
1591 SDValue In = U.get();
1592 SDValue NewIn = DAG.getNode(ISD::BITCAST, SL, NewEltVT, In);
1593 if (NewNumElt > 1)
1594 DAG.ExtractVectorElements(NewIn, Args);
1595 else
1596 Args.push_back(NewIn);
1597 }
1598
1599 EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
1600 NewNumElt * Op.getNumOperands());
1601 SDValue BV = DAG.getBuildVector(NewVT, SL, Args);
1602 return DAG.getNode(ISD::BITCAST, SL, VT, BV);
1603 }
1604 }
1605
1606 for (const SDUse &U : Op->ops())
1607 DAG.ExtractVectorElements(U.get(), Args);
1608
1609 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1610}
1611
1613 SelectionDAG &DAG) const {
1614 SDLoc SL(Op);
1616 unsigned Start = Op.getConstantOperandVal(1);
1617 EVT VT = Op.getValueType();
1618 EVT SrcVT = Op.getOperand(0).getValueType();
1619
1620 if (VT.getScalarSizeInBits() == 16 && Start % 2 == 0) {
1621 unsigned NumElt = VT.getVectorNumElements();
1622 unsigned NumSrcElt = SrcVT.getVectorNumElements();
1623 assert(NumElt % 2 == 0 && NumSrcElt % 2 == 0 && "expect legal types");
1624
1625 // Extract 32-bit registers at a time.
1626 EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumSrcElt / 2);
1627 EVT NewVT = NumElt == 2
1628 ? MVT::i32
1629 : EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElt / 2);
1630 SDValue Tmp = DAG.getNode(ISD::BITCAST, SL, NewSrcVT, Op.getOperand(0));
1631
1632 DAG.ExtractVectorElements(Tmp, Args, Start / 2, NumElt / 2);
1633 if (NumElt == 2)
1634 Tmp = Args[0];
1635 else
1636 Tmp = DAG.getBuildVector(NewVT, SL, Args);
1637
1638 return DAG.getNode(ISD::BITCAST, SL, VT, Tmp);
1639 }
1640
1641 DAG.ExtractVectorElements(Op.getOperand(0), Args, Start,
1643
1644 return DAG.getBuildVector(Op.getValueType(), SL, Args);
1645}
1646
1647// TODO: Handle fabs too
1649 if (Val.getOpcode() == ISD::FNEG)
1650 return Val.getOperand(0);
1651
1652 return Val;
1653}
1654
1656 if (Val.getOpcode() == ISD::FNEG)
1657 Val = Val.getOperand(0);
1658 if (Val.getOpcode() == ISD::FABS)
1659 Val = Val.getOperand(0);
1660 if (Val.getOpcode() == ISD::FCOPYSIGN)
1661 Val = Val.getOperand(0);
1662 return Val;
1663}
1664
1666 const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True,
1667 SDValue False, SDValue CC, DAGCombinerInfo &DCI) const {
1668 SelectionDAG &DAG = DCI.DAG;
1669 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
1670 switch (CCOpcode) {
1671 case ISD::SETOEQ:
1672 case ISD::SETONE:
1673 case ISD::SETUNE:
1674 case ISD::SETNE:
1675 case ISD::SETUEQ:
1676 case ISD::SETEQ:
1677 case ISD::SETFALSE:
1678 case ISD::SETFALSE2:
1679 case ISD::SETTRUE:
1680 case ISD::SETTRUE2:
1681 case ISD::SETUO:
1682 case ISD::SETO:
1683 break;
1684 case ISD::SETULE:
1685 case ISD::SETULT: {
1686 if (LHS == True)
1687 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1688 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1689 }
1690 case ISD::SETOLE:
1691 case ISD::SETOLT:
1692 case ISD::SETLE:
1693 case ISD::SETLT: {
1694 // Ordered. Assume ordered for undefined.
1695
1696 // Only do this after legalization to avoid interfering with other combines
1697 // which might occur.
1699 !DCI.isCalledByLegalizer())
1700 return SDValue();
1701
1702 // We need to permute the operands to get the correct NaN behavior. The
1703 // selected operand is the second one based on the failing compare with NaN,
1704 // so permute it based on the compare type the hardware uses.
1705 if (LHS == True)
1706 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1707 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1708 }
1709 case ISD::SETUGE:
1710 case ISD::SETUGT: {
1711 if (LHS == True)
1712 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, RHS, LHS);
1713 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, LHS, RHS);
1714 }
1715 case ISD::SETGT:
1716 case ISD::SETGE:
1717 case ISD::SETOGE:
1718 case ISD::SETOGT: {
1720 !DCI.isCalledByLegalizer())
1721 return SDValue();
1722
1723 if (LHS == True)
1724 return DAG.getNode(AMDGPUISD::FMAX_LEGACY, DL, VT, LHS, RHS);
1725 return DAG.getNode(AMDGPUISD::FMIN_LEGACY, DL, VT, RHS, LHS);
1726 }
1727 case ISD::SETCC_INVALID:
1728 llvm_unreachable("Invalid setcc condcode!");
1729 }
1730 return SDValue();
1731}
1732
1733/// Generate Min/Max node
1735 SDValue LHS, SDValue RHS,
1736 SDValue True, SDValue False,
1737 SDValue CC,
1738 DAGCombinerInfo &DCI) const {
1739 if ((LHS == True && RHS == False) || (LHS == False && RHS == True))
1740 return combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, True, False, CC, DCI);
1741
1742 SelectionDAG &DAG = DCI.DAG;
1743
1744 // If we can't directly match this, try to see if we can fold an fneg to
1745 // match.
1746
1749 SDValue NegTrue = peekFNeg(True);
1750
1751 // Undo the combine foldFreeOpFromSelect does if it helps us match the
1752 // fmin/fmax.
1753 //
1754 // select (fcmp olt (lhs, K)), (fneg lhs), -K
1755 // -> fneg (fmin_legacy lhs, K)
1756 //
1757 // TODO: Use getNegatedExpression
1758 if (LHS == NegTrue && CFalse && CRHS) {
1759 APFloat NegRHS = neg(CRHS->getValueAPF());
1760 if (NegRHS == CFalse->getValueAPF()) {
1761 SDValue Combined =
1762 combineFMinMaxLegacyImpl(DL, VT, LHS, RHS, NegTrue, False, CC, DCI);
1763 if (Combined)
1764 return DAG.getNode(ISD::FNEG, DL, VT, Combined);
1765 return SDValue();
1766 }
1767 }
1768
1769 return SDValue();
1770}
1771
1772std::pair<SDValue, SDValue>
1774 SDLoc SL(Op);
1775
1776 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1777
1778 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1779 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1780
1781 SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1782 SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1783
1784 return std::pair(Lo, Hi);
1785}
1786
1788 SDLoc SL(Op);
1789
1790 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1791 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
1792 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, Zero);
1793}
1794
1796 SDLoc SL(Op);
1797
1798 SDValue Vec = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, Op);
1799 const SDValue One = DAG.getConstant(1, SL, MVT::i32);
1800 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, Vec, One);
1801}
1802
1803// Split a vector type into two parts. The first part is a power of two vector.
1804// The second part is whatever is left over, and is a scalar if it would
1805// otherwise be a 1-vector.
1806std::pair<EVT, EVT>
1808 EVT LoVT, HiVT;
1809 EVT EltVT = VT.getVectorElementType();
1810 unsigned NumElts = VT.getVectorNumElements();
1811 unsigned LoNumElts = PowerOf2Ceil((NumElts + 1) / 2);
1812 LoVT = EVT::getVectorVT(*DAG.getContext(), EltVT, LoNumElts);
1813 HiVT = NumElts - LoNumElts == 1
1814 ? EltVT
1815 : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts - LoNumElts);
1816 return std::pair(LoVT, HiVT);
1817}
1818
1819// Split a vector value into two parts of types LoVT and HiVT. HiVT could be
1820// scalar.
1821std::pair<SDValue, SDValue>
1823 const EVT &LoVT, const EVT &HiVT,
1824 SelectionDAG &DAG) const {
1825 EVT VT = N.getValueType();
1827 (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
1828 VT.getVectorNumElements() &&
1829 "More vector elements requested than available!");
1831 DAG.getVectorIdxConstant(0, DL));
1832
1833 unsigned LoNumElts = LoVT.getVectorNumElements();
1834
1835 if (HiVT.isVector()) {
1836 unsigned HiNumElts = HiVT.getVectorNumElements();
1837 if ((VT.getVectorNumElements() % HiNumElts) == 0) {
1838 // Avoid creating an extract_subvector with an index that isn't a multiple
1839 // of the result type.
1841 DAG.getConstant(LoNumElts, DL, MVT::i32));
1842 return {Lo, Hi};
1843 }
1844
1846 DAG.ExtractVectorElements(N, Elts, /*Start=*/LoNumElts,
1847 /*Count=*/HiNumElts);
1848 SDValue Hi = DAG.getBuildVector(HiVT, DL, Elts);
1849 return {Lo, Hi};
1850 }
1851
1853 DAG.getVectorIdxConstant(LoNumElts, DL));
1854 return {Lo, Hi};
1855}
1856
1858 SelectionDAG &DAG) const {
1860 EVT VT = Op.getValueType();
1861 SDLoc SL(Op);
1862
1863
1864 // If this is a 2 element vector, we really want to scalarize and not create
1865 // weird 1 element vectors.
1866 if (VT.getVectorNumElements() == 2) {
1867 SDValue Ops[2];
1868 std::tie(Ops[0], Ops[1]) = scalarizeVectorLoad(Load, DAG);
1869 return DAG.getMergeValues(Ops, SL);
1870 }
1871
1872 SDValue BasePtr = Load->getBasePtr();
1873 EVT MemVT = Load->getMemoryVT();
1874
1875 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1876
1877 EVT LoVT, HiVT;
1878 EVT LoMemVT, HiMemVT;
1879 SDValue Lo, Hi;
1880
1881 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1882 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1883 std::tie(Lo, Hi) = splitVector(Op, SL, LoVT, HiVT, DAG);
1884
1885 unsigned Size = LoMemVT.getStoreSize();
1886 Align BaseAlign = Load->getAlign();
1887 Align HiAlign = commonAlignment(BaseAlign, Size);
1888
1889 SDValue LoLoad = DAG.getExtLoad(Load->getExtensionType(), SL, LoVT,
1890 Load->getChain(), BasePtr, SrcValue, LoMemVT,
1891 BaseAlign, Load->getMemOperand()->getFlags());
1892 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, TypeSize::getFixed(Size));
1893 SDValue HiLoad =
1894 DAG.getExtLoad(Load->getExtensionType(), SL, HiVT, Load->getChain(),
1895 HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
1896 HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
1897
1898 SDValue Join;
1899 if (LoVT == HiVT) {
1900 // This is the case that the vector is power of two so was evenly split.
1901 Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
1902 } else {
1903 Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getPOISON(VT), LoLoad,
1904 DAG.getVectorIdxConstant(0, SL));
1905 Join = DAG.getNode(
1907 VT, Join, HiLoad,
1909 }
1910
1911 SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
1912 LoLoad.getValue(1), HiLoad.getValue(1))};
1913
1914 return DAG.getMergeValues(Ops, SL);
1915}
1916
1918 SelectionDAG &DAG) const {
1920 EVT VT = Op.getValueType();
1921 SDValue BasePtr = Load->getBasePtr();
1922 EVT MemVT = Load->getMemoryVT();
1923 SDLoc SL(Op);
1924 const MachinePointerInfo &SrcValue = Load->getMemOperand()->getPointerInfo();
1925 Align BaseAlign = Load->getAlign();
1926 unsigned NumElements = MemVT.getVectorNumElements();
1927
1928 // Widen from vec3 to vec4 when the load is at least 8-byte aligned
1929 // or 16-byte fully dereferenceable. Otherwise, split the vector load.
1930 if (NumElements != 3 ||
1931 (BaseAlign < Align(8) &&
1932 !SrcValue.isDereferenceable(16, *DAG.getContext(), DAG.getDataLayout())))
1933 return SplitVectorLoad(Op, DAG);
1934
1935 assert(NumElements == 3);
1936
1937 EVT WideVT =
1939 EVT WideMemVT =
1941 SDValue WideLoad = DAG.getExtLoad(
1942 Load->getExtensionType(), SL, WideVT, Load->getChain(), BasePtr, SrcValue,
1943 WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
1944 return DAG.getMergeValues(
1945 {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
1946 DAG.getVectorIdxConstant(0, SL)),
1947 WideLoad.getValue(1)},
1948 SL);
1949}
1950
1952 SelectionDAG &DAG) const {
1954 SDValue Val = Store->getValue();
1955 EVT VT = Val.getValueType();
1956
1957 // If this is a 2 element vector, we really want to scalarize and not create
1958 // weird 1 element vectors.
1959 if (VT.getVectorNumElements() == 2)
1960 return scalarizeVectorStore(Store, DAG);
1961
1962 EVT MemVT = Store->getMemoryVT();
1963 SDValue Chain = Store->getChain();
1964 SDValue BasePtr = Store->getBasePtr();
1965 SDLoc SL(Op);
1966
1967 EVT LoVT, HiVT;
1968 EVT LoMemVT, HiMemVT;
1969 SDValue Lo, Hi;
1970
1971 std::tie(LoVT, HiVT) = getSplitDestVTs(VT, DAG);
1972 std::tie(LoMemVT, HiMemVT) = getSplitDestVTs(MemVT, DAG);
1973 std::tie(Lo, Hi) = splitVector(Val, SL, LoVT, HiVT, DAG);
1974
1975 SDValue HiPtr = DAG.getObjectPtrOffset(SL, BasePtr, LoMemVT.getStoreSize());
1976
1977 const MachinePointerInfo &SrcValue = Store->getMemOperand()->getPointerInfo();
1978 Align BaseAlign = Store->getAlign();
1979 unsigned Size = LoMemVT.getStoreSize();
1980 Align HiAlign = commonAlignment(BaseAlign, Size);
1981
1982 SDValue LoStore =
1983 DAG.getTruncStore(Chain, SL, Lo, BasePtr, SrcValue, LoMemVT, BaseAlign,
1984 Store->getMemOperand()->getFlags());
1985 SDValue HiStore =
1986 DAG.getTruncStore(Chain, SL, Hi, HiPtr, SrcValue.getWithOffset(Size),
1987 HiMemVT, HiAlign, Store->getMemOperand()->getFlags());
1988
1989 return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoStore, HiStore);
1990}
1991
1992// This is a shortcut for integer division because we have fast i32<->f32
1993// conversions, and fast f32 reciprocal instructions. The fractional part of a
1994// float is enough to accurately represent up to a 24-bit signed integer.
1996 bool Sign) const {
1997 SDLoc DL(Op);
1998 EVT VT = Op.getValueType();
1999 SDValue LHS = Op.getOperand(0);
2000 SDValue RHS = Op.getOperand(1);
2001 MVT IntVT = MVT::i32;
2002 MVT FltVT = MVT::f32;
2003
2004 unsigned LHSSignBits = DAG.ComputeNumSignBits(LHS);
2005 if (LHSSignBits < 9)
2006 return SDValue();
2007
2008 unsigned RHSSignBits = DAG.ComputeNumSignBits(RHS);
2009 if (RHSSignBits < 9)
2010 return SDValue();
2011
2012 unsigned BitSize = VT.getSizeInBits();
2013 unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
2014 unsigned DivBits = BitSize - SignBits;
2015 if (Sign)
2016 ++DivBits;
2017
2020
2021 SDValue jq = DAG.getConstant(1, DL, IntVT);
2022
2023 if (Sign) {
2024 // char|short jq = ia ^ ib;
2025 jq = DAG.getNode(ISD::XOR, DL, VT, LHS, RHS);
2026
2027 // jq = jq >> (bitsize - 2)
2028 jq = DAG.getNode(ISD::SRA, DL, VT, jq,
2029 DAG.getConstant(BitSize - 2, DL, VT));
2030
2031 // jq = jq | 0x1
2032 jq = DAG.getNode(ISD::OR, DL, VT, jq, DAG.getConstant(1, DL, VT));
2033 }
2034
2035 // int ia = (int)LHS;
2036 SDValue ia = LHS;
2037
2038 // int ib, (int)RHS;
2039 SDValue ib = RHS;
2040
2041 // float fa = (float)ia;
2042 SDValue fa = DAG.getNode(ToFp, DL, FltVT, ia);
2043
2044 // float fb = (float)ib;
2045 SDValue fb = DAG.getNode(ToFp, DL, FltVT, ib);
2046
2047 SDValue fq = DAG.getNode(ISD::FMUL, DL, FltVT,
2048 fa, DAG.getNode(AMDGPUISD::RCP, DL, FltVT, fb));
2049
2050 // fq = trunc(fq);
2051 fq = DAG.getNode(ISD::FTRUNC, DL, FltVT, fq);
2052
2053 // float fqneg = -fq;
2054 SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
2055
2057
2058 bool UseFmadFtz = false;
2059 if (Subtarget->isGCN()) {
2061 UseFmadFtz =
2063 }
2064
2065 // float fr = mad(fqneg, fb, fa);
2066 unsigned OpCode = !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2067 : UseFmadFtz ? (unsigned)AMDGPUISD::FMAD_FTZ
2069 SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
2070
2071 // int iq = (int)fq;
2072 SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
2073
2074 // fr = fabs(fr);
2075 fr = DAG.getNode(ISD::FABS, DL, FltVT, fr);
2076
2077 // fb = fabs(fb);
2078 fb = DAG.getNode(ISD::FABS, DL, FltVT, fb);
2079
2080 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2081
2082 // int cv = fr >= fb;
2083 SDValue cv = DAG.getSetCC(DL, SetCCVT, fr, fb, ISD::SETOGE);
2084
2085 // jq = (cv ? jq : 0);
2086 jq = DAG.getNode(ISD::SELECT, DL, VT, cv, jq, DAG.getConstant(0, DL, VT));
2087
2088 // dst = iq + jq;
2089 SDValue Div = DAG.getNode(ISD::ADD, DL, VT, iq, jq);
2090
2091 // Rem needs compensation, it's easier to recompute it
2092 SDValue Rem = DAG.getNode(ISD::MUL, DL, VT, Div, RHS);
2093 Rem = DAG.getNode(ISD::SUB, DL, VT, LHS, Rem);
2094
2095 // Truncate to number of bits this divide really is.
2096 if (Sign) {
2097 SDValue InRegSize
2098 = DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(), DivBits));
2099 Div = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Div, InRegSize);
2100 Rem = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Rem, InRegSize);
2101 } else {
2102 SDValue TruncMask = DAG.getConstant((UINT64_C(1) << DivBits) - 1, DL, VT);
2103 Div = DAG.getNode(ISD::AND, DL, VT, Div, TruncMask);
2104 Rem = DAG.getNode(ISD::AND, DL, VT, Rem, TruncMask);
2105 }
2106
2107 return DAG.getMergeValues({ Div, Rem }, DL);
2108}
2109
2111 SelectionDAG &DAG,
2113 SDLoc DL(Op);
2114 EVT VT = Op.getValueType();
2115
2116 assert(VT == MVT::i64 && "LowerUDIVREM64 expects an i64");
2117
2118 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2119
2120 SDValue One = DAG.getConstant(1, DL, HalfVT);
2121 SDValue Zero = DAG.getConstant(0, DL, HalfVT);
2122
2123 //HiLo split
2124 SDValue LHS_Lo, LHS_Hi;
2125 SDValue LHS = Op.getOperand(0);
2126 std::tie(LHS_Lo, LHS_Hi) = DAG.SplitScalar(LHS, DL, HalfVT, HalfVT);
2127
2128 SDValue RHS_Lo, RHS_Hi;
2129 SDValue RHS = Op.getOperand(1);
2130 std::tie(RHS_Lo, RHS_Hi) = DAG.SplitScalar(RHS, DL, HalfVT, HalfVT);
2131
2132 if (DAG.MaskedValueIsZero(RHS, APInt::getHighBitsSet(64, 32)) &&
2133 DAG.MaskedValueIsZero(LHS, APInt::getHighBitsSet(64, 32))) {
2134
2135 SDValue Res = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2136 LHS_Lo, RHS_Lo);
2137
2138 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(0), Zero});
2139 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {Res.getValue(1), Zero});
2140
2141 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV));
2142 Results.push_back(DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM));
2143 return;
2144 }
2145
2146 if (isTypeLegal(MVT::i64)) {
2147 // The algorithm here is based on ideas from "Software Integer Division",
2148 // Tom Rodeheffer, August 2008.
2149
2152
2153 // Compute denominator reciprocal.
2154 unsigned FMAD =
2155 !Subtarget->hasMadMacF32Insts() ? (unsigned)ISD::FMA
2159
2160 SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
2161 SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
2162 SDValue Mad1 = DAG.getNode(FMAD, DL, MVT::f32, Cvt_Hi,
2163 DAG.getConstantFP(APInt(32, 0x4f800000).bitsToFloat(), DL, MVT::f32),
2164 Cvt_Lo);
2165 SDValue Rcp = DAG.getNode(AMDGPUISD::RCP, DL, MVT::f32, Mad1);
2166 SDValue Mul1 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Rcp,
2167 DAG.getConstantFP(APInt(32, 0x5f7ffffc).bitsToFloat(), DL, MVT::f32));
2168 SDValue Mul2 = DAG.getNode(ISD::FMUL, DL, MVT::f32, Mul1,
2169 DAG.getConstantFP(APInt(32, 0x2f800000).bitsToFloat(), DL, MVT::f32));
2170 SDValue Trunc = DAG.getNode(ISD::FTRUNC, DL, MVT::f32, Mul2);
2171 SDValue Mad2 = DAG.getNode(FMAD, DL, MVT::f32, Trunc,
2172 DAG.getConstantFP(APInt(32, 0xcf800000).bitsToFloat(), DL, MVT::f32),
2173 Mul1);
2174 SDValue Rcp_Lo = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Mad2);
2175 SDValue Rcp_Hi = DAG.getNode(ISD::FP_TO_UINT, DL, HalfVT, Trunc);
2176 SDValue Rcp64 = DAG.getBitcast(VT,
2177 DAG.getBuildVector(MVT::v2i32, DL, {Rcp_Lo, Rcp_Hi}));
2178
2179 SDValue Zero64 = DAG.getConstant(0, DL, VT);
2180 SDValue One64 = DAG.getConstant(1, DL, VT);
2181 SDValue Zero1 = DAG.getConstant(0, DL, MVT::i1);
2182 SDVTList HalfCarryVT = DAG.getVTList(HalfVT, MVT::i1);
2183
2184 // First round of UNR (Unsigned integer Newton-Raphson).
2185 SDValue Neg_RHS = DAG.getNode(ISD::SUB, DL, VT, Zero64, RHS);
2186 SDValue Mullo1 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Rcp64);
2187 SDValue Mulhi1 = DAG.getNode(ISD::MULHU, DL, VT, Rcp64, Mullo1);
2188 SDValue Mulhi1_Lo, Mulhi1_Hi;
2189 std::tie(Mulhi1_Lo, Mulhi1_Hi) =
2190 DAG.SplitScalar(Mulhi1, DL, HalfVT, HalfVT);
2191 SDValue Add1_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Lo,
2192 Mulhi1_Lo, Zero1);
2193 SDValue Add1_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Rcp_Hi,
2194 Mulhi1_Hi, Add1_Lo.getValue(1));
2195 SDValue Add1 = DAG.getBitcast(VT,
2196 DAG.getBuildVector(MVT::v2i32, DL, {Add1_Lo, Add1_Hi}));
2197
2198 // Second round of UNR.
2199 SDValue Mullo2 = DAG.getNode(ISD::MUL, DL, VT, Neg_RHS, Add1);
2200 SDValue Mulhi2 = DAG.getNode(ISD::MULHU, DL, VT, Add1, Mullo2);
2201 SDValue Mulhi2_Lo, Mulhi2_Hi;
2202 std::tie(Mulhi2_Lo, Mulhi2_Hi) =
2203 DAG.SplitScalar(Mulhi2, DL, HalfVT, HalfVT);
2204 SDValue Add2_Lo = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Lo,
2205 Mulhi2_Lo, Zero1);
2206 SDValue Add2_Hi = DAG.getNode(ISD::UADDO_CARRY, DL, HalfCarryVT, Add1_Hi,
2207 Mulhi2_Hi, Add2_Lo.getValue(1));
2208 SDValue Add2 = DAG.getBitcast(VT,
2209 DAG.getBuildVector(MVT::v2i32, DL, {Add2_Lo, Add2_Hi}));
2210
2211 SDValue Mulhi3 = DAG.getNode(ISD::MULHU, DL, VT, LHS, Add2);
2212
2213 SDValue Mul3 = DAG.getNode(ISD::MUL, DL, VT, RHS, Mulhi3);
2214
2215 SDValue Mul3_Lo, Mul3_Hi;
2216 std::tie(Mul3_Lo, Mul3_Hi) = DAG.SplitScalar(Mul3, DL, HalfVT, HalfVT);
2217 SDValue Sub1_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Lo,
2218 Mul3_Lo, Zero1);
2219 SDValue Sub1_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, LHS_Hi,
2220 Mul3_Hi, Sub1_Lo.getValue(1));
2221 SDValue Sub1_Mi = DAG.getNode(ISD::SUB, DL, HalfVT, LHS_Hi, Mul3_Hi);
2222 SDValue Sub1 = DAG.getBitcast(VT,
2223 DAG.getBuildVector(MVT::v2i32, DL, {Sub1_Lo, Sub1_Hi}));
2224
2225 SDValue MinusOne = DAG.getConstant(0xffffffffu, DL, HalfVT);
2226 SDValue C1 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, MinusOne, Zero,
2227 ISD::SETUGE);
2228 SDValue C2 = DAG.getSelectCC(DL, Sub1_Lo, RHS_Lo, MinusOne, Zero,
2229 ISD::SETUGE);
2230 SDValue C3 = DAG.getSelectCC(DL, Sub1_Hi, RHS_Hi, C2, C1, ISD::SETEQ);
2231
2232 // TODO: Here and below portions of the code can be enclosed into if/endif.
2233 // Currently control flow is unconditional and we have 4 selects after
2234 // potential endif to substitute PHIs.
2235
2236 // if C3 != 0 ...
2237 SDValue Sub2_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Lo,
2238 RHS_Lo, Zero1);
2239 SDValue Sub2_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub1_Mi,
2240 RHS_Hi, Sub1_Lo.getValue(1));
2241 SDValue Sub2_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2242 Zero, Sub2_Lo.getValue(1));
2243 SDValue Sub2 = DAG.getBitcast(VT,
2244 DAG.getBuildVector(MVT::v2i32, DL, {Sub2_Lo, Sub2_Hi}));
2245
2246 SDValue Add3 = DAG.getNode(ISD::ADD, DL, VT, Mulhi3, One64);
2247
2248 SDValue C4 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, MinusOne, Zero,
2249 ISD::SETUGE);
2250 SDValue C5 = DAG.getSelectCC(DL, Sub2_Lo, RHS_Lo, MinusOne, Zero,
2251 ISD::SETUGE);
2252 SDValue C6 = DAG.getSelectCC(DL, Sub2_Hi, RHS_Hi, C5, C4, ISD::SETEQ);
2253
2254 // if (C6 != 0)
2255 SDValue Add4 = DAG.getNode(ISD::ADD, DL, VT, Add3, One64);
2256
2257 SDValue Sub3_Lo = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Lo,
2258 RHS_Lo, Zero1);
2259 SDValue Sub3_Mi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub2_Mi,
2260 RHS_Hi, Sub2_Lo.getValue(1));
2261 SDValue Sub3_Hi = DAG.getNode(ISD::USUBO_CARRY, DL, HalfCarryVT, Sub3_Mi,
2262 Zero, Sub3_Lo.getValue(1));
2263 SDValue Sub3 = DAG.getBitcast(VT,
2264 DAG.getBuildVector(MVT::v2i32, DL, {Sub3_Lo, Sub3_Hi}));
2265
2266 // endif C6
2267 // endif C3
2268
2269 SDValue Sel1 = DAG.getSelectCC(DL, C6, Zero, Add4, Add3, ISD::SETNE);
2270 SDValue Div = DAG.getSelectCC(DL, C3, Zero, Sel1, Mulhi3, ISD::SETNE);
2271
2272 SDValue Sel2 = DAG.getSelectCC(DL, C6, Zero, Sub3, Sub2, ISD::SETNE);
2273 SDValue Rem = DAG.getSelectCC(DL, C3, Zero, Sel2, Sub1, ISD::SETNE);
2274
2275 Results.push_back(Div);
2276 Results.push_back(Rem);
2277
2278 return;
2279 }
2280
2281 // r600 expandion.
2282 // Get Speculative values
2283 SDValue DIV_Part = DAG.getNode(ISD::UDIV, DL, HalfVT, LHS_Hi, RHS_Lo);
2284 SDValue REM_Part = DAG.getNode(ISD::UREM, DL, HalfVT, LHS_Hi, RHS_Lo);
2285
2286 SDValue REM_Lo = DAG.getSelectCC(DL, RHS_Hi, Zero, REM_Part, LHS_Hi, ISD::SETEQ);
2287 SDValue REM = DAG.getBuildVector(MVT::v2i32, DL, {REM_Lo, Zero});
2288 REM = DAG.getNode(ISD::BITCAST, DL, MVT::i64, REM);
2289
2290 SDValue DIV_Hi = DAG.getSelectCC(DL, RHS_Hi, Zero, DIV_Part, Zero, ISD::SETEQ);
2291 SDValue DIV_Lo = Zero;
2292
2293 const unsigned halfBitWidth = HalfVT.getSizeInBits();
2294
2295 for (unsigned i = 0; i < halfBitWidth; ++i) {
2296 const unsigned bitPos = halfBitWidth - i - 1;
2297 SDValue POS = DAG.getConstant(bitPos, DL, HalfVT);
2298 // Get value of high bit
2299 SDValue HBit = DAG.getNode(ISD::SRL, DL, HalfVT, LHS_Lo, POS);
2300 HBit = DAG.getNode(ISD::AND, DL, HalfVT, HBit, One);
2301 HBit = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, HBit);
2302
2303 // Shift
2304 REM = DAG.getNode(ISD::SHL, DL, VT, REM, DAG.getConstant(1, DL, VT));
2305 // Add LHS high bit
2306 REM = DAG.getNode(ISD::OR, DL, VT, REM, HBit);
2307
2308 SDValue BIT = DAG.getConstant(1ULL << bitPos, DL, HalfVT);
2309 SDValue realBIT = DAG.getSelectCC(DL, REM, RHS, BIT, Zero, ISD::SETUGE);
2310
2311 DIV_Lo = DAG.getNode(ISD::OR, DL, HalfVT, DIV_Lo, realBIT);
2312
2313 // Update REM
2314 SDValue REM_sub = DAG.getNode(ISD::SUB, DL, VT, REM, RHS);
2315 REM = DAG.getSelectCC(DL, REM, RHS, REM_sub, REM, ISD::SETUGE);
2316 }
2317
2318 SDValue DIV = DAG.getBuildVector(MVT::v2i32, DL, {DIV_Lo, DIV_Hi});
2319 DIV = DAG.getNode(ISD::BITCAST, DL, MVT::i64, DIV);
2320 Results.push_back(DIV);
2321 Results.push_back(REM);
2322}
2323
2325 SelectionDAG &DAG) const {
2326 SDLoc DL(Op);
2327 EVT VT = Op.getValueType();
2328
2329 if (VT == MVT::i64) {
2331 LowerUDIVREM64(Op, DAG, Results);
2332 return DAG.getMergeValues(Results, DL);
2333 }
2334
2335 if (VT == MVT::i32) {
2336 if (SDValue Res = LowerDIVREM24(Op, DAG, false))
2337 return Res;
2338 }
2339
2340 SDValue X = Op.getOperand(0);
2341 SDValue Y = Op.getOperand(1);
2342
2343 // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
2344 // algorithm used here.
2345
2346 // Initial estimate of inv(y).
2347 SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
2348
2349 // One round of UNR.
2350 SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
2351 SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
2352 Z = DAG.getNode(ISD::ADD, DL, VT, Z,
2353 DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
2354
2355 // Quotient/remainder estimate.
2356 SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
2357 SDValue R =
2358 DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
2359
2360 // First quotient/remainder refinement.
2361 EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2362 SDValue One = DAG.getConstant(1, DL, VT);
2363 SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2364 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2365 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2366 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2367 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2368
2369 // Second quotient/remainder refinement.
2370 Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
2371 Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2372 DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
2373 R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
2374 DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
2375
2376 return DAG.getMergeValues({Q, R}, DL);
2377}
2378
2380 SelectionDAG &DAG) const {
2381 SDLoc DL(Op);
2382 EVT VT = Op.getValueType();
2383
2384 SDValue LHS = Op.getOperand(0);
2385 SDValue RHS = Op.getOperand(1);
2386
2387 SDValue Zero = DAG.getConstant(0, DL, VT);
2388 SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
2389
2390 if (VT == MVT::i32) {
2391 if (SDValue Res = LowerDIVREM24(Op, DAG, true))
2392 return Res;
2393 }
2394
2395 if (VT == MVT::i64 &&
2396 DAG.ComputeNumSignBits(LHS) > 32 &&
2397 DAG.ComputeNumSignBits(RHS) > 32) {
2398 EVT HalfVT = VT.getHalfSizedIntegerVT(*DAG.getContext());
2399
2400 //HiLo split
2401 SDValue LHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, LHS, Zero);
2402 SDValue RHS_Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, DL, HalfVT, RHS, Zero);
2403 SDValue DIVREM = DAG.getNode(ISD::SDIVREM, DL, DAG.getVTList(HalfVT, HalfVT),
2404 LHS_Lo, RHS_Lo);
2405 SDValue Res[2] = {
2406 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(0)),
2407 DAG.getNode(ISD::SIGN_EXTEND, DL, VT, DIVREM.getValue(1))
2408 };
2409 return DAG.getMergeValues(Res, DL);
2410 }
2411
2412 SDValue LHSign = DAG.getSelectCC(DL, LHS, Zero, NegOne, Zero, ISD::SETLT);
2413 SDValue RHSign = DAG.getSelectCC(DL, RHS, Zero, NegOne, Zero, ISD::SETLT);
2414 SDValue DSign = DAG.getNode(ISD::XOR, DL, VT, LHSign, RHSign);
2415 SDValue RSign = LHSign; // Remainder sign is the same as LHS
2416
2417 LHS = DAG.getNode(ISD::ADD, DL, VT, LHS, LHSign);
2418 RHS = DAG.getNode(ISD::ADD, DL, VT, RHS, RHSign);
2419
2420 LHS = DAG.getNode(ISD::XOR, DL, VT, LHS, LHSign);
2421 RHS = DAG.getNode(ISD::XOR, DL, VT, RHS, RHSign);
2422
2423 SDValue Div = DAG.getNode(ISD::UDIVREM, DL, DAG.getVTList(VT, VT), LHS, RHS);
2424 SDValue Rem = Div.getValue(1);
2425
2426 Div = DAG.getNode(ISD::XOR, DL, VT, Div, DSign);
2427 Rem = DAG.getNode(ISD::XOR, DL, VT, Rem, RSign);
2428
2429 Div = DAG.getNode(ISD::SUB, DL, VT, Div, DSign);
2430 Rem = DAG.getNode(ISD::SUB, DL, VT, Rem, RSign);
2431
2432 SDValue Res[2] = {
2433 Div,
2434 Rem
2435 };
2436 return DAG.getMergeValues(Res, DL);
2437}
2438
2440 SDLoc SL(Op);
2441 SDValue Src = Op.getOperand(0);
2442
2443 // result = trunc(src)
2444 // if (src > 0.0 && src != result)
2445 // result += 1.0
2446
2447 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2448
2449 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2450 const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f64);
2451
2452 EVT SetCCVT =
2453 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2454
2455 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOGT);
2456 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2457 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2458
2459 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, One, Zero);
2460 // TODO: Should this propagate fast-math-flags?
2461 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2462}
2463
2465 SelectionDAG &DAG) {
2466 const unsigned FractBits = 52;
2467 const unsigned ExpBits = 11;
2468
2469 SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32,
2470 Hi,
2471 DAG.getConstant(FractBits - 32, SL, MVT::i32),
2472 DAG.getConstant(ExpBits, SL, MVT::i32));
2473 SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart,
2474 DAG.getConstant(1023, SL, MVT::i32));
2475
2476 return Exp;
2477}
2478
2480 SDLoc SL(Op);
2481 SDValue Src = Op.getOperand(0);
2482
2483 assert(Op.getValueType() == MVT::f64);
2484
2485 const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
2486
2487 // Extract the upper half, since this is where we will find the sign and
2488 // exponent.
2489 SDValue Hi = getHiHalf64(Src, DAG);
2490
2491 SDValue Exp = extractF64Exponent(Hi, SL, DAG);
2492
2493 const unsigned FractBits = 52;
2494
2495 // Extract the sign bit.
2496 const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, SL, MVT::i32);
2497 SDValue SignBit = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, SignBitMask);
2498
2499 // Extend back to 64-bits.
2500 SDValue SignBit64 = DAG.getBuildVector(MVT::v2i32, SL, {Zero, SignBit});
2501 SignBit64 = DAG.getNode(ISD::BITCAST, SL, MVT::i64, SignBit64);
2502
2503 SDValue BcInt = DAG.getNode(ISD::BITCAST, SL, MVT::i64, Src);
2504 const SDValue FractMask
2505 = DAG.getConstant((UINT64_C(1) << FractBits) - 1, SL, MVT::i64);
2506
2507 SDValue Shr = DAG.getNode(ISD::SRA, SL, MVT::i64, FractMask, Exp);
2508 SDValue Not = DAG.getNOT(SL, Shr, MVT::i64);
2509 SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, BcInt, Not);
2510
2511 EVT SetCCVT =
2512 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
2513
2514 const SDValue FiftyOne = DAG.getConstant(FractBits - 1, SL, MVT::i32);
2515
2516 SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
2517 SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
2518
2519 SDValue Tmp1 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpLt0, SignBit64, Tmp0);
2520 SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, ExpGt51, BcInt, Tmp1);
2521
2522 return DAG.getNode(ISD::BITCAST, SL, MVT::f64, Tmp2);
2523}
2524
2526 SelectionDAG &DAG) const {
2527 SDLoc SL(Op);
2528 SDValue Src = Op.getOperand(0);
2529
2530 assert(Op.getValueType() == MVT::f64);
2531
2532 APFloat C1Val(APFloat::IEEEdouble(), "0x1.0p+52");
2533 SDValue C1 = DAG.getConstantFP(C1Val, SL, MVT::f64);
2534 SDValue CopySign = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, C1, Src);
2535
2536 // TODO: Should this propagate fast-math-flags?
2537
2538 SDValue Tmp1 = DAG.getNode(ISD::FADD, SL, MVT::f64, Src, CopySign);
2539 SDValue Tmp2 = DAG.getNode(ISD::FSUB, SL, MVT::f64, Tmp1, CopySign);
2540
2541 SDValue Fabs = DAG.getNode(ISD::FABS, SL, MVT::f64, Src);
2542
2543 APFloat C2Val(APFloat::IEEEdouble(), "0x1.fffffffffffffp+51");
2544 SDValue C2 = DAG.getConstantFP(C2Val, SL, MVT::f64);
2545
2546 EVT SetCCVT =
2547 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2548 SDValue Cond = DAG.getSetCC(SL, SetCCVT, Fabs, C2, ISD::SETOGT);
2549
2550 return DAG.getSelect(SL, MVT::f64, Cond, Src, Tmp2);
2551}
2552
2554 SelectionDAG &DAG) const {
2555 // FNEARBYINT and FRINT are the same, except in their handling of FP
2556 // exceptions. Those aren't really meaningful for us, and OpenCL only has
2557 // rint, so just treat them as equivalent.
2558 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), Op.getValueType(),
2559 Op.getOperand(0));
2560}
2561
2563 auto VT = Op.getValueType();
2564 auto Arg = Op.getOperand(0u);
2565 return DAG.getNode(ISD::FROUNDEVEN, SDLoc(Op), VT, Arg);
2566}
2567
2568// XXX - May require not supporting f32 denormals?
2569
2570// Don't handle v2f16. The extra instructions to scalarize and repack around the
2571// compare and vselect end up producing worse code than scalarizing the whole
2572// operation.
2574 SDLoc SL(Op);
2575 SDValue X = Op.getOperand(0);
2576 EVT VT = Op.getValueType();
2577
2578 SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
2579
2580 // TODO: Should this propagate fast-math-flags?
2581
2582 SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
2583
2584 SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
2585
2586 const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2587 const SDValue One = DAG.getConstantFP(1.0, SL, VT);
2588
2589 EVT SetCCVT =
2590 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2591
2592 const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
2593 SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
2594 SDValue OneOrZeroFP = DAG.getNode(ISD::SELECT, SL, VT, Cmp, One, Zero);
2595
2596 SDValue SignedOffset = DAG.getNode(ISD::FCOPYSIGN, SL, VT, OneOrZeroFP, X);
2597 return DAG.getNode(ISD::FADD, SL, VT, T, SignedOffset);
2598}
2599
2601 SDLoc SL(Op);
2602 SDValue Src = Op.getOperand(0);
2603
2604 // result = trunc(src);
2605 // if (src < 0.0 && src != result)
2606 // result += -1.0.
2607
2608 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, MVT::f64, Src);
2609
2610 const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f64);
2611 const SDValue NegOne = DAG.getConstantFP(-1.0, SL, MVT::f64);
2612
2613 EVT SetCCVT =
2614 getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f64);
2615
2616 SDValue Lt0 = DAG.getSetCC(SL, SetCCVT, Src, Zero, ISD::SETOLT);
2617 SDValue NeTrunc = DAG.getSetCC(SL, SetCCVT, Src, Trunc, ISD::SETONE);
2618 SDValue And = DAG.getNode(ISD::AND, SL, SetCCVT, Lt0, NeTrunc);
2619
2620 SDValue Add = DAG.getNode(ISD::SELECT, SL, MVT::f64, And, NegOne, Zero);
2621 // TODO: Should this propagate fast-math-flags?
2622 return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add);
2623}
2624
2625/// Return true if it's known that \p Src can never be an f32 denormal value.
2627 switch (Src.getOpcode()) {
2628 case ISD::FP_EXTEND:
2629 return Src.getOperand(0).getValueType() == MVT::f16;
2630 case ISD::FP16_TO_FP:
2631 case ISD::FFREXP:
2632 return true;
2634 unsigned IntrinsicID = Src.getConstantOperandVal(0);
2635 switch (IntrinsicID) {
2636 case Intrinsic::amdgcn_frexp_mant:
2637 return true;
2638 default:
2639 return false;
2640 }
2641 }
2642 default:
2643 return false;
2644 }
2645
2646 llvm_unreachable("covered opcode switch");
2647}
2648
2650 SDNodeFlags Flags) {
2651 return Flags.hasApproximateFuncs();
2652}
2653
2662
2664 SDValue Src,
2665 SDNodeFlags Flags) const {
2666 SDLoc SL(Src);
2667 EVT VT = Src.getValueType();
2668 const fltSemantics &Semantics = VT.getFltSemantics();
2669 SDValue SmallestNormal =
2670 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2671
2672 // Want to scale denormals up, but negatives and 0 work just as well on the
2673 // scaled path.
2674 SDValue IsLtSmallestNormal = DAG.getSetCC(
2675 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2676 SmallestNormal, ISD::SETOLT);
2677
2678 return IsLtSmallestNormal;
2679}
2680
2682 SDNodeFlags Flags) const {
2683 SDLoc SL(Src);
2684 EVT VT = Src.getValueType();
2685 const fltSemantics &Semantics = VT.getFltSemantics();
2686 SDValue Inf = DAG.getConstantFP(APFloat::getInf(Semantics), SL, VT);
2687
2688 SDValue Fabs = DAG.getNode(ISD::FABS, SL, VT, Src, Flags);
2689 SDValue IsFinite = DAG.getSetCC(
2690 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Fabs,
2691 Inf, ISD::SETOLT);
2692 return IsFinite;
2693}
2694
2695/// If denormal handling is required return the scaled input to FLOG2, and the
2696/// check for denormal range. Otherwise, return null values.
2697std::pair<SDValue, SDValue>
2699 SDValue Src, SDNodeFlags Flags) const {
2700 if (!needsDenormHandlingF32(DAG, Src, Flags))
2701 return {};
2702
2703 MVT VT = MVT::f32;
2704 const fltSemantics &Semantics = APFloat::IEEEsingle();
2705 SDValue SmallestNormal =
2706 DAG.getConstantFP(APFloat::getSmallestNormalized(Semantics), SL, VT);
2707
2708 SDValue IsLtSmallestNormal = DAG.getSetCC(
2709 SL, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT), Src,
2710 SmallestNormal, ISD::SETOLT);
2711
2712 SDValue Scale32 = DAG.getConstantFP(0x1.0p+32, SL, VT);
2713 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2714 SDValue ScaleFactor =
2715 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, Scale32, One, Flags);
2716
2717 SDValue ScaledInput = DAG.getNode(ISD::FMUL, SL, VT, Src, ScaleFactor, Flags);
2718 return {ScaledInput, IsLtSmallestNormal};
2719}
2720
2722 // v_log_f32 is good enough for OpenCL, except it doesn't handle denormals.
2723 // If we have to handle denormals, scale up the input and adjust the result.
2724
2725 // scaled = x * (is_denormal ? 0x1.0p+32 : 1.0)
2726 // log2 = amdgpu_log2 - (is_denormal ? 32.0 : 0.0)
2727
2728 SDLoc SL(Op);
2729 EVT VT = Op.getValueType();
2730 SDValue Src = Op.getOperand(0);
2731 SDNodeFlags Flags = Op->getFlags();
2732
2733 if (VT == MVT::f16) {
2734 // Nothing in half is a denormal when promoted to f32.
2735 assert(!Subtarget->has16BitInsts());
2736 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2737 SDValue Log = DAG.getNode(AMDGPUISD::LOG, SL, MVT::f32, Ext, Flags);
2738 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2739 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2740 }
2741
2742 auto [ScaledInput, IsLtSmallestNormal] =
2743 getScaledLogInput(DAG, SL, Src, Flags);
2744 if (!ScaledInput)
2745 return DAG.getNode(AMDGPUISD::LOG, SL, VT, Src, Flags);
2746
2747 SDValue Log2 = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2748
2749 SDValue ThirtyTwo = DAG.getConstantFP(32.0, SL, VT);
2750 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2751 SDValue ResultOffset =
2752 DAG.getNode(ISD::SELECT, SL, VT, IsLtSmallestNormal, ThirtyTwo, Zero);
2753 return DAG.getNode(ISD::FSUB, SL, VT, Log2, ResultOffset, Flags);
2754}
2755
2756static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X,
2757 SDValue Y, SDValue C, SDNodeFlags Flags = SDNodeFlags()) {
2758 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Y, Flags);
2759 return DAG.getNode(ISD::FADD, SL, VT, Mul, C, Flags);
2760}
2761
2763 SelectionDAG &DAG) const {
2764 SDValue X = Op.getOperand(0);
2765 EVT VT = Op.getValueType();
2766 SDNodeFlags Flags = Op->getFlags();
2767 SDLoc DL(Op);
2768
2769 const bool IsLog10 = Op.getOpcode() == ISD::FLOG10;
2770 assert(IsLog10 || Op.getOpcode() == ISD::FLOG);
2771
2772 const auto &Options = getTargetMachine().Options;
2773 if (VT == MVT::f16 || Flags.hasApproximateFuncs()) {
2774
2775 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2776 // Log and multiply in f32 is good enough for f16.
2777 X = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, X, Flags);
2778 }
2779
2780 SDValue Lowered = LowerFLOGUnsafe(X, DL, DAG, IsLog10, Flags);
2781 if (VT == MVT::f16 && !Subtarget->has16BitInsts()) {
2782 return DAG.getNode(ISD::FP_ROUND, DL, VT, Lowered,
2783 DAG.getTargetConstant(0, DL, MVT::i32), Flags);
2784 }
2785
2786 return Lowered;
2787 }
2788
2789 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, DL, X, Flags);
2790 if (ScaledInput)
2791 X = ScaledInput;
2792
2793 SDValue Y = DAG.getNode(AMDGPUISD::LOG, DL, VT, X, Flags);
2794
2795 SDValue R;
2796 if (Subtarget->hasFastFMAF32()) {
2797 // c+cc are ln(2)/ln(10) to more than 49 bits
2798 const float c_log10 = 0x1.344134p-2f;
2799 const float cc_log10 = 0x1.09f79ep-26f;
2800
2801 // c + cc is ln(2) to more than 49 bits
2802 const float c_log = 0x1.62e42ep-1f;
2803 const float cc_log = 0x1.efa39ep-25f;
2804
2805 SDValue C = DAG.getConstantFP(IsLog10 ? c_log10 : c_log, DL, VT);
2806 SDValue CC = DAG.getConstantFP(IsLog10 ? cc_log10 : cc_log, DL, VT);
2807
2808 R = DAG.getNode(ISD::FMUL, DL, VT, Y, C, Flags);
2809 SDValue NegR = DAG.getNode(ISD::FNEG, DL, VT, R, Flags);
2810 SDValue FMA0 = DAG.getNode(ISD::FMA, DL, VT, Y, C, NegR, Flags);
2811 SDValue FMA1 = DAG.getNode(ISD::FMA, DL, VT, Y, CC, FMA0, Flags);
2812 R = DAG.getNode(ISD::FADD, DL, VT, R, FMA1, Flags);
2813 } else {
2814 // ch+ct is ln(2)/ln(10) to more than 36 bits
2815 const float ch_log10 = 0x1.344000p-2f;
2816 const float ct_log10 = 0x1.3509f6p-18f;
2817
2818 // ch + ct is ln(2) to more than 36 bits
2819 const float ch_log = 0x1.62e000p-1f;
2820 const float ct_log = 0x1.0bfbe8p-15f;
2821
2822 SDValue CH = DAG.getConstantFP(IsLog10 ? ch_log10 : ch_log, DL, VT);
2823 SDValue CT = DAG.getConstantFP(IsLog10 ? ct_log10 : ct_log, DL, VT);
2824
2825 SDValue YAsInt = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Y);
2826 SDValue MaskConst = DAG.getConstant(0xfffff000, DL, MVT::i32);
2827 SDValue YHInt = DAG.getNode(ISD::AND, DL, MVT::i32, YAsInt, MaskConst);
2828 SDValue YH = DAG.getNode(ISD::BITCAST, DL, MVT::f32, YHInt);
2829 SDValue YT = DAG.getNode(ISD::FSUB, DL, VT, Y, YH, Flags);
2830
2831 SDValue YTCT = DAG.getNode(ISD::FMUL, DL, VT, YT, CT, Flags);
2832 SDValue Mad0 = getMad(DAG, DL, VT, YH, CT, YTCT, Flags);
2833 SDValue Mad1 = getMad(DAG, DL, VT, YT, CH, Mad0, Flags);
2834 R = getMad(DAG, DL, VT, YH, CH, Mad1);
2835 }
2836
2837 const bool IsFiniteOnly =
2838 (Flags.hasNoNaNs() || Options.NoNaNsFPMath) && Flags.hasNoInfs();
2839
2840 // TODO: Check if known finite from source value.
2841 if (!IsFiniteOnly) {
2842 SDValue IsFinite = getIsFinite(DAG, Y, Flags);
2843 R = DAG.getNode(ISD::SELECT, DL, VT, IsFinite, R, Y, Flags);
2844 }
2845
2846 if (IsScaled) {
2847 SDValue Zero = DAG.getConstantFP(0.0f, DL, VT);
2848 SDValue ShiftK =
2849 DAG.getConstantFP(IsLog10 ? 0x1.344136p+3f : 0x1.62e430p+4f, DL, VT);
2850 SDValue Shift =
2851 DAG.getNode(ISD::SELECT, DL, VT, IsScaled, ShiftK, Zero, Flags);
2852 R = DAG.getNode(ISD::FSUB, DL, VT, R, Shift, Flags);
2853 }
2854
2855 return R;
2856}
2857
2861
2862// Do f32 fast math expansion for flog2 or flog10. This is accurate enough for a
2863// promote f16 operation.
2865 SelectionDAG &DAG, bool IsLog10,
2866 SDNodeFlags Flags) const {
2867 EVT VT = Src.getValueType();
2868 unsigned LogOp =
2869 VT == MVT::f32 ? (unsigned)AMDGPUISD::LOG : (unsigned)ISD::FLOG2;
2870
2871 double Log2BaseInverted =
2873
2874 if (VT == MVT::f32) {
2875 auto [ScaledInput, IsScaled] = getScaledLogInput(DAG, SL, Src, Flags);
2876 if (ScaledInput) {
2877 SDValue LogSrc = DAG.getNode(AMDGPUISD::LOG, SL, VT, ScaledInput, Flags);
2878 SDValue ScaledResultOffset =
2879 DAG.getConstantFP(-32.0 * Log2BaseInverted, SL, VT);
2880
2881 SDValue Zero = DAG.getConstantFP(0.0f, SL, VT);
2882
2883 SDValue ResultOffset = DAG.getNode(ISD::SELECT, SL, VT, IsScaled,
2884 ScaledResultOffset, Zero, Flags);
2885
2886 SDValue Log2Inv = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2887
2888 if (Subtarget->hasFastFMAF32())
2889 return DAG.getNode(ISD::FMA, SL, VT, LogSrc, Log2Inv, ResultOffset,
2890 Flags);
2891 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, LogSrc, Log2Inv, Flags);
2892 return DAG.getNode(ISD::FADD, SL, VT, Mul, ResultOffset);
2893 }
2894 }
2895
2896 SDValue Log2Operand = DAG.getNode(LogOp, SL, VT, Src, Flags);
2897 SDValue Log2BaseInvertedOperand = DAG.getConstantFP(Log2BaseInverted, SL, VT);
2898
2899 return DAG.getNode(ISD::FMUL, SL, VT, Log2Operand, Log2BaseInvertedOperand,
2900 Flags);
2901}
2902
2904 // v_exp_f32 is good enough for OpenCL, except it doesn't handle denormals.
2905 // If we have to handle denormals, scale up the input and adjust the result.
2906
2907 SDLoc SL(Op);
2908 EVT VT = Op.getValueType();
2909 SDValue Src = Op.getOperand(0);
2910 SDNodeFlags Flags = Op->getFlags();
2911
2912 if (VT == MVT::f16) {
2913 // Nothing in half is a denormal when promoted to f32.
2914 assert(!Subtarget->has16BitInsts());
2915 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, Src, Flags);
2916 SDValue Log = DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Ext, Flags);
2917 return DAG.getNode(ISD::FP_ROUND, SL, VT, Log,
2918 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
2919 }
2920
2921 assert(VT == MVT::f32);
2922
2923 if (!needsDenormHandlingF32(DAG, Src, Flags))
2924 return DAG.getNode(AMDGPUISD::EXP, SL, MVT::f32, Src, Flags);
2925
2926 // bool needs_scaling = x < -0x1.f80000p+6f;
2927 // v_exp_f32(x + (s ? 0x1.0p+6f : 0.0f)) * (s ? 0x1.0p-64f : 1.0f);
2928
2929 // -nextafter(128.0, -1)
2930 SDValue RangeCheckConst = DAG.getConstantFP(-0x1.f80000p+6f, SL, VT);
2931
2932 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2933
2934 SDValue NeedsScaling =
2935 DAG.getSetCC(SL, SetCCVT, Src, RangeCheckConst, ISD::SETOLT);
2936
2937 SDValue SixtyFour = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2938 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
2939
2940 SDValue AddOffset =
2941 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, SixtyFour, Zero);
2942
2943 SDValue AddInput = DAG.getNode(ISD::FADD, SL, VT, Src, AddOffset, Flags);
2944 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, AddInput, Flags);
2945
2946 SDValue TwoExpNeg64 = DAG.getConstantFP(0x1.0p-64f, SL, VT);
2947 SDValue One = DAG.getConstantFP(1.0, SL, VT);
2948 SDValue ResultScale =
2949 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, TwoExpNeg64, One);
2950
2951 return DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScale, Flags);
2952}
2953
2955 SelectionDAG &DAG,
2956 SDNodeFlags Flags) const {
2957 EVT VT = X.getValueType();
2958 const SDValue Log2E = DAG.getConstantFP(numbers::log2e, SL, VT);
2959
2960 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
2961 // exp2(M_LOG2E_F * f);
2962 SDValue Mul = DAG.getNode(ISD::FMUL, SL, VT, X, Log2E, Flags);
2963 return DAG.getNode(VT == MVT::f32 ? (unsigned)AMDGPUISD::EXP
2964 : (unsigned)ISD::FEXP2,
2965 SL, VT, Mul, Flags);
2966 }
2967
2968 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2969
2970 SDValue Threshold = DAG.getConstantFP(-0x1.5d58a0p+6f, SL, VT);
2971 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
2972
2973 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+6f, SL, VT);
2974
2975 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
2976
2977 SDValue AdjustedX =
2978 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
2979
2980 SDValue ExpInput = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, Log2E, Flags);
2981
2982 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, ExpInput, Flags);
2983
2984 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.969d48p-93f, SL, VT);
2985 SDValue AdjustedResult =
2986 DAG.getNode(ISD::FMUL, SL, VT, Exp2, ResultScaleFactor, Flags);
2987
2988 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, Exp2,
2989 Flags);
2990}
2991
2992/// Emit approx-funcs appropriate lowering for exp10. inf/nan should still be
2993/// handled correctly.
2995 SelectionDAG &DAG,
2996 SDNodeFlags Flags) const {
2997 const EVT VT = X.getValueType();
2998 const unsigned Exp2Op = VT == MVT::f32 ? static_cast<unsigned>(AMDGPUISD::EXP)
2999 : static_cast<unsigned>(ISD::FEXP2);
3000
3001 if (VT != MVT::f32 || !needsDenormHandlingF32(DAG, X, Flags)) {
3002 // exp2(x * 0x1.a92000p+1f) * exp2(x * 0x1.4f0978p-11f);
3003 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3004 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3005
3006 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, X, K0, Flags);
3007 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3008 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, X, K1, Flags);
3009 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3010 return DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1);
3011 }
3012
3013 // bool s = x < -0x1.2f7030p+5f;
3014 // x += s ? 0x1.0p+5f : 0.0f;
3015 // exp10 = exp2(x * 0x1.a92000p+1f) *
3016 // exp2(x * 0x1.4f0978p-11f) *
3017 // (s ? 0x1.9f623ep-107f : 1.0f);
3018
3019 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3020
3021 SDValue Threshold = DAG.getConstantFP(-0x1.2f7030p+5f, SL, VT);
3022 SDValue NeedsScaling = DAG.getSetCC(SL, SetCCVT, X, Threshold, ISD::SETOLT);
3023
3024 SDValue ScaleOffset = DAG.getConstantFP(0x1.0p+5f, SL, VT);
3025 SDValue ScaledX = DAG.getNode(ISD::FADD, SL, VT, X, ScaleOffset, Flags);
3026 SDValue AdjustedX =
3027 DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, ScaledX, X);
3028
3029 SDValue K0 = DAG.getConstantFP(0x1.a92000p+1f, SL, VT);
3030 SDValue K1 = DAG.getConstantFP(0x1.4f0978p-11f, SL, VT);
3031
3032 SDValue Mul0 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K0, Flags);
3033 SDValue Exp2_0 = DAG.getNode(Exp2Op, SL, VT, Mul0, Flags);
3034 SDValue Mul1 = DAG.getNode(ISD::FMUL, SL, VT, AdjustedX, K1, Flags);
3035 SDValue Exp2_1 = DAG.getNode(Exp2Op, SL, VT, Mul1, Flags);
3036
3037 SDValue MulExps = DAG.getNode(ISD::FMUL, SL, VT, Exp2_0, Exp2_1, Flags);
3038
3039 SDValue ResultScaleFactor = DAG.getConstantFP(0x1.9f623ep-107f, SL, VT);
3040 SDValue AdjustedResult =
3041 DAG.getNode(ISD::FMUL, SL, VT, MulExps, ResultScaleFactor, Flags);
3042
3043 return DAG.getNode(ISD::SELECT, SL, VT, NeedsScaling, AdjustedResult, MulExps,
3044 Flags);
3045}
3046
3048 EVT VT = Op.getValueType();
3049 SDLoc SL(Op);
3050 SDValue X = Op.getOperand(0);
3051 SDNodeFlags Flags = Op->getFlags();
3052 const bool IsExp10 = Op.getOpcode() == ISD::FEXP10;
3053
3054 if (VT.getScalarType() == MVT::f16) {
3055 // v_exp_f16 (fmul x, log2e)
3056 if (allowApproxFunc(DAG, Flags)) // TODO: Does this really require fast?
3057 return lowerFEXPUnsafe(X, SL, DAG, Flags);
3058
3059 if (VT.isVector())
3060 return SDValue();
3061
3062 // exp(f16 x) ->
3063 // fptrunc (v_exp_f32 (fmul (fpext x), log2e))
3064
3065 // Nothing in half is a denormal when promoted to f32.
3066 SDValue Ext = DAG.getNode(ISD::FP_EXTEND, SL, MVT::f32, X, Flags);
3067 SDValue Lowered = lowerFEXPUnsafe(Ext, SL, DAG, Flags);
3068 return DAG.getNode(ISD::FP_ROUND, SL, VT, Lowered,
3069 DAG.getTargetConstant(0, SL, MVT::i32), Flags);
3070 }
3071
3072 assert(VT == MVT::f32);
3073
3074 // TODO: Interpret allowApproxFunc as ignoring DAZ. This is currently copying
3075 // library behavior. Also, is known-not-daz source sufficient?
3076 if (allowApproxFunc(DAG, Flags)) {
3077 return IsExp10 ? lowerFEXP10Unsafe(X, SL, DAG, Flags)
3078 : lowerFEXPUnsafe(X, SL, DAG, Flags);
3079 }
3080
3081 // Algorithm:
3082 //
3083 // e^x = 2^(x/ln(2)) = 2^(x*(64/ln(2))/64)
3084 //
3085 // x*(64/ln(2)) = n + f, |f| <= 0.5, n is integer
3086 // n = 64*m + j, 0 <= j < 64
3087 //
3088 // e^x = 2^((64*m + j + f)/64)
3089 // = (2^m) * (2^(j/64)) * 2^(f/64)
3090 // = (2^m) * (2^(j/64)) * e^(f*(ln(2)/64))
3091 //
3092 // f = x*(64/ln(2)) - n
3093 // r = f*(ln(2)/64) = x - n*(ln(2)/64)
3094 //
3095 // e^x = (2^m) * (2^(j/64)) * e^r
3096 //
3097 // (2^(j/64)) is precomputed
3098 //
3099 // e^r = 1 + r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3100 // e^r = 1 + q
3101 //
3102 // q = r + (r^2)/2! + (r^3)/3! + (r^4)/4! + (r^5)/5!
3103 //
3104 // e^x = (2^m) * ( (2^(j/64)) + q*(2^(j/64)) )
3105 SDNodeFlags FlagsNoContract = Flags;
3106 FlagsNoContract.setAllowContract(false);
3107
3108 SDValue PH, PL;
3109 if (Subtarget->hasFastFMAF32()) {
3110 const float c_exp = numbers::log2ef;
3111 const float cc_exp = 0x1.4ae0bep-26f; // c+cc are 49 bits
3112 const float c_exp10 = 0x1.a934f0p+1f;
3113 const float cc_exp10 = 0x1.2f346ep-24f;
3114
3115 SDValue C = DAG.getConstantFP(IsExp10 ? c_exp10 : c_exp, SL, VT);
3116 SDValue CC = DAG.getConstantFP(IsExp10 ? cc_exp10 : cc_exp, SL, VT);
3117
3118 PH = DAG.getNode(ISD::FMUL, SL, VT, X, C, Flags);
3119 SDValue NegPH = DAG.getNode(ISD::FNEG, SL, VT, PH, Flags);
3120 SDValue FMA0 = DAG.getNode(ISD::FMA, SL, VT, X, C, NegPH, Flags);
3121 PL = DAG.getNode(ISD::FMA, SL, VT, X, CC, FMA0, Flags);
3122 } else {
3123 const float ch_exp = 0x1.714000p+0f;
3124 const float cl_exp = 0x1.47652ap-12f; // ch + cl are 36 bits
3125
3126 const float ch_exp10 = 0x1.a92000p+1f;
3127 const float cl_exp10 = 0x1.4f0978p-11f;
3128
3129 SDValue CH = DAG.getConstantFP(IsExp10 ? ch_exp10 : ch_exp, SL, VT);
3130 SDValue CL = DAG.getConstantFP(IsExp10 ? cl_exp10 : cl_exp, SL, VT);
3131
3132 SDValue XAsInt = DAG.getNode(ISD::BITCAST, SL, MVT::i32, X);
3133 SDValue MaskConst = DAG.getConstant(0xfffff000, SL, MVT::i32);
3134 SDValue XHAsInt = DAG.getNode(ISD::AND, SL, MVT::i32, XAsInt, MaskConst);
3135 SDValue XH = DAG.getNode(ISD::BITCAST, SL, VT, XHAsInt);
3136 SDValue XL = DAG.getNode(ISD::FSUB, SL, VT, X, XH, Flags);
3137
3138 PH = DAG.getNode(ISD::FMUL, SL, VT, XH, CH, Flags);
3139
3140 SDValue XLCL = DAG.getNode(ISD::FMUL, SL, VT, XL, CL, Flags);
3141 SDValue Mad0 = getMad(DAG, SL, VT, XL, CH, XLCL, Flags);
3142 PL = getMad(DAG, SL, VT, XH, CL, Mad0, Flags);
3143 }
3144
3145 SDValue E = DAG.getNode(ISD::FROUNDEVEN, SL, VT, PH, Flags);
3146
3147 // It is unsafe to contract this fsub into the PH multiply.
3148 SDValue PHSubE = DAG.getNode(ISD::FSUB, SL, VT, PH, E, FlagsNoContract);
3149
3150 SDValue A = DAG.getNode(ISD::FADD, SL, VT, PHSubE, PL, Flags);
3151 SDValue IntE = DAG.getNode(ISD::FP_TO_SINT, SL, MVT::i32, E);
3152 SDValue Exp2 = DAG.getNode(AMDGPUISD::EXP, SL, VT, A, Flags);
3153
3154 SDValue R = DAG.getNode(ISD::FLDEXP, SL, VT, Exp2, IntE, Flags);
3155
3156 SDValue UnderflowCheckConst =
3157 DAG.getConstantFP(IsExp10 ? -0x1.66d3e8p+5f : -0x1.9d1da0p+6f, SL, VT);
3158
3159 EVT SetCCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
3160 SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
3161 SDValue Underflow =
3162 DAG.getSetCC(SL, SetCCVT, X, UnderflowCheckConst, ISD::SETOLT);
3163
3164 R = DAG.getNode(ISD::SELECT, SL, VT, Underflow, Zero, R);
3165
3166 if (!Flags.hasNoInfs()) {
3167 SDValue OverflowCheckConst =
3168 DAG.getConstantFP(IsExp10 ? 0x1.344136p+5f : 0x1.62e430p+6f, SL, VT);
3169 SDValue Overflow =
3170 DAG.getSetCC(SL, SetCCVT, X, OverflowCheckConst, ISD::SETOGT);
3171 SDValue Inf =
3173 R = DAG.getNode(ISD::SELECT, SL, VT, Overflow, Inf, R);
3174 }
3175
3176 return R;
3177}
3178
3179static bool isCtlzOpc(unsigned Opc) {
3180 return Opc == ISD::CTLZ || Opc == ISD::CTLZ_ZERO_UNDEF;
3181}
3182
3183static bool isCttzOpc(unsigned Opc) {
3184 return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
3185}
3186
3188 SelectionDAG &DAG) const {
3189 auto SL = SDLoc(Op);
3190 auto Opc = Op.getOpcode();
3191 auto Arg = Op.getOperand(0u);
3192 auto ResultVT = Op.getValueType();
3193
3194 if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
3195 return {};
3196
3198 assert(ResultVT == Arg.getValueType());
3199
3200 const uint64_t NumBits = ResultVT.getFixedSizeInBits();
3201 SDValue NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
3202 SDValue NewOp;
3203
3204 if (Opc == ISD::CTLZ_ZERO_UNDEF) {
3205 NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
3206 NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
3207 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3208 } else {
3209 NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
3210 NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
3211 NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
3212 }
3213
3214 return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
3215}
3216
3218 SDLoc SL(Op);
3219 SDValue Src = Op.getOperand(0);
3220
3221 assert(isCtlzOpc(Op.getOpcode()) || isCttzOpc(Op.getOpcode()));
3222 bool Ctlz = isCtlzOpc(Op.getOpcode());
3223 unsigned NewOpc = Ctlz ? AMDGPUISD::FFBH_U32 : AMDGPUISD::FFBL_B32;
3224
3225 bool ZeroUndef = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ||
3226 Op.getOpcode() == ISD::CTTZ_ZERO_UNDEF;
3227 bool Is64BitScalar = !Src->isDivergent() && Src.getValueType() == MVT::i64;
3228
3229 if (Src.getValueType() == MVT::i32 || Is64BitScalar) {
3230 // (ctlz hi:lo) -> (umin (ffbh src), 32)
3231 // (cttz hi:lo) -> (umin (ffbl src), 32)
3232 // (ctlz_zero_undef src) -> (ffbh src)
3233 // (cttz_zero_undef src) -> (ffbl src)
3234
3235 // 64-bit scalar version produce 32-bit result
3236 // (ctlz hi:lo) -> (umin (S_FLBIT_I32_B64 src), 64)
3237 // (cttz hi:lo) -> (umin (S_FF1_I32_B64 src), 64)
3238 // (ctlz_zero_undef src) -> (S_FLBIT_I32_B64 src)
3239 // (cttz_zero_undef src) -> (S_FF1_I32_B64 src)
3240 SDValue NewOpr = DAG.getNode(NewOpc, SL, MVT::i32, Src);
3241 if (!ZeroUndef) {
3242 const SDValue ConstVal = DAG.getConstant(
3243 Op.getValueType().getScalarSizeInBits(), SL, MVT::i32);
3244 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, ConstVal);
3245 }
3246 return DAG.getNode(ISD::ZERO_EXTEND, SL, Src.getValueType(), NewOpr);
3247 }
3248
3249 SDValue Lo, Hi;
3250 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3251
3252 SDValue OprLo = DAG.getNode(NewOpc, SL, MVT::i32, Lo);
3253 SDValue OprHi = DAG.getNode(NewOpc, SL, MVT::i32, Hi);
3254
3255 // (ctlz hi:lo) -> (umin3 (ffbh hi), (uaddsat (ffbh lo), 32), 64)
3256 // (cttz hi:lo) -> (umin3 (uaddsat (ffbl hi), 32), (ffbl lo), 64)
3257 // (ctlz_zero_undef hi:lo) -> (umin (ffbh hi), (add (ffbh lo), 32))
3258 // (cttz_zero_undef hi:lo) -> (umin (add (ffbl hi), 32), (ffbl lo))
3259
3260 unsigned AddOpc = ZeroUndef ? ISD::ADD : ISD::UADDSAT;
3261 const SDValue Const32 = DAG.getConstant(32, SL, MVT::i32);
3262 if (Ctlz)
3263 OprLo = DAG.getNode(AddOpc, SL, MVT::i32, OprLo, Const32);
3264 else
3265 OprHi = DAG.getNode(AddOpc, SL, MVT::i32, OprHi, Const32);
3266
3267 SDValue NewOpr;
3268 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, OprLo, OprHi);
3269 if (!ZeroUndef) {
3270 const SDValue Const64 = DAG.getConstant(64, SL, MVT::i32);
3271 NewOpr = DAG.getNode(ISD::UMIN, SL, MVT::i32, NewOpr, Const64);
3272 }
3273
3274 return DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i64, NewOpr);
3275}
3276
3278 bool Signed) const {
3279 // The regular method converting a 64-bit integer to float roughly consists of
3280 // 2 steps: normalization and rounding. In fact, after normalization, the
3281 // conversion from a 64-bit integer to a float is essentially the same as the
3282 // one from a 32-bit integer. The only difference is that it has more
3283 // trailing bits to be rounded. To leverage the native 32-bit conversion, a
3284 // 64-bit integer could be preprocessed and fit into a 32-bit integer then
3285 // converted into the correct float number. The basic steps for the unsigned
3286 // conversion are illustrated in the following pseudo code:
3287 //
3288 // f32 uitofp(i64 u) {
3289 // i32 hi, lo = split(u);
3290 // // Only count the leading zeros in hi as we have native support of the
3291 // // conversion from i32 to f32. If hi is all 0s, the conversion is
3292 // // reduced to a 32-bit one automatically.
3293 // i32 shamt = clz(hi); // Return 32 if hi is all 0s.
3294 // u <<= shamt;
3295 // hi, lo = split(u);
3296 // hi |= (lo != 0) ? 1 : 0; // Adjust rounding bit in hi based on lo.
3297 // // convert it as a 32-bit integer and scale the result back.
3298 // return uitofp(hi) * 2^(32 - shamt);
3299 // }
3300 //
3301 // The signed one follows the same principle but uses 'ffbh_i32' to count its
3302 // sign bits instead. If 'ffbh_i32' is not available, its absolute value is
3303 // converted instead followed by negation based its sign bit.
3304
3305 SDLoc SL(Op);
3306 SDValue Src = Op.getOperand(0);
3307
3308 SDValue Lo, Hi;
3309 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3310 SDValue Sign;
3311 SDValue ShAmt;
3312 if (Signed && Subtarget->isGCN()) {
3313 // We also need to consider the sign bit in Lo if Hi has just sign bits,
3314 // i.e. Hi is 0 or -1. However, that only needs to take the MSB into
3315 // account. That is, the maximal shift is
3316 // - 32 if Lo and Hi have opposite signs;
3317 // - 33 if Lo and Hi have the same sign.
3318 //
3319 // Or, MaxShAmt = 33 + OppositeSign, where
3320 //
3321 // OppositeSign is defined as ((Lo ^ Hi) >> 31), which is
3322 // - -1 if Lo and Hi have opposite signs; and
3323 // - 0 otherwise.
3324 //
3325 // All in all, ShAmt is calculated as
3326 //
3327 // umin(sffbh(Hi), 33 + (Lo^Hi)>>31) - 1.
3328 //
3329 // or
3330 //
3331 // umin(sffbh(Hi) - 1, 32 + (Lo^Hi)>>31).
3332 //
3333 // to reduce the critical path.
3334 SDValue OppositeSign = DAG.getNode(
3335 ISD::SRA, SL, MVT::i32, DAG.getNode(ISD::XOR, SL, MVT::i32, Lo, Hi),
3336 DAG.getConstant(31, SL, MVT::i32));
3337 SDValue MaxShAmt =
3338 DAG.getNode(ISD::ADD, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3339 OppositeSign);
3340 // Count the leading sign bits.
3341 ShAmt = DAG.getNode(AMDGPUISD::FFBH_I32, SL, MVT::i32, Hi);
3342 // Different from unsigned conversion, the shift should be one bit less to
3343 // preserve the sign bit.
3344 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, ShAmt,
3345 DAG.getConstant(1, SL, MVT::i32));
3346 ShAmt = DAG.getNode(ISD::UMIN, SL, MVT::i32, ShAmt, MaxShAmt);
3347 } else {
3348 if (Signed) {
3349 // Without 'ffbh_i32', only leading zeros could be counted. Take the
3350 // absolute value first.
3351 Sign = DAG.getNode(ISD::SRA, SL, MVT::i64, Src,
3352 DAG.getConstant(63, SL, MVT::i64));
3353 SDValue Abs =
3354 DAG.getNode(ISD::XOR, SL, MVT::i64,
3355 DAG.getNode(ISD::ADD, SL, MVT::i64, Src, Sign), Sign);
3356 std::tie(Lo, Hi) = split64BitValue(Abs, DAG);
3357 }
3358 // Count the leading zeros.
3359 ShAmt = DAG.getNode(ISD::CTLZ, SL, MVT::i32, Hi);
3360 // The shift amount for signed integers is [0, 32].
3361 }
3362 // Normalize the given 64-bit integer.
3363 SDValue Norm = DAG.getNode(ISD::SHL, SL, MVT::i64, Src, ShAmt);
3364 // Split it again.
3365 std::tie(Lo, Hi) = split64BitValue(Norm, DAG);
3366 // Calculate the adjust bit for rounding.
3367 // (lo != 0) ? 1 : 0 => (lo >= 1) ? 1 : 0 => umin(1, lo)
3368 SDValue Adjust = DAG.getNode(ISD::UMIN, SL, MVT::i32,
3369 DAG.getConstant(1, SL, MVT::i32), Lo);
3370 // Get the 32-bit normalized integer.
3371 Norm = DAG.getNode(ISD::OR, SL, MVT::i32, Hi, Adjust);
3372 // Convert the normalized 32-bit integer into f32.
3373 unsigned Opc =
3374 (Signed && Subtarget->isGCN()) ? ISD::SINT_TO_FP : ISD::UINT_TO_FP;
3375 SDValue FVal = DAG.getNode(Opc, SL, MVT::f32, Norm);
3376
3377 // Finally, need to scale back the converted floating number as the original
3378 // 64-bit integer is converted as a 32-bit one.
3379 ShAmt = DAG.getNode(ISD::SUB, SL, MVT::i32, DAG.getConstant(32, SL, MVT::i32),
3380 ShAmt);
3381 // On GCN, use LDEXP directly.
3382 if (Subtarget->isGCN())
3383 return DAG.getNode(ISD::FLDEXP, SL, MVT::f32, FVal, ShAmt);
3384
3385 // Otherwise, align 'ShAmt' to the exponent part and add it into the exponent
3386 // part directly to emulate the multiplication of 2^ShAmt. That 8-bit
3387 // exponent is enough to avoid overflowing into the sign bit.
3388 SDValue Exp = DAG.getNode(ISD::SHL, SL, MVT::i32, ShAmt,
3389 DAG.getConstant(23, SL, MVT::i32));
3390 SDValue IVal =
3391 DAG.getNode(ISD::ADD, SL, MVT::i32,
3392 DAG.getNode(ISD::BITCAST, SL, MVT::i32, FVal), Exp);
3393 if (Signed) {
3394 // Set the sign bit.
3395 Sign = DAG.getNode(ISD::SHL, SL, MVT::i32,
3396 DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Sign),
3397 DAG.getConstant(31, SL, MVT::i32));
3398 IVal = DAG.getNode(ISD::OR, SL, MVT::i32, IVal, Sign);
3399 }
3400 return DAG.getNode(ISD::BITCAST, SL, MVT::f32, IVal);
3401}
3402
3404 bool Signed) const {
3405 SDLoc SL(Op);
3406 SDValue Src = Op.getOperand(0);
3407
3408 SDValue Lo, Hi;
3409 std::tie(Lo, Hi) = split64BitValue(Src, DAG);
3410
3412 SL, MVT::f64, Hi);
3413
3414 SDValue CvtLo = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f64, Lo);
3415
3416 SDValue LdExp = DAG.getNode(ISD::FLDEXP, SL, MVT::f64, CvtHi,
3417 DAG.getConstant(32, SL, MVT::i32));
3418 // TODO: Should this propagate fast-math-flags?
3419 return DAG.getNode(ISD::FADD, SL, MVT::f64, LdExp, CvtLo);
3420}
3421
3423 SelectionDAG &DAG) const {
3424 // TODO: Factor out code common with LowerSINT_TO_FP.
3425 EVT DestVT = Op.getValueType();
3426 SDValue Src = Op.getOperand(0);
3427 EVT SrcVT = Src.getValueType();
3428
3429 if (SrcVT == MVT::i16) {
3430 if (DestVT == MVT::f16)
3431 return Op;
3432 SDLoc DL(Op);
3433
3434 // Promote src to i32
3435 SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Src);
3436 return DAG.getNode(ISD::UINT_TO_FP, DL, DestVT, Ext);
3437 }
3438
3439 if (DestVT == MVT::bf16) {
3440 SDLoc SL(Op);
3441 SDValue ToF32 = DAG.getNode(ISD::UINT_TO_FP, SL, MVT::f32, Src);
3442 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3443 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3444 }
3445
3446 if (SrcVT != MVT::i64)
3447 return Op;
3448
3449 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3450 SDLoc DL(Op);
3451
3452 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3453 SDValue FPRoundFlag =
3454 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3455 SDValue FPRound =
3456 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3457
3458 return FPRound;
3459 }
3460
3461 if (DestVT == MVT::f32)
3462 return LowerINT_TO_FP32(Op, DAG, false);
3463
3464 assert(DestVT == MVT::f64);
3465 return LowerINT_TO_FP64(Op, DAG, false);
3466}
3467
3469 SelectionDAG &DAG) const {
3470 EVT DestVT = Op.getValueType();
3471
3472 SDValue Src = Op.getOperand(0);
3473 EVT SrcVT = Src.getValueType();
3474
3475 if (SrcVT == MVT::i16) {
3476 if (DestVT == MVT::f16)
3477 return Op;
3478
3479 SDLoc DL(Op);
3480 // Promote src to i32
3481 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i32, Src);
3482 return DAG.getNode(ISD::SINT_TO_FP, DL, DestVT, Ext);
3483 }
3484
3485 if (DestVT == MVT::bf16) {
3486 SDLoc SL(Op);
3487 SDValue ToF32 = DAG.getNode(ISD::SINT_TO_FP, SL, MVT::f32, Src);
3488 SDValue FPRoundFlag = DAG.getIntPtrConstant(0, SL, /*isTarget=*/true);
3489 return DAG.getNode(ISD::FP_ROUND, SL, MVT::bf16, ToF32, FPRoundFlag);
3490 }
3491
3492 if (SrcVT != MVT::i64)
3493 return Op;
3494
3495 // TODO: Factor out code common with LowerUINT_TO_FP.
3496
3497 if (Subtarget->has16BitInsts() && DestVT == MVT::f16) {
3498 SDLoc DL(Op);
3499 SDValue Src = Op.getOperand(0);
3500
3501 SDValue IntToFp32 = DAG.getNode(Op.getOpcode(), DL, MVT::f32, Src);
3502 SDValue FPRoundFlag =
3503 DAG.getIntPtrConstant(0, SDLoc(Op), /*isTarget=*/true);
3504 SDValue FPRound =
3505 DAG.getNode(ISD::FP_ROUND, DL, MVT::f16, IntToFp32, FPRoundFlag);
3506
3507 return FPRound;
3508 }
3509
3510 if (DestVT == MVT::f32)
3511 return LowerINT_TO_FP32(Op, DAG, true);
3512
3513 assert(DestVT == MVT::f64);
3514 return LowerINT_TO_FP64(Op, DAG, true);
3515}
3516
3518 bool Signed) const {
3519 SDLoc SL(Op);
3520
3521 SDValue Src = Op.getOperand(0);
3522 EVT SrcVT = Src.getValueType();
3523
3524 assert(SrcVT == MVT::f32 || SrcVT == MVT::f64);
3525
3526 // The basic idea of converting a floating point number into a pair of 32-bit
3527 // integers is illustrated as follows:
3528 //
3529 // tf := trunc(val);
3530 // hif := floor(tf * 2^-32);
3531 // lof := tf - hif * 2^32; // lof is always positive due to floor.
3532 // hi := fptoi(hif);
3533 // lo := fptoi(lof);
3534 //
3535 SDValue Trunc = DAG.getNode(ISD::FTRUNC, SL, SrcVT, Src);
3536 SDValue Sign;
3537 if (Signed && SrcVT == MVT::f32) {
3538 // However, a 32-bit floating point number has only 23 bits mantissa and
3539 // it's not enough to hold all the significant bits of `lof` if val is
3540 // negative. To avoid the loss of precision, We need to take the absolute
3541 // value after truncating and flip the result back based on the original
3542 // signedness.
3543 Sign = DAG.getNode(ISD::SRA, SL, MVT::i32,
3544 DAG.getNode(ISD::BITCAST, SL, MVT::i32, Trunc),
3545 DAG.getConstant(31, SL, MVT::i32));
3546 Trunc = DAG.getNode(ISD::FABS, SL, SrcVT, Trunc);
3547 }
3548
3549 SDValue K0, K1;
3550 if (SrcVT == MVT::f64) {
3551 K0 = DAG.getConstantFP(
3552 llvm::bit_cast<double>(UINT64_C(/*2^-32*/ 0x3df0000000000000)), SL,
3553 SrcVT);
3554 K1 = DAG.getConstantFP(
3555 llvm::bit_cast<double>(UINT64_C(/*-2^32*/ 0xc1f0000000000000)), SL,
3556 SrcVT);
3557 } else {
3558 K0 = DAG.getConstantFP(
3559 llvm::bit_cast<float>(UINT32_C(/*2^-32*/ 0x2f800000)), SL, SrcVT);
3560 K1 = DAG.getConstantFP(
3561 llvm::bit_cast<float>(UINT32_C(/*-2^32*/ 0xcf800000)), SL, SrcVT);
3562 }
3563 // TODO: Should this propagate fast-math-flags?
3564 SDValue Mul = DAG.getNode(ISD::FMUL, SL, SrcVT, Trunc, K0);
3565
3566 SDValue FloorMul = DAG.getNode(ISD::FFLOOR, SL, SrcVT, Mul);
3567
3568 SDValue Fma = DAG.getNode(ISD::FMA, SL, SrcVT, FloorMul, K1, Trunc);
3569
3570 SDValue Hi = DAG.getNode((Signed && SrcVT == MVT::f64) ? ISD::FP_TO_SINT
3572 SL, MVT::i32, FloorMul);
3573 SDValue Lo = DAG.getNode(ISD::FP_TO_UINT, SL, MVT::i32, Fma);
3574
3575 SDValue Result = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3576 DAG.getBuildVector(MVT::v2i32, SL, {Lo, Hi}));
3577
3578 if (Signed && SrcVT == MVT::f32) {
3579 assert(Sign);
3580 // Flip the result based on the signedness, which is either all 0s or 1s.
3581 Sign = DAG.getNode(ISD::BITCAST, SL, MVT::i64,
3582 DAG.getBuildVector(MVT::v2i32, SL, {Sign, Sign}));
3583 // r := xor(r, sign) - sign;
3584 Result =
3585 DAG.getNode(ISD::SUB, SL, MVT::i64,
3586 DAG.getNode(ISD::XOR, SL, MVT::i64, Result, Sign), Sign);
3587 }
3588
3589 return Result;
3590}
3591
3593 SDLoc DL(Op);
3594 SDValue N0 = Op.getOperand(0);
3595
3596 // Convert to target node to get known bits
3597 if (N0.getValueType() == MVT::f32)
3598 return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
3599
3600 if (Op->getFlags().hasApproximateFuncs()) {
3601 // There is a generic expand for FP_TO_FP16 with unsafe fast math.
3602 return SDValue();
3603 }
3604
3605 return LowerF64ToF16Safe(N0, DL, DAG);
3606}
3607
3608// return node in i32
3610 SelectionDAG &DAG) const {
3611 assert(Src.getSimpleValueType() == MVT::f64);
3612
3613 // f64 -> f16 conversion using round-to-nearest-even rounding mode.
3614 // TODO: We can generate better code for True16.
3615 const unsigned ExpMask = 0x7ff;
3616 const unsigned ExpBiasf64 = 1023;
3617 const unsigned ExpBiasf16 = 15;
3618 SDValue Zero = DAG.getConstant(0, DL, MVT::i32);
3619 SDValue One = DAG.getConstant(1, DL, MVT::i32);
3620 SDValue U = DAG.getNode(ISD::BITCAST, DL, MVT::i64, Src);
3621 SDValue UH = DAG.getNode(ISD::SRL, DL, MVT::i64, U,
3622 DAG.getConstant(32, DL, MVT::i64));
3623 UH = DAG.getZExtOrTrunc(UH, DL, MVT::i32);
3624 U = DAG.getZExtOrTrunc(U, DL, MVT::i32);
3625 SDValue E = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3626 DAG.getConstant(20, DL, MVT::i64));
3627 E = DAG.getNode(ISD::AND, DL, MVT::i32, E,
3628 DAG.getConstant(ExpMask, DL, MVT::i32));
3629 // Subtract the fp64 exponent bias (1023) to get the real exponent and
3630 // add the f16 bias (15) to get the biased exponent for the f16 format.
3631 E = DAG.getNode(ISD::ADD, DL, MVT::i32, E,
3632 DAG.getConstant(-ExpBiasf64 + ExpBiasf16, DL, MVT::i32));
3633
3634 SDValue M = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3635 DAG.getConstant(8, DL, MVT::i32));
3636 M = DAG.getNode(ISD::AND, DL, MVT::i32, M,
3637 DAG.getConstant(0xffe, DL, MVT::i32));
3638
3639 SDValue MaskedSig = DAG.getNode(ISD::AND, DL, MVT::i32, UH,
3640 DAG.getConstant(0x1ff, DL, MVT::i32));
3641 MaskedSig = DAG.getNode(ISD::OR, DL, MVT::i32, MaskedSig, U);
3642
3643 SDValue Lo40Set = DAG.getSelectCC(DL, MaskedSig, Zero, Zero, One, ISD::SETEQ);
3644 M = DAG.getNode(ISD::OR, DL, MVT::i32, M, Lo40Set);
3645
3646 // (M != 0 ? 0x0200 : 0) | 0x7c00;
3647 SDValue I = DAG.getNode(ISD::OR, DL, MVT::i32,
3648 DAG.getSelectCC(DL, M, Zero, DAG.getConstant(0x0200, DL, MVT::i32),
3649 Zero, ISD::SETNE), DAG.getConstant(0x7c00, DL, MVT::i32));
3650
3651 // N = M | (E << 12);
3652 SDValue N = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3653 DAG.getNode(ISD::SHL, DL, MVT::i32, E,
3654 DAG.getConstant(12, DL, MVT::i32)));
3655
3656 // B = clamp(1-E, 0, 13);
3657 SDValue OneSubExp = DAG.getNode(ISD::SUB, DL, MVT::i32,
3658 One, E);
3659 SDValue B = DAG.getNode(ISD::SMAX, DL, MVT::i32, OneSubExp, Zero);
3660 B = DAG.getNode(ISD::SMIN, DL, MVT::i32, B,
3661 DAG.getConstant(13, DL, MVT::i32));
3662
3663 SDValue SigSetHigh = DAG.getNode(ISD::OR, DL, MVT::i32, M,
3664 DAG.getConstant(0x1000, DL, MVT::i32));
3665
3666 SDValue D = DAG.getNode(ISD::SRL, DL, MVT::i32, SigSetHigh, B);
3667 SDValue D0 = DAG.getNode(ISD::SHL, DL, MVT::i32, D, B);
3668 SDValue D1 = DAG.getSelectCC(DL, D0, SigSetHigh, One, Zero, ISD::SETNE);
3669 D = DAG.getNode(ISD::OR, DL, MVT::i32, D, D1);
3670
3671 SDValue V = DAG.getSelectCC(DL, E, One, D, N, ISD::SETLT);
3672 SDValue VLow3 = DAG.getNode(ISD::AND, DL, MVT::i32, V,
3673 DAG.getConstant(0x7, DL, MVT::i32));
3674 V = DAG.getNode(ISD::SRL, DL, MVT::i32, V,
3675 DAG.getConstant(2, DL, MVT::i32));
3676 SDValue V0 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(3, DL, MVT::i32),
3677 One, Zero, ISD::SETEQ);
3678 SDValue V1 = DAG.getSelectCC(DL, VLow3, DAG.getConstant(5, DL, MVT::i32),
3679 One, Zero, ISD::SETGT);
3680 V1 = DAG.getNode(ISD::OR, DL, MVT::i32, V0, V1);
3681 V = DAG.getNode(ISD::ADD, DL, MVT::i32, V, V1);
3682
3683 V = DAG.getSelectCC(DL, E, DAG.getConstant(30, DL, MVT::i32),
3684 DAG.getConstant(0x7c00, DL, MVT::i32), V, ISD::SETGT);
3685 V = DAG.getSelectCC(DL, E, DAG.getConstant(1039, DL, MVT::i32),
3686 I, V, ISD::SETEQ);
3687
3688 // Extract the sign bit.
3689 SDValue Sign = DAG.getNode(ISD::SRL, DL, MVT::i32, UH,
3690 DAG.getConstant(16, DL, MVT::i32));
3691 Sign = DAG.getNode(ISD::AND, DL, MVT::i32, Sign,
3692 DAG.getConstant(0x8000, DL, MVT::i32));
3693
3694 return DAG.getNode(ISD::OR, DL, MVT::i32, Sign, V);
3695}
3696
3698 SelectionDAG &DAG) const {
3699 SDValue Src = Op.getOperand(0);
3700 unsigned OpOpcode = Op.getOpcode();
3701 EVT SrcVT = Src.getValueType();
3702 EVT DestVT = Op.getValueType();
3703
3704 // Will be selected natively
3705 if (SrcVT == MVT::f16 && DestVT == MVT::i16)
3706 return Op;
3707
3708 if (SrcVT == MVT::bf16) {
3709 SDLoc DL(Op);
3710 SDValue PromotedSrc = DAG.getNode(ISD::FP_EXTEND, DL, MVT::f32, Src);
3711 return DAG.getNode(Op.getOpcode(), DL, DestVT, PromotedSrc);
3712 }
3713
3714 // Promote i16 to i32
3715 if (DestVT == MVT::i16 && (SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
3716 SDLoc DL(Op);
3717
3718 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3719 return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, FpToInt32);
3720 }
3721
3722 if (DestVT != MVT::i64)
3723 return Op;
3724
3725 if (SrcVT == MVT::f16 ||
3726 (SrcVT == MVT::f32 && Src.getOpcode() == ISD::FP16_TO_FP)) {
3727 SDLoc DL(Op);
3728
3729 SDValue FpToInt32 = DAG.getNode(OpOpcode, DL, MVT::i32, Src);
3730 unsigned Ext =
3732 return DAG.getNode(Ext, DL, MVT::i64, FpToInt32);
3733 }
3734
3735 if (SrcVT == MVT::f32 || SrcVT == MVT::f64)
3736 return LowerFP_TO_INT64(Op, DAG, OpOpcode == ISD::FP_TO_SINT);
3737
3738 return SDValue();
3739}
3740
3742 SelectionDAG &DAG) const {
3743 EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
3744 MVT VT = Op.getSimpleValueType();
3745 MVT ScalarVT = VT.getScalarType();
3746
3747 assert(VT.isVector());
3748
3749 SDValue Src = Op.getOperand(0);
3750 SDLoc DL(Op);
3751
3752 // TODO: Don't scalarize on Evergreen?
3753 unsigned NElts = VT.getVectorNumElements();
3755 DAG.ExtractVectorElements(Src, Args, 0, NElts);
3756
3757 SDValue VTOp = DAG.getValueType(ExtraVT.getScalarType());
3758 for (unsigned I = 0; I < NElts; ++I)
3759 Args[I] = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ScalarVT, Args[I], VTOp);
3760
3761 return DAG.getBuildVector(VT, DL, Args);
3762}
3763
3764//===----------------------------------------------------------------------===//
3765// Custom DAG optimizations
3766//===----------------------------------------------------------------------===//
3767
3768static bool isU24(SDValue Op, SelectionDAG &DAG) {
3769 return AMDGPUTargetLowering::numBitsUnsigned(Op, DAG) <= 24;
3770}
3771
3772static bool isI24(SDValue Op, SelectionDAG &DAG) {
3773 EVT VT = Op.getValueType();
3774 return VT.getSizeInBits() >= 24 && // Types less than 24-bit should be treated
3775 // as unsigned 24-bit values.
3777}
3778
3781 SelectionDAG &DAG = DCI.DAG;
3782 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
3783 bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
3784
3785 SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
3786 SDValue RHS = IsIntrin ? Node24->getOperand(2) : Node24->getOperand(1);
3787 unsigned NewOpcode = Node24->getOpcode();
3788 if (IsIntrin) {
3789 unsigned IID = Node24->getConstantOperandVal(0);
3790 switch (IID) {
3791 case Intrinsic::amdgcn_mul_i24:
3792 NewOpcode = AMDGPUISD::MUL_I24;
3793 break;
3794 case Intrinsic::amdgcn_mul_u24:
3795 NewOpcode = AMDGPUISD::MUL_U24;
3796 break;
3797 case Intrinsic::amdgcn_mulhi_i24:
3798 NewOpcode = AMDGPUISD::MULHI_I24;
3799 break;
3800 case Intrinsic::amdgcn_mulhi_u24:
3801 NewOpcode = AMDGPUISD::MULHI_U24;
3802 break;
3803 default:
3804 llvm_unreachable("Expected 24-bit mul intrinsic");
3805 }
3806 }
3807
3808 APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
3809
3810 // First try to simplify using SimplifyMultipleUseDemandedBits which allows
3811 // the operands to have other uses, but will only perform simplifications that
3812 // involve bypassing some nodes for this user.
3813 SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
3814 SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
3815 if (DemandedLHS || DemandedRHS)
3816 return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
3817 DemandedLHS ? DemandedLHS : LHS,
3818 DemandedRHS ? DemandedRHS : RHS);
3819
3820 // Now try SimplifyDemandedBits which can simplify the nodes used by our
3821 // operands if this node is the only user.
3822 if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
3823 return SDValue(Node24, 0);
3824 if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
3825 return SDValue(Node24, 0);
3826
3827 return SDValue();
3828}
3829
3830template <typename IntTy>
3832 uint32_t Width, const SDLoc &DL) {
3833 if (Width + Offset < 32) {
3834 uint32_t Shl = static_cast<uint32_t>(Src0) << (32 - Offset - Width);
3835 IntTy Result = static_cast<IntTy>(Shl) >> (32 - Width);
3836 if constexpr (std::is_signed_v<IntTy>) {
3837 return DAG.getSignedConstant(Result, DL, MVT::i32);
3838 } else {
3839 return DAG.getConstant(Result, DL, MVT::i32);
3840 }
3841 }
3842
3843 return DAG.getConstant(Src0 >> Offset, DL, MVT::i32);
3844}
3845
3846static bool hasVolatileUser(SDNode *Val) {
3847 for (SDNode *U : Val->users()) {
3848 if (MemSDNode *M = dyn_cast<MemSDNode>(U)) {
3849 if (M->isVolatile())
3850 return true;
3851 }
3852 }
3853
3854 return false;
3855}
3856
3858 // i32 vectors are the canonical memory type.
3859 if (VT.getScalarType() == MVT::i32 || isTypeLegal(VT))
3860 return false;
3861
3862 if (!VT.isByteSized())
3863 return false;
3864
3865 unsigned Size = VT.getStoreSize();
3866
3867 if ((Size == 1 || Size == 2 || Size == 4) && !VT.isVector())
3868 return false;
3869
3870 if (Size == 3 || (Size > 4 && (Size % 4 != 0)))
3871 return false;
3872
3873 return true;
3874}
3875
3876// Replace load of an illegal type with a bitcast from a load of a friendlier
3877// type.
3879 DAGCombinerInfo &DCI) const {
3880 if (!DCI.isBeforeLegalize())
3881 return SDValue();
3882
3884 if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
3885 return SDValue();
3886
3887 SDLoc SL(N);
3888 SelectionDAG &DAG = DCI.DAG;
3889 EVT VT = LN->getMemoryVT();
3890
3891 unsigned Size = VT.getStoreSize();
3892 Align Alignment = LN->getAlign();
3893 if (Alignment < Size && isTypeLegal(VT)) {
3894 unsigned IsFast;
3895 unsigned AS = LN->getAddressSpace();
3896
3897 // Expand unaligned loads earlier than legalization. Due to visitation order
3898 // problems during legalization, the emitted instructions to pack and unpack
3899 // the bytes again are not eliminated in the case of an unaligned copy.
3901 VT, AS, Alignment, LN->getMemOperand()->getFlags(), &IsFast)) {
3902 if (VT.isVector())
3903 return SplitVectorLoad(SDValue(LN, 0), DAG);
3904
3905 SDValue Ops[2];
3906 std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(LN, DAG);
3907
3908 return DAG.getMergeValues(Ops, SDLoc(N));
3909 }
3910
3911 if (!IsFast)
3912 return SDValue();
3913 }
3914
3915 if (!shouldCombineMemoryType(VT))
3916 return SDValue();
3917
3918 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3919
3920 SDValue NewLoad
3921 = DAG.getLoad(NewVT, SL, LN->getChain(),
3922 LN->getBasePtr(), LN->getMemOperand());
3923
3924 SDValue BC = DAG.getNode(ISD::BITCAST, SL, VT, NewLoad);
3925 DCI.CombineTo(N, BC, NewLoad.getValue(1));
3926 return SDValue(N, 0);
3927}
3928
3929// Replace store of an illegal type with a store of a bitcast to a friendlier
3930// type.
3932 DAGCombinerInfo &DCI) const {
3933 if (!DCI.isBeforeLegalize())
3934 return SDValue();
3935
3937 if (!SN->isSimple() || !ISD::isNormalStore(SN))
3938 return SDValue();
3939
3940 EVT VT = SN->getMemoryVT();
3941 unsigned Size = VT.getStoreSize();
3942
3943 SDLoc SL(N);
3944 SelectionDAG &DAG = DCI.DAG;
3945 Align Alignment = SN->getAlign();
3946 if (Alignment < Size && isTypeLegal(VT)) {
3947 unsigned IsFast;
3948 unsigned AS = SN->getAddressSpace();
3949
3950 // Expand unaligned stores earlier than legalization. Due to visitation
3951 // order problems during legalization, the emitted instructions to pack and
3952 // unpack the bytes again are not eliminated in the case of an unaligned
3953 // copy.
3955 VT, AS, Alignment, SN->getMemOperand()->getFlags(), &IsFast)) {
3956 if (VT.isVector())
3957 return SplitVectorStore(SDValue(SN, 0), DAG);
3958
3959 return expandUnalignedStore(SN, DAG);
3960 }
3961
3962 if (!IsFast)
3963 return SDValue();
3964 }
3965
3966 if (!shouldCombineMemoryType(VT))
3967 return SDValue();
3968
3969 EVT NewVT = getEquivalentMemType(*DAG.getContext(), VT);
3970 SDValue Val = SN->getValue();
3971
3972 //DCI.AddToWorklist(Val.getNode());
3973
3974 bool OtherUses = !Val.hasOneUse();
3975 SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, NewVT, Val);
3976 if (OtherUses) {
3977 SDValue CastBack = DAG.getNode(ISD::BITCAST, SL, VT, CastVal);
3978 DAG.ReplaceAllUsesOfValueWith(Val, CastBack);
3979 }
3980
3981 return DAG.getStore(SN->getChain(), SL, CastVal,
3982 SN->getBasePtr(), SN->getMemOperand());
3983}
3984
3985// FIXME: This should go in generic DAG combiner with an isTruncateFree check,
3986// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU
3987// issues.
3989 DAGCombinerInfo &DCI) const {
3990 SelectionDAG &DAG = DCI.DAG;
3991 SDValue N0 = N->getOperand(0);
3992
3993 // (vt2 (assertzext (truncate vt0:x), vt1)) ->
3994 // (vt2 (truncate (assertzext vt0:x, vt1)))
3995 if (N0.getOpcode() == ISD::TRUNCATE) {
3996 SDValue N1 = N->getOperand(1);
3997 EVT ExtVT = cast<VTSDNode>(N1)->getVT();
3998 SDLoc SL(N);
3999
4000 SDValue Src = N0.getOperand(0);
4001 EVT SrcVT = Src.getValueType();
4002 if (SrcVT.bitsGE(ExtVT)) {
4003 SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1);
4004 return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg);
4005 }
4006 }
4007
4008 return SDValue();
4009}
4010
4012 SDNode *N, DAGCombinerInfo &DCI) const {
4013 unsigned IID = N->getConstantOperandVal(0);
4014 switch (IID) {
4015 case Intrinsic::amdgcn_mul_i24:
4016 case Intrinsic::amdgcn_mul_u24:
4017 case Intrinsic::amdgcn_mulhi_i24:
4018 case Intrinsic::amdgcn_mulhi_u24:
4019 return simplifyMul24(N, DCI);
4020 case Intrinsic::amdgcn_fract:
4021 case Intrinsic::amdgcn_rsq:
4022 case Intrinsic::amdgcn_rcp_legacy:
4023 case Intrinsic::amdgcn_rsq_legacy:
4024 case Intrinsic::amdgcn_rsq_clamp:
4025 case Intrinsic::amdgcn_tanh:
4026 case Intrinsic::amdgcn_prng_b32: {
4027 // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
4028 SDValue Src = N->getOperand(1);
4029 return Src.isUndef() ? Src : SDValue();
4030 }
4031 case Intrinsic::amdgcn_frexp_exp: {
4032 // frexp_exp (fneg x) -> frexp_exp x
4033 // frexp_exp (fabs x) -> frexp_exp x
4034 // frexp_exp (fneg (fabs x)) -> frexp_exp x
4035 SDValue Src = N->getOperand(1);
4036 SDValue PeekSign = peekFPSignOps(Src);
4037 if (PeekSign == Src)
4038 return SDValue();
4039 return SDValue(DCI.DAG.UpdateNodeOperands(N, N->getOperand(0), PeekSign),
4040 0);
4041 }
4042 default:
4043 return SDValue();
4044 }
4045}
4046
4047/// Split the 64-bit value \p LHS into two 32-bit components, and perform the
4048/// binary operation \p Opc to it with the corresponding constant operands.
4050 DAGCombinerInfo &DCI, const SDLoc &SL,
4051 unsigned Opc, SDValue LHS,
4052 uint32_t ValLo, uint32_t ValHi) const {
4053 SelectionDAG &DAG = DCI.DAG;
4054 SDValue Lo, Hi;
4055 std::tie(Lo, Hi) = split64BitValue(LHS, DAG);
4056
4057 SDValue LoRHS = DAG.getConstant(ValLo, SL, MVT::i32);
4058 SDValue HiRHS = DAG.getConstant(ValHi, SL, MVT::i32);
4059
4060 SDValue LoAnd = DAG.getNode(Opc, SL, MVT::i32, Lo, LoRHS);
4061 SDValue HiAnd = DAG.getNode(Opc, SL, MVT::i32, Hi, HiRHS);
4062
4063 // Re-visit the ands. It's possible we eliminated one of them and it could
4064 // simplify the vector.
4065 DCI.AddToWorklist(Lo.getNode());
4066 DCI.AddToWorklist(Hi.getNode());
4067
4068 SDValue Vec = DAG.getBuildVector(MVT::v2i32, SL, {LoAnd, HiAnd});
4069 return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec);
4070}
4071
4073 DAGCombinerInfo &DCI) const {
4074 EVT VT = N->getValueType(0);
4075 SDValue LHS = N->getOperand(0);
4076 SDValue RHS = N->getOperand(1);
4078 SDLoc SL(N);
4079 SelectionDAG &DAG = DCI.DAG;
4080
4081 unsigned RHSVal;
4082 if (CRHS) {
4083 RHSVal = CRHS->getZExtValue();
4084 if (!RHSVal)
4085 return LHS;
4086
4087 switch (LHS->getOpcode()) {
4088 default:
4089 break;
4090 case ISD::ZERO_EXTEND:
4091 case ISD::SIGN_EXTEND:
4092 case ISD::ANY_EXTEND: {
4093 SDValue X = LHS->getOperand(0);
4094
4095 if (VT == MVT::i32 && RHSVal == 16 && X.getValueType() == MVT::i16 &&
4096 isOperationLegal(ISD::BUILD_VECTOR, MVT::v2i16)) {
4097 // Prefer build_vector as the canonical form if packed types are legal.
4098 // (shl ([asz]ext i16:x), 16 -> build_vector 0, x
4099 SDValue Vec = DAG.getBuildVector(
4100 MVT::v2i16, SL,
4101 {DAG.getConstant(0, SL, MVT::i16), LHS->getOperand(0)});
4102 return DAG.getNode(ISD::BITCAST, SL, MVT::i32, Vec);
4103 }
4104
4105 // shl (ext x) => zext (shl x), if shift does not overflow int
4106 if (VT != MVT::i64)
4107 break;
4108 KnownBits Known = DAG.computeKnownBits(X);
4109 unsigned LZ = Known.countMinLeadingZeros();
4110 if (LZ < RHSVal)
4111 break;
4112 EVT XVT = X.getValueType();
4113 SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(CRHS, 0));
4114 return DAG.getZExtOrTrunc(Shl, SL, VT);
4115 }
4116 }
4117 }
4118
4119 if (VT.getScalarType() != MVT::i64)
4120 return SDValue();
4121
4122 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4123 // common case, splitting this into a move and a 32-bit shift is faster and
4124 // the same code size.
4125 KnownBits Known = DAG.computeKnownBits(RHS);
4126
4127 EVT ElementType = VT.getScalarType();
4128 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4129 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4130 : TargetScalarType;
4131
4132 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4133 return SDValue();
4134 SDValue ShiftAmt;
4135
4136 if (CRHS) {
4137 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4138 TargetType);
4139 } else {
4140 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4141 const SDValue ShiftMask =
4142 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4143 // This AND instruction will clamp out of bounds shift values.
4144 // It will also be removed during later instruction selection.
4145 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4146 }
4147
4148 SDValue Lo = DAG.getNode(ISD::TRUNCATE, SL, TargetType, LHS);
4149 SDValue NewShift =
4150 DAG.getNode(ISD::SHL, SL, TargetType, Lo, ShiftAmt, N->getFlags());
4151
4152 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4153 SDValue Vec;
4154
4155 if (VT.isVector()) {
4156 EVT ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4157 unsigned NElts = TargetType.getVectorNumElements();
4159 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4160
4161 DAG.ExtractVectorElements(NewShift, HiOps, 0, NElts);
4162 for (unsigned I = 0; I != NElts; ++I)
4163 HiAndLoOps[2 * I + 1] = HiOps[I];
4164 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4165 } else {
4166 EVT ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4167 Vec = DAG.getBuildVector(ConcatType, SL, {Zero, NewShift});
4168 }
4169 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4170}
4171
4173 DAGCombinerInfo &DCI) const {
4174 SDValue RHS = N->getOperand(1);
4176 EVT VT = N->getValueType(0);
4177 SDValue LHS = N->getOperand(0);
4178 SelectionDAG &DAG = DCI.DAG;
4179 SDLoc SL(N);
4180
4181 if (VT.getScalarType() != MVT::i64)
4182 return SDValue();
4183
4184 // For C >= 32
4185 // i64 (sra x, C) -> (build_pair (sra hi_32(x), C - 32), sra hi_32(x), 31))
4186
4187 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4188 // common case, splitting this into a move and a 32-bit shift is faster and
4189 // the same code size.
4190 KnownBits Known = DAG.computeKnownBits(RHS);
4191
4192 EVT ElementType = VT.getScalarType();
4193 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4194 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4195 : TargetScalarType;
4196
4197 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4198 return SDValue();
4199
4200 SDValue ShiftFullAmt =
4201 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4202 SDValue ShiftAmt;
4203 if (CRHS) {
4204 unsigned RHSVal = CRHS->getZExtValue();
4205 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4206 TargetType);
4207 } else if (Known.getMinValue().getZExtValue() ==
4208 (ElementType.getSizeInBits() - 1)) {
4209 ShiftAmt = ShiftFullAmt;
4210 } else {
4211 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4212 const SDValue ShiftMask =
4213 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4214 // This AND instruction will clamp out of bounds shift values.
4215 // It will also be removed during later instruction selection.
4216 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4217 }
4218
4219 EVT ConcatType;
4220 SDValue Hi;
4221 SDLoc LHSSL(LHS);
4222 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4223 if (VT.isVector()) {
4224 unsigned NElts = TargetType.getVectorNumElements();
4225 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4226 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4227 SmallVector<SDValue, 8> HiOps(NElts);
4228 SmallVector<SDValue, 16> HiAndLoOps;
4229
4230 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, 0, NElts * 2);
4231 for (unsigned I = 0; I != NElts; ++I) {
4232 HiOps[I] = HiAndLoOps[2 * I + 1];
4233 }
4234 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4235 } else {
4236 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4237 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4238 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4239 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4240 }
4241
4242 KnownBits KnownLHS = DAG.computeKnownBits(LHS);
4243 SDValue HiShift;
4244 if (KnownLHS.isNegative()) {
4245 HiShift = DAG.getAllOnesConstant(SL, TargetType);
4246 } else {
4247 Hi = DAG.getFreeze(Hi);
4248 HiShift = DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftFullAmt);
4249 }
4250 SDValue NewShift =
4251 DAG.getNode(ISD::SRA, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4252
4253 SDValue Vec;
4254 if (VT.isVector()) {
4255 unsigned NElts = TargetType.getVectorNumElements();
4258 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2);
4259
4260 DAG.ExtractVectorElements(HiShift, HiOps, 0, NElts);
4261 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4262 for (unsigned I = 0; I != NElts; ++I) {
4263 HiAndLoOps[2 * I + 1] = HiOps[I];
4264 HiAndLoOps[2 * I] = LoOps[I];
4265 }
4266 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4267 } else {
4268 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, HiShift});
4269 }
4270 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4271}
4272
4274 DAGCombinerInfo &DCI) const {
4275 SDValue RHS = N->getOperand(1);
4277 EVT VT = N->getValueType(0);
4278 SDValue LHS = N->getOperand(0);
4279 SelectionDAG &DAG = DCI.DAG;
4280 SDLoc SL(N);
4281 unsigned RHSVal;
4282
4283 if (CRHS) {
4284 RHSVal = CRHS->getZExtValue();
4285
4286 // fold (srl (and x, c1 << c2), c2) -> (and (srl(x, c2), c1)
4287 // this improves the ability to match BFE patterns in isel.
4288 if (LHS.getOpcode() == ISD::AND) {
4289 if (auto *Mask = dyn_cast<ConstantSDNode>(LHS.getOperand(1))) {
4290 unsigned MaskIdx, MaskLen;
4291 if (Mask->getAPIntValue().isShiftedMask(MaskIdx, MaskLen) &&
4292 MaskIdx == RHSVal) {
4293 return DAG.getNode(ISD::AND, SL, VT,
4294 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(0),
4295 N->getOperand(1)),
4296 DAG.getNode(ISD::SRL, SL, VT, LHS.getOperand(1),
4297 N->getOperand(1)));
4298 }
4299 }
4300 }
4301 }
4302
4303 if (VT.getScalarType() != MVT::i64)
4304 return SDValue();
4305
4306 // for C >= 32
4307 // i64 (srl x, C) -> (build_pair (srl hi_32(x), C - 32), 0)
4308
4309 // On some subtargets, 64-bit shift is a quarter rate instruction. In the
4310 // common case, splitting this into a move and a 32-bit shift is faster and
4311 // the same code size.
4312 KnownBits Known = DAG.computeKnownBits(RHS);
4313
4314 EVT ElementType = VT.getScalarType();
4315 EVT TargetScalarType = ElementType.getHalfSizedIntegerVT(*DAG.getContext());
4316 EVT TargetType = VT.isVector() ? VT.changeVectorElementType(TargetScalarType)
4317 : TargetScalarType;
4318
4319 if (Known.getMinValue().getZExtValue() < TargetScalarType.getSizeInBits())
4320 return SDValue();
4321
4322 SDValue ShiftAmt;
4323 if (CRHS) {
4324 ShiftAmt = DAG.getConstant(RHSVal - TargetScalarType.getSizeInBits(), SL,
4325 TargetType);
4326 } else {
4327 SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS);
4328 const SDValue ShiftMask =
4329 DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType);
4330 // This AND instruction will clamp out of bounds shift values.
4331 // It will also be removed during later instruction selection.
4332 ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask);
4333 }
4334
4335 const SDValue Zero = DAG.getConstant(0, SL, TargetScalarType);
4336 EVT ConcatType;
4337 SDValue Hi;
4338 SDLoc LHSSL(LHS);
4339 // Bitcast LHS into ConcatType so hi-half of source can be extracted into Hi
4340 if (VT.isVector()) {
4341 unsigned NElts = TargetType.getVectorNumElements();
4342 ConcatType = TargetType.getDoubleNumVectorElementsVT(*DAG.getContext());
4343 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4344 SmallVector<SDValue, 8> HiOps(NElts);
4345 SmallVector<SDValue, 16> HiAndLoOps;
4346
4347 DAG.ExtractVectorElements(SplitLHS, HiAndLoOps, /*Start=*/0, NElts * 2);
4348 for (unsigned I = 0; I != NElts; ++I)
4349 HiOps[I] = HiAndLoOps[2 * I + 1];
4350 Hi = DAG.getNode(ISD::BUILD_VECTOR, LHSSL, TargetType, HiOps);
4351 } else {
4352 const SDValue One = DAG.getConstant(1, LHSSL, TargetScalarType);
4353 ConcatType = EVT::getVectorVT(*DAG.getContext(), TargetType, 2);
4354 SDValue SplitLHS = DAG.getNode(ISD::BITCAST, LHSSL, ConcatType, LHS);
4355 Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, LHSSL, TargetType, SplitLHS, One);
4356 }
4357
4358 SDValue NewShift =
4359 DAG.getNode(ISD::SRL, SL, TargetType, Hi, ShiftAmt, N->getFlags());
4360
4361 SDValue Vec;
4362 if (VT.isVector()) {
4363 unsigned NElts = TargetType.getVectorNumElements();
4365 SmallVector<SDValue, 16> HiAndLoOps(NElts * 2, Zero);
4366
4367 DAG.ExtractVectorElements(NewShift, LoOps, 0, NElts);
4368 for (unsigned I = 0; I != NElts; ++I)
4369 HiAndLoOps[2 * I] = LoOps[I];
4370 Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, ConcatType, HiAndLoOps);
4371 } else {
4372 Vec = DAG.getBuildVector(ConcatType, SL, {NewShift, Zero});
4373 }
4374 return DAG.getNode(ISD::BITCAST, SL, VT, Vec);
4375}
4376
4378 SDNode *N, DAGCombinerInfo &DCI) const {
4379 SDLoc SL(N);
4380 SelectionDAG &DAG = DCI.DAG;
4381 EVT VT = N->getValueType(0);
4382 SDValue Src = N->getOperand(0);
4383
4384 // vt1 (truncate (bitcast (build_vector vt0:x, ...))) -> vt1 (bitcast vt0:x)
4385 if (Src.getOpcode() == ISD::BITCAST && !VT.isVector()) {
4386 SDValue Vec = Src.getOperand(0);
4387 if (Vec.getOpcode() == ISD::BUILD_VECTOR) {
4388 SDValue Elt0 = Vec.getOperand(0);
4389 EVT EltVT = Elt0.getValueType();
4390 if (VT.getFixedSizeInBits() <= EltVT.getFixedSizeInBits()) {
4391 if (EltVT.isFloatingPoint()) {
4392 Elt0 = DAG.getNode(ISD::BITCAST, SL,
4393 EltVT.changeTypeToInteger(), Elt0);
4394 }
4395
4396 return DAG.getNode(ISD::TRUNCATE, SL, VT, Elt0);
4397 }
4398 }
4399 }
4400
4401 // Equivalent of above for accessing the high element of a vector as an
4402 // integer operation.
4403 // trunc (srl (bitcast (build_vector x, y))), 16 -> trunc (bitcast y)
4404 if (Src.getOpcode() == ISD::SRL && !VT.isVector()) {
4405 if (auto *K = isConstOrConstSplat(Src.getOperand(1))) {
4406 SDValue BV = stripBitcast(Src.getOperand(0));
4407 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
4408 EVT SrcEltVT = BV.getOperand(0).getValueType();
4409 unsigned SrcEltSize = SrcEltVT.getSizeInBits();
4410 unsigned BitIndex = K->getZExtValue();
4411 unsigned PartIndex = BitIndex / SrcEltSize;
4412
4413 if (PartIndex * SrcEltSize == BitIndex &&
4414 PartIndex < BV.getNumOperands()) {
4415 if (SrcEltVT.getSizeInBits() == VT.getSizeInBits()) {
4416 SDValue SrcElt =
4417 DAG.getNode(ISD::BITCAST, SL, SrcEltVT.changeTypeToInteger(),
4418 BV.getOperand(PartIndex));
4419 return DAG.getNode(ISD::TRUNCATE, SL, VT, SrcElt);
4420 }
4421 }
4422 }
4423 }
4424 }
4425
4426 // Partially shrink 64-bit shifts to 32-bit if reduced to 16-bit.
4427 //
4428 // i16 (trunc (srl i64:x, K)), K <= 16 ->
4429 // i16 (trunc (srl (i32 (trunc x), K)))
4430 if (VT.getScalarSizeInBits() < 32) {
4431 EVT SrcVT = Src.getValueType();
4432 if (SrcVT.getScalarSizeInBits() > 32 &&
4433 (Src.getOpcode() == ISD::SRL ||
4434 Src.getOpcode() == ISD::SRA ||
4435 Src.getOpcode() == ISD::SHL)) {
4436 SDValue Amt = Src.getOperand(1);
4437 KnownBits Known = DAG.computeKnownBits(Amt);
4438
4439 // - For left shifts, do the transform as long as the shift
4440 // amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
4441 // - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
4442 // losing information stored in the high bits when truncating.
4443 const unsigned MaxCstSize =
4444 (Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
4445 if (Known.getMaxValue().ule(MaxCstSize)) {
4446 EVT MidVT = VT.isVector() ?
4447 EVT::getVectorVT(*DAG.getContext(), MVT::i32,
4448 VT.getVectorNumElements()) : MVT::i32;
4449
4450 EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
4451 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
4452 Src.getOperand(0));
4453 DCI.AddToWorklist(Trunc.getNode());
4454
4455 if (Amt.getValueType() != NewShiftVT) {
4456 Amt = DAG.getZExtOrTrunc(Amt, SL, NewShiftVT);
4457 DCI.AddToWorklist(Amt.getNode());
4458 }
4459
4460 SDValue ShrunkShift = DAG.getNode(Src.getOpcode(), SL, MidVT,
4461 Trunc, Amt);
4462 return DAG.getNode(ISD::TRUNCATE, SL, VT, ShrunkShift);
4463 }
4464 }
4465 }
4466
4467 return SDValue();
4468}
4469
4470// We need to specifically handle i64 mul here to avoid unnecessary conversion
4471// instructions. If we only match on the legalized i64 mul expansion,
4472// SimplifyDemandedBits will be unable to remove them because there will be
4473// multiple uses due to the separate mul + mulh[su].
4474static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL,
4475 SDValue N0, SDValue N1, unsigned Size, bool Signed) {
4476 if (Size <= 32) {
4477 unsigned MulOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4478 return DAG.getNode(MulOpc, SL, MVT::i32, N0, N1);
4479 }
4480
4481 unsigned MulLoOpc = Signed ? AMDGPUISD::MUL_I24 : AMDGPUISD::MUL_U24;
4482 unsigned MulHiOpc = Signed ? AMDGPUISD::MULHI_I24 : AMDGPUISD::MULHI_U24;
4483
4484 SDValue MulLo = DAG.getNode(MulLoOpc, SL, MVT::i32, N0, N1);
4485 SDValue MulHi = DAG.getNode(MulHiOpc, SL, MVT::i32, N0, N1);
4486
4487 return DAG.getNode(ISD::BUILD_PAIR, SL, MVT::i64, MulLo, MulHi);
4488}
4489
4490/// If \p V is an add of a constant 1, returns the other operand. Otherwise
4491/// return SDValue().
4492static SDValue getAddOneOp(const SDNode *V) {
4493 if (V->getOpcode() != ISD::ADD)
4494 return SDValue();
4495
4496 return isOneConstant(V->getOperand(1)) ? V->getOperand(0) : SDValue();
4497}
4498
4500 DAGCombinerInfo &DCI) const {
4501 assert(N->getOpcode() == ISD::MUL);
4502 EVT VT = N->getValueType(0);
4503
4504 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4505 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4506 // unnecessarily). isDivergent() is used as an approximation of whether the
4507 // value is in an SGPR.
4508 if (!N->isDivergent())
4509 return SDValue();
4510
4511 unsigned Size = VT.getSizeInBits();
4512 if (VT.isVector() || Size > 64)
4513 return SDValue();
4514
4515 SelectionDAG &DAG = DCI.DAG;
4516 SDLoc DL(N);
4517
4518 SDValue N0 = N->getOperand(0);
4519 SDValue N1 = N->getOperand(1);
4520
4521 // Undo InstCombine canonicalize X * (Y + 1) -> X * Y + X to enable mad
4522 // matching.
4523
4524 // mul x, (add y, 1) -> add (mul x, y), x
4525 auto IsFoldableAdd = [](SDValue V) -> SDValue {
4526 SDValue AddOp = getAddOneOp(V.getNode());
4527 if (!AddOp)
4528 return SDValue();
4529
4530 if (V.hasOneUse() || all_of(V->users(), [](const SDNode *U) -> bool {
4531 return U->getOpcode() == ISD::MUL;
4532 }))
4533 return AddOp;
4534
4535 return SDValue();
4536 };
4537
4538 // FIXME: The selection pattern is not properly checking for commuted
4539 // operands, so we have to place the mul in the LHS
4540 if (SDValue MulOper = IsFoldableAdd(N0)) {
4541 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N1, MulOper);
4542 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N1);
4543 }
4544
4545 if (SDValue MulOper = IsFoldableAdd(N1)) {
4546 SDValue MulVal = DAG.getNode(N->getOpcode(), DL, VT, N0, MulOper);
4547 return DAG.getNode(ISD::ADD, DL, VT, MulVal, N0);
4548 }
4549
4550 // There are i16 integer mul/mad.
4551 if (Subtarget->has16BitInsts() && VT.getScalarType().bitsLE(MVT::i16))
4552 return SDValue();
4553
4554 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4555 // in the source into any_extends if the result of the mul is truncated. Since
4556 // we can assume the high bits are whatever we want, use the underlying value
4557 // to avoid the unknown high bits from interfering.
4558 if (N0.getOpcode() == ISD::ANY_EXTEND)
4559 N0 = N0.getOperand(0);
4560
4561 if (N1.getOpcode() == ISD::ANY_EXTEND)
4562 N1 = N1.getOperand(0);
4563
4564 SDValue Mul;
4565
4566 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4567 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4568 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4569 Mul = getMul24(DAG, DL, N0, N1, Size, false);
4570 } else if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4571 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4572 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4573 Mul = getMul24(DAG, DL, N0, N1, Size, true);
4574 } else {
4575 return SDValue();
4576 }
4577
4578 // We need to use sext even for MUL_U24, because MUL_U24 is used
4579 // for signed multiply of 8 and 16-bit types.
4580 return DAG.getSExtOrTrunc(Mul, DL, VT);
4581}
4582
4583SDValue
4585 DAGCombinerInfo &DCI) const {
4586 if (N->getValueType(0) != MVT::i32)
4587 return SDValue();
4588
4589 SelectionDAG &DAG = DCI.DAG;
4590 SDLoc DL(N);
4591
4592 bool Signed = N->getOpcode() == ISD::SMUL_LOHI;
4593 SDValue N0 = N->getOperand(0);
4594 SDValue N1 = N->getOperand(1);
4595
4596 // SimplifyDemandedBits has the annoying habit of turning useful zero_extends
4597 // in the source into any_extends if the result of the mul is truncated. Since
4598 // we can assume the high bits are whatever we want, use the underlying value
4599 // to avoid the unknown high bits from interfering.
4600 if (N0.getOpcode() == ISD::ANY_EXTEND)
4601 N0 = N0.getOperand(0);
4602 if (N1.getOpcode() == ISD::ANY_EXTEND)
4603 N1 = N1.getOperand(0);
4604
4605 // Try to use two fast 24-bit multiplies (one for each half of the result)
4606 // instead of one slow extending multiply.
4607 unsigned LoOpcode = 0;
4608 unsigned HiOpcode = 0;
4609 if (Signed) {
4610 if (Subtarget->hasMulI24() && isI24(N0, DAG) && isI24(N1, DAG)) {
4611 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4612 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4613 LoOpcode = AMDGPUISD::MUL_I24;
4614 HiOpcode = AMDGPUISD::MULHI_I24;
4615 }
4616 } else {
4617 if (Subtarget->hasMulU24() && isU24(N0, DAG) && isU24(N1, DAG)) {
4618 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4619 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4620 LoOpcode = AMDGPUISD::MUL_U24;
4621 HiOpcode = AMDGPUISD::MULHI_U24;
4622 }
4623 }
4624 if (!LoOpcode)
4625 return SDValue();
4626
4627 SDValue Lo = DAG.getNode(LoOpcode, DL, MVT::i32, N0, N1);
4628 SDValue Hi = DAG.getNode(HiOpcode, DL, MVT::i32, N0, N1);
4629 DCI.CombineTo(N, Lo, Hi);
4630 return SDValue(N, 0);
4631}
4632
4634 DAGCombinerInfo &DCI) const {
4635 EVT VT = N->getValueType(0);
4636
4637 if (!Subtarget->hasMulI24() || VT.isVector())
4638 return SDValue();
4639
4640 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4641 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4642 // unnecessarily). isDivergent() is used as an approximation of whether the
4643 // value is in an SGPR.
4644 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4645 // valu op anyway)
4646 if (Subtarget->hasSMulHi() && !N->isDivergent())
4647 return SDValue();
4648
4649 SelectionDAG &DAG = DCI.DAG;
4650 SDLoc DL(N);
4651
4652 SDValue N0 = N->getOperand(0);
4653 SDValue N1 = N->getOperand(1);
4654
4655 if (!isI24(N0, DAG) || !isI24(N1, DAG))
4656 return SDValue();
4657
4658 N0 = DAG.getSExtOrTrunc(N0, DL, MVT::i32);
4659 N1 = DAG.getSExtOrTrunc(N1, DL, MVT::i32);
4660
4661 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_I24, DL, MVT::i32, N0, N1);
4662 DCI.AddToWorklist(Mulhi.getNode());
4663 return DAG.getSExtOrTrunc(Mulhi, DL, VT);
4664}
4665
4667 DAGCombinerInfo &DCI) const {
4668 EVT VT = N->getValueType(0);
4669
4670 if (!Subtarget->hasMulU24() || VT.isVector() || VT.getSizeInBits() > 32)
4671 return SDValue();
4672
4673 // Don't generate 24-bit multiplies on values that are in SGPRs, since
4674 // we only have a 32-bit scalar multiply (avoid values being moved to VGPRs
4675 // unnecessarily). isDivergent() is used as an approximation of whether the
4676 // value is in an SGPR.
4677 // This doesn't apply if no s_mul_hi is available (since we'll end up with a
4678 // valu op anyway)
4679 if (Subtarget->hasSMulHi() && !N->isDivergent())
4680 return SDValue();
4681
4682 SelectionDAG &DAG = DCI.DAG;
4683 SDLoc DL(N);
4684
4685 SDValue N0 = N->getOperand(0);
4686 SDValue N1 = N->getOperand(1);
4687
4688 if (!isU24(N0, DAG) || !isU24(N1, DAG))
4689 return SDValue();
4690
4691 N0 = DAG.getZExtOrTrunc(N0, DL, MVT::i32);
4692 N1 = DAG.getZExtOrTrunc(N1, DL, MVT::i32);
4693
4694 SDValue Mulhi = DAG.getNode(AMDGPUISD::MULHI_U24, DL, MVT::i32, N0, N1);
4695 DCI.AddToWorklist(Mulhi.getNode());
4696 return DAG.getZExtOrTrunc(Mulhi, DL, VT);
4697}
4698
4699SDValue AMDGPUTargetLowering::getFFBX_U32(SelectionDAG &DAG,
4700 SDValue Op,
4701 const SDLoc &DL,
4702 unsigned Opc) const {
4703 EVT VT = Op.getValueType();
4704 EVT LegalVT = getTypeToTransformTo(*DAG.getContext(), VT);
4705 if (LegalVT != MVT::i32 && (Subtarget->has16BitInsts() &&
4706 LegalVT != MVT::i16))
4707 return SDValue();
4708
4709 if (VT != MVT::i32)
4710 Op = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, Op);
4711
4712 SDValue FFBX = DAG.getNode(Opc, DL, MVT::i32, Op);
4713 if (VT != MVT::i32)
4714 FFBX = DAG.getNode(ISD::TRUNCATE, DL, VT, FFBX);
4715
4716 return FFBX;
4717}
4718
4719// The native instructions return -1 on 0 input. Optimize out a select that
4720// produces -1 on 0.
4721//
4722// TODO: If zero is not undef, we could also do this if the output is compared
4723// against the bitwidth.
4724//
4725// TODO: Should probably combine against FFBH_U32 instead of ctlz directly.
4727 SDValue LHS, SDValue RHS,
4728 DAGCombinerInfo &DCI) const {
4729 if (!isNullConstant(Cond.getOperand(1)))
4730 return SDValue();
4731
4732 SelectionDAG &DAG = DCI.DAG;
4733 ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
4734 SDValue CmpLHS = Cond.getOperand(0);
4735
4736 // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
4737 // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
4738 if (CCOpcode == ISD::SETEQ &&
4739 (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
4740 RHS.getOperand(0) == CmpLHS && isAllOnesConstant(LHS)) {
4741 unsigned Opc =
4743 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4744 }
4745
4746 // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
4747 // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
4748 if (CCOpcode == ISD::SETNE &&
4749 (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
4750 LHS.getOperand(0) == CmpLHS && isAllOnesConstant(RHS)) {
4751 unsigned Opc =
4753
4754 return getFFBX_U32(DAG, CmpLHS, SL, Opc);
4755 }
4756
4757 return SDValue();
4758}
4759
4761 unsigned Op,
4762 const SDLoc &SL,
4763 SDValue Cond,
4764 SDValue N1,
4765 SDValue N2) {
4766 SelectionDAG &DAG = DCI.DAG;
4767 EVT VT = N1.getValueType();
4768
4769 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT, Cond,
4770 N1.getOperand(0), N2.getOperand(0));
4771 DCI.AddToWorklist(NewSelect.getNode());
4772 return DAG.getNode(Op, SL, VT, NewSelect);
4773}
4774
4775// Pull a free FP operation out of a select so it may fold into uses.
4776//
4777// select c, (fneg x), (fneg y) -> fneg (select c, x, y)
4778// select c, (fneg x), k -> fneg (select c, x, (fneg k))
4779//
4780// select c, (fabs x), (fabs y) -> fabs (select c, x, y)
4781// select c, (fabs x), +k -> fabs (select c, x, k)
4782SDValue
4784 SDValue N) const {
4785 SelectionDAG &DAG = DCI.DAG;
4786 SDValue Cond = N.getOperand(0);
4787 SDValue LHS = N.getOperand(1);
4788 SDValue RHS = N.getOperand(2);
4789
4790 EVT VT = N.getValueType();
4791 if ((LHS.getOpcode() == ISD::FABS && RHS.getOpcode() == ISD::FABS) ||
4792 (LHS.getOpcode() == ISD::FNEG && RHS.getOpcode() == ISD::FNEG)) {
4794 return SDValue();
4795
4796 return distributeOpThroughSelect(DCI, LHS.getOpcode(),
4797 SDLoc(N), Cond, LHS, RHS);
4798 }
4799
4800 bool Inv = false;
4801 if (RHS.getOpcode() == ISD::FABS || RHS.getOpcode() == ISD::FNEG) {
4802 std::swap(LHS, RHS);
4803 Inv = true;
4804 }
4805
4806 // TODO: Support vector constants.
4808 if ((LHS.getOpcode() == ISD::FNEG || LHS.getOpcode() == ISD::FABS) && CRHS &&
4809 !selectSupportsSourceMods(N.getNode())) {
4810 SDLoc SL(N);
4811 // If one side is an fneg/fabs and the other is a constant, we can push the
4812 // fneg/fabs down. If it's an fabs, the constant needs to be non-negative.
4813 SDValue NewLHS = LHS.getOperand(0);
4814 SDValue NewRHS = RHS;
4815
4816 // Careful: if the neg can be folded up, don't try to pull it back down.
4817 bool ShouldFoldNeg = true;
4818
4819 if (NewLHS.hasOneUse()) {
4820 unsigned Opc = NewLHS.getOpcode();
4821 if (LHS.getOpcode() == ISD::FNEG && fnegFoldsIntoOp(NewLHS.getNode()))
4822 ShouldFoldNeg = false;
4823 if (LHS.getOpcode() == ISD::FABS && Opc == ISD::FMUL)
4824 ShouldFoldNeg = false;
4825 }
4826
4827 if (ShouldFoldNeg) {
4828 if (LHS.getOpcode() == ISD::FABS && CRHS->isNegative())
4829 return SDValue();
4830
4831 // We're going to be forced to use a source modifier anyway, there's no
4832 // point to pulling the negate out unless we can get a size reduction by
4833 // negating the constant.
4834 //
4835 // TODO: Generalize to use getCheaperNegatedExpression which doesn't know
4836 // about cheaper constants.
4837 if (NewLHS.getOpcode() == ISD::FABS &&
4839 return SDValue();
4840
4842 return SDValue();
4843
4844 if (LHS.getOpcode() == ISD::FNEG)
4845 NewRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
4846
4847 if (Inv)
4848 std::swap(NewLHS, NewRHS);
4849
4850 SDValue NewSelect = DAG.getNode(ISD::SELECT, SL, VT,
4851 Cond, NewLHS, NewRHS);
4852 DCI.AddToWorklist(NewSelect.getNode());
4853 return DAG.getNode(LHS.getOpcode(), SL, VT, NewSelect);
4854 }
4855 }
4856
4857 return SDValue();
4858}
4859
4861 DAGCombinerInfo &DCI) const {
4862 if (SDValue Folded = foldFreeOpFromSelect(DCI, SDValue(N, 0)))
4863 return Folded;
4864
4865 SDValue Cond = N->getOperand(0);
4866 if (Cond.getOpcode() != ISD::SETCC)
4867 return SDValue();
4868
4869 EVT VT = N->getValueType(0);
4870 SDValue LHS = Cond.getOperand(0);
4871 SDValue RHS = Cond.getOperand(1);
4872 SDValue CC = Cond.getOperand(2);
4873
4874 SDValue True = N->getOperand(1);
4875 SDValue False = N->getOperand(2);
4876
4877 if (Cond.hasOneUse()) { // TODO: Look for multiple select uses.
4878 SelectionDAG &DAG = DCI.DAG;
4879 if (DAG.isConstantValueOfAnyType(True) &&
4880 !DAG.isConstantValueOfAnyType(False)) {
4881 // Swap cmp + select pair to move constant to false input.
4882 // This will allow using VOPC cndmasks more often.
4883 // select (setcc x, y), k, x -> select (setccinv x, y), x, k
4884
4885 SDLoc SL(N);
4886 ISD::CondCode NewCC =
4887 getSetCCInverse(cast<CondCodeSDNode>(CC)->get(), LHS.getValueType());
4888
4889 SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
4890 return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
4891 }
4892
4893 if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
4895 = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
4896 // Revisit this node so we can catch min3/max3/med3 patterns.
4897 //DCI.AddToWorklist(MinMax.getNode());
4898 return MinMax;
4899 }
4900 }
4901
4902 // There's no reason to not do this if the condition has other uses.
4903 return performCtlz_CttzCombine(SDLoc(N), Cond, True, False, DCI);
4904}
4905
4906static bool isInv2Pi(const APFloat &APF) {
4907 static const APFloat KF16(APFloat::IEEEhalf(), APInt(16, 0x3118));
4908 static const APFloat KF32(APFloat::IEEEsingle(), APInt(32, 0x3e22f983));
4909 static const APFloat KF64(APFloat::IEEEdouble(), APInt(64, 0x3fc45f306dc9c882));
4910
4911 return APF.bitwiseIsEqual(KF16) ||
4912 APF.bitwiseIsEqual(KF32) ||
4913 APF.bitwiseIsEqual(KF64);
4914}
4915
4916// 0 and 1.0 / (0.5 * pi) do not have inline immmediates, so there is an
4917// additional cost to negate them.
4920 if (C->isZero())
4921 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4922
4923 if (Subtarget->hasInv2PiInlineImm() && isInv2Pi(C->getValueAPF()))
4924 return C->isNegative() ? NegatibleCost::Cheaper : NegatibleCost::Expensive;
4925
4927}
4928
4934
4940
4941static unsigned inverseMinMax(unsigned Opc) {
4942 switch (Opc) {
4943 case ISD::FMAXNUM:
4944 return ISD::FMINNUM;
4945 case ISD::FMINNUM:
4946 return ISD::FMAXNUM;
4947 case ISD::FMAXNUM_IEEE:
4948 return ISD::FMINNUM_IEEE;
4949 case ISD::FMINNUM_IEEE:
4950 return ISD::FMAXNUM_IEEE;
4951 case ISD::FMAXIMUM:
4952 return ISD::FMINIMUM;
4953 case ISD::FMINIMUM:
4954 return ISD::FMAXIMUM;
4955 case ISD::FMAXIMUMNUM:
4956 return ISD::FMINIMUMNUM;
4957 case ISD::FMINIMUMNUM:
4958 return ISD::FMAXIMUMNUM;
4963 default:
4964 llvm_unreachable("invalid min/max opcode");
4965 }
4966}
4967
4968/// \return true if it's profitable to try to push an fneg into its source
4969/// instruction.
4971 // If the input has multiple uses and we can either fold the negate down, or
4972 // the other uses cannot, give up. This both prevents unprofitable
4973 // transformations and infinite loops: we won't repeatedly try to fold around
4974 // a negate that has no 'good' form.
4975 if (N0.hasOneUse()) {
4976 // This may be able to fold into the source, but at a code size cost. Don't
4977 // fold if the fold into the user is free.
4978 if (allUsesHaveSourceMods(N, 0))
4979 return false;
4980 } else {
4981 if (fnegFoldsIntoOp(N0.getNode()) &&
4983 return false;
4984 }
4985
4986 return true;
4987}
4988
4990 DAGCombinerInfo &DCI) const {
4991 SelectionDAG &DAG = DCI.DAG;
4992 SDValue N0 = N->getOperand(0);
4993 EVT VT = N->getValueType(0);
4994
4995 unsigned Opc = N0.getOpcode();
4996
4997 if (!shouldFoldFNegIntoSrc(N, N0))
4998 return SDValue();
4999
5000 SDLoc SL(N);
5001 switch (Opc) {
5002 case ISD::FADD: {
5003 if (!mayIgnoreSignedZero(N0))
5004 return SDValue();
5005
5006 // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y))
5007 SDValue LHS = N0.getOperand(0);
5008 SDValue RHS = N0.getOperand(1);
5009
5010 if (LHS.getOpcode() != ISD::FNEG)
5011 LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5012 else
5013 LHS = LHS.getOperand(0);
5014
5015 if (RHS.getOpcode() != ISD::FNEG)
5016 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5017 else
5018 RHS = RHS.getOperand(0);
5019
5020 SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS, N0->getFlags());
5021 if (Res.getOpcode() != ISD::FADD)
5022 return SDValue(); // Op got folded away.
5023 if (!N0.hasOneUse())
5024 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5025 return Res;
5026 }
5027 case ISD::FMUL:
5029 // (fneg (fmul x, y)) -> (fmul x, (fneg y))
5030 // (fneg (fmul_legacy x, y)) -> (fmul_legacy x, (fneg y))
5031 SDValue LHS = N0.getOperand(0);
5032 SDValue RHS = N0.getOperand(1);
5033
5034 if (LHS.getOpcode() == ISD::FNEG)
5035 LHS = LHS.getOperand(0);
5036 else if (RHS.getOpcode() == ISD::FNEG)
5037 RHS = RHS.getOperand(0);
5038 else
5039 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5040
5041 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, RHS, N0->getFlags());
5042 if (Res.getOpcode() != Opc)
5043 return SDValue(); // Op got folded away.
5044 if (!N0.hasOneUse())
5045 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5046 return Res;
5047 }
5048 case ISD::FMA:
5049 case ISD::FMAD: {
5050 // TODO: handle llvm.amdgcn.fma.legacy
5051 if (!mayIgnoreSignedZero(N0))
5052 return SDValue();
5053
5054 // (fneg (fma x, y, z)) -> (fma x, (fneg y), (fneg z))
5055 SDValue LHS = N0.getOperand(0);
5056 SDValue MHS = N0.getOperand(1);
5057 SDValue RHS = N0.getOperand(2);
5058
5059 if (LHS.getOpcode() == ISD::FNEG)
5060 LHS = LHS.getOperand(0);
5061 else if (MHS.getOpcode() == ISD::FNEG)
5062 MHS = MHS.getOperand(0);
5063 else
5064 MHS = DAG.getNode(ISD::FNEG, SL, VT, MHS);
5065
5066 if (RHS.getOpcode() != ISD::FNEG)
5067 RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5068 else
5069 RHS = RHS.getOperand(0);
5070
5071 SDValue Res = DAG.getNode(Opc, SL, VT, LHS, MHS, RHS);
5072 if (Res.getOpcode() != Opc)
5073 return SDValue(); // Op got folded away.
5074 if (!N0.hasOneUse())
5075 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5076 return Res;
5077 }
5078 case ISD::FMAXNUM:
5079 case ISD::FMINNUM:
5080 case ISD::FMAXNUM_IEEE:
5081 case ISD::FMINNUM_IEEE:
5082 case ISD::FMINIMUM:
5083 case ISD::FMAXIMUM:
5084 case ISD::FMINIMUMNUM:
5085 case ISD::FMAXIMUMNUM:
5088 // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
5089 // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
5090 // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
5091 // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
5092
5093 SDValue LHS = N0.getOperand(0);
5094 SDValue RHS = N0.getOperand(1);
5095
5096 // 0 doesn't have a negated inline immediate.
5097 // TODO: This constant check should be generalized to other operations.
5099 return SDValue();
5100
5101 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
5102 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
5103 unsigned Opposite = inverseMinMax(Opc);
5104
5105 SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
5106 if (Res.getOpcode() != Opposite)
5107 return SDValue(); // Op got folded away.
5108 if (!N0.hasOneUse())
5109 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
5110 return Res;
5111 }
5112 case AMDGPUISD::FMED3: {
5113 SDValue Ops[3];
5114 for (unsigned I = 0; I < 3; ++I)
5115 Ops[I] = DAG.getNode(ISD::FNEG, SL, VT, N0->getOperand(I), N0->getFlags());
5116
5117 SDValue Res = DAG.getNode(AMDGPUISD::FMED3, SL, VT, Ops, N0->getFlags());
5118 if (Res.getOpcode() != AMDGPUISD::FMED3)
5119 return SDValue(); // Op got folded away.
5120
5121 if (!N0.hasOneUse()) {
5122 SDValue Neg = DAG.getNode(ISD::FNEG, SL, VT, Res);
5123 DAG.ReplaceAllUsesWith(N0, Neg);
5124
5125 for (SDNode *U : Neg->users())
5126 DCI.AddToWorklist(U);
5127 }
5128
5129 return Res;
5130 }
5131 case ISD::FP_EXTEND:
5132 case ISD::FTRUNC:
5133 case ISD::FRINT:
5134 case ISD::FNEARBYINT: // XXX - Should fround be handled?
5135 case ISD::FROUNDEVEN:
5136 case ISD::FSIN:
5137 case ISD::FCANONICALIZE:
5138 case AMDGPUISD::RCP:
5141 case AMDGPUISD::SIN_HW: {
5142 SDValue CvtSrc = N0.getOperand(0);
5143 if (CvtSrc.getOpcode() == ISD::FNEG) {
5144 // (fneg (fp_extend (fneg x))) -> (fp_extend x)
5145 // (fneg (rcp (fneg x))) -> (rcp x)
5146 return DAG.getNode(Opc, SL, VT, CvtSrc.getOperand(0));
5147 }
5148
5149 if (!N0.hasOneUse())
5150 return SDValue();
5151
5152 // (fneg (fp_extend x)) -> (fp_extend (fneg x))
5153 // (fneg (rcp x)) -> (rcp (fneg x))
5154 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5155 return DAG.getNode(Opc, SL, VT, Neg, N0->getFlags());
5156 }
5157 case ISD::FP_ROUND: {
5158 SDValue CvtSrc = N0.getOperand(0);
5159
5160 if (CvtSrc.getOpcode() == ISD::FNEG) {
5161 // (fneg (fp_round (fneg x))) -> (fp_round x)
5162 return DAG.getNode(ISD::FP_ROUND, SL, VT,
5163 CvtSrc.getOperand(0), N0.getOperand(1));
5164 }
5165
5166 if (!N0.hasOneUse())
5167 return SDValue();
5168
5169 // (fneg (fp_round x)) -> (fp_round (fneg x))
5170 SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
5171 return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
5172 }
5173 case ISD::FP16_TO_FP: {
5174 // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
5175 // f16, but legalization of f16 fneg ends up pulling it out of the source.
5176 // Put the fneg back as a legal source operation that can be matched later.
5177 SDLoc SL(N);
5178
5179 SDValue Src = N0.getOperand(0);
5180 EVT SrcVT = Src.getValueType();
5181
5182 // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
5183 SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
5184 DAG.getConstant(0x8000, SL, SrcVT));
5185 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
5186 }
5187 case ISD::SELECT: {
5188 // fneg (select c, a, b) -> select c, (fneg a), (fneg b)
5189 // TODO: Invert conditions of foldFreeOpFromSelect
5190 return SDValue();
5191 }
5192 case ISD::BITCAST: {
5193 SDLoc SL(N);
5194 SDValue BCSrc = N0.getOperand(0);
5195 if (BCSrc.getOpcode() == ISD::BUILD_VECTOR) {
5196 SDValue HighBits = BCSrc.getOperand(BCSrc.getNumOperands() - 1);
5197 if (HighBits.getValueType().getSizeInBits() != 32 ||
5198 !fnegFoldsIntoOp(HighBits.getNode()))
5199 return SDValue();
5200
5201 // f64 fneg only really needs to operate on the high half of of the
5202 // register, so try to force it to an f32 operation to help make use of
5203 // source modifiers.
5204 //
5205 //
5206 // fneg (f64 (bitcast (build_vector x, y))) ->
5207 // f64 (bitcast (build_vector (bitcast i32:x to f32),
5208 // (fneg (bitcast i32:y to f32)))
5209
5210 SDValue CastHi = DAG.getNode(ISD::BITCAST, SL, MVT::f32, HighBits);
5211 SDValue NegHi = DAG.getNode(ISD::FNEG, SL, MVT::f32, CastHi);
5212 SDValue CastBack =
5213 DAG.getNode(ISD::BITCAST, SL, HighBits.getValueType(), NegHi);
5214
5216 Ops.back() = CastBack;
5217 DCI.AddToWorklist(NegHi.getNode());
5218 SDValue Build =
5219 DAG.getNode(ISD::BUILD_VECTOR, SL, BCSrc.getValueType(), Ops);
5220 SDValue Result = DAG.getNode(ISD::BITCAST, SL, VT, Build);
5221
5222 if (!N0.hasOneUse())
5223 DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Result));
5224 return Result;
5225 }
5226
5227 if (BCSrc.getOpcode() == ISD::SELECT && VT == MVT::f32 &&
5228 BCSrc.hasOneUse()) {
5229 // fneg (bitcast (f32 (select cond, i32:lhs, i32:rhs))) ->
5230 // select cond, (bitcast i32:lhs to f32), (bitcast i32:rhs to f32)
5231
5232 // TODO: Cast back result for multiple uses is beneficial in some cases.
5233
5234 SDValue LHS =
5235 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(1));
5236 SDValue RHS =
5237 DAG.getNode(ISD::BITCAST, SL, MVT::f32, BCSrc.getOperand(2));
5238
5239 SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, LHS);
5240 SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, MVT::f32, RHS);
5241
5242 return DAG.getNode(ISD::SELECT, SL, MVT::f32, BCSrc.getOperand(0), NegLHS,
5243 NegRHS);
5244 }
5245
5246 return SDValue();
5247 }
5248 default:
5249 return SDValue();
5250 }
5251}
5252
5254 DAGCombinerInfo &DCI) const {
5255 SelectionDAG &DAG = DCI.DAG;
5256 SDValue N0 = N->getOperand(0);
5257
5258 if (!N0.hasOneUse())
5259 return SDValue();
5260
5261 switch (N0.getOpcode()) {
5262 case ISD::FP16_TO_FP: {
5263 assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
5264 SDLoc SL(N);
5265 SDValue Src = N0.getOperand(0);
5266 EVT SrcVT = Src.getValueType();
5267
5268 // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
5269 SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
5270 DAG.getConstant(0x7fff, SL, SrcVT));
5271 return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
5272 }
5273 default:
5274 return SDValue();
5275 }
5276}
5277
5279 DAGCombinerInfo &DCI) const {
5280 const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
5281 if (!CFP)
5282 return SDValue();
5283
5284 // XXX - Should this flush denormals?
5285 const APFloat &Val = CFP->getValueAPF();
5286 APFloat One(Val.getSemantics(), "1.0");
5287 return DCI.DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
5288}
5289
5291 DAGCombinerInfo &DCI) const {
5292 SelectionDAG &DAG = DCI.DAG;
5293 SDLoc DL(N);
5294
5295 switch(N->getOpcode()) {
5296 default:
5297 break;
5298 case ISD::BITCAST: {
5299 EVT DestVT = N->getValueType(0);
5300
5301 // Push casts through vector builds. This helps avoid emitting a large
5302 // number of copies when materializing floating point vector constants.
5303 //
5304 // vNt1 bitcast (vNt0 (build_vector t0:x, t0:y)) =>
5305 // vnt1 = build_vector (t1 (bitcast t0:x)), (t1 (bitcast t0:y))
5306 if (DestVT.isVector()) {
5307 SDValue Src = N->getOperand(0);
5308 if (Src.getOpcode() == ISD::BUILD_VECTOR &&
5311 EVT SrcVT = Src.getValueType();
5312 unsigned NElts = DestVT.getVectorNumElements();
5313
5314 if (SrcVT.getVectorNumElements() == NElts) {
5315 EVT DestEltVT = DestVT.getVectorElementType();
5316
5317 SmallVector<SDValue, 8> CastedElts;
5318 SDLoc SL(N);
5319 for (unsigned I = 0, E = SrcVT.getVectorNumElements(); I != E; ++I) {
5320 SDValue Elt = Src.getOperand(I);
5321 CastedElts.push_back(DAG.getNode(ISD::BITCAST, DL, DestEltVT, Elt));
5322 }
5323
5324 return DAG.getBuildVector(DestVT, SL, CastedElts);
5325 }
5326 }
5327 }
5328
5329 if (DestVT.getSizeInBits() != 64 || !DestVT.isVector())
5330 break;
5331
5332 // Fold bitcasts of constants.
5333 //
5334 // v2i32 (bitcast i64:k) -> build_vector lo_32(k), hi_32(k)
5335 // TODO: Generalize and move to DAGCombiner
5336 SDValue Src = N->getOperand(0);
5338 SDLoc SL(N);
5339 uint64_t CVal = C->getZExtValue();
5340 SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5341 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5342 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5343 return DAG.getNode(ISD::BITCAST, SL, DestVT, BV);
5344 }
5345
5347 const APInt &Val = C->getValueAPF().bitcastToAPInt();
5348 SDLoc SL(N);
5349 uint64_t CVal = Val.getZExtValue();
5350 SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32,
5351 DAG.getConstant(Lo_32(CVal), SL, MVT::i32),
5352 DAG.getConstant(Hi_32(CVal), SL, MVT::i32));
5353
5354 return DAG.getNode(ISD::BITCAST, SL, DestVT, Vec);
5355 }
5356
5357 break;
5358 }
5359 case ISD::SHL:
5360 case ISD::SRA:
5361 case ISD::SRL: {
5362 // Range metadata can be invalidated when loads are converted to legal types
5363 // (e.g. v2i64 -> v4i32).
5364 // Try to convert vector shl/sra/srl before type legalization so that range
5365 // metadata can be utilized.
5366 if (!(N->getValueType(0).isVector() &&
5369 break;
5370 if (N->getOpcode() == ISD::SHL)
5371 return performShlCombine(N, DCI);
5372 if (N->getOpcode() == ISD::SRA)
5373 return performSraCombine(N, DCI);
5374 return performSrlCombine(N, DCI);
5375 }
5376 case ISD::TRUNCATE:
5377 return performTruncateCombine(N, DCI);
5378 case ISD::MUL:
5379 return performMulCombine(N, DCI);
5380 case AMDGPUISD::MUL_U24:
5381 case AMDGPUISD::MUL_I24: {
5382 if (SDValue Simplified = simplifyMul24(N, DCI))
5383 return Simplified;
5384 break;
5385 }
5388 return simplifyMul24(N, DCI);
5389 case ISD::SMUL_LOHI:
5390 case ISD::UMUL_LOHI:
5391 return performMulLoHiCombine(N, DCI);
5392 case ISD::MULHS:
5393 return performMulhsCombine(N, DCI);
5394 case ISD::MULHU:
5395 return performMulhuCombine(N, DCI);
5396 case ISD::SELECT:
5397 return performSelectCombine(N, DCI);
5398 case ISD::FNEG:
5399 return performFNegCombine(N, DCI);
5400 case ISD::FABS:
5401 return performFAbsCombine(N, DCI);
5402 case AMDGPUISD::BFE_I32:
5403 case AMDGPUISD::BFE_U32: {
5404 assert(!N->getValueType(0).isVector() &&
5405 "Vector handling of BFE not implemented");
5406 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(N->getOperand(2));
5407 if (!Width)
5408 break;
5409
5410 uint32_t WidthVal = Width->getZExtValue() & 0x1f;
5411 if (WidthVal == 0)
5412 return DAG.getConstant(0, DL, MVT::i32);
5413
5415 if (!Offset)
5416 break;
5417
5418 SDValue BitsFrom = N->getOperand(0);
5419 uint32_t OffsetVal = Offset->getZExtValue() & 0x1f;
5420
5421 bool Signed = N->getOpcode() == AMDGPUISD::BFE_I32;
5422
5423 if (OffsetVal == 0) {
5424 // This is already sign / zero extended, so try to fold away extra BFEs.
5425 unsigned SignBits = Signed ? (32 - WidthVal + 1) : (32 - WidthVal);
5426
5427 unsigned OpSignBits = DAG.ComputeNumSignBits(BitsFrom);
5428 if (OpSignBits >= SignBits)
5429 return BitsFrom;
5430
5431 EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), WidthVal);
5432 if (Signed) {
5433 // This is a sign_extend_inreg. Replace it to take advantage of existing
5434 // DAG Combines. If not eliminated, we will match back to BFE during
5435 // selection.
5436
5437 // TODO: The sext_inreg of extended types ends, although we can could
5438 // handle them in a single BFE.
5439 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32, BitsFrom,
5440 DAG.getValueType(SmallVT));
5441 }
5442
5443 return DAG.getZeroExtendInReg(BitsFrom, DL, SmallVT);
5444 }
5445
5446 if (ConstantSDNode *CVal = dyn_cast<ConstantSDNode>(BitsFrom)) {
5447 if (Signed) {
5448 return constantFoldBFE<int32_t>(DAG,
5449 CVal->getSExtValue(),
5450 OffsetVal,
5451 WidthVal,
5452 DL);
5453 }
5454
5455 return constantFoldBFE<uint32_t>(DAG,
5456 CVal->getZExtValue(),
5457 OffsetVal,
5458 WidthVal,
5459 DL);
5460 }
5461
5462 if ((OffsetVal + WidthVal) >= 32 &&
5463 !(Subtarget->hasSDWA() && OffsetVal == 16 && WidthVal == 16)) {
5464 SDValue ShiftVal = DAG.getConstant(OffsetVal, DL, MVT::i32);
5465 return DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, MVT::i32,
5466 BitsFrom, ShiftVal);
5467 }
5468
5469 if (BitsFrom.hasOneUse()) {
5470 APInt Demanded = APInt::getBitsSet(32,
5471 OffsetVal,
5472 OffsetVal + WidthVal);
5473
5474 KnownBits Known;
5476 !DCI.isBeforeLegalizeOps());
5477 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5478 if (TLI.ShrinkDemandedConstant(BitsFrom, Demanded, TLO) ||
5479 TLI.SimplifyDemandedBits(BitsFrom, Demanded, Known, TLO)) {
5480 DCI.CommitTargetLoweringOpt(TLO);
5481 }
5482 }
5483
5484 break;
5485 }
5486 case ISD::LOAD:
5487 return performLoadCombine(N, DCI);
5488 case ISD::STORE:
5489 return performStoreCombine(N, DCI);
5490 case AMDGPUISD::RCP:
5492 return performRcpCombine(N, DCI);
5493 case ISD::AssertZext:
5494 case ISD::AssertSext:
5495 return performAssertSZExtCombine(N, DCI);
5497 return performIntrinsicWOChainCombine(N, DCI);
5498 case AMDGPUISD::FMAD_FTZ: {
5499 SDValue N0 = N->getOperand(0);
5500 SDValue N1 = N->getOperand(1);
5501 SDValue N2 = N->getOperand(2);
5502 EVT VT = N->getValueType(0);
5503
5504 // FMAD_FTZ is a FMAD + flush denormals to zero.
5505 // We flush the inputs, the intermediate step, and the output.
5509 if (N0CFP && N1CFP && N2CFP) {
5510 const auto FTZ = [](const APFloat &V) {
5511 if (V.isDenormal()) {
5512 APFloat Zero(V.getSemantics(), 0);
5513 return V.isNegative() ? -Zero : Zero;
5514 }
5515 return V;
5516 };
5517
5518 APFloat V0 = FTZ(N0CFP->getValueAPF());
5519 APFloat V1 = FTZ(N1CFP->getValueAPF());
5520 APFloat V2 = FTZ(N2CFP->getValueAPF());
5522 V0 = FTZ(V0);
5524 return DAG.getConstantFP(FTZ(V0), DL, VT);
5525 }
5526 break;
5527 }
5528 }
5529 return SDValue();
5530}
5531
5532//===----------------------------------------------------------------------===//
5533// Helper functions
5534//===----------------------------------------------------------------------===//
5535
5537 const TargetRegisterClass *RC,
5538 Register Reg, EVT VT,
5539 const SDLoc &SL,
5540 bool RawReg) const {
5543 Register VReg;
5544
5545 if (!MRI.isLiveIn(Reg)) {
5546 VReg = MRI.createVirtualRegister(RC);
5547 MRI.addLiveIn(Reg, VReg);
5548 } else {
5549 VReg = MRI.getLiveInVirtReg(Reg);
5550 }
5551
5552 if (RawReg)
5553 return DAG.getRegister(VReg, VT);
5554
5555 return DAG.getCopyFromReg(DAG.getEntryNode(), SL, VReg, VT);
5556}
5557
5558// This may be called multiple times, and nothing prevents creating multiple
5559// objects at the same offset. See if we already defined this object.
5561 int64_t Offset) {
5562 for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
5563 if (MFI.getObjectOffset(I) == Offset) {
5564 assert(MFI.getObjectSize(I) == Size);
5565 return I;
5566 }
5567 }
5568
5569 return MFI.CreateFixedObject(Size, Offset, true);
5570}
5571
5573 EVT VT,
5574 const SDLoc &SL,
5575 int64_t Offset) const {
5577 MachineFrameInfo &MFI = MF.getFrameInfo();
5578 int FI = getOrCreateFixedStackObject(MFI, VT.getStoreSize(), Offset);
5579
5580 auto SrcPtrInfo = MachinePointerInfo::getStack(MF, Offset);
5581 SDValue Ptr = DAG.getFrameIndex(FI, MVT::i32);
5582
5583 return DAG.getLoad(VT, SL, DAG.getEntryNode(), Ptr, SrcPtrInfo, Align(4),
5586}
5587
5589 const SDLoc &SL,
5590 SDValue Chain,
5591 SDValue ArgVal,
5592 int64_t Offset) const {
5596
5597 SDValue Ptr = DAG.getConstant(Offset, SL, MVT::i32);
5598 // Stores to the argument stack area are relative to the stack pointer.
5599 SDValue SP =
5600 DAG.getCopyFromReg(Chain, SL, Info->getStackPtrOffsetReg(), MVT::i32);
5601 Ptr = DAG.getNode(ISD::ADD, SL, MVT::i32, SP, Ptr);
5602 SDValue Store = DAG.getStore(Chain, SL, ArgVal, Ptr, DstInfo, Align(4),
5604 return Store;
5605}
5606
5608 const TargetRegisterClass *RC,
5609 EVT VT, const SDLoc &SL,
5610 const ArgDescriptor &Arg) const {
5611 assert(Arg && "Attempting to load missing argument");
5612
5613 SDValue V = Arg.isRegister() ?
5614 CreateLiveInRegister(DAG, RC, Arg.getRegister(), VT, SL) :
5615 loadStackInputValue(DAG, VT, SL, Arg.getStackOffset());
5616
5617 if (!Arg.isMasked())
5618 return V;
5619
5620 unsigned Mask = Arg.getMask();
5621 unsigned Shift = llvm::countr_zero<unsigned>(Mask);
5622 V = DAG.getNode(ISD::SRL, SL, VT, V,
5623 DAG.getShiftAmountConstant(Shift, VT, SL));
5624 return DAG.getNode(ISD::AND, SL, VT, V,
5625 DAG.getConstant(Mask >> Shift, SL, VT));
5626}
5627
5629 uint64_t ExplicitKernArgSize, const ImplicitParameter Param) const {
5630 unsigned ExplicitArgOffset = Subtarget->getExplicitKernelArgOffset();
5631 const Align Alignment = Subtarget->getAlignmentForImplicitArgPtr();
5632 uint64_t ArgOffset =
5633 alignTo(ExplicitKernArgSize, Alignment) + ExplicitArgOffset;
5634 switch (Param) {
5635 case FIRST_IMPLICIT:
5636 return ArgOffset;
5637 case PRIVATE_BASE:
5639 case SHARED_BASE:
5640 return ArgOffset + AMDGPU::ImplicitArg::SHARED_BASE_OFFSET;
5641 case QUEUE_PTR:
5642 return ArgOffset + AMDGPU::ImplicitArg::QUEUE_PTR_OFFSET;
5643 }
5644 llvm_unreachable("unexpected implicit parameter type");
5645}
5646
5652
5653#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
5654
5655const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
5656 switch ((AMDGPUISD::NodeType)Opcode) {
5657 case AMDGPUISD::FIRST_NUMBER: break;
5658 // AMDIL DAG nodes
5659 NODE_NAME_CASE(BRANCH_COND);
5660
5661 // AMDGPU DAG nodes
5662 NODE_NAME_CASE(IF)
5663 NODE_NAME_CASE(ELSE)
5664 NODE_NAME_CASE(LOOP)
5665 NODE_NAME_CASE(CALL)
5666 NODE_NAME_CASE(TC_RETURN)
5667 NODE_NAME_CASE(TC_RETURN_GFX)
5668 NODE_NAME_CASE(TC_RETURN_GFX_WholeWave)
5669 NODE_NAME_CASE(TC_RETURN_CHAIN)
5670 NODE_NAME_CASE(TC_RETURN_CHAIN_DVGPR)
5671 NODE_NAME_CASE(TRAP)
5672 NODE_NAME_CASE(RET_GLUE)
5673 NODE_NAME_CASE(WAVE_ADDRESS)
5674 NODE_NAME_CASE(RETURN_TO_EPILOG)
5675 NODE_NAME_CASE(ENDPGM)
5676 NODE_NAME_CASE(ENDPGM_TRAP)
5677 NODE_NAME_CASE(SIMULATED_TRAP)
5678 NODE_NAME_CASE(DWORDADDR)
5679 NODE_NAME_CASE(FRACT)
5680 NODE_NAME_CASE(SETCC)
5681 NODE_NAME_CASE(DENORM_MODE)
5682 NODE_NAME_CASE(FMA_W_CHAIN)
5683 NODE_NAME_CASE(FMUL_W_CHAIN)
5684 NODE_NAME_CASE(CLAMP)
5685 NODE_NAME_CASE(COS_HW)
5686 NODE_NAME_CASE(SIN_HW)
5687 NODE_NAME_CASE(FMAX_LEGACY)
5688 NODE_NAME_CASE(FMIN_LEGACY)
5689 NODE_NAME_CASE(FMAX3)
5690 NODE_NAME_CASE(SMAX3)
5691 NODE_NAME_CASE(UMAX3)
5692 NODE_NAME_CASE(FMIN3)
5693 NODE_NAME_CASE(SMIN3)
5694 NODE_NAME_CASE(UMIN3)
5695 NODE_NAME_CASE(FMED3)
5696 NODE_NAME_CASE(SMED3)
5697 NODE_NAME_CASE(UMED3)
5698 NODE_NAME_CASE(FMAXIMUM3)
5699 NODE_NAME_CASE(FMINIMUM3)
5700 NODE_NAME_CASE(FDOT2)
5701 NODE_NAME_CASE(URECIP)
5702 NODE_NAME_CASE(DIV_SCALE)
5703 NODE_NAME_CASE(DIV_FMAS)
5704 NODE_NAME_CASE(DIV_FIXUP)
5705 NODE_NAME_CASE(FMAD_FTZ)
5706 NODE_NAME_CASE(RCP)
5707 NODE_NAME_CASE(RSQ)
5708 NODE_NAME_CASE(RCP_LEGACY)
5709 NODE_NAME_CASE(RCP_IFLAG)
5710 NODE_NAME_CASE(LOG)
5711 NODE_NAME_CASE(EXP)
5712 NODE_NAME_CASE(FMUL_LEGACY)
5713 NODE_NAME_CASE(RSQ_CLAMP)
5714 NODE_NAME_CASE(FP_CLASS)
5715 NODE_NAME_CASE(DOT4)
5716 NODE_NAME_CASE(CARRY)
5717 NODE_NAME_CASE(BORROW)
5718 NODE_NAME_CASE(BFE_U32)
5719 NODE_NAME_CASE(BFE_I32)
5720 NODE_NAME_CASE(BFI)
5721 NODE_NAME_CASE(BFM)
5722 NODE_NAME_CASE(FFBH_U32)
5723 NODE_NAME_CASE(FFBH_I32)
5724 NODE_NAME_CASE(FFBL_B32)
5725 NODE_NAME_CASE(MUL_U24)
5726 NODE_NAME_CASE(MUL_I24)
5727 NODE_NAME_CASE(MULHI_U24)
5728 NODE_NAME_CASE(MULHI_I24)
5729 NODE_NAME_CASE(MAD_U24)
5730 NODE_NAME_CASE(MAD_I24)
5731 NODE_NAME_CASE(MAD_I64_I32)
5732 NODE_NAME_CASE(MAD_U64_U32)
5733 NODE_NAME_CASE(PERM)
5734 NODE_NAME_CASE(TEXTURE_FETCH)
5735 NODE_NAME_CASE(R600_EXPORT)
5736 NODE_NAME_CASE(CONST_ADDRESS)
5737 NODE_NAME_CASE(REGISTER_LOAD)
5738 NODE_NAME_CASE(REGISTER_STORE)
5739 NODE_NAME_CASE(CVT_F32_UBYTE0)
5740 NODE_NAME_CASE(CVT_F32_UBYTE1)
5741 NODE_NAME_CASE(CVT_F32_UBYTE2)
5742 NODE_NAME_CASE(CVT_F32_UBYTE3)
5743 NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
5744 NODE_NAME_CASE(CVT_PKNORM_I16_F32)
5745 NODE_NAME_CASE(CVT_PKNORM_U16_F32)
5746 NODE_NAME_CASE(CVT_PK_I16_I32)
5747 NODE_NAME_CASE(CVT_PK_U16_U32)
5748 NODE_NAME_CASE(FP_TO_FP16)
5749 NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
5750 NODE_NAME_CASE(CONST_DATA_PTR)
5751 NODE_NAME_CASE(PC_ADD_REL_OFFSET)
5752 NODE_NAME_CASE(PC_ADD_REL_OFFSET64)
5754 NODE_NAME_CASE(DUMMY_CHAIN)
5755 NODE_NAME_CASE(LOAD_D16_HI)
5756 NODE_NAME_CASE(LOAD_D16_LO)
5757 NODE_NAME_CASE(LOAD_D16_HI_I8)
5758 NODE_NAME_CASE(LOAD_D16_HI_U8)
5759 NODE_NAME_CASE(LOAD_D16_LO_I8)
5760 NODE_NAME_CASE(LOAD_D16_LO_U8)
5761 NODE_NAME_CASE(STORE_MSKOR)
5762 NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
5763 NODE_NAME_CASE(TBUFFER_STORE_FORMAT_D16)
5764 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT)
5765 NODE_NAME_CASE(TBUFFER_LOAD_FORMAT_D16)
5766 NODE_NAME_CASE(DS_ORDERED_COUNT)
5767 NODE_NAME_CASE(ATOMIC_CMP_SWAP)
5768 NODE_NAME_CASE(BUFFER_LOAD)
5769 NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
5770 NODE_NAME_CASE(BUFFER_LOAD_USHORT)
5771 NODE_NAME_CASE(BUFFER_LOAD_BYTE)
5772 NODE_NAME_CASE(BUFFER_LOAD_SHORT)
5773 NODE_NAME_CASE(BUFFER_LOAD_TFE)
5774 NODE_NAME_CASE(BUFFER_LOAD_UBYTE_TFE)
5775 NODE_NAME_CASE(BUFFER_LOAD_USHORT_TFE)
5776 NODE_NAME_CASE(BUFFER_LOAD_BYTE_TFE)
5777 NODE_NAME_CASE(BUFFER_LOAD_SHORT_TFE)
5778 NODE_NAME_CASE(BUFFER_LOAD_FORMAT)
5779 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_TFE)
5780 NODE_NAME_CASE(BUFFER_LOAD_FORMAT_D16)
5781 NODE_NAME_CASE(SBUFFER_LOAD)
5782 NODE_NAME_CASE(SBUFFER_LOAD_BYTE)
5783 NODE_NAME_CASE(SBUFFER_LOAD_UBYTE)
5784 NODE_NAME_CASE(SBUFFER_LOAD_SHORT)
5785 NODE_NAME_CASE(SBUFFER_LOAD_USHORT)
5786 NODE_NAME_CASE(SBUFFER_PREFETCH_DATA)
5787 NODE_NAME_CASE(BUFFER_STORE)
5788 NODE_NAME_CASE(BUFFER_STORE_BYTE)
5789 NODE_NAME_CASE(BUFFER_STORE_SHORT)
5790 NODE_NAME_CASE(BUFFER_STORE_FORMAT)
5791 NODE_NAME_CASE(BUFFER_STORE_FORMAT_D16)
5792 NODE_NAME_CASE(BUFFER_ATOMIC_SWAP)
5793 NODE_NAME_CASE(BUFFER_ATOMIC_ADD)
5794 NODE_NAME_CASE(BUFFER_ATOMIC_SUB)
5795 NODE_NAME_CASE(BUFFER_ATOMIC_SMIN)
5796 NODE_NAME_CASE(BUFFER_ATOMIC_UMIN)
5797 NODE_NAME_CASE(BUFFER_ATOMIC_SMAX)
5798 NODE_NAME_CASE(BUFFER_ATOMIC_UMAX)
5799 NODE_NAME_CASE(BUFFER_ATOMIC_AND)
5800 NODE_NAME_CASE(BUFFER_ATOMIC_OR)
5801 NODE_NAME_CASE(BUFFER_ATOMIC_XOR)
5802 NODE_NAME_CASE(BUFFER_ATOMIC_INC)
5803 NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
5804 NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
5805 NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
5806 NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
5807 NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
5808 NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
5809 NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
5810 NODE_NAME_CASE(WHOLE_WAVE_SETUP)
5811 NODE_NAME_CASE(WHOLE_WAVE_RETURN)
5812 }
5813 return nullptr;
5814}
5815
5817 SelectionDAG &DAG, int Enabled,
5818 int &RefinementSteps,
5819 bool &UseOneConstNR,
5820 bool Reciprocal) const {
5821 EVT VT = Operand.getValueType();
5822
5823 if (VT == MVT::f32) {
5824 RefinementSteps = 0;
5825 return DAG.getNode(AMDGPUISD::RSQ, SDLoc(Operand), VT, Operand);
5826 }
5827
5828 // TODO: There is also f64 rsq instruction, but the documentation is less
5829 // clear on its precision.
5830
5831 return SDValue();
5832}
5833
5835 SelectionDAG &DAG, int Enabled,
5836 int &RefinementSteps) const {
5837 EVT VT = Operand.getValueType();
5838
5839 if (VT == MVT::f32) {
5840 // Reciprocal, < 1 ulp error.
5841 //
5842 // This reciprocal approximation converges to < 0.5 ulp error with one
5843 // newton rhapson performed with two fused multiple adds (FMAs).
5844
5845 RefinementSteps = 0;
5846 return DAG.getNode(AMDGPUISD::RCP, SDLoc(Operand), VT, Operand);
5847 }
5848
5849 // TODO: There is also f64 rcp instruction, but the documentation is less
5850 // clear on its precision.
5851
5852 return SDValue();
5853}
5854
5855static unsigned workitemIntrinsicDim(unsigned ID) {
5856 switch (ID) {
5857 case Intrinsic::amdgcn_workitem_id_x:
5858 return 0;
5859 case Intrinsic::amdgcn_workitem_id_y:
5860 return 1;
5861 case Intrinsic::amdgcn_workitem_id_z:
5862 return 2;
5863 default:
5864 llvm_unreachable("not a workitem intrinsic");
5865 }
5866}
5867
5869 const SDValue Op, KnownBits &Known,
5870 const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
5871
5872 Known.resetAll(); // Don't know anything.
5873
5874 unsigned Opc = Op.getOpcode();
5875
5876 switch (Opc) {
5877 default:
5878 break;
5879 case AMDGPUISD::CARRY:
5880 case AMDGPUISD::BORROW: {
5881 Known.Zero = APInt::getHighBitsSet(32, 31);
5882 break;
5883 }
5884
5885 case AMDGPUISD::BFE_I32:
5886 case AMDGPUISD::BFE_U32: {
5887 ConstantSDNode *CWidth = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5888 if (!CWidth)
5889 return;
5890
5891 uint32_t Width = CWidth->getZExtValue() & 0x1f;
5892
5893 if (Opc == AMDGPUISD::BFE_U32)
5894 Known.Zero = APInt::getHighBitsSet(32, 32 - Width);
5895
5896 break;
5897 }
5898 case AMDGPUISD::FP_TO_FP16: {
5899 unsigned BitWidth = Known.getBitWidth();
5900
5901 // High bits are zero.
5903 break;
5904 }
5905 case AMDGPUISD::MUL_U24:
5906 case AMDGPUISD::MUL_I24: {
5907 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5908 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5909 unsigned TrailZ = LHSKnown.countMinTrailingZeros() +
5910 RHSKnown.countMinTrailingZeros();
5911 Known.Zero.setLowBits(std::min(TrailZ, 32u));
5912 // Skip extra check if all bits are known zeros.
5913 if (TrailZ >= 32)
5914 break;
5915
5916 // Truncate to 24 bits.
5917 LHSKnown = LHSKnown.trunc(24);
5918 RHSKnown = RHSKnown.trunc(24);
5919
5920 if (Opc == AMDGPUISD::MUL_I24) {
5921 unsigned LHSValBits = LHSKnown.countMaxSignificantBits();
5922 unsigned RHSValBits = RHSKnown.countMaxSignificantBits();
5923 unsigned MaxValBits = LHSValBits + RHSValBits;
5924 if (MaxValBits > 32)
5925 break;
5926 unsigned SignBits = 32 - MaxValBits + 1;
5927 bool LHSNegative = LHSKnown.isNegative();
5928 bool LHSNonNegative = LHSKnown.isNonNegative();
5929 bool LHSPositive = LHSKnown.isStrictlyPositive();
5930 bool RHSNegative = RHSKnown.isNegative();
5931 bool RHSNonNegative = RHSKnown.isNonNegative();
5932 bool RHSPositive = RHSKnown.isStrictlyPositive();
5933
5934 if ((LHSNonNegative && RHSNonNegative) || (LHSNegative && RHSNegative))
5935 Known.Zero.setHighBits(SignBits);
5936 else if ((LHSNegative && RHSPositive) || (LHSPositive && RHSNegative))
5937 Known.One.setHighBits(SignBits);
5938 } else {
5939 unsigned LHSValBits = LHSKnown.countMaxActiveBits();
5940 unsigned RHSValBits = RHSKnown.countMaxActiveBits();
5941 unsigned MaxValBits = LHSValBits + RHSValBits;
5942 if (MaxValBits >= 32)
5943 break;
5944 Known.Zero.setBitsFrom(MaxValBits);
5945 }
5946 break;
5947 }
5948 case AMDGPUISD::PERM: {
5949 ConstantSDNode *CMask = dyn_cast<ConstantSDNode>(Op.getOperand(2));
5950 if (!CMask)
5951 return;
5952
5953 KnownBits LHSKnown = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
5954 KnownBits RHSKnown = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
5955 unsigned Sel = CMask->getZExtValue();
5956
5957 for (unsigned I = 0; I < 32; I += 8) {
5958 unsigned SelBits = Sel & 0xff;
5959 if (SelBits < 4) {
5960 SelBits *= 8;
5961 Known.One |= ((RHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5962 Known.Zero |= ((RHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5963 } else if (SelBits < 7) {
5964 SelBits = (SelBits & 3) * 8;
5965 Known.One |= ((LHSKnown.One.getZExtValue() >> SelBits) & 0xff) << I;
5966 Known.Zero |= ((LHSKnown.Zero.getZExtValue() >> SelBits) & 0xff) << I;
5967 } else if (SelBits == 0x0c) {
5968 Known.Zero |= 0xFFull << I;
5969 } else if (SelBits > 0x0c) {
5970 Known.One |= 0xFFull << I;
5971 }
5972 Sel >>= 8;
5973 }
5974 break;
5975 }
5977 Known.Zero.setHighBits(24);
5978 break;
5979 }
5981 Known.Zero.setHighBits(16);
5982 break;
5983 }
5984 case AMDGPUISD::LDS: {
5985 auto *GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
5986 Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
5987
5988 Known.Zero.setHighBits(16);
5989 Known.Zero.setLowBits(Log2(Alignment));
5990 break;
5991 }
5992 case AMDGPUISD::SMIN3:
5993 case AMDGPUISD::SMAX3:
5994 case AMDGPUISD::SMED3:
5995 case AMDGPUISD::UMIN3:
5996 case AMDGPUISD::UMAX3:
5997 case AMDGPUISD::UMED3: {
5998 KnownBits Known2 = DAG.computeKnownBits(Op.getOperand(2), Depth + 1);
5999 if (Known2.isUnknown())
6000 break;
6001
6002 KnownBits Known1 = DAG.computeKnownBits(Op.getOperand(1), Depth + 1);
6003 if (Known1.isUnknown())
6004 break;
6005
6006 KnownBits Known0 = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
6007 if (Known0.isUnknown())
6008 break;
6009
6010 // TODO: Handle LeadZero/LeadOne from UMIN/UMAX handling.
6011 Known.Zero = Known0.Zero & Known1.Zero & Known2.Zero;
6012 Known.One = Known0.One & Known1.One & Known2.One;
6013 break;
6014 }
6016 unsigned IID = Op.getConstantOperandVal(0);
6017 switch (IID) {
6018 case Intrinsic::amdgcn_workitem_id_x:
6019 case Intrinsic::amdgcn_workitem_id_y:
6020 case Intrinsic::amdgcn_workitem_id_z: {
6021 unsigned MaxValue = Subtarget->getMaxWorkitemID(
6023 Known.Zero.setHighBits(llvm::countl_zero(MaxValue));
6024 break;
6025 }
6026 default:
6027 break;
6028 }
6029 }
6030 }
6031}
6032
6034 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6035 unsigned Depth) const {
6036 switch (Op.getOpcode()) {
6037 case AMDGPUISD::BFE_I32: {
6038 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6039 if (!Width)
6040 return 1;
6041
6042 unsigned SignBits = 32 - Width->getZExtValue() + 1;
6043 if (!isNullConstant(Op.getOperand(1)))
6044 return SignBits;
6045
6046 // TODO: Could probably figure something out with non-0 offsets.
6047 unsigned Op0SignBits = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6048 return std::max(SignBits, Op0SignBits);
6049 }
6050
6051 case AMDGPUISD::BFE_U32: {
6052 ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
6053 return Width ? 32 - (Width->getZExtValue() & 0x1f) : 1;
6054 }
6055
6056 case AMDGPUISD::CARRY:
6057 case AMDGPUISD::BORROW:
6058 return 31;
6060 return 25;
6062 return 17;
6064 return 24;
6066 return 16;
6068 return 16;
6069 case AMDGPUISD::SMIN3:
6070 case AMDGPUISD::SMAX3:
6071 case AMDGPUISD::SMED3:
6072 case AMDGPUISD::UMIN3:
6073 case AMDGPUISD::UMAX3:
6074 case AMDGPUISD::UMED3: {
6075 unsigned Tmp2 = DAG.ComputeNumSignBits(Op.getOperand(2), Depth + 1);
6076 if (Tmp2 == 1)
6077 return 1; // Early out.
6078
6079 unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
6080 if (Tmp1 == 1)
6081 return 1; // Early out.
6082
6083 unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
6084 if (Tmp0 == 1)
6085 return 1; // Early out.
6086
6087 return std::min({Tmp0, Tmp1, Tmp2});
6088 }
6089 default:
6090 return 1;
6091 }
6092}
6093
6095 GISelValueTracking &Analysis, Register R, const APInt &DemandedElts,
6096 const MachineRegisterInfo &MRI, unsigned Depth) const {
6097 const MachineInstr *MI = MRI.getVRegDef(R);
6098 if (!MI)
6099 return 1;
6100
6101 // TODO: Check range metadata on MMO.
6102 switch (MI->getOpcode()) {
6103 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
6104 return 25;
6105 case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
6106 return 17;
6107 case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
6108 return 24;
6109 case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
6110 return 16;
6111 case AMDGPU::G_AMDGPU_SMED3:
6112 case AMDGPU::G_AMDGPU_UMED3: {
6113 auto [Dst, Src0, Src1, Src2] = MI->getFirst4Regs();
6114 unsigned Tmp2 = Analysis.computeNumSignBits(Src2, DemandedElts, Depth + 1);
6115 if (Tmp2 == 1)
6116 return 1;
6117 unsigned Tmp1 = Analysis.computeNumSignBits(Src1, DemandedElts, Depth + 1);
6118 if (Tmp1 == 1)
6119 return 1;
6120 unsigned Tmp0 = Analysis.computeNumSignBits(Src0, DemandedElts, Depth + 1);
6121 if (Tmp0 == 1)
6122 return 1;
6123 return std::min({Tmp0, Tmp1, Tmp2});
6124 }
6125 default:
6126 return 1;
6127 }
6128}
6129
6131 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
6132 bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const {
6133 unsigned Opcode = Op.getOpcode();
6134 switch (Opcode) {
6135 case AMDGPUISD::BFE_I32:
6136 case AMDGPUISD::BFE_U32:
6137 return false;
6138 }
6140 Op, DemandedElts, DAG, PoisonOnly, ConsiderFlags, Depth);
6141}
6142
6144 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN,
6145 unsigned Depth) const {
6146 unsigned Opcode = Op.getOpcode();
6147 switch (Opcode) {
6150 if (SNaN)
6151 return true;
6152
6153 // TODO: Can check no nans on one of the operands for each one, but which
6154 // one?
6155 return false;
6156 }
6159 if (SNaN)
6160 return true;
6161 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6162 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6163 }
6164 case AMDGPUISD::FMED3:
6165 case AMDGPUISD::FMIN3:
6166 case AMDGPUISD::FMAX3:
6169 case AMDGPUISD::FMAD_FTZ: {
6170 if (SNaN)
6171 return true;
6172 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1) &&
6173 DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6174 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6175 }
6180 return true;
6181
6182 case AMDGPUISD::RCP:
6183 case AMDGPUISD::RSQ:
6185 case AMDGPUISD::RSQ_CLAMP: {
6186 if (SNaN)
6187 return true;
6188
6189 // TODO: Need is known positive check.
6190 return false;
6191 }
6192 case ISD::FLDEXP:
6193 case AMDGPUISD::FRACT: {
6194 if (SNaN)
6195 return true;
6196 return DAG.isKnownNeverNaN(Op.getOperand(0), SNaN, Depth + 1);
6197 }
6201 // TODO: Refine on operands.
6202 return SNaN;
6203 case AMDGPUISD::SIN_HW:
6204 case AMDGPUISD::COS_HW: {
6205 // TODO: Need check for infinity
6206 return SNaN;
6207 }
6209 unsigned IntrinsicID = Op.getConstantOperandVal(0);
6210 // TODO: Handle more intrinsics
6211 switch (IntrinsicID) {
6212 case Intrinsic::amdgcn_cubeid:
6213 case Intrinsic::amdgcn_cvt_off_f32_i4:
6214 return true;
6215
6216 case Intrinsic::amdgcn_frexp_mant: {
6217 if (SNaN)
6218 return true;
6219 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1);
6220 }
6221 case Intrinsic::amdgcn_cvt_pkrtz: {
6222 if (SNaN)
6223 return true;
6224 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6225 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
6226 }
6227 case Intrinsic::amdgcn_rcp:
6228 case Intrinsic::amdgcn_rsq:
6229 case Intrinsic::amdgcn_rcp_legacy:
6230 case Intrinsic::amdgcn_rsq_legacy:
6231 case Intrinsic::amdgcn_rsq_clamp:
6232 case Intrinsic::amdgcn_tanh: {
6233 if (SNaN)
6234 return true;
6235
6236 // TODO: Need is known positive check.
6237 return false;
6238 }
6239 case Intrinsic::amdgcn_trig_preop:
6240 case Intrinsic::amdgcn_fdot2:
6241 // TODO: Refine on operand
6242 return SNaN;
6243 case Intrinsic::amdgcn_fma_legacy:
6244 if (SNaN)
6245 return true;
6246 return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
6247 DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1) &&
6248 DAG.isKnownNeverNaN(Op.getOperand(3), SNaN, Depth + 1);
6249 default:
6250 return false;
6251 }
6252 }
6253 default:
6254 return false;
6255 }
6256}
6257
6259 Register N0, Register N1) const {
6260 return MRI.hasOneNonDBGUse(N0); // FIXME: handle regbanks
6261}
unsigned const MachineRegisterInfo * MRI
return SDValue()
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static LLVM_READONLY bool hasSourceMods(const MachineInstr &MI)
static bool isInv2Pi(const APFloat &APF)
static LLVM_READONLY bool opMustUseVOP3Encoding(const MachineInstr &MI, const MachineRegisterInfo &MRI)
returns true if the operation will definitely need to use a 64-bit encoding, and thus will use a VOP3...
static unsigned inverseMinMax(unsigned Opc)
static SDValue extractF64Exponent(SDValue Hi, const SDLoc &SL, SelectionDAG &DAG)
static unsigned workitemIntrinsicDim(unsigned ID)
static int getOrCreateFixedStackObject(MachineFrameInfo &MFI, unsigned Size, int64_t Offset)
static SDValue constantFoldBFE(SelectionDAG &DAG, IntTy Src0, uint32_t Offset, uint32_t Width, const SDLoc &DL)
static SDValue getMad(SelectionDAG &DAG, const SDLoc &SL, EVT VT, SDValue X, SDValue Y, SDValue C, SDNodeFlags Flags=SDNodeFlags())
static SDValue getAddOneOp(const SDNode *V)
If V is an add of a constant 1, returns the other operand.
#define NODE_NAME_CASE(node)
static LLVM_READONLY bool selectSupportsSourceMods(const SDNode *N)
Return true if v_cndmask_b32 will support fabs/fneg source modifiers for the type for ISD::SELECT.
static cl::opt< bool > AMDGPUBypassSlowDiv("amdgpu-bypass-slow-div", cl::desc("Skip 64-bit divide for dynamic 32-bit values"), cl::init(true))
static SDValue getMul24(SelectionDAG &DAG, const SDLoc &SL, SDValue N0, SDValue N1, unsigned Size, bool Signed)
static bool fnegFoldsIntoOp(const SDNode *N)
static bool isI24(SDValue Op, SelectionDAG &DAG)
static bool isCttzOpc(unsigned Opc)
static bool isU24(SDValue Op, SelectionDAG &DAG)
static SDValue peekFPSignOps(SDValue Val)
static bool valueIsKnownNeverF32Denorm(SDValue Src)
Return true if it's known that Src can never be an f32 denormal value.
static SDValue distributeOpThroughSelect(TargetLowering::DAGCombinerInfo &DCI, unsigned Op, const SDLoc &SL, SDValue Cond, SDValue N1, SDValue N2)
static SDValue peekFNeg(SDValue Val)
static SDValue simplifyMul24(SDNode *Node24, TargetLowering::DAGCombinerInfo &DCI)
static bool isCtlzOpc(unsigned Opc)
static LLVM_READNONE bool fnegFoldsIntoOpcode(unsigned Opc)
static bool hasVolatileUser(SDNode *Val)
Interface definition of the TargetLowering class that is common to all AMD GPUs.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
AMDGPU promote alloca to vector or LDS
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
Function Alias Analysis Results
block Block Frequency Analysis
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
#define LLVM_READNONE
Definition Compiler.h:315
#define LLVM_READONLY
Definition Compiler.h:322
Provides analysis for querying information about KnownBits during GISel passes.
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define I(x, y, z)
Definition MD5.cpp:58
#define G(x, y, z)
Definition MD5.cpp:56
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
#define T
const SmallVectorImpl< MachineOperand > & Cond
#define CH(x, y, z)
Definition SHA256.cpp:34
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
Value * RHS
Value * LHS
BinaryOperator * Mul
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
static std::optional< uint32_t > getLDSAbsoluteAddress(const GlobalValue &GV)
void recordNumNamedBarriers(uint32_t GVAddr, unsigned BarCnt)
unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV)
static const AMDGPUSubtarget & get(const MachineFunction &MF)
static unsigned numBitsSigned(SDValue Op, SelectionDAG &DAG)
SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
Generate Min/Max node.
unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue performMulhuCombine(SDNode *N, DAGCombinerInfo &DCI) const
EVT getTypeForExtReturn(LLVMContext &Context, EVT VT, ISD::NodeType ExtendKind) const override
Return the type that should be used to zero or sign extend a zeroext/signext integer return value.
SDValue SplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Split a vector load into 2 loads of half the vector.
SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const
SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const
void analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl< ISD::InputArg > &Ins) const
The SelectionDAGBuilder will automatically promote function arguments with illegal types.
SDValue LowerF64ToF16Safe(SDValue Src, const SDLoc &DL, SelectionDAG &DAG) const
SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const
SDValue storeStackInputValue(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain, SDValue ArgVal, int64_t Offset) const
bool storeOfVectorConstantIsCheap(bool IsZero, EVT MemVT, unsigned NumElem, unsigned AS) const override
Return true if it is expected to be cheaper to do a store of vector constant with the given size and ...
SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth=0) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
bool shouldCombineMemoryType(EVT VT) const
SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, uint32_t ValLo, uint32_t ValHi) const
Split the 64-bit value LHS into two 32-bit components, and perform the binary operation Opc to it wit...
SDValue lowerUnhandledCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals, StringRef Reason) const
SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isTruncateFree(EVT Src, EVT Dest) const override
bool aggressivelyPreferBuildVectorSources(EVT VecVT) const override
SDValue LowerFCEIL(SDValue Op, SelectionDAG &DAG) const
TargetLowering::NegatibleCost getConstantNegateCost(const ConstantFPSDNode *C) const
SDValue LowerFLOGUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, bool IsLog10, SDNodeFlags Flags) const
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
SDValue performMulhsCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSDNodeAlwaysUniform(const SDNode *N) const override
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
SDValue performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
SDValue LowerSDIVREM(SDValue Op, SelectionDAG &DAG) const
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
SDValue LowerFLOG10(SDValue Op, SelectionDAG &DAG) const
SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const
unsigned computeNumSignBitsForTargetInstr(GISelValueTracking &Analysis, Register R, const APInt &DemandedElts, const MachineRegisterInfo &MRI, unsigned Depth=0) const override
This method can be implemented by targets that want to expose additional information about sign bits ...
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const
SDValue addTokenForArgument(SDValue Chain, SelectionDAG &DAG, MachineFrameInfo &MFI, int ClobberedFI) const
bool isConstantCheaperToNegate(SDValue N) const
bool isReassocProfitable(MachineRegisterInfo &MRI, Register N0, Register N1) const override
bool isKnownNeverNaNForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool SNaN=false, unsigned Depth=0) const override
If SNaN is false,.
static bool needsDenormHandlingF32(const SelectionDAG &DAG, SDValue Src, SDNodeFlags Flags)
uint32_t getImplicitParameterOffset(const MachineFunction &MF, const ImplicitParameter Param) const
Helper function that returns the byte offset of the given type of implicit parameter.
SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const
SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const
virtual SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op, SelectionDAG &DAG) const
bool isConstantCostlierToNegate(SDValue N) const
SDValue loadInputValue(SelectionDAG &DAG, const TargetRegisterClass *RC, EVT VT, const SDLoc &SL, const ArgDescriptor &Arg) const
SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const
SDValue lowerFEXP10Unsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
Emit approx-funcs appropriate lowering for exp10.
bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtType, EVT ExtVT, std::optional< unsigned > ByteOffset) const override
Return true if it is profitable to reduce a load to a smaller type.
SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue performCtlz_CttzCombine(const SDLoc &SL, SDValue Cond, SDValue LHS, SDValue RHS, DAGCombinerInfo &DCI) const
SDValue performSraCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isSelectSupported(SelectSupportKind) const override
bool isZExtFree(Type *Src, Type *Dest) const override
Return true if any actual instruction that defines a value of type FromTy implicitly zero-extends the...
SDValue lowerFEXP2(SDValue Op, SelectionDAG &DAG) const
SDValue LowerCall(CallLoweringInfo &CLI, SmallVectorImpl< SDValue > &InVals) const override
This hook must be implemented to lower calls into the specified DAG.
SDValue performSrlCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const
SDValue getIsLtSmallestNormal(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool mayIgnoreSignedZero(SDValue Op) const
SDValue getIsFinite(SelectionDAG &DAG, SDValue Op, SDNodeFlags Flags) const
bool isLoadBitCastBeneficial(EVT, EVT, const SelectionDAG &DAG, const MachineMemOperand &MMO) const final
Return true if the following transform is beneficial: fold (conv (load x)) -> (load (conv*)x) On arch...
std::pair< SDValue, SDValue > splitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HighVT, SelectionDAG &DAG) const
Split a vector value into two parts of types LoVT and HiVT.
SDValue LowerFLOGCommon(SDValue Op, SelectionDAG &DAG) const
SDValue foldFreeOpFromSelect(TargetLowering::DAGCombinerInfo &DCI, SDValue N) const
SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const
bool isFAbsFree(EVT VT) const override
Return true if an fabs operation is free to the point where it is never worthwhile to replace it with...
SDValue loadStackInputValue(SelectionDAG &DAG, EVT VT, const SDLoc &SL, int64_t Offset) const
Similar to CreateLiveInRegister, except value maybe loaded from a stack slot rather than passed in a ...
SDValue LowerFLOG2(SDValue Op, SelectionDAG &DAG) const
static EVT getEquivalentMemType(LLVMContext &Context, EVT VT)
SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps, bool &UseOneConstNR, bool Reciprocal) const override
Hooks for building estimates in place of slower divisions and square roots.
SDValue performTruncateCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerSINT_TO_FP(SDValue Op, SelectionDAG &DAG) const
static SDValue stripBitcast(SDValue Val)
SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, Register Reg, EVT VT, const SDLoc &SL, bool RawReg=false) const
Helper function that adds Reg to the LiveIn list of the DAG's MachineFunction.
SDValue SplitVectorStore(SDValue Op, SelectionDAG &DAG) const
Split a vector store into 2 stores of half the vector.
SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const
SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize, NegatibleCost &Cost, unsigned Depth) const override
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > split64BitValue(SDValue Op, SelectionDAG &DAG) const
Return 64-bit value Op as two 32-bit integers.
SDValue performMulCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled, int &RefinementSteps) const override
Return a reciprocal estimate value for the input operand.
AMDGPUTargetLowering(const TargetMachine &TM, const AMDGPUSubtarget &STI)
SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const
const char * getTargetNodeName(unsigned Opcode) const override
This method returns the name of a target specific DAG node.
SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const
static CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool IsVarArg)
std::pair< SDValue, SDValue > getScaledLogInput(SelectionDAG &DAG, const SDLoc SL, SDValue Op, SDNodeFlags Flags) const
If denormal handling is required return the scaled input to FLOG2, and the check for denormal range.
static CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg)
Selects the correct CCAssignFn for a given CallingConvention value.
static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold=4)
SDValue LowerFROUNDEVEN(SDValue Op, SelectionDAG &DAG) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize) const override
Returns true if the target can instruction select the specified FP immediate natively.
static unsigned numBitsUnsigned(SDValue Op, SelectionDAG &DAG)
SDValue lowerFEXPUnsafe(SDValue Op, const SDLoc &SL, SelectionDAG &DAG, SDNodeFlags Flags) const
SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const
SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
static bool allowApproxFunc(const SelectionDAG &DAG, SDNodeFlags Flags)
bool ShouldShrinkFPConstant(EVT VT) const override
If true, then instruction selection should seek to shrink the FP constant of the specified type to a ...
SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl< ISD::OutputArg > &Outs, const SmallVectorImpl< SDValue > &OutVals, const SDLoc &DL, SelectionDAG &DAG) const override
This hook must be implemented to lower outgoing return values, described by the Outs array,...
SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
This callback is invoked when a node result type is illegal for the target, and the operation was reg...
SDValue performRcpCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue getLoHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const
SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerFP_TO_INT64(SDValue Op, SelectionDAG &DAG, bool Signed) const
static bool shouldFoldFNegIntoSrc(SDNode *FNeg, SDValue FNegSrc)
bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override
Return true if it's profitable to narrow operations of type SrcVT to DestVT.
SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const
SDValue performIntrinsicWOChainCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const
SDValue performMulLoHiCombine(SDNode *N, DAGCombinerInfo &DCI) const
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl< SDValue > &Results) const
SDValue WidenOrSplitVectorLoad(SDValue Op, SelectionDAG &DAG) const
Widen a suitably aligned v3 load.
std::pair< EVT, EVT > getSplitDestVTs(const EVT &VT, SelectionDAG &DAG) const
Split a vector type into two parts.
SDValue getHiHalf64(SDValue Op, SelectionDAG &DAG) const
SDValue combineFMinMaxLegacyImpl(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, SDValue True, SDValue False, SDValue CC, DAGCombinerInfo &DCI) const
unsigned getVectorIdxWidth(const DataLayout &) const override
Returns the type to be used for the index operand vector operations.
static const fltSemantics & IEEEsingle()
Definition APFloat.h:296
static const fltSemantics & IEEEdouble()
Definition APFloat.h:297
static constexpr roundingMode rmNearestTiesToEven
Definition APFloat.h:344
static const fltSemantics & IEEEhalf()
Definition APFloat.h:294
bool bitwiseIsEqual(const APFloat &RHS) const
Definition APFloat.h:1396
opStatus add(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1163
const fltSemantics & getSemantics() const
Definition APFloat.h:1439
opStatus multiply(const APFloat &RHS, roundingMode RM)
Definition APFloat.h:1181
static APFloat getSmallestNormalized(const fltSemantics &Sem, bool Negative=false)
Returns the smallest (by magnitude) normalized finite number in the given semantics.
Definition APFloat.h:1140
static APFloat getInf(const fltSemantics &Sem, bool Negative=false)
Factory for Positive and Negative Infinity.
Definition APFloat.h:1080
Class for arbitrary precision integers.
Definition APInt.h:78
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1541
void setHighBits(unsigned hiBits)
Set the top hiBits bits.
Definition APInt.h:1392
void setBitsFrom(unsigned loBit)
Set the top bits starting from loBit.
Definition APInt.h:1386
static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit)
Get a value with a block of bits set.
Definition APInt.h:259
bool ule(const APInt &RHS) const
Unsigned less or equal comparison.
Definition APInt.h:1151
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
void setLowBits(unsigned loBits)
Set the bottom loBits bits.
Definition APInt.h:1389
This class represents an incoming formal argument to a Function.
Definition Argument.h:32
CCState - This class holds information needed while lowering arguments and return values.
static CCValAssign getCustomMem(unsigned ValNo, MVT ValVT, int64_t Offset, MVT LocVT, LocInfo HTP)
const APFloat & getValueAPF() const
bool isNegative() const
Return true if the value is negative.
uint64_t getZExtValue() const
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Diagnostic information for unsupported feature in backend.
const DataLayout & getDataLayout() const
Get the data layout of the module this function belongs to.
Definition Function.cpp:363
iterator_range< arg_iterator > args()
Definition Function.h:890
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:270
Module * getParent()
Get the module that this global value is contained inside of...
Type * getValueType() const
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Machine Value Type.
static auto integer_fixedlen_vector_valuetypes()
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
bool isInteger() const
Return true if this is an integer or a vector integer type.
static auto integer_valuetypes()
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
The MachineFrameInfo class represents an abstract stack frame until prolog/epilog code is inserted.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
int getObjectIndexBegin() const
Return the minimum frame object index.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
DenormalMode getDenormalMode(const fltSemantics &FPType) const
Returns the denormal handling type for the default rounding mode of the function.
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
Function & getFunction()
Return the LLVM function that this machine code represents.
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
Representation of each machine instruction.
A description of a memory reference used in the backend.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOInvariant
The memory access always returns the same value (or traps).
Flags getFlags() const
Return the raw flags of the source value,.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
This is an abstract virtual class for memory operations.
unsigned getAddressSpace() const
Return the address space for the associated pointer.
Align getAlign() const
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return a MachineMemOperand object describing the memory reference performed by operation.
const SDValue & getChain() const
bool isInvariant() const
EVT getMemoryVT() const
Return the type of the in-memory value.
LLVMContext & getContext() const
Get the global data context.
Definition Module.h:285
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
ArrayRef< SDUse > ops() const
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
SDNodeFlags getFlags() const
SDVTList getVTList() const
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
iterator_range< user_iterator > users()
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
TypeSize getValueSizeInBits() const
Returns the size of the value in bits.
const SDValue & getOperand(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This class keeps track of the SPI_SP_INPUT_ADDR config register, which tells the hardware which inter...
SIModeRegisterDefaults getMode() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI unsigned ComputeMaxSignificantBits(SDValue Op, unsigned Depth=0) const
Get the upper bound on bit size for this Value Op as a signed integer.
const SDValue & getRoot() const
Return the root tag of the SelectionDAG.
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getShiftAmountConstant(uint64_t Val, EVT VT, const SDLoc &DL)
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI void ExtractVectorElements(SDValue Op, SmallVectorImpl< SDValue > &Args, unsigned Start=0, unsigned Count=0, EVT EltVT=EVT())
Append the extracted elements from Start to Count out of the vector Op in Args.
LLVM_ABI SDValue getFreeze(SDValue V)
Return a freeze using the SDLoc of the value operand.
SDValue getSetCC(const SDLoc &DL, EVT VT, SDValue LHS, SDValue RHS, ISD::CondCode Cond, SDValue Chain=SDValue(), bool IsSignaling=false)
Helper function to make it easier to build SetCC's if you just have an ISD::CondCode instead of an SD...
LLVM_ABI SDValue getConstantFP(double Val, const SDLoc &DL, EVT VT, bool isTarget=false)
Create a ConstantFPSDNode wrapping a constant value.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
SDValue getSelect(const SDLoc &DL, EVT VT, SDValue Cond, SDValue LHS, SDValue RHS, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build Select's if you just have operands and don't want to check...
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
bool isConstantValueOfAnyType(SDValue N) const
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI bool isKnownNeverNaN(SDValue Op, const APInt &DemandedElts, bool SNaN=false, unsigned Depth=0) const
Test whether the given SDValue (or all elements of it, if it is a vector) is known to never be NaN in...
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
SDValue getPOISON(EVT VT)
Return a POISON node. POISON does not have a useful SDLoc.
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
const SDValue & setRoot(SDValue N)
Set the current root tag of the SelectionDAG.
LLVM_ABI SDNode * UpdateNodeOperands(SDNode *N, SDValue Op)
Mutate the specified node in-place to have the specified operands.
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
virtual bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy, EVT NewVT, std::optional< unsigned > ByteOffset=std::nullopt) const
Return true if it is profitable to reduce a load to a smaller type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
const TargetMachine & getTargetMachine() const
virtual unsigned getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain targets require unusual breakdowns of certain types.
unsigned MaxGluedStoresPerMemcpy
Specify max number of store instructions to glue in inlined memcpy.
virtual MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const
Certain combinations of ABIs, Targets and features require that types are legal for some operations a...
void addBypassSlowDiv(unsigned int SlowBitWidth, unsigned int FastBitWidth)
Tells the code generator which bitwidths to bypass.
void setMaxLargeFPConvertBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum fp to/from int conversion the backend supports.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
SelectSupportKind
Enum that describes what type of support for selects the target has.
virtual bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *=nullptr) const
Determine if the target supports unaligned memory accesses.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
EVT getShiftAmountTy(EVT LHSTy, const DataLayout &DL) const
Returns the type for the shift amount of a shift opcode.
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
virtual EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const
Return the ValueType of the result of SETCC operations.
virtual EVT getTypeToTransformTo(LLVMContext &Context, EVT VT) const
For types supported by the target, this is an identity function.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setSupportsUnalignedAtomics(bool UnalignedSupported)
Sets whether unaligned atomic operations are supported.
bool isOperationLegal(unsigned Op, EVT VT) const
Return true if the specified operation is legal on this target.
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
void setMinCmpXchgSizeInBits(unsigned SizeInBits)
Sets the minimum cmpxchg or ll/sc size supported by the backend.
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
unsigned GatherAllAliasesMaxDepth
Depth that GatherAllAliases should continue looking for chain dependencies when trying to find a more...
NegatibleCost
Enum that specifies when a float negation is beneficial.
bool allowsMemoryAccessForAlignment(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
This function returns true if the memory access is aligned or if the target allows this specific unal...
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
void setJumpIsExpensive(bool isExpensive=true)
Tells the code generator not to expand logic operations on comparison predicates into separate sequen...
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
SDValue scalarizeVectorStore(StoreSDNode *ST, SelectionDAG &DAG) const
SDValue SimplifyMultipleUseDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, SelectionDAG &DAG, unsigned Depth=0) const
More limited version of SimplifyDemandedBits that can be used to "lookthrough" ops that don't contrib...
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
bool ShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const
Check to see if the specified operand of the specified instruction is a constant integer.
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps, bool OptForSize, NegatibleCost &Cost, unsigned Depth=0) const
Return the newly negated expression if the cost is not expensive and set the cost in Cost to indicate...
std::pair< SDValue, SDValue > scalarizeVectorLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Turn load of vector type into a load of the individual elements.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
TargetLowering(const TargetLowering &)=delete
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, bool PoisonOnly, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
Primary interface to the complete machine description for the target machine.
TargetOptions Options
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
LLVM Value Representation.
Definition Value.h:75
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ REGION_ADDRESS
Address space for region memory. (GDS)
@ LOCAL_ADDRESS
Address space for local memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
@ GLOBAL_ADDRESS
Address space for global memory (RAT0, VTX0).
bool isIntrinsicAlwaysUniform(unsigned IntrID)
TargetExtType * isNamedBarrier(const GlobalVariable &GV)
bool isUniformMMO(const MachineMemOperand *MMO)
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ AMDGPU_CS
Used for Mesa/AMDPAL compute shaders.
@ AMDGPU_VS
Used for Mesa vertex shaders, or AMDPAL last shader stage before rasterization (vertex shader if tess...
@ AMDGPU_KERNEL
Used for AMDGPU code object kernels.
@ AMDGPU_Gfx
Used for AMD graphics targets.
@ AMDGPU_CS_ChainPreserve
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_HS
Used for Mesa/AMDPAL hull shaders (= tessellation control shaders).
@ AMDGPU_GS
Used for Mesa/AMDPAL geometry shaders.
@ AMDGPU_CS_Chain
Used on AMDGPUs to give the middle-end more control over argument placement.
@ AMDGPU_PS
Used for Mesa/AMDPAL pixel shaders.
@ Cold
Attempts to make code in the caller as efficient as possible under the assumption that the call is no...
Definition CallingConv.h:47
@ SPIR_KERNEL
Used for SPIR kernel functions.
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ AMDGPU_ES
Used for AMDPAL shader stage before geometry shader if geometry is in use.
@ AMDGPU_LS
Used for AMDPAL vertex shader if tessellation is in use.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:807
@ CTLZ_ZERO_UNDEF
Definition ISDOpcodes.h:780
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:270
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:593
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:771
@ ADDC
Carry-setting nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:289
@ FMAD
FMAD - Perform a * b + c, while getting the same result as the separately rounded operations.
Definition ISDOpcodes.h:515
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:841
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:511
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:577
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:275
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:249
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ CTTZ_ZERO_UNDEF
Bit counting operators with an undefined result for zero inputs.
Definition ISDOpcodes.h:779
@ FCANONICALIZE
Returns platform specific canonical encoding of a floating point number.
Definition ISDOpcodes.h:534
@ IS_FPCLASS
Performs a check of floating point class property, defined by IEEE-754.
Definition ISDOpcodes.h:541
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:784
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:242
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:642
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:607
@ EntryToken
EntryToken - This is the marker used to indicate the start of a region.
Definition ISDOpcodes.h:48
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:569
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:219
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:799
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:876
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:724
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:793
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:323
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:200
@ ADDE
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:299
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:558
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
@ AssertSext
AssertSext, AssertZext - These nodes record if a register contains a value that has already been zero...
Definition ISDOpcodes.h:62
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:527
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:208
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:549
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
initializer< Ty > init(const Ty &Val)
constexpr double ln2
constexpr double ln10
constexpr float log2ef
Definition MathExtras.h:51
constexpr double log2e
This is an optimization pass for GlobalISel generic memory operations.
@ Offset
Definition DWP.cpp:477
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
MaybeAlign getAlign(const CallInst &I, unsigned Index)
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
void ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL, Type *Ty, SmallVectorImpl< EVT > &ValueVTs, SmallVectorImpl< EVT > *MemVTs=nullptr, SmallVectorImpl< TypeSize > *Offsets=nullptr, TypeSize StartingOffset=TypeSize::getZero())
ComputeValueVTs - Given an LLVM IR type, compute a sequence of EVTs that represent all the individual...
Definition Analysis.cpp:119
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
uint64_t PowerOf2Ceil(uint64_t A)
Returns the power of two which is greater than or equal to the given value.
Definition MathExtras.h:385
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:236
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
constexpr uint32_t Hi_32(uint64_t Value)
Return the high 32 bits of a 64 bit value.
Definition MathExtras.h:150
constexpr uint32_t Lo_32(uint64_t Value)
Return the low 32 bits of a 64 bit value.
Definition MathExtras.h:155
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
CombineLevel
Definition DAGCombine.h:15
@ AfterLegalizeDAG
Definition DAGCombine.h:19
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
@ AfterLegalizeTypes
Definition DAGCombine.h:17
To bit_cast(const From &from) noexcept
Definition bit.h:90
@ Mul
Product of integers.
@ Add
Sum of integers.
uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
DWARFExpression::Operation Op
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
APFloat neg(APFloat X)
Returns the negated value of the argument.
Definition APFloat.h:1551
unsigned Log2(Align A)
Returns the log2 of the alignment.
Definition Alignment.h:197
static cl::opt< unsigned > CostThreshold("dfa-cost-threshold", cl::desc("Maximum cost accepted for the transformation"), cl::Hidden, cl::init(50))
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:180
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
#define N
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
MCRegister getRegister() const
unsigned getStackOffset() const
DenormalModeKind Input
Denormal treatment kind for floating point instruction inputs in the default floating-point environme...
@ PreserveSign
The sign of a flushed-to-zero number is preserved in the sign of 0.
static constexpr DenormalMode getPreserveSign()
Extended Value Type.
Definition ValueTypes.h:35
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
Definition ValueTypes.h:395
EVT getPow2VectorType(LLVMContext &Context) const
Widens the length of the given vector EVT up to the nearest power of 2 and returns that type.
Definition ValueTypes.h:477
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:137
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:74
EVT changeTypeToInteger() const
Return the type converted to an equivalently sized integer or vector with integer element type.
Definition ValueTypes.h:121
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:147
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:463
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:373
bool isByteSized() const
Return true if the bit size is a multiple of 8.
Definition ValueTypes.h:243
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:385
EVT getHalfSizedIntegerVT(LLVMContext &Context) const
Finds the smallest simple value type that is greater than or equal to half the width of this EVT.
Definition ValueTypes.h:430
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:470
TypeSize getStoreSizeInBits() const
Return the number of bits overwritten by a store of the specified value type.
Definition ValueTypes.h:412
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:316
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:65
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:381
EVT getRoundIntegerType(LLVMContext &Context) const
Rounds the bit-width of the given integer EVT up to the nearest power of two (and at least to eight),...
Definition ValueTypes.h:419
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:168
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:323
bool bitsGE(EVT VT) const
Return true if this has no less bits than VT.
Definition ValueTypes.h:292
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:328
bool isExtended() const
Test if the given EVT is extended (as opposed to being simple).
Definition ValueTypes.h:142
EVT changeVectorElementType(EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:102
LLVM_ABI const fltSemantics & getFltSemantics() const
Returns an APFloat semantics tag appropriate for the value type.
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:336
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:308
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:152
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
MVT VT
Legalized type of this argument part.
bool isNonNegative() const
Returns true if this value is known to be non-negative.
Definition KnownBits.h:108
unsigned countMinTrailingZeros() const
Returns the minimum number of trailing zero bits.
Definition KnownBits.h:242
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:66
KnownBits trunc(unsigned BitWidth) const
Return known bits for a truncation of the value we're tracking.
Definition KnownBits.h:161
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:74
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:248
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145
APInt getMinValue() const
Return the minimal unsigned value possible given these KnownBits.
Definition KnownBits.h:129
bool isStrictlyPositive() const
Returns true if this value is known to be positive.
Definition KnownBits.h:114
bool isNegative() const
Returns true if this value is known to be negative.
Definition KnownBits.h:105
unsigned countMaxSignificantBits() const
Returns the maximum number of bits needed to represent all possible signed values with these known bi...
Definition KnownBits.h:269
Matching combinators.
This class contains a discriminated union of information about pointers in memory operands,...
LLVM_ABI bool isDereferenceable(unsigned Size, LLVMContext &C, const DataLayout &DL) const
Return true if memory region [V, V+Offset+Size) is known to be dereferenceable.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
MachinePointerInfo getWithOffset(int64_t O) const
These are IR-level optimization flags that may be propagated to SDNodes.
void setAllowContract(bool b)
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
DenormalMode FP32Denormals
If this is set, neither input or output denormals are flushed for most f32 instructions.
This structure contains all information that is necessary for lowering calls.
SmallVector< ISD::InputArg, 32 > Ins
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
LLVM_ABI void CommitTargetLoweringOpt(const TargetLoweringOpt &TLO)
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...