LLVM 23.0.0git
AMDGPURegBankLegalizeRules.cpp
Go to the documentation of this file.
1//===-- AMDGPURegBankLegalizeRules.cpp ------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9/// Definitions of RegBankLegalize Rules for all opcodes.
10/// Implementation of container for all the Rules and search.
11/// Fast search for most common case when Rule.Predicate checks LLT and
12/// uniformity of register in operand 0.
13//
14//===----------------------------------------------------------------------===//
15
17#include "AMDGPUInstrInfo.h"
18#include "GCNSubtarget.h"
21#include "llvm/IR/IntrinsicsAMDGPU.h"
23
24#define DEBUG_TYPE "amdgpu-regbanklegalize"
25
26using namespace llvm;
27using namespace AMDGPU;
28
29bool AMDGPU::isAnyPtr(LLT Ty, unsigned Width) {
30 return Ty.isPointer() && Ty.getSizeInBits() == Width;
31}
32
34 std::initializer_list<RegBankLLTMappingApplyID> DstOpMappingList,
35 std::initializer_list<RegBankLLTMappingApplyID> SrcOpMappingList,
37 : DstOpMapping(DstOpMappingList), SrcOpMapping(SrcOpMappingList),
39
41 std::initializer_list<UniformityLLTOpPredicateID> OpList,
42 std::function<bool(const MachineInstr &)> TestFunc)
44
46 const MachineUniformityInfo &MUI,
47 const MachineRegisterInfo &MRI) {
48 switch (UniID) {
49 case S1:
50 return MRI.getType(Reg) == LLT::scalar(1);
51 case S16:
52 return MRI.getType(Reg) == LLT::scalar(16);
53 case S32:
54 return MRI.getType(Reg) == LLT::scalar(32);
55 case S64:
56 return MRI.getType(Reg) == LLT::scalar(64);
57 case S128:
58 return MRI.getType(Reg) == LLT::scalar(128);
59 case P0:
60 return MRI.getType(Reg) == LLT::pointer(0, 64);
61 case P1:
62 return MRI.getType(Reg) == LLT::pointer(1, 64);
63 case P2:
64 return MRI.getType(Reg) == LLT::pointer(2, 32);
65 case P3:
66 return MRI.getType(Reg) == LLT::pointer(3, 32);
67 case P4:
68 return MRI.getType(Reg) == LLT::pointer(4, 64);
69 case P5:
70 return MRI.getType(Reg) == LLT::pointer(5, 32);
71 case P8:
72 return MRI.getType(Reg) == LLT::pointer(8, 128);
73 case Ptr32:
74 return isAnyPtr(MRI.getType(Reg), 32);
75 case Ptr64:
76 return isAnyPtr(MRI.getType(Reg), 64);
77 case Ptr128:
78 return isAnyPtr(MRI.getType(Reg), 128);
79 case V2S16:
80 return MRI.getType(Reg) == LLT::fixed_vector(2, 16);
81 case V2S32:
82 return MRI.getType(Reg) == LLT::fixed_vector(2, 32);
83 case V3S32:
84 return MRI.getType(Reg) == LLT::fixed_vector(3, 32);
85 case V4S32:
86 return MRI.getType(Reg) == LLT::fixed_vector(4, 32);
87 case B32:
88 return MRI.getType(Reg).getSizeInBits() == 32;
89 case B64:
90 return MRI.getType(Reg).getSizeInBits() == 64;
91 case B96:
92 return MRI.getType(Reg).getSizeInBits() == 96;
93 case B128:
94 return MRI.getType(Reg).getSizeInBits() == 128;
95 case B160:
96 return MRI.getType(Reg).getSizeInBits() == 160;
97 case B256:
98 return MRI.getType(Reg).getSizeInBits() == 256;
99 case B512:
100 return MRI.getType(Reg).getSizeInBits() == 512;
101 case UniS1:
102 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isUniform(Reg);
103 case UniS16:
104 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isUniform(Reg);
105 case UniS32:
106 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isUniform(Reg);
107 case UniS64:
108 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isUniform(Reg);
109 case UniS128:
110 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isUniform(Reg);
111 case UniP0:
112 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isUniform(Reg);
113 case UniP1:
114 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isUniform(Reg);
115 case UniP2:
116 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isUniform(Reg);
117 case UniP3:
118 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isUniform(Reg);
119 case UniP4:
120 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isUniform(Reg);
121 case UniP5:
122 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isUniform(Reg);
123 case UniP8:
124 return MRI.getType(Reg) == LLT::pointer(8, 128) && MUI.isUniform(Reg);
125 case UniPtr32:
126 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isUniform(Reg);
127 case UniPtr64:
128 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isUniform(Reg);
129 case UniPtr128:
130 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isUniform(Reg);
131 case UniV2S16:
132 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isUniform(Reg);
133 case UniV2S32:
134 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isUniform(Reg);
135 case UniB32:
136 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isUniform(Reg);
137 case UniB64:
138 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isUniform(Reg);
139 case UniB96:
140 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isUniform(Reg);
141 case UniB128:
142 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isUniform(Reg);
143 case UniB160:
144 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isUniform(Reg);
145 case UniB256:
146 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isUniform(Reg);
147 case UniB512:
148 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isUniform(Reg);
149 case UniBRC: {
150 if (!MUI.isUniform(Reg))
151 return false;
152 // Check if there is SGPR register class of same size as the LLT.
153 const SIRegisterInfo *TRI =
154 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
155 // There is no 16 bit SGPR register class. Extra size check is required
156 // since getSGPRClassForBitWidth returns SReg_32RegClass for Size 16.
157 unsigned LLTSize = MRI.getType(Reg).getSizeInBits();
158 return LLTSize >= 32 && TRI->getSGPRClassForBitWidth(LLTSize);
159 }
160 case DivS1:
161 return MRI.getType(Reg) == LLT::scalar(1) && MUI.isDivergent(Reg);
162 case DivS16:
163 return MRI.getType(Reg) == LLT::scalar(16) && MUI.isDivergent(Reg);
164 case DivS32:
165 return MRI.getType(Reg) == LLT::scalar(32) && MUI.isDivergent(Reg);
166 case DivS64:
167 return MRI.getType(Reg) == LLT::scalar(64) && MUI.isDivergent(Reg);
168 case DivS128:
169 return MRI.getType(Reg) == LLT::scalar(128) && MUI.isDivergent(Reg);
170 case DivP0:
171 return MRI.getType(Reg) == LLT::pointer(0, 64) && MUI.isDivergent(Reg);
172 case DivP1:
173 return MRI.getType(Reg) == LLT::pointer(1, 64) && MUI.isDivergent(Reg);
174 case DivP2:
175 return MRI.getType(Reg) == LLT::pointer(2, 32) && MUI.isDivergent(Reg);
176 case DivP3:
177 return MRI.getType(Reg) == LLT::pointer(3, 32) && MUI.isDivergent(Reg);
178 case DivP4:
179 return MRI.getType(Reg) == LLT::pointer(4, 64) && MUI.isDivergent(Reg);
180 case DivP5:
181 return MRI.getType(Reg) == LLT::pointer(5, 32) && MUI.isDivergent(Reg);
182 case DivPtr32:
183 return isAnyPtr(MRI.getType(Reg), 32) && MUI.isDivergent(Reg);
184 case DivPtr64:
185 return isAnyPtr(MRI.getType(Reg), 64) && MUI.isDivergent(Reg);
186 case DivPtr128:
187 return isAnyPtr(MRI.getType(Reg), 128) && MUI.isDivergent(Reg);
188 case DivV2S16:
189 return MRI.getType(Reg) == LLT::fixed_vector(2, 16) && MUI.isDivergent(Reg);
190 case DivV2S32:
191 return MRI.getType(Reg) == LLT::fixed_vector(2, 32) && MUI.isDivergent(Reg);
192 case DivV3S32:
193 return MRI.getType(Reg) == LLT::fixed_vector(3, 32) && MUI.isDivergent(Reg);
194 case DivV4S16:
195 return MRI.getType(Reg) == LLT::fixed_vector(4, 16) && MUI.isDivergent(Reg);
196 case DivB32:
197 return MRI.getType(Reg).getSizeInBits() == 32 && MUI.isDivergent(Reg);
198 case DivB64:
199 return MRI.getType(Reg).getSizeInBits() == 64 && MUI.isDivergent(Reg);
200 case DivB96:
201 return MRI.getType(Reg).getSizeInBits() == 96 && MUI.isDivergent(Reg);
202 case DivB128:
203 return MRI.getType(Reg).getSizeInBits() == 128 && MUI.isDivergent(Reg);
204 case DivB160:
205 return MRI.getType(Reg).getSizeInBits() == 160 && MUI.isDivergent(Reg);
206 case DivB256:
207 return MRI.getType(Reg).getSizeInBits() == 256 && MUI.isDivergent(Reg);
208 case DivB512:
209 return MRI.getType(Reg).getSizeInBits() == 512 && MUI.isDivergent(Reg);
210 case DivBRC: {
211 if (!MUI.isDivergent(Reg))
212 return false;
213 // Check if there is VGPR register class of same size as the LLT.
214 const SIRegisterInfo *TRI =
215 static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
216 return TRI->getSGPRClassForBitWidth(MRI.getType(Reg).getSizeInBits());
217 }
218 case _:
219 return true;
220 default:
221 llvm_unreachable("missing matchUniformityAndLLT");
222 }
223}
224
226 const MachineUniformityInfo &MUI,
227 const MachineRegisterInfo &MRI) const {
228 // Check LLT signature.
229 for (unsigned i = 0; i < OpUniformityAndTypes.size(); ++i) {
230 const MachineOperand &MO = MI.getOperand(i);
231 if (OpUniformityAndTypes[i] == _) {
232 assert((!MI.getOperand(i).isReg() ||
233 !MI.getOperand(i).getReg().isVirtual()) &&
234 "_ is for non-register and physical register operands only");
235 continue;
236 }
237
238 // Remaining IDs check registers.
239 if (!MO.isReg())
240 return false;
241
242 if (!matchUniformityAndLLT(MO.getReg(), OpUniformityAndTypes[i], MUI, MRI))
243 return false;
244 }
245
246 // More complex check.
247 if (TestFunc)
248 return TestFunc(MI);
249
250 return true;
251}
252
254
256 : FastTypes(FastTypes) {}
257
259 if (Ty == LLT::scalar(16))
260 return S16;
261 if (Ty == LLT::scalar(32))
262 return S32;
263 if (Ty == LLT::scalar(64))
264 return S64;
265 if (Ty == LLT::fixed_vector(2, 16))
266 return V2S16;
267 if (Ty == LLT::fixed_vector(2, 32))
268 return V2S32;
269 if (Ty == LLT::fixed_vector(3, 32))
270 return V3S32;
271 if (Ty == LLT::fixed_vector(4, 32))
272 return V4S32;
273 return _;
274}
275
277 if (Ty == LLT::scalar(32) || Ty == LLT::fixed_vector(2, 16) ||
278 isAnyPtr(Ty, 32))
279 return B32;
280 if (Ty == LLT::scalar(64) || Ty == LLT::fixed_vector(2, 32) ||
281 Ty == LLT::fixed_vector(4, 16) || isAnyPtr(Ty, 64))
282 return B64;
283 if (Ty == LLT::fixed_vector(3, 32))
284 return B96;
285 if (Ty == LLT::fixed_vector(4, 32) || Ty == LLT::fixed_vector(2, 64) ||
286 Ty == LLT::fixed_vector(8, 16) || isAnyPtr(Ty, 128))
287 return B128;
288 return _;
289}
290
291const RegBankLLTMapping *
293 const MachineRegisterInfo &MRI,
294 const MachineUniformityInfo &MUI) const {
295 // Search in "Fast Rules".
296 // Note: if fast rules are enabled, RegBankLLTMapping must be added in each
297 // slot that could "match fast Predicate". If not, InvalidMapping is
298 // returned which results in failure, does not search "Slow Rules".
299 if (FastTypes != NoFastRules) {
300 Register Reg = MI.getOperand(0).getReg();
301 int Slot;
302 if (FastTypes == StandardB)
303 Slot = getFastPredicateSlot(LLTToBId(MRI.getType(Reg)));
304 else
305 Slot = getFastPredicateSlot(LLTToId(MRI.getType(Reg)));
306
307 if (Slot != -1)
308 return MUI.isUniform(Reg) ? &Uni[Slot] : &Div[Slot];
309 }
310
311 // Slow search for more complex rules.
312 for (const RegBankLegalizeRule &Rule : Rules) {
313 if (Rule.Predicate.match(MI, MUI, MRI))
314 return &Rule.OperandMapping;
315 }
316
317 return nullptr;
318}
319
321 Rules.push_back(Rule);
322}
323
325 RegBankLLTMapping RuleApplyIDs) {
326 int Slot = getFastPredicateSlot(Ty);
327 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
328 Div[Slot] = std::move(RuleApplyIDs);
329}
330
332 RegBankLLTMapping RuleApplyIDs) {
333 int Slot = getFastPredicateSlot(Ty);
334 assert(Slot != -1 && "Ty unsupported in this FastRulesTypes");
335 Uni[Slot] = std::move(RuleApplyIDs);
336}
337
338int SetOfRulesForOpcode::getFastPredicateSlot(
340 switch (FastTypes) {
341 case Standard: {
342 switch (Ty) {
343 case S32:
344 return 0;
345 case S16:
346 return 1;
347 case S64:
348 return 2;
349 case V2S16:
350 return 3;
351 default:
352 return -1;
353 }
354 }
355 case StandardB: {
356 switch (Ty) {
357 case B32:
358 return 0;
359 case B64:
360 return 1;
361 case B96:
362 return 2;
363 case B128:
364 return 3;
365 default:
366 return -1;
367 }
368 }
369 case Vector: {
370 switch (Ty) {
371 case S32:
372 return 0;
373 case V2S32:
374 return 1;
375 case V3S32:
376 return 2;
377 case V4S32:
378 return 3;
379 default:
380 return -1;
381 }
382 }
383 default:
384 return -1;
385 }
386}
387
388RegBankLegalizeRules::RuleSetInitializer
389RegBankLegalizeRules::addRulesForGOpcs(std::initializer_list<unsigned> OpcList,
390 FastRulesTypes FastTypes) {
391 return RuleSetInitializer(OpcList, GRulesAlias, GRules, FastTypes);
392}
393
394RegBankLegalizeRules::RuleSetInitializer
395RegBankLegalizeRules::addRulesForIOpcs(std::initializer_list<unsigned> OpcList,
396 FastRulesTypes FastTypes) {
397 return RuleSetInitializer(OpcList, IRulesAlias, IRules, FastTypes);
398}
399
402 unsigned Opc = MI.getOpcode();
403 if (Opc == AMDGPU::G_INTRINSIC || Opc == AMDGPU::G_INTRINSIC_CONVERGENT ||
404 Opc == AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS ||
405 Opc == AMDGPU::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS) {
406 unsigned IntrID = cast<GIntrinsic>(MI).getIntrinsicID();
407 auto IRAIt = IRulesAlias.find(IntrID);
408 if (IRAIt == IRulesAlias.end())
409 return nullptr;
410 return &IRules.at(IRAIt->second);
411 }
412
413 auto GRAIt = GRulesAlias.find(Opc);
414 if (GRAIt == GRulesAlias.end())
415 return nullptr;
416 return &GRules.at(GRAIt->second);
417}
418
419// Syntactic sugar wrapper for predicate lambda that enables '&&', '||' and '!'.
420class Predicate {
421private:
422 struct Elt {
423 // Save formula composed of Pred, '&&', '||' and '!' as a jump table.
424 // Sink ! to Pred. For example !((A && !B) || C) -> (!A || B) && !C
425 // Sequences of && and || will be represented by jumps, for example:
426 // (A && B && ... X) or (A && B && ... X) || Y
427 // A == true jump to B
428 // A == false jump to end or Y, result is A(false) or Y
429 // (A || B || ... X) or (A || B || ... X) && Y
430 // A == true jump to end or Y, result is A(true) or Y
431 // A == false jump to B
432 // Notice that when negating expression, we simply flip Neg on each Pred
433 // and swap TJumpOffset and FJumpOffset (&& becomes ||, || becomes &&).
434 std::function<bool(const MachineInstr &)> Pred;
435 bool Neg; // Neg of Pred is calculated before jump
436 unsigned TJumpOffset;
437 unsigned FJumpOffset;
438 };
439
440 SmallVector<Elt, 8> Expression;
441
442 Predicate(SmallVectorImpl<Elt> &&Expr) { Expression.swap(Expr); };
443
444public:
445 Predicate(std::function<bool(const MachineInstr &)> Pred) {
446 Expression.push_back({Pred, false, 1, 1});
447 };
448
449 bool operator()(const MachineInstr &MI) const {
450 unsigned Idx = 0;
451 unsigned ResultIdx = Expression.size();
452 bool Result;
453 do {
454 Result = Expression[Idx].Pred(MI);
455 Result = Expression[Idx].Neg ? !Result : Result;
456 if (Result) {
457 Idx += Expression[Idx].TJumpOffset;
458 } else {
459 Idx += Expression[Idx].FJumpOffset;
460 }
461 } while ((Idx != ResultIdx));
462
463 return Result;
464 };
465
466 Predicate operator!() const {
467 SmallVector<Elt, 8> NegExpression;
468 for (const Elt &ExprElt : Expression) {
469 NegExpression.push_back({ExprElt.Pred, !ExprElt.Neg, ExprElt.FJumpOffset,
470 ExprElt.TJumpOffset});
471 }
472 return Predicate(std::move(NegExpression));
473 };
474
475 Predicate operator&&(const Predicate &RHS) const {
476 SmallVector<Elt, 8> AndExpression = Expression;
477
478 unsigned RHSSize = RHS.Expression.size();
479 unsigned ResultIdx = Expression.size();
480 for (unsigned i = 0; i < ResultIdx; ++i) {
481 // LHS results in false, whole expression results in false.
482 if (i + AndExpression[i].FJumpOffset == ResultIdx)
483 AndExpression[i].FJumpOffset += RHSSize;
484 }
485
486 AndExpression.append(RHS.Expression);
487
488 return Predicate(std::move(AndExpression));
489 }
490
491 Predicate operator||(const Predicate &RHS) const {
492 SmallVector<Elt, 8> OrExpression = Expression;
493
494 unsigned RHSSize = RHS.Expression.size();
495 unsigned ResultIdx = Expression.size();
496 for (unsigned i = 0; i < ResultIdx; ++i) {
497 // LHS results in true, whole expression results in true.
498 if (i + OrExpression[i].TJumpOffset == ResultIdx)
499 OrExpression[i].TJumpOffset += RHSSize;
500 }
501
502 OrExpression.append(RHS.Expression);
503
504 return Predicate(std::move(OrExpression));
505 }
506};
507
508// Initialize rules
511 : ST(&_ST), MRI(&_MRI) {
512
513 addRulesForGOpcs({G_ADD, G_SUB}, Standard)
514 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
515 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
516 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
517 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
519 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
520 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}})
521 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}});
522
523 addRulesForGOpcs({G_UADDO, G_USUBO}, Standard)
524 .Uni(S32, {{Sgpr32, Sgpr32Trunc}, {Sgpr32, Sgpr32}})
525 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32}});
526
527 addRulesForGOpcs({G_UADDE, G_USUBE, G_SADDE, G_SSUBE}, Standard)
529 .Div(S32, {{Vgpr32, Vcc}, {Vgpr32, Vgpr32, Vcc}});
530
531 addRulesForGOpcs({G_UADDSAT, G_SADDSAT, G_USUBSAT, G_SSUBSAT}, Standard)
532 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
533 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
534 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
535 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
537 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
538
539 bool HasVecMulU64 = ST->hasVectorMulU64();
540 addRulesForGOpcs({G_MUL}, Standard)
541 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
542 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
543 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
544 .Uni(S64, {{SgprB64}, {SgprB64, SgprB64}})
546 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
547 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32AExt}})
548 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}}, HasVecMulU64)
549 .Div(S64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32Mul}, !HasVecMulU64);
550
551 bool hasMulHi = ST->hasScalarMulHiInsts();
552 addRulesForGOpcs({G_UMULH, G_SMULH}, Standard)
553 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
554 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasMulHi)
555 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasMulHi);
556
557 addRulesForGOpcs({G_AMDGPU_MAD_U64_U32}, Standard)
558 .Div(S64, {{Vgpr64, Vcc}, {Vgpr32, Vgpr32, Vgpr64}})
560
561 bool HasScalarSMulU64 = ST->hasScalarSMulU64();
562 addRulesForGOpcs({G_AMDGPU_S_MUL_U64_U32, G_AMDGPU_S_MUL_I64_I32}, Standard)
563 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr64}, UniMul64}, HasScalarSMulU64)
564 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}, DivSMulToMAD});
565
566 addRulesForGOpcs({G_XOR, G_OR, G_AND}, StandardB)
568 .Any({{DivS1}, {{Vcc}, {Vcc, Vcc}}})
569 .Any({{UniS16}, {{Sgpr16}, {Sgpr16, Sgpr16}}})
570 .Any({{DivS16}, {{Vgpr16}, {Vgpr16, Vgpr16}}})
571 .Uni(B32, {{SgprB32}, {SgprB32, SgprB32}})
572 .Div(B32, {{VgprB32}, {VgprB32, VgprB32}})
573 .Uni(B64, {{SgprB64}, {SgprB64, SgprB64}})
574 .Div(B64, {{VgprB64}, {VgprB64, VgprB64}, SplitTo32});
575
576 addRulesForGOpcs({G_SHL}, Standard)
577 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32AExt, Sgpr32ZExt}})
578 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
580 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
581 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
582 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
583 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
584 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
585
586 addRulesForGOpcs({G_LSHR}, Standard)
587 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
588 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
590 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
591 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
592 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
593 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
594 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
595
596 addRulesForGOpcs({G_ASHR}, Standard)
597 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32ZExt}})
598 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
600 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
601 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
602 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32}})
603 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
604 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
605
606 addRulesForGOpcs({G_FSHR}, Standard)
607 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
608 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
609
610 addRulesForGOpcs({G_BSWAP}, Standard)
611 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
612 .Div(S16, {{Vgpr16}, {Vgpr16}})
613 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
614 .Div(S32, {{Vgpr32}, {Vgpr32}})
615 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
616 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}});
617
618 addRulesForGOpcs({G_AMDGPU_CVT_F32_UBYTE0, G_AMDGPU_CVT_F32_UBYTE1,
619 G_AMDGPU_CVT_F32_UBYTE2, G_AMDGPU_CVT_F32_UBYTE3,
620 G_AMDGPU_RCP_IFLAG},
621 Standard)
622 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
623 .Div(S32, {{Vgpr32}, {Vgpr32}});
624
625 addRulesForGOpcs({G_FRAME_INDEX}).Any({{UniP5, _}, {{SgprP5}, {None}}});
626
627 addRulesForGOpcs({G_UBFX, G_SBFX}, Standard)
628 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}, S_BFE})
629 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
630 .Uni(S64, {{Sgpr64}, {Sgpr64, Sgpr32, Sgpr32}, S_BFE})
631 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32, Vgpr32}, V_BFE});
632
633 addRulesForGOpcs({G_SMIN, G_SMAX}, Standard)
634 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt, Sgpr32SExt}})
635 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
636 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
637 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
639 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
640
641 addRulesForGOpcs({G_UMIN, G_UMAX}, Standard)
642 .Uni(S16, {{Sgpr32Trunc}, {Sgpr32ZExt, Sgpr32ZExt}})
643 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
644 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}})
645 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
647 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
648
649 // Note: we only write S1 rules for G_IMPLICIT_DEF, G_CONSTANT and G_FCONSTANT
650 // here, rest is trivially regbankselected earlier
651 addRulesForGOpcs({G_IMPLICIT_DEF}).Any({{UniS1}, {{Sgpr32Trunc}, {}}});
652 addRulesForGOpcs({G_CONSTANT})
653 .Any({{UniS1, _}, {{Sgpr32Trunc}, {None}, UniCstExt}});
654
655 addRulesForGOpcs({G_FREEZE})
656 .Any({{UniS1}, {{Sgpr32Trunc}, {Sgpr32AExt}}})
657 .Any({{DivS1}, {{Vcc}, {Vcc}}})
658 .Any({{UniS16}, {{Sgpr16}, {Sgpr16}}})
659 .Any({{UniBRC}, {{SgprBRC}, {SgprBRC}}})
660 .Any({{DivBRC}, {{VgprBRC}, {VgprBRC}}});
661
662 addRulesForGOpcs({G_UNMERGE_VALUES})
663 .Any({{UniS16}, {{}, {}, UnmergeToShiftTrunc}})
664 .Any({{UniBRC}, {{}, {}, VerifyAllSgpr}})
665 .Any({{DivBRC}, {{}, {}, ApplyAllVgpr}});
666
667 addRulesForGOpcs({G_PHI})
668 .Any({{UniS1}, {{}, {}, AextToS32InIncomingBlockGPHI}})
669 .Any({{UniS16}, {{}, {}, VerifyAllSgprGPHI}})
670 .Any({{UniBRC}, {{}, {}, VerifyAllSgprGPHI}})
671 .Any({{DivBRC}, {{}, {}, VerifyAllSgprOrVgprGPHI}});
672
673 // LOAD {Div}, {{VgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
674 // LOAD {Uni}, {{UniInVgprDst...}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
675 // LOAD_NORET {}, {{}, {Imm, VgprSrc, ..., Sgpr_WF_RsrcIdx}}
676 // STORE {}, {{}, {VgprSrc, ..., Sgpr_WF_RsrcIdx}}
677 addRulesForGOpcs({G_AMDGPU_INTRIN_IMAGE_LOAD, G_AMDGPU_INTRIN_IMAGE_LOAD_D16,
678 G_AMDGPU_INTRIN_IMAGE_LOAD_NORET,
679 G_AMDGPU_INTRIN_IMAGE_STORE,
680 G_AMDGPU_INTRIN_IMAGE_STORE_D16})
681 .Any({{}, {{}, {}, ApplyINTRIN_IMAGE}});
682
683 Predicate isSignedICmp([](const MachineInstr &MI) -> bool {
684 auto Pred =
685 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
686 return CmpInst::isSigned(Pred);
687 });
688
689 Predicate isEqualityICmp([](const MachineInstr &MI) -> bool {
690 auto Pred =
691 static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
692 return ICmpInst::isEquality(Pred);
693 });
694
695 bool HasScalarCompareEq64 = ST->hasScalarCompareEq64();
696 // clang-format off
697 addRulesForGOpcs({G_ICMP})
698 .Any({{{UniS1, _, S16}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
699 .Any({{{UniS1, _, S16}, !isEqualityICmp && isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32SExt, Sgpr32SExt}}})
700 .Any({{{UniS1, _, S16}, !isEqualityICmp && !isSignedICmp}, {{Sgpr32Trunc}, {None, Sgpr32ZExt, Sgpr32ZExt}}})
701 .Any({{{DivS1, _, S16}}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
702 .Any({{{UniS1, _, S32}}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}})
703 .Any({{{DivS1, _, S32}}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
704 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, Sgpr64, Sgpr64}}}, HasScalarCompareEq64)
705 .Any({{{UniS1, _, S64}, isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}}, !HasScalarCompareEq64)
706 .Any({{{UniS1, _, S64}, !isEqualityICmp}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
707 .Any({{{DivS1, _, S64}}, {{Vcc}, {None, Vgpr64, Vgpr64}}})
708 .Any({{{UniS1, _, Ptr32}}, {{Sgpr32Trunc}, {None, SgprPtr32, SgprPtr32}}})
709 .Any({{{DivS1, _, Ptr32}}, {{Vcc}, {None, VgprPtr32, VgprPtr32}}})
710 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{Sgpr32Trunc}, {None, SgprPtr64, SgprPtr64}}}, HasScalarCompareEq64)
711 .Any({{{UniS1, _, Ptr64}, isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}}, !HasScalarCompareEq64)
712 .Any({{{UniS1, _, Ptr64}, !isEqualityICmp}, {{UniInVcc}, {None, VgprPtr64, VgprPtr64}}})
713 .Any({{{DivS1, _, Ptr64}}, {{Vcc}, {None, VgprPtr64, VgprPtr64}}});
714 // clang-format on
715
716 addRulesForGOpcs({G_BRCOND})
717 .Any({{UniS1}, {{}, {Sgpr32AExtBoolInReg}}})
718 .Any({{DivS1}, {{}, {Vcc}}});
719
720 addRulesForGOpcs({G_BR}).Any({{_}, {{}, {None}}});
721
722 addRulesForGOpcs({G_SELECT}, StandardB)
723 .Any({{DivS16}, {{Vgpr16}, {Vcc, Vgpr16, Vgpr16}}})
725 .Div(B32, {{VgprB32}, {Vcc, VgprB32, VgprB32}})
729
730 addRulesForGOpcs({G_ANYEXT})
731 .Any({{UniS16, S1}, {{None}, {None}}}) // should be combined away
732 .Any({{UniS32, S1}, {{None}, {None}}}) // should be combined away
733 .Any({{UniS64, S1}, {{None}, {None}}}) // should be combined away
734 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
735 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
736 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
737 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
738 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
739 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
740 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
741
742 bool Has16bitCmp = ST->has16BitInsts();
743
744 // In global-isel G_TRUNC in-reg is treated as no-op, inst selected into COPY.
745 // It is up to user to deal with truncated bits.
746 addRulesForGOpcs({G_TRUNC})
747 .Any({{UniS1, UniS16}, {{None}, {None}}}) // should be combined away
748 .Any({{UniS1, UniS32}, {{None}, {None}}}) // should be combined away
749 .Any({{UniS1, UniS64}, {{None}, {None}}}) // should be combined away
750 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}})
751 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
752 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
753 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
754 .Any({{UniV2S16, V2S32}, {{SgprV2S16}, {SgprV2S32}}})
755 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
756 // This is non-trivial. VgprToVccCopy is done using compare instruction.
757 .Any({{DivS1, DivS16}, {{Vcc}, {Vgpr16}, VgprToVccCopy}}, Has16bitCmp)
759 !Has16bitCmp)
760 .Any({{DivS1, DivS32}, {{Vcc}, {Vgpr32}, VgprToVccCopy}})
761 .Any({{DivS1, DivS64}, {{Vcc}, {Vgpr64}, VgprToVccCopy}});
762
763 addRulesForGOpcs({G_ZEXT})
767 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
768 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
769 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
770 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
771 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
772 // not extending S16 to S32 is questionable.
773 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32ZExt}, Ext32To64}})
774 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32ZExt}, Ext32To64}})
775 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
776 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
777
778 addRulesForGOpcs({G_SEXT})
782 .Any({{DivS16, S1}, {{Vgpr16}, {Vcc}, VccExtToSel}})
783 .Any({{DivS32, S1}, {{Vgpr32}, {Vcc}, VccExtToSel}})
784 .Any({{DivS64, S1}, {{Vgpr64}, {Vcc}, VccExtToSel}})
785 .Any({{UniS64, S32}, {{Sgpr64}, {Sgpr32}, Ext32To64}})
786 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}, Ext32To64}})
787 // not extending S16 to S32 is questionable.
788 .Any({{UniS64, S16}, {{Sgpr64}, {Sgpr32SExt}, Ext32To64}})
789 .Any({{DivS64, S16}, {{Vgpr64}, {Vgpr32SExt}, Ext32To64}})
790 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}})
791 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}});
792
793 addRulesForGOpcs({G_SEXT_INREG})
794 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
795 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
796 .Any({{UniS64, S64}, {{Sgpr64}, {Sgpr64}}})
798
799 addRulesForGOpcs({G_ASSERT_ZEXT, G_ASSERT_SEXT}, Standard)
800 .Uni(S32, {{Sgpr32}, {Sgpr32, Imm}})
801 .Div(S32, {{Vgpr32}, {Vgpr32, Imm}})
802 .Uni(S64, {{Sgpr64}, {Sgpr64, Imm}})
803 .Div(S64, {{Vgpr64}, {Vgpr64, Imm}});
804
805 addRulesForGOpcs({G_ASSERT_ALIGN}, Standard)
806 .Uni(S32, {{Sgpr32}, {Sgpr32}})
807 .Div(S32, {{Vgpr32}, {Vgpr32}})
808 .Uni(S64, {{Sgpr64}, {Sgpr64}})
809 .Div(S64, {{Vgpr64}, {Vgpr64}})
810 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32}}})
811 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32}}})
812 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64}}})
813 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64}}});
814
815 // Atomic read-modify-write operations: result and value are always VGPR,
816 // pointer varies by address space.
817 addRulesForGOpcs({G_ATOMICRMW_ADD, G_ATOMICRMW_SUB, G_ATOMICRMW_XCHG,
818 G_ATOMICRMW_AND, G_ATOMICRMW_OR, G_ATOMICRMW_XOR,
819 G_ATOMICRMW_MIN, G_ATOMICRMW_MAX, G_ATOMICRMW_UMIN,
820 G_ATOMICRMW_UMAX, G_ATOMICRMW_UINC_WRAP,
821 G_ATOMICRMW_UDEC_WRAP, G_ATOMICRMW_FMIN, G_ATOMICRMW_FMAX})
822 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
823 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
824 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
825 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
826 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
827 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}});
828
829 bool HasAtomicFlatPkAdd16Insts = ST->hasAtomicFlatPkAdd16Insts();
830 bool HasAtomicBufferGlobalPkAddF16Insts =
831 ST->hasAtomicBufferGlobalPkAddF16NoRtnInsts() ||
832 ST->hasAtomicBufferGlobalPkAddF16Insts();
833 bool HasAtomicDsPkAdd16Insts = ST->hasAtomicDsPkAdd16Insts();
834 addRulesForGOpcs({G_ATOMICRMW_FADD})
835 .Any({{DivS32, P0, S32}, {{Vgpr32}, {VgprP0, Vgpr32}}})
836 .Any({{DivS64, P0, S64}, {{Vgpr64}, {VgprP0, Vgpr64}}})
837 .Any({{DivS32, P1, S32}, {{Vgpr32}, {VgprP1, Vgpr32}}})
838 .Any({{DivS64, P1, S64}, {{Vgpr64}, {VgprP1, Vgpr64}}})
839 .Any({{DivS32, P3, S32}, {{Vgpr32}, {VgprP3, Vgpr32}}})
840 .Any({{DivS64, P3, S64}, {{Vgpr64}, {VgprP3, Vgpr64}}})
841 .Any({{DivV2S16, P0, V2S16}, {{VgprV2S16}, {VgprP0, VgprV2S16}}},
842 HasAtomicFlatPkAdd16Insts)
843 .Any({{DivV2S16, P1, V2S16}, {{VgprV2S16}, {VgprP1, VgprV2S16}}},
844 HasAtomicBufferGlobalPkAddF16Insts)
845 .Any({{DivV2S16, P3, V2S16}, {{VgprV2S16}, {VgprP3, VgprV2S16}}},
846 HasAtomicDsPkAdd16Insts);
847
848 addRulesForGOpcs({G_ATOMIC_CMPXCHG})
849 .Any({{DivS32, P2}, {{Vgpr32}, {VgprP2, Vgpr32, Vgpr32}}})
850 .Any({{DivS64, P2}, {{Vgpr64}, {VgprP2, Vgpr64, Vgpr64}}})
851 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3, Vgpr32, Vgpr32}}})
852 .Any({{DivS64, P3}, {{Vgpr64}, {VgprP3, Vgpr64, Vgpr64}}});
853
854 addRulesForGOpcs({G_AMDGPU_ATOMIC_CMPXCHG})
855 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0, VgprV2S32}}})
856 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1, VgprV2S32}}})
857 .Any({{DivS64, P0}, {{Vgpr64}, {VgprP0, VgprV2S64}}})
858 .Any({{DivS64, P1}, {{Vgpr64}, {VgprP1, VgprV2S64}}});
859
860 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_CMPSWAP}, Standard)
861 .Div(S32, {{Vgpr32},
863 .Div(S64, {{Vgpr64},
865
866 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_ADD, G_AMDGPU_BUFFER_ATOMIC_AND,
867 G_AMDGPU_BUFFER_ATOMIC_DEC, G_AMDGPU_BUFFER_ATOMIC_FMAX,
868 G_AMDGPU_BUFFER_ATOMIC_FMIN, G_AMDGPU_BUFFER_ATOMIC_INC,
869 G_AMDGPU_BUFFER_ATOMIC_OR, G_AMDGPU_BUFFER_ATOMIC_SMAX,
870 G_AMDGPU_BUFFER_ATOMIC_SMIN, G_AMDGPU_BUFFER_ATOMIC_SUB,
871 G_AMDGPU_BUFFER_ATOMIC_SWAP, G_AMDGPU_BUFFER_ATOMIC_UMAX,
872 G_AMDGPU_BUFFER_ATOMIC_UMIN, G_AMDGPU_BUFFER_ATOMIC_XOR},
873 Standard)
876
877 bool hasSMRDx3 = ST->hasScalarDwordx3Loads();
878 bool hasSMRDSmall = ST->hasScalarSubwordLoads();
879 bool usesTrue16 = ST->useRealTrue16Insts();
880
881 Predicate isAlign16([](const MachineInstr &MI) -> bool {
882 return (*MI.memoperands_begin())->getAlign() >= Align(16);
883 });
884
885 Predicate isAlign4([](const MachineInstr &MI) -> bool {
886 return (*MI.memoperands_begin())->getAlign() >= Align(4);
887 });
888
889 Predicate isAtomicMMO([](const MachineInstr &MI) -> bool {
890 return (*MI.memoperands_begin())->isAtomic();
891 });
892
893 Predicate isUniMMO([](const MachineInstr &MI) -> bool {
894 return AMDGPU::isUniformMMO(*MI.memoperands_begin());
895 });
896
897 Predicate isConst([](const MachineInstr &MI) -> bool {
898 // Address space in MMO be different then address space on pointer.
899 const MachineMemOperand *MMO = *MI.memoperands_begin();
900 const unsigned AS = MMO->getAddrSpace();
901 return AS == AMDGPUAS::CONSTANT_ADDRESS ||
903 });
904
905 Predicate isVolatileMMO([](const MachineInstr &MI) -> bool {
906 return (*MI.memoperands_begin())->isVolatile();
907 });
908
909 Predicate isInvMMO([](const MachineInstr &MI) -> bool {
910 return (*MI.memoperands_begin())->isInvariant();
911 });
912
913 Predicate isNoClobberMMO([](const MachineInstr &MI) -> bool {
914 return (*MI.memoperands_begin())->getFlags() & MONoClobber;
915 });
916
917 Predicate isNaturalAligned([](const MachineInstr &MI) -> bool {
918 const MachineMemOperand *MMO = *MI.memoperands_begin();
919 return MMO->getAlign() >= Align(MMO->getSize().getValue());
920 });
921
922 Predicate is8Or16BitMMO([](const MachineInstr &MI) -> bool {
923 const MachineMemOperand *MMO = *MI.memoperands_begin();
924 const unsigned MemSize = 8 * MMO->getSize().getValue();
925 return MemSize == 16 || MemSize == 8;
926 });
927
928 Predicate is32BitMMO([](const MachineInstr &MI) -> bool {
929 const MachineMemOperand *MMO = *MI.memoperands_begin();
930 return 8 * MMO->getSize().getValue() == 32;
931 });
932
933 auto isUL = !isAtomicMMO && isUniMMO && (isConst || !isVolatileMMO) &&
934 (isConst || isInvMMO || isNoClobberMMO);
935
936 // clang-format off
937 // TODO: S32Dst, 16-bit any-extending load should not appear on True16 targets
938 addRulesForGOpcs({G_LOAD})
939 // flat, addrspace(0), never uniform - flat_load
940 .Any({{DivS16, P0}, {{Vgpr16}, {VgprP0}}}, usesTrue16)
941 .Any({{DivB32, P0}, {{VgprB32}, {VgprP0}}}) // 32-bit load, 8-bit and 16-bit any-extending load
942 .Any({{DivB64, P0}, {{VgprB64}, {VgprP0}}})
943 .Any({{DivB96, P0}, {{VgprB96}, {VgprP0}}})
944 .Any({{DivB128, P0}, {{VgprB128}, {VgprP0}}})
945
946 // global, addrspace(1)
947 // divergent - global_load
948 .Any({{DivS16, P1}, {{Vgpr16}, {VgprP1}}}, usesTrue16)
949 .Any({{DivB32, P1}, {{VgprB32}, {VgprP1}}}) //32-bit load, 8-bit and 16-bit any-extending load
950 .Any({{DivB64, P1}, {{VgprB64}, {VgprP1}}})
951 .Any({{DivB96, P1}, {{VgprB96}, {VgprP1}}})
952 .Any({{DivB128, P1}, {{VgprB128}, {VgprP1}}})
953 .Any({{DivB256, P1}, {{VgprB256}, {VgprP1}, SplitLoad}})
954 .Any({{DivB512, P1}, {{VgprB512}, {VgprP1}, SplitLoad}})
955
956 // uniform - s_load
957 .Any({{{UniS16, P1}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
958 .Any({{{UniS16, P1}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP1}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
959 .Any({{{UniB32, P1}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
960 // TODO: SplitLoad when !isNaturalAligned && isUL and target hasSMRDSmall
961 .Any({{{UniB32, P1}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
962 .Any({{{UniB32, P1}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP1}}}) //32-bit load
963 .Any({{{UniB64, P1}, isAlign4 && isUL}, {{SgprB64}, {SgprP1}}})
964 .Any({{{UniB96, P1}, isAlign16 && isUL}, {{SgprB96}, {SgprP1}, WidenLoad}}, !hasSMRDx3)
965 .Any({{{UniB96, P1}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP1}, SplitLoad}}, !hasSMRDx3)
966 .Any({{{UniB96, P1}, isAlign4 && isUL}, {{SgprB96}, {SgprP1}}}, hasSMRDx3)
967 .Any({{{UniB128, P1}, isAlign4 && isUL}, {{SgprB128}, {SgprP1}}})
968 .Any({{{UniB256, P1}, isAlign4 && isUL}, {{SgprB256}, {SgprP1}}})
969 .Any({{{UniB512, P1}, isAlign4 && isUL}, {{SgprB512}, {SgprP1}}})
970
971 // Uniform via global or buffer load, for example volatile or non-aligned
972 // uniform load. Not using standard {{UniInVgprTy}, {VgprP1}} since it is
973 // selected as global_load, use SgprP1 for pointer instead to match
974 // patterns without flat-for-global, default for GFX7 and older.
975 // -> +flat-for-global + {{UniInVgprTy}, {SgprP1}} - global_load
976 // -> -flat-for-global + {{UniInVgprTy}, {SgprP1}} - buffer_load
977 .Any({{{UniS16, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && hasSMRDSmall) // s16 load
978 .Any({{{UniS16, P1}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP1}}}, usesTrue16 && !hasSMRDSmall) // s16 load
979 .Any({{{UniB32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP1}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
980 .Any({{{UniB32, P1}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP1}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
981 .Any({{{UniB64, P1}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP1}}})
982 .Any({{{UniB96, P1}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP1}}})
983 .Any({{{UniB128, P1}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP1}}})
984 .Any({{{UniB256, P1}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP1}, SplitLoad}})
985 .Any({{{UniB512, P1}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP1}, SplitLoad}})
986
987 // local, addrspace(3) - ds_load
988 .Any({{DivS16, P3}, {{Vgpr16}, {VgprP3}}}, usesTrue16)
989 .Any({{DivB32, P3}, {{VgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
990 .Any({{DivB64, P3}, {{VgprB64}, {VgprP3}}})
991 .Any({{DivB96, P3}, {{VgprB96}, {VgprP3}}})
992 .Any({{DivB128, P3}, {{VgprB128}, {VgprP3}}})
993
994 .Any({{UniS16, P3}, {{UniInVgprS16}, {SgprP3}}}, usesTrue16) // 16-bit load
995 .Any({{UniB32, P3}, {{UniInVgprB32}, {VgprP3}}}) // 32-bit load, 8-bit and 16-bit any-extending load
996 .Any({{UniB64, P3}, {{UniInVgprB64}, {VgprP3}}})
997 .Any({{UniB96, P3}, {{UniInVgprB96}, {VgprP3}}})
998 .Any({{UniB128, P3}, {{UniInVgprB128}, {VgprP3}}})
999
1000 // constant, addrspace(4)
1001 // divergent - global_load
1002 .Any({{DivS16, P4}, {{Vgpr16}, {VgprP4}}}, usesTrue16)
1003 .Any({{DivB32, P4}, {{VgprB32}, {VgprP4}}}) //32-bit load, 8-bit and 16-bit any-extending load
1004 .Any({{DivB64, P4}, {{VgprB64}, {VgprP4}}})
1005 .Any({{DivB96, P4}, {{VgprB96}, {VgprP4}}})
1006 .Any({{DivB128, P4}, {{VgprB128}, {VgprP4}}})
1007 .Any({{DivB256, P4}, {{VgprB256}, {VgprP4}, SplitLoad}})
1008 .Any({{DivB512, P4}, {{VgprB512}, {VgprP4}, SplitLoad}})
1009
1010 // uniform - s_load
1011 .Any({{{UniS16, P4}, isNaturalAligned && isUL}, {{Sgpr32Trunc}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1012 .Any({{{UniS16, P4}, isAlign4 && isUL}, {{Sgpr32Trunc}, {SgprP4}, WidenMMOToS32}}, usesTrue16 && !hasSMRDSmall) // s16 load to 32-bit load
1013 .Any({{{UniB32, P4}, isNaturalAligned && isUL}, {{SgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1014 .Any({{{UniB32, P4}, is8Or16BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall) //8-bit and 16-bit any-extending load to 32-bit load
1015 .Any({{{UniB32, P4}, is32BitMMO && isAlign4 && isUL}, {{SgprB32}, {SgprP4}}}) //32-bit load
1016 .Any({{{UniB64, P4}, isAlign4 && isUL}, {{SgprB64}, {SgprP4}}})
1017 .Any({{{UniB96, P4}, isAlign16 && isUL}, {{SgprB96}, {SgprP4}, WidenLoad}}, !hasSMRDx3)
1018 .Any({{{UniB96, P4}, isAlign4 && !isAlign16 && isUL}, {{SgprB96}, {SgprP4}, SplitLoad}}, !hasSMRDx3)
1019 .Any({{{UniB96, P4}, isAlign4 && isUL}, {{SgprB96}, {SgprP4}}}, hasSMRDx3)
1020 .Any({{{UniB128, P4}, isAlign4 && isUL}, {{SgprB128}, {SgprP4}}})
1021 .Any({{{UniB256, P4}, isAlign4 && isUL}, {{SgprB256}, {SgprP4}}})
1022 .Any({{{UniB512, P4}, isAlign4 && isUL}, {{SgprB512}, {SgprP4}}})
1023
1024 // uniform in vgpr - global_load or buffer_load
1025 .Any({{{UniS16, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && hasSMRDSmall) // s16 load
1026 .Any({{{UniS16, P4}, !isAlign4 || !isUL}, {{UniInVgprS16}, {SgprP4}}}, usesTrue16 && !hasSMRDSmall) // s16 load
1027 .Any({{{UniB32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprB32}, {SgprP4}}}, hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1028 .Any({{{UniB32, P4}, !isAlign4 || !isUL}, {{UniInVgprB32}, {SgprP4}}}, !hasSMRDSmall) //32-bit load, 8-bit and 16-bit any-extending load
1029 .Any({{{UniB64, P4}, !isAlign4 || !isUL}, {{UniInVgprB64}, {SgprP4}}})
1030 .Any({{{UniB96, P4}, !isAlign4 || !isUL}, {{UniInVgprB96}, {SgprP4}}})
1031 .Any({{{UniB128, P4}, !isAlign4 || !isUL}, {{UniInVgprB128}, {SgprP4}}})
1032 .Any({{{UniB256, P4}, !isAlign4 || !isUL}, {{UniInVgprB256}, {SgprP4}, SplitLoad}})
1033 .Any({{{UniB512, P4}, !isAlign4 || !isUL}, {{UniInVgprB512}, {SgprP4}, SplitLoad}})
1034
1035 // private, addrspace(5), never uniform - scratch_load
1036 .Any({{DivS16, P5}, {{Vgpr16}, {VgprP5}}}, usesTrue16)
1037 .Any({{DivB32, P5}, {{VgprB32}, {VgprP5}}}) // 32-bit load, 8-bit and 16-bit any-extending load
1038 .Any({{DivB64, P5}, {{VgprB64}, {VgprP5}}})
1039 .Any({{DivB96, P5}, {{VgprB96}, {VgprP5}}})
1040 .Any({{DivB128, P5}, {{VgprB128}, {VgprP5}}})
1041
1042 .Any({{DivS32, Ptr128}, {{Vgpr32}, {VgprPtr128}}});
1043
1044
1045 addRulesForGOpcs({G_ZEXTLOAD, G_SEXTLOAD}) // i8 and i16 zeroextending loads
1046 .Any({{DivS32, P0}, {{Vgpr32}, {VgprP0}}})
1047
1048 .Any({{DivS32, P1}, {{Vgpr32}, {VgprP1}}})
1049 .Any({{{UniS32, P1}, isAlign4 && isUL}, {{Sgpr32}, {SgprP1}, WidenMMOToS32}}, !hasSMRDSmall)
1050 .Any({{{UniS32, P1}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP1}}}, hasSMRDSmall)
1051 .Any({{{UniS32, P1}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP1}}}, !hasSMRDSmall)
1052 .Any({{{UniS32, P1}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP1}}}, hasSMRDSmall)
1053
1054 .Any({{DivS32, P3}, {{Vgpr32}, {VgprP3}}})
1055 .Any({{UniS32, P3}, {{UniInVgprS32}, {VgprP3}}})
1056
1057 .Any({{DivS32, P4}, {{Vgpr32}, {VgprP4}}})
1058 .Any({{{UniS32, P4}, isAlign4 && isUL}, {{Sgpr32}, {SgprP4}, WidenMMOToS32}}, !hasSMRDSmall)
1059 .Any({{{UniS32, P4}, isNaturalAligned && isUL}, {{Sgpr32}, {SgprP4}}}, hasSMRDSmall)
1060 .Any({{{UniS32, P4}, !isAlign4 || !isUL}, {{UniInVgprS32}, {SgprP4}}}, !hasSMRDSmall)
1061 .Any({{{UniS32, P4}, !isNaturalAligned || !isUL}, {{UniInVgprS32}, {SgprP4}}}, hasSMRDSmall)
1062
1063 .Any({{DivS32, P5}, {{Vgpr32}, {VgprP5}}});
1064
1065 addRulesForGOpcs({G_STORE})
1066 // addrspace(0)
1067 .Any({{S16, P0}, {{}, {Vgpr16, VgprP0}}}, usesTrue16) // 16-bit store
1068 .Any({{B32, P0}, {{}, {VgprB32, VgprP0}}}) // 32-bit store, 8-bit and 16-bit truncating store
1069 .Any({{B64, P0}, {{}, {VgprB64, VgprP0}}})
1070 .Any({{B96, P0}, {{}, {VgprB96, VgprP0}}})
1071 .Any({{B128, P0}, {{}, {VgprB128, VgprP0}}})
1072
1073 // addrspace(1), there are no stores to addrspace(4)
1074 // For targets:
1075 // - with "+flat-for-global" - global_store
1076 // - without(-flat-for-global) - buffer_store addr64
1077 .Any({{S16, DivP1}, {{}, {Vgpr16, VgprP1}}}, usesTrue16) // 16-bit store
1078 .Any({{B32, DivP1}, {{}, {VgprB32, VgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1079 .Any({{B64, DivP1}, {{}, {VgprB64, VgprP1}}})
1080 .Any({{B96, DivP1}, {{}, {VgprB96, VgprP1}}})
1081 .Any({{B128, DivP1}, {{}, {VgprB128, VgprP1}}})
1082
1083 // For UniP1, use sgpr ptr to match flat-for-global patterns. Targets:
1084 // - with "+flat-for-global" - global_store for both sgpr and vgpr ptr
1085 // - without(-flat-for-global) - need sgpr ptr to select buffer_store
1086 .Any({{S16, UniP1}, {{}, {Vgpr16, SgprP1}}}, usesTrue16) // 16-bit store
1087 .Any({{B32, UniP1}, {{}, {VgprB32, SgprP1}}}) // 32-bit store, 8-bit and 16-bit truncating store
1088 .Any({{B64, UniP1}, {{}, {VgprB64, SgprP1}}})
1089 .Any({{B96, UniP1}, {{}, {VgprB96, SgprP1}}})
1090 .Any({{B128, UniP1}, {{}, {VgprB128, SgprP1}}})
1091
1092 // addrspace(3) and addrspace(5)
1093 .Any({{S16, Ptr32}, {{}, {Vgpr16, VgprPtr32}}}, usesTrue16) // 16-bit store
1094 .Any({{B32, Ptr32}, {{}, {VgprB32, VgprPtr32}}}) // 32-bit store, 8-bit and 16-bit truncating store
1095 .Any({{B64, Ptr32}, {{}, {VgprB64, VgprPtr32}}})
1096 .Any({{B96, Ptr32}, {{}, {VgprB96, VgprPtr32}}})
1097 .Any({{B128, Ptr32}, {{}, {VgprB128, VgprPtr32}}});
1098
1099 // clang-format on
1100
1101 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD, G_AMDGPU_BUFFER_LOAD_FORMAT,
1102 G_AMDGPU_TBUFFER_LOAD_FORMAT},
1103 StandardB)
1112
1113 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_USHORT, G_AMDGPU_BUFFER_LOAD_UBYTE,
1114 G_AMDGPU_BUFFER_LOAD_SSHORT, G_AMDGPU_BUFFER_LOAD_SBYTE},
1115 StandardB)
1118
1119 addRulesForGOpcs(
1120 {G_AMDGPU_BUFFER_LOAD_UBYTE_TFE, G_AMDGPU_BUFFER_LOAD_USHORT_TFE},
1121 StandardB)
1124
1125 addRulesForGOpcs({G_AMDGPU_BUFFER_LOAD_TFE, G_AMDGPU_BUFFER_LOAD_FORMAT_TFE},
1126 StandardB)
1134 .Any({{UniB160},
1136
1137 addRulesForGOpcs(
1138 {G_AMDGPU_BUFFER_LOAD_FORMAT_D16, G_AMDGPU_TBUFFER_LOAD_FORMAT_D16},
1139 StandardB)
1146
1147 addRulesForGOpcs({G_AMDGPU_BUFFER_STORE, G_AMDGPU_BUFFER_STORE_BYTE,
1148 G_AMDGPU_BUFFER_STORE_SHORT, G_AMDGPU_BUFFER_STORE_FORMAT,
1149 G_AMDGPU_BUFFER_STORE_FORMAT_D16,
1150 G_AMDGPU_TBUFFER_STORE_FORMAT,
1151 G_AMDGPU_TBUFFER_STORE_FORMAT_D16})
1152 .Any({{B32}, {{}, {VgprB32, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1153 .Any({{B64}, {{}, {VgprB64, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1154 .Any({{B96}, {{}, {VgprB96, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}})
1155 .Any({{B128}, {{}, {VgprB128, SgprV4S32_WF, Vgpr32, Vgpr32, Sgpr32_WF}}});
1156
1157 // Buffer atomics: resource descriptor + scalar offset are SGPR, data and
1158 // address components are VGPR.
1159 //
1160 // Operand order (SIInstructions.td BufferAtomicGenericInstruction):
1161 // dst = op vdata, rsrc, vindex, voffset, soffset, offset_imm, cachepolicy,
1162 // idxen_imm
1163 addRulesForGOpcs({G_AMDGPU_BUFFER_ATOMIC_FADD})
1164 .Any({{S32, S32, V4S32, S32, S32, S32},
1166 .Any({{S64, S64, V4S32, S32, S32, S32},
1168 .Any({{V2S16, V2S16, V4S32, S32, S32, S32},
1169 {{VgprV2S16},
1171
1172 addRulesForGOpcs({G_PTR_ADD})
1173 .Any({{UniPtr32}, {{SgprPtr32}, {SgprPtr32, Sgpr32}}})
1174 .Any({{DivPtr32}, {{VgprPtr32}, {VgprPtr32, Vgpr32}}})
1175 .Any({{UniPtr64}, {{SgprPtr64}, {SgprPtr64, Sgpr64}}})
1176 .Any({{DivPtr64}, {{VgprPtr64}, {VgprPtr64, Vgpr64}}});
1177
1178 addRulesForGOpcs({G_INTTOPTR})
1179 .Any({{UniPtr32}, {{SgprPtr32}, {Sgpr32}}})
1180 .Any({{DivPtr32}, {{VgprPtr32}, {Vgpr32}}})
1181 .Any({{UniPtr64}, {{SgprPtr64}, {Sgpr64}}})
1182 .Any({{DivPtr64}, {{VgprPtr64}, {Vgpr64}}})
1183 .Any({{UniPtr128}, {{SgprPtr128}, {Sgpr128}}})
1184 .Any({{DivPtr128}, {{VgprPtr128}, {Vgpr128}}});
1185
1186 addRulesForGOpcs({G_PTRTOINT})
1187 .Any({{UniS32}, {{Sgpr32}, {SgprPtr32}}})
1188 .Any({{DivS32}, {{Vgpr32}, {VgprPtr32}}})
1189 .Any({{UniS64}, {{Sgpr64}, {SgprPtr64}}})
1190 .Any({{DivS64}, {{Vgpr64}, {VgprPtr64}}})
1191 .Any({{UniS128}, {{Sgpr128}, {SgprPtr128}}})
1192 .Any({{DivS128}, {{Vgpr128}, {VgprPtr128}}});
1193
1194 // FIXME: Update llvm/test/CodeGen/AMDGPU/ptrmask.ll to use GlobalISel.
1195 // Currently crashes on P8 (buffer resource) tests due to legalizer issue.
1196 addRulesForGOpcs({G_PTRMASK})
1197 .Any({{UniP1}, {{SgprP1}, {SgprP1, Sgpr64}}})
1198 .Any({{DivP1}, {{VgprP1}, {VgprP1, Vgpr64}}})
1199 .Any({{UniP3}, {{SgprP3}, {SgprP3, Sgpr32}}})
1200 .Any({{DivP3}, {{VgprP3}, {VgprP3, Vgpr32}}});
1201
1202 addRulesForGOpcs({G_ABS}, Standard).Uni(S16, {{Sgpr32Trunc}, {Sgpr32SExt}});
1203
1204 addRulesForGOpcs({G_BITREVERSE}, Standard)
1205 .Uni(S32, {{Sgpr32}, {Sgpr32}})
1206 .Div(S32, {{Vgpr32}, {Vgpr32}})
1207 .Uni(S64, {{Sgpr64}, {Sgpr64}})
1208 .Div(S64, {{Vgpr64}, {Vgpr64}});
1209
1210 addRulesForGOpcs({G_AMDGPU_FFBH_U32, G_AMDGPU_FFBL_B32, G_CTLZ_ZERO_UNDEF,
1211 G_CTTZ_ZERO_UNDEF})
1212 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}})
1213 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1214 .Any({{UniS32, S64}, {{Sgpr32}, {Sgpr64}}})
1216
1217 addRulesForGOpcs({G_FENCE}).Any({{{}}, {{}, {}}});
1218
1219 addRulesForGOpcs({G_READSTEADYCOUNTER, G_READCYCLECOUNTER}, Standard)
1220 .Uni(S64, {{Sgpr64}, {}});
1221
1222 addRulesForGOpcs({G_BLOCK_ADDR}).Any({{UniP0}, {{SgprP0}, {}}});
1223
1224 addRulesForGOpcs({G_GLOBAL_VALUE})
1225 .Any({{UniP0}, {{SgprP0}, {}}})
1226 .Any({{UniP1}, {{SgprP1}, {}}})
1227 .Any({{UniP3}, {{SgprP3}, {}}})
1228 .Any({{UniP4}, {{SgprP4}, {}}})
1229 .Any({{UniP8}, {{SgprP8}, {}}});
1230
1231 addRulesForGOpcs({G_AMDGPU_WAVE_ADDRESS}).Any({{UniP5}, {{SgprP5}, {}}});
1232
1233 addRulesForGOpcs({G_SI_CALL})
1234 .Any({{_, UniP0}, {{None}, {SgprP0}}})
1235 .Any({{_, DivP0}, {{None}, {SgprP0Call_WF}}})
1236 .Any({{_, UniP4}, {{None}, {SgprP4}}})
1237 .Any({{_, DivP4}, {{None}, {SgprP4Call_WF}}});
1238
1239 bool hasSALUFloat = ST->hasSALUFloatInsts();
1240
1241 addRulesForGOpcs({G_FADD, G_FMUL, G_STRICT_FADD, G_STRICT_FMUL}, Standard)
1242 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1243 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1244 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1245 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1246 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat)
1247 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1248 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1249 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1250 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat)
1252 hasSALUFloat)
1253 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1254
1255 addRulesForGOpcs({G_FSUB, G_STRICT_FSUB}, Standard)
1256 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1257 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1258 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1259 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1260 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1261 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1262
1263 addRulesForGOpcs({G_FMAD}, Standard)
1264 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1265 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1266 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1267 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1268
1269 addRulesForGOpcs({G_FLDEXP, G_STRICT_FLDEXP}, Standard)
1270 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1271 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1272 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}})
1273 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1274 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr32}})
1275 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr32}});
1276
1277 addRulesForGOpcs({G_FMA, G_STRICT_FMA}, Standard)
1278 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1279 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}})
1280 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64, Vgpr64}})
1281 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64, Vgpr64}})
1285 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16, Sgpr16}}, hasSALUFloat)
1286 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}}, !hasSALUFloat)
1287 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32, Sgpr32}}, hasSALUFloat)
1288 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}}, !hasSALUFloat)
1289 .Uni(V2S16,
1291 hasSALUFloat)
1293 !hasSALUFloat);
1294
1295 addRulesForGOpcs({G_AMDGPU_FMED3}, Standard)
1296 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16, Vgpr16}})
1297 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16, Vgpr16}})
1298 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1299 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1300
1301 // TODO: This opcode is generated from the i64->i16 signed clamped pattern in
1302 // the PreLegalizerCombiner. Move the combine to RegBankCombiner to keep more
1303 // instructions on SALU.
1304 addRulesForGOpcs({G_AMDGPU_SMED3}, Standard)
1305 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32, Vgpr32}})
1306 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32, Vgpr32}});
1307
1308 // FNEG and FABS are either folded as source modifiers or can be selected as
1309 // bitwise XOR and AND with Mask. XOR and AND are available on SALU but for
1310 // targets without SALU float we still select them as VGPR since there would
1311 // be no real sgpr use.
1312 addRulesForGOpcs({G_FNEG, G_FABS}, Standard)
1313 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasSALUFloat)
1314 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasSALUFloat)
1315 .Div(S16, {{Vgpr16}, {Vgpr16}})
1316 .Uni(S32, {{UniInVgprS32}, {Vgpr32}}, !hasSALUFloat)
1317 .Uni(S32, {{Sgpr32}, {Sgpr32}}, hasSALUFloat)
1318 .Div(S32, {{Vgpr32}, {Vgpr32}})
1319 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1320 .Div(S64, {{Vgpr64}, {Vgpr64}})
1321 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}}, !hasSALUFloat)
1322 .Uni(V2S16, {{SgprV2S16}, {SgprV2S16}, ScalarizeToS16}, hasSALUFloat)
1323 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1324 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1325 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1326
1327 addRulesForGOpcs({G_FCANONICALIZE}, Standard)
1328 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1329 .Div(S32, {{Vgpr32}, {Vgpr32}})
1330 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1331 .Div(S16, {{Vgpr16}, {Vgpr16}})
1332 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1333 .Div(S64, {{Vgpr64}, {Vgpr64}})
1334 .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16}})
1335 .Div(V2S16, {{VgprV2S16}, {VgprV2S16}})
1336 .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32}}})
1337 .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32}}});
1338
1339 bool hasPST = ST->hasPseudoScalarTrans();
1340 addRulesForGOpcs({G_FSQRT}, Standard)
1341 .Div(S16, {{Vgpr16}, {Vgpr16}})
1342 .Uni(S16, {{Sgpr16}, {Sgpr16}}, hasPST)
1343 .Uni(S16, {{UniInVgprS16}, {Vgpr16}}, !hasPST);
1344
1345 addRulesForGOpcs({G_FPTOUI, G_FPTOSI})
1346 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1347 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1348 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1349 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat)
1350 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1351 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1352 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1353 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1354 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1355 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}});
1356
1357 addRulesForGOpcs({G_UITOFP, G_SITOFP})
1358 .Any({{UniS16, S16}, {{UniInVgprS16}, {Vgpr16}}})
1359 .Any({{DivS16, S16}, {{Vgpr16}, {Vgpr16}}})
1360 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1361 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat)
1362 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1363 .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat)
1364 .Any({{UniS32, S32}, {{UniInVgprS32}, {Vgpr32}}}, !hasSALUFloat)
1365 .Any({{DivS32, S32}, {{Vgpr32}, {Vgpr32}}})
1366 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1367 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}});
1368
1369 addRulesForGOpcs({G_FPEXT})
1370 .Any({{DivS32, S16}, {{Vgpr32}, {Vgpr16}}})
1371 .Any({{UniS64, S32}, {{UniInVgprS64}, {Vgpr32}}})
1372 .Any({{DivS64, S32}, {{Vgpr64}, {Vgpr32}}})
1373 .Any({{UniS32, S16}, {{Sgpr32}, {Sgpr16}}}, hasSALUFloat)
1374 .Any({{UniS32, S16}, {{UniInVgprS32}, {Vgpr16}}}, !hasSALUFloat);
1375
1376 addRulesForGOpcs({G_AMDGPU_CVT_PK_I16_I32}, Standard)
1377 .Uni(V2S16, {{UniInVgprV2S16}, {Vgpr32, Vgpr32}})
1378 .Div(V2S16, {{VgprV2S16}, {Vgpr32, Vgpr32}});
1379
1380 addRulesForGOpcs({G_AMDGPU_FMIN_LEGACY, G_AMDGPU_FMAX_LEGACY}, Standard)
1381 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}})
1382 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}});
1383
1384 bool hasSALUMinimumMaximumInsts = ST->hasSALUMinimumMaximumInsts();
1385
1386 addRulesForGOpcs({G_FMINIMUM, G_FMAXIMUM}, Standard)
1387 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUMinimumMaximumInsts)
1388 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUMinimumMaximumInsts)
1389 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1390 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUMinimumMaximumInsts)
1391 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUMinimumMaximumInsts)
1392 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1393 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1394 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1396 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}});
1397
1398 addRulesForGOpcs({G_FMINNUM_IEEE, G_FMAXNUM_IEEE, G_FMINNUM, G_FMAXNUM},
1399 Standard)
1400 .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}})
1401 .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}})
1402 .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}})
1403 .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}})
1405 .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}})
1406 .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat)
1407 .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat)
1408 .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat)
1409 .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat);
1410
1411 addRulesForGOpcs({G_FPTRUNC})
1412 .Any({{DivS16, S32}, {{Vgpr16}, {Vgpr32}}})
1413 .Any({{UniS32, S64}, {{UniInVgprS32}, {Vgpr64}}})
1414 .Any({{DivS32, S64}, {{Vgpr32}, {Vgpr64}}})
1416 .Any({{DivV2S16, V2S32}, {{VgprV2S16}, {VgprV2S32}}})
1417 .Any({{UniS16, S32}, {{Sgpr16}, {Sgpr32}}}, hasSALUFloat)
1418 .Any({{UniS16, S32}, {{UniInVgprS16}, {Vgpr32}}}, !hasSALUFloat);
1419
1420 addRulesForGOpcs({G_IS_FPCLASS})
1421 .Any({{DivS1, S16}, {{Vcc}, {Vgpr16}}})
1422 .Any({{UniS1, S16}, {{UniInVcc}, {Vgpr16}}})
1423 .Any({{DivS1, S32}, {{Vcc}, {Vgpr32}}})
1424 .Any({{UniS1, S32}, {{UniInVcc}, {Vgpr32}}})
1425 .Any({{DivS1, S64}, {{Vcc}, {Vgpr64}}})
1426 .Any({{UniS1, S64}, {{UniInVcc}, {Vgpr64}}});
1427
1428 addRulesForGOpcs({G_FCMP}, Standard)
1429 .Any({{UniS1, _, S16}, {{Sgpr32Trunc}, {None, Sgpr16, Sgpr16}}},
1430 hasSALUFloat)
1431 .Any({{UniS1, _, S16}, {{UniInVcc}, {None, Vgpr16, Vgpr16}}},
1432 !hasSALUFloat)
1433 .Any({{DivS1, _, S16}, {{Vcc}, {None, Vgpr16, Vgpr16}}})
1434 .Any({{UniS1, _, S32}, {{Sgpr32Trunc}, {None, Sgpr32, Sgpr32}}},
1435 hasSALUFloat)
1436 .Any({{UniS1, _, S32}, {{UniInVcc}, {None, Vgpr32, Vgpr32}}},
1437 !hasSALUFloat)
1438 .Any({{DivS1, _, S32}, {{Vcc}, {None, Vgpr32, Vgpr32}}})
1439 .Any({{UniS1, _, S64}, {{UniInVcc}, {None, Vgpr64, Vgpr64}}})
1440 .Any({{DivS1, _, S64}, {{Vcc}, {None, Vgpr64, Vgpr64}}});
1441
1442 addRulesForGOpcs({G_INTRINSIC_TRUNC, G_INTRINSIC_ROUNDEVEN, G_FFLOOR, G_FCEIL,
1443 G_FEXP2, G_FLOG2},
1444 Standard)
1445 .Uni(S16, {{UniInVgprS16}, {Vgpr16}})
1446 .Div(S16, {{Vgpr16}, {Vgpr16}})
1447 .Uni(S32, {{UniInVgprS32}, {Vgpr32}})
1448 .Div(S32, {{Vgpr32}, {Vgpr32}})
1449 .Uni(S64, {{UniInVgprS64}, {Vgpr64}})
1450 .Div(S64, {{Vgpr64}, {Vgpr64}});
1451
1452 using namespace Intrinsic;
1453
1454 addRulesForIOpcs({amdgcn_s_getpc}).Any({{UniS64, _}, {{Sgpr64}, {None}}});
1455
1456 addRulesForIOpcs({amdgcn_s_getreg}).Any({{}, {{Sgpr32}, {IntrId, Imm}}});
1457
1458 addRulesForIOpcs({amdgcn_s_setreg})
1459 .Any({{_, _, S32}, {{}, {IntrId, Imm, SgprB32_ReadFirstLane}}});
1460
1461 addRulesForIOpcs({amdgcn_s_sendmsg, amdgcn_s_sendmsghalt})
1462 .Any({{}, {{}, {IntrId, Imm, SgprB32_M0}}});
1463
1464 addRulesForIOpcs({amdgcn_s_sendmsg_rtn})
1465 .Any({{S32}, {{Sgpr32}, {}}})
1466 .Any({{S64}, {{Sgpr64}, {}}});
1467
1468 addRulesForIOpcs({amdgcn_s_memrealtime}, Standard)
1469 .Uni(S64, {{Sgpr64}, {IntrId}});
1470
1471 addRulesForIOpcs({amdgcn_groupstaticsize, amdgcn_pops_exiting_wave_id,
1472 amdgcn_reloc_constant},
1473 Standard)
1474 .Uni(S32, {{Sgpr32}, {IntrId}});
1475
1476 // Intrinsics with no register operands.
1477 addRulesForIOpcs({amdgcn_endpgm,
1478 amdgcn_s_barrier,
1479 amdgcn_s_barrier_signal,
1480 amdgcn_s_barrier_wait,
1481 amdgcn_s_nop,
1482 amdgcn_s_sethalt,
1483 amdgcn_s_setprio,
1484 amdgcn_s_sleep,
1485 amdgcn_s_ttracedata_imm,
1486 amdgcn_s_wait_asynccnt,
1487 amdgcn_s_wait_bvhcnt,
1488 amdgcn_s_wait_dscnt,
1489 amdgcn_s_wait_event,
1490 amdgcn_s_wait_event_export_ready,
1491 amdgcn_s_wait_expcnt,
1492 amdgcn_s_wait_kmcnt,
1493 amdgcn_s_wait_loadcnt,
1494 amdgcn_s_wait_samplecnt,
1495 amdgcn_s_wait_storecnt,
1496 amdgcn_s_wait_tensorcnt,
1497 amdgcn_s_waitcnt,
1498 amdgcn_wave_barrier})
1499 .Any({{}, {{}, {}}});
1500
1501 addRulesForIOpcs({amdgcn_s_ttracedata}).Any({{}, {{}, {IntrId, SgprB32_M0}}});
1502
1503 addRulesForIOpcs({amdgcn_s_sleep_var})
1504 .Any({{}, {{}, {IntrId, SgprB32_ReadFirstLane}}});
1505
1506 addRulesForIOpcs({amdgcn_s_prefetch_data})
1508
1509 addRulesForIOpcs({amdgcn_class})
1510 .Any({{UniS1, _, S16}, {{UniInVcc}, {IntrId, Vgpr16, Vgpr32}}})
1511 .Any({{DivS1, _, S16}, {{Vcc}, {IntrId, Vgpr16, Vgpr32}}})
1512 .Any({{UniS1, _, S32}, {{UniInVcc}, {IntrId, Vgpr32, Vgpr32}}})
1513 .Any({{DivS1, _, S32}, {{Vcc}, {IntrId, Vgpr32, Vgpr32}}})
1514 .Any({{UniS1, _, S64}, {{UniInVcc}, {IntrId, Vgpr64, Vgpr32}}})
1515 .Any({{DivS1, _, S64}, {{Vcc}, {IntrId, Vgpr64, Vgpr32}}});
1516
1517 // This is "intrinsic lane mask" it was set to i32/i64 in llvm-ir.
1518 addRulesForIOpcs({amdgcn_end_cf})
1519 .Any({{_, UniS32}, {{}, {IntrId, Sgpr32}}})
1520 .Any({{_, UniS64}, {{}, {IntrId, Sgpr64}}});
1521
1522 addRulesForIOpcs({amdgcn_if_break}, Standard)
1523 .Uni(S64, {{Sgpr64}, {IntrId, Vcc, Sgpr64}})
1524 .Uni(S32, {{Sgpr32}, {IntrId, Vcc, Sgpr32}});
1525
1526 addRulesForIOpcs({amdgcn_exp})
1527 .Any({{_, _, _, S32, S32, S32, S32},
1528 {{}, {IntrId, Imm, Imm, Vgpr32, Vgpr32, Vgpr32, Vgpr32}}});
1529
1530 addRulesForIOpcs({amdgcn_exp_compr})
1531 .Any({{_, _, _, V2S16}, {{}, {IntrId, Imm, Imm, VgprV2S16, VgprV2S16}}});
1532
1533 addRulesForIOpcs({amdgcn_exp_row})
1534 .Any({{_, _, _, S32, S32, S32, S32, _, S32},
1535 {{},
1537 SgprB32_M0}}});
1538
1539 addRulesForIOpcs({amdgcn_lds_direct_load}, StandardB)
1540 .Div(B32, {{VgprB32}, {IntrId, SgprB32_M0}});
1541
1542 addRulesForIOpcs({amdgcn_lds_param_load}, Standard)
1543 .Div(S32, {{Vgpr32}, {IntrId, Imm, Imm, SgprB32_M0}});
1544
1545 addRulesForIOpcs({amdgcn_mbcnt_lo, amdgcn_mbcnt_hi}, Standard)
1546 .Div(S32, {{}, {Vgpr32, None, Vgpr32, Vgpr32}});
1547
1548 addRulesForIOpcs({amdgcn_readfirstlane})
1549 .Any({{UniB32, _, DivB32}, {{}, {SgprB32, None, VgprB32}}})
1550 // this should not exist in the first place, it is from call lowering
1551 // readfirstlaning just in case register is not in sgpr.
1552 .Any({{UniS32, _, UniS32}, {{}, {Sgpr32, None, Vgpr32}}});
1553
1554 addRulesForIOpcs({amdgcn_readlane}, StandardB)
1556
1557 addRulesForIOpcs({amdgcn_writelane}, StandardB)
1558 .Div(B32,
1559 {{VgprB32},
1561
1562 addRulesForIOpcs({amdgcn_permlane16, amdgcn_permlanex16}, Standard)
1563 .Div(S32, {{Vgpr32},
1566
1567 addRulesForIOpcs({amdgcn_perm}, Standard)
1568 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1569 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1570
1571 addRulesForIOpcs({amdgcn_wave_reduce_umax, amdgcn_wave_reduce_umin}, Standard)
1572 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1573 .Div(S32, {{Sgpr32ToVgprDst}, {IntrId, VgprB32}})
1574 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64}})
1575 .Div(S64, {{Sgpr64ToVgprDst}, {IntrId, VgprB64}});
1576
1577 addRulesForIOpcs({amdgcn_bitop3, amdgcn_fmad_ftz}, Standard)
1578 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1579 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1580 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1581 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1582
1583 addRulesForIOpcs({amdgcn_udot4, amdgcn_sdot4, amdgcn_udot8, amdgcn_sdot8,
1584 amdgcn_dot4_f32_bf8_bf8, amdgcn_dot4_f32_bf8_fp8,
1585 amdgcn_dot4_f32_fp8_fp8, amdgcn_dot4_f32_fp8_bf8},
1586 Standard)
1587 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1588 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1589
1590 addRulesForIOpcs({amdgcn_mul_u24, amdgcn_mul_i24}, Standard)
1591 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1592 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}})
1593 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr32, Vgpr32}})
1594 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32, Vgpr32}});
1595
1596 addRulesForIOpcs({amdgcn_ds_bpermute, amdgcn_ds_bpermute_fi_b32,
1597 amdgcn_ds_permute, amdgcn_fmul_legacy, amdgcn_mulhi_i24,
1598 amdgcn_mulhi_u24},
1599 Standard)
1600 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32}})
1601 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1602
1603 addRulesForIOpcs({amdgcn_cubesc, amdgcn_cubetc, amdgcn_cubema, amdgcn_cubeid,
1604 amdgcn_fma_legacy},
1605 Standard)
1606 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1607 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1608
1609 addRulesForIOpcs({amdgcn_frexp_mant, amdgcn_fract}, Standard)
1610 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
1611 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1612 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
1613 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1614 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64}})
1615 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64}});
1616
1617 addRulesForIOpcs({amdgcn_prng_b32})
1618 .Any({{UniS32}, {{UniInVgprS32}, {IntrId, Vgpr32}}})
1619 .Any({{DivS32}, {{Vgpr32}, {IntrId, Vgpr32}}});
1620
1621 addRulesForIOpcs({amdgcn_sffbh}, Standard)
1622 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32}})
1623 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
1624
1625 addRulesForIOpcs({amdgcn_ubfe, amdgcn_sbfe}, Standard)
1626 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1627 .Uni(S32, {{Sgpr32}, {IntrId, Sgpr32, Sgpr32, Sgpr32}, S_BFE})
1628 .Uni(S64, {{Sgpr64}, {IntrId, Sgpr64, Sgpr32, Sgpr32}, S_BFE})
1629 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32, Vgpr32}, V_BFE});
1630
1631 addRulesForIOpcs({amdgcn_cvt_pk_i16, amdgcn_cvt_pk_u16, amdgcn_cvt_pknorm_i16,
1632 amdgcn_cvt_pknorm_u16, amdgcn_cvt_pkrtz},
1633 Standard)
1634 .Uni(V2S16, {{UniInVgprV2S16}, {IntrId, Vgpr32, Vgpr32}})
1635 .Div(V2S16, {{VgprV2S16}, {IntrId, Vgpr32, Vgpr32}});
1636
1637 addRulesForIOpcs({amdgcn_global_load_tr_b64})
1638 .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
1639 .Any({{DivB32}, {{VgprB32}, {IntrId, SgprP1}}});
1640
1641 addRulesForIOpcs({amdgcn_global_load_tr_b128})
1642 .Any({{DivB64}, {{VgprB64}, {IntrId, SgprP1}}})
1643 .Any({{DivB128}, {{VgprB128}, {IntrId, SgprP1}}});
1644
1645 addRulesForIOpcs({amdgcn_global_atomic_ordered_add_b64})
1646 .Any({{DivS64}, {{Vgpr64}, {IntrId, VgprP1, Vgpr64}}});
1647
1648 addRulesForIOpcs(
1649 {amdgcn_global_atomic_fmin_num, amdgcn_global_atomic_fmax_num}, Standard)
1650 .Div(S32, {{Vgpr32}, {IntrId, VgprP1, Vgpr32}});
1651
1652 addRulesForIOpcs({amdgcn_flat_atomic_fmin_num, amdgcn_flat_atomic_fmax_num},
1653 Standard)
1654 .Div(S32, {{Vgpr32}, {IntrId, VgprP0, Vgpr32}});
1655
1656 addRulesForIOpcs({amdgcn_raw_buffer_load_lds})
1657 .Any({{_}, {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Sgpr32}}});
1658
1659 addRulesForIOpcs({amdgcn_struct_buffer_load_lds})
1660 .Any({{_},
1661 {{}, {IntrId, SgprV4S32, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
1662
1663 addRulesForIOpcs({amdgcn_raw_ptr_buffer_load_lds})
1664 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Sgpr32}}});
1665
1666 addRulesForIOpcs({amdgcn_struct_ptr_buffer_load_lds})
1667 .Any({{_}, {{}, {IntrId, SgprP8, SgprP3, Imm, Vgpr32, Vgpr32, Sgpr32}}});
1668
1669 addRulesForIOpcs({amdgcn_global_load_lds})
1670 .Any({{}, {{}, {IntrId, VgprP1, SgprB32_M0}}});
1671
1672 addRulesForIOpcs({amdgcn_wwm, amdgcn_strict_wwm, amdgcn_wqm, amdgcn_softwqm,
1673 amdgcn_strict_wqm},
1674 StandardB)
1675 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
1676 .Uni(B32, {{SgprB32}, {IntrId, SgprB32}})
1677 .Div(B64, {{VgprB64}, {IntrId, VgprB64}})
1678 .Uni(B64, {{SgprB64}, {IntrId, SgprB64}})
1679 .Div(B96, {{VgprB96}, {IntrId, VgprB96}})
1680 .Uni(B96, {{SgprB96}, {IntrId, SgprB96}})
1681 .Div(B128, {{VgprB128}, {IntrId, VgprB128}})
1682 .Uni(B128, {{SgprB128}, {IntrId, SgprB128}})
1683 .Any({{UniB256}, {{SgprB256}, {IntrId, SgprB256}}})
1684 .Any({{DivB256}, {{VgprB256}, {IntrId, VgprB256}}})
1685 .Any({{UniB512}, {{SgprB512}, {IntrId, SgprB512}}})
1686 .Any({{DivB512}, {{VgprB512}, {IntrId, VgprB512}}});
1687
1688 addRulesForIOpcs({amdgcn_wqm_demote}).Any({{}, {{}, {IntrId, Vcc}}});
1689
1690 addRulesForIOpcs({amdgcn_live_mask, amdgcn_ps_live})
1691 .Any({{DivS1}, {{Vcc}, {}}});
1692
1693 addRulesForIOpcs({amdgcn_mov_dpp, amdgcn_mov_dpp8}, StandardB)
1694 .Div(B32, {{VgprB32}, {IntrId, VgprB32}})
1695 .Div(B64, {{VgprB64}, {IntrId, VgprB64}});
1696
1697 addRulesForIOpcs({amdgcn_sin, amdgcn_cos}, Standard)
1698 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16}})
1699 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16}})
1700 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1701 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}});
1702
1703 addRulesForIOpcs({amdgcn_trig_preop}, Standard)
1704 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr32}})
1705 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr32}});
1706
1707 addRulesForIOpcs({amdgcn_ds_add_gs_reg_rtn, amdgcn_ds_sub_gs_reg_rtn},
1708 Standard)
1709 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}})
1710 .Div(S64, {{Vgpr64}, {IntrId, Vgpr32}});
1711
1712 addRulesForIOpcs({amdgcn_ds_append, amdgcn_ds_consume}, Standard)
1713 .Uni(S32, {{UniInVgprS32}, {IntrId, SgprB32_M0}})
1714 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0}});
1715
1716 addRulesForIOpcs(
1717 {amdgcn_ds_bvh_stack_rtn, amdgcn_ds_bvh_stack_push4_pop1_rtn}, Standard)
1718 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV4S32}});
1719
1720 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop1_rtn}, Standard)
1721 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
1722
1723 addRulesForIOpcs({amdgcn_ds_bvh_stack_push8_pop2_rtn}, Standard)
1724 .Div(S64, {{Vgpr64, Vgpr32}, {IntrId, Vgpr32, Vgpr32, VgprV8S32}});
1725
1726 addRulesForIOpcs({amdgcn_ds_ordered_add, amdgcn_ds_ordered_swap}, Standard)
1727 .Div(S32, {{Vgpr32}, {IntrId, SgprB32_M0, Vgpr32}});
1728
1729 addRulesForIOpcs({amdgcn_ds_swizzle}, Standard)
1730 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32}})
1731 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32}});
1732
1733 addRulesForIOpcs({amdgcn_permlane16_var, amdgcn_permlanex16_var}, Standard)
1734 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1735
1736 addRulesForIOpcs({amdgcn_permlane16_swap, amdgcn_permlane32_swap}, Standard)
1737 .Div(S32, {{Vgpr32, Vgpr32}, {IntrId, Vgpr32, Vgpr32}});
1738
1739 addRulesForIOpcs({amdgcn_permlane64}, StandardB)
1740 .Div(B32, {{VgprB32}, {IntrId, VgprB32}});
1741
1742 addRulesForIOpcs({amdgcn_ds_read_tr4_b64, amdgcn_ds_read_tr8_b64})
1743 .Any({{DivV2S32}, {{VgprV2S32}, {IntrId, VgprP3}}});
1744
1745 addRulesForIOpcs({amdgcn_ds_read_tr6_b96})
1746 .Any({{DivV3S32}, {{VgprV3S32}, {IntrId, VgprP3}}});
1747
1748 addRulesForIOpcs({amdgcn_ds_read_tr16_b64})
1749 .Any({{DivV4S16}, {{VgprV4S16}, {IntrId, VgprP3}}});
1750
1751 addRulesForIOpcs({amdgcn_interp_inreg_p10, amdgcn_interp_inreg_p2,
1752 amdgcn_interp_inreg_p10_f16, amdgcn_interp_p10_rtz_f16},
1753 Standard)
1754 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1755 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1756
1757 addRulesForIOpcs({amdgcn_interp_inreg_p2_f16, amdgcn_interp_p2_rtz_f16},
1758 Standard)
1759 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1760 .Div(S16, {{Vgpr16}, {IntrId, Vgpr32, Vgpr32, Vgpr32}});
1761
1762 addRulesForIOpcs({amdgcn_div_fmas}, Standard)
1763 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
1764 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32, Vcc}})
1765 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}})
1766 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64, Vcc}});
1767
1768 addRulesForIOpcs({amdgcn_div_fixup}, Standard)
1769 .Div(S16, {{Vgpr16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1770 .Uni(S16, {{UniInVgprS16}, {IntrId, Vgpr16, Vgpr16, Vgpr16}})
1771 .Div(S32, {{Vgpr32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1772 .Uni(S32, {{UniInVgprS32}, {IntrId, Vgpr32, Vgpr32, Vgpr32}})
1773 .Div(S64, {{Vgpr64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}})
1774 .Uni(S64, {{UniInVgprS64}, {IntrId, Vgpr64, Vgpr64, Vgpr64}});
1775
1776 addRulesForIOpcs({amdgcn_div_scale}, Standard)
1777 .Div(S32, {{Vgpr32, Vcc}, {IntrId, Vgpr32, Vgpr32}})
1778 .Uni(S32, {{UniInVgprS32, UniInVcc}, {IntrId, Vgpr32, Vgpr32}})
1779 .Div(S64, {{Vgpr64, Vcc}, {IntrId, Vgpr64, Vgpr64}})
1780 .Uni(S64, {{UniInVgprS64, UniInVcc}, {IntrId, Vgpr64, Vgpr64}});
1781
1782 addRulesForIOpcs({amdgcn_udot2, amdgcn_sdot2}, Standard)
1784 .Div(S32, {{Vgpr32}, {IntrId, VgprV2S16, VgprV2S16, Vgpr32}});
1785
1786 addRulesForIOpcs({amdgcn_sudot4, amdgcn_sudot8}, Standard)
1787 .Uni(S32, {{UniInVgprS32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}})
1788 .Div(S32, {{Vgpr32}, {IntrId, Imm, Vgpr32, Imm, Vgpr32, Vgpr32}});
1789
1790} // end initialize rules
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU address space definition.
Contains the definition of a TargetInstrInfo class that is common to all AMD GPUs.
constexpr LLT S16
constexpr LLT S1
constexpr LLT V2S16
constexpr LLT S32
constexpr LLT V4S32
constexpr LLT V3S32
constexpr LLT S64
constexpr LLT V2S32
constexpr LLT S128
UniformityLLTOpPredicateID LLTToBId(LLT Ty)
bool matchUniformityAndLLT(Register Reg, UniformityLLTOpPredicateID UniID, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI)
UniformityLLTOpPredicateID LLTToId(LLT Ty)
AMD GCN specific subclass of TargetSubtarget.
Declares convenience wrapper classes for interpreting MachineInstr instances as specific generic oper...
#define _
IRTranslator LLVM IR MI
Register Reg
Register const TargetRegisterInfo * TRI
Machine IR instance of the generic uniformity analysis.
bool operator()(const MachineInstr &MI) const
Predicate operator||(const Predicate &RHS) const
Predicate operator&&(const Predicate &RHS) const
Predicate(std::function< bool(const MachineInstr &)> Pred)
Predicate operator!() const
RegBankLegalizeRules(const GCNSubtarget &ST, MachineRegisterInfo &MRI)
const SetOfRulesForOpcode * getRulesForOpc(MachineInstr &MI) const
const RegBankLLTMapping * findMappingForMI(const MachineInstr &MI, const MachineRegisterInfo &MRI, const MachineUniformityInfo &MUI) const
void addFastRuleDivergent(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
void addFastRuleUniform(UniformityLLTOpPredicateID Ty, RegBankLLTMapping RuleApplyIDs)
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isSigned() const
Definition InstrTypes.h:930
bool isDivergent(ConstValueRefT V) const
Whether V is divergent at its definition.
bool isUniform(ConstValueRefT V) const
Whether V is uniform/non-divergent.
bool isEquality() const
Return true if this predicate is either EQ or NE.
static constexpr LLT scalar(unsigned SizeInBits)
Get a low-level scalar or aggregate "bag of bits".
static constexpr LLT pointer(unsigned AddressSpace, unsigned SizeInBits)
Get a low-level pointer in the given address space.
constexpr TypeSize getSizeInBits() const
Returns the total size of the type. Must only be called on sized types.
static constexpr LLT fixed_vector(unsigned NumElements, unsigned ScalarSizeInBits)
Get a low-level fixed-width vector of some number of elements and element width.
TypeSize getValue() const
Representation of each machine instruction.
A description of a memory reference used in the backend.
LocationSize getSize() const
Return the size in bytes of the memory reference.
unsigned getAddrSpace() const
LLVM_ABI Align getAlign() const
Return the minimum known alignment in bytes of the actual memory reference.
MachineOperand class - Representation of each machine instruction operand.
bool isReg() const
isReg - Tests if this is a MO_Register operand.
Register getReg() const
getReg - Returns the register number.
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLT getType(Register Reg) const
Get the low-level type of Reg or LLT{} if Reg is not a generic (target independent) virtual register.
const TargetRegisterInfo * getTargetRegisterInfo() const
Wrapper class representing virtual and physical registers.
Definition Register.h:20
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void swap(SmallVectorImpl &RHS)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
@ CONSTANT_ADDRESS_32BIT
Address space for 32-bit constant memory.
@ CONSTANT_ADDRESS
Address space for constant memory (VTX2).
bool isAnyPtr(LLT Ty, unsigned Width)
bool isUniformMMO(const MachineMemOperand *MMO)
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
This is an optimization pass for GlobalISel generic memory operations.
GenericUniformityInfo< MachineSSAContext > MachineUniformityInfo
static const MachineMemOperand::Flags MONoClobber
Mark the MMO of a uniform load if there are no potentially clobbering stores on any path from the sta...
Definition SIInstrInfo.h:44
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
SmallVector< UniformityLLTOpPredicateID, 4 > OpUniformityAndTypes
PredicateMapping(std::initializer_list< UniformityLLTOpPredicateID > OpList, std::function< bool(const MachineInstr &)> TestFunc=nullptr)
bool match(const MachineInstr &MI, const MachineUniformityInfo &MUI, const MachineRegisterInfo &MRI) const
std::function< bool(const MachineInstr &)> TestFunc
RegBankLLTMapping(std::initializer_list< RegBankLLTMappingApplyID > DstOpMappingList, std::initializer_list< RegBankLLTMappingApplyID > SrcOpMappingList, LoweringMethodID LoweringMethod=DoNotLower)
SmallVector< RegBankLLTMappingApplyID, 2 > DstOpMapping
SmallVector< RegBankLLTMappingApplyID, 4 > SrcOpMapping
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39