LLVM 23.0.0git
AArch64TargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- AArch64TargetTransformInfo.cpp - AArch64 specific TTI -------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "AArch64ExpandImm.h"
14#include "llvm/ADT/DenseMap.h"
22#include "llvm/IR/Intrinsics.h"
23#include "llvm/IR/IntrinsicsAArch64.h"
25#include "llvm/Support/Debug.h"
30#include <algorithm>
31#include <optional>
32using namespace llvm;
33using namespace llvm::PatternMatch;
34
35#define DEBUG_TYPE "aarch64tti"
36
37static cl::opt<bool> EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix",
38 cl::init(true), cl::Hidden);
39
41 "sve-prefer-fixed-over-scalable-if-equal", cl::Hidden);
42
43static cl::opt<unsigned> SVEGatherOverhead("sve-gather-overhead", cl::init(10),
45
46static cl::opt<unsigned> SVEScatterOverhead("sve-scatter-overhead",
47 cl::init(10), cl::Hidden);
48
49static cl::opt<unsigned> SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold",
50 cl::init(15), cl::Hidden);
51
53 NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10),
55
57 "call-penalty-sm-change", cl::init(5), cl::Hidden,
59 "Penalty of calling a function that requires a change to PSTATE.SM"));
60
62 "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
63 cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
64
65static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
66 cl::init(true), cl::Hidden);
67
68static cl::opt<bool> EnableLSRCostOpt("enable-aarch64-lsr-cost-opt",
69 cl::init(true), cl::Hidden);
70
71// A complete guess as to a reasonable cost.
73 BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
74 cl::desc("The cost of a histcnt instruction"));
75
77 "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
78 cl::desc("The number of instructions to search for a redundant dmb"));
79
81 "aarch64-force-unroll-threshold", cl::init(0), cl::Hidden,
82 cl::desc("Threshold for forced unrolling of small loops in AArch64"));
83
84namespace {
85class TailFoldingOption {
86 // These bitfields will only ever be set to something non-zero in operator=,
87 // when setting the -sve-tail-folding option. This option should always be of
88 // the form (default|simple|all|disable)[+(Flag1|Flag2|etc)], where here
89 // InitialBits is one of (disabled|all|simple). EnableBits represents
90 // additional flags we're enabling, and DisableBits for those flags we're
91 // disabling. The default flag is tracked in the variable NeedsDefault, since
92 // at the time of setting the option we may not know what the default value
93 // for the CPU is.
97
98 // This value needs to be initialised to true in case the user does not
99 // explicitly set the -sve-tail-folding option.
100 bool NeedsDefault = true;
101
102 void setInitialBits(TailFoldingOpts Bits) { InitialBits = Bits; }
103
104 void setNeedsDefault(bool V) { NeedsDefault = V; }
105
106 void setEnableBit(TailFoldingOpts Bit) {
107 EnableBits |= Bit;
108 DisableBits &= ~Bit;
109 }
110
111 void setDisableBit(TailFoldingOpts Bit) {
112 EnableBits &= ~Bit;
113 DisableBits |= Bit;
114 }
115
116 TailFoldingOpts getBits(TailFoldingOpts DefaultBits) const {
117 TailFoldingOpts Bits = TailFoldingOpts::Disabled;
118
119 assert((InitialBits == TailFoldingOpts::Disabled || !NeedsDefault) &&
120 "Initial bits should only include one of "
121 "(disabled|all|simple|default)");
122 Bits = NeedsDefault ? DefaultBits : InitialBits;
123 Bits |= EnableBits;
124 Bits &= ~DisableBits;
125
126 return Bits;
127 }
128
129 void reportError(std::string Opt) {
130 errs() << "invalid argument '" << Opt
131 << "' to -sve-tail-folding=; the option should be of the form\n"
132 " (disabled|all|default|simple)[+(reductions|recurrences"
133 "|reverse|noreductions|norecurrences|noreverse)]\n";
134 report_fatal_error("Unrecognised tail-folding option");
135 }
136
137public:
138
139 void operator=(const std::string &Val) {
140 // If the user explicitly sets -sve-tail-folding= then treat as an error.
141 if (Val.empty()) {
142 reportError("");
143 return;
144 }
145
146 // Since the user is explicitly setting the option we don't automatically
147 // need the default unless they require it.
148 setNeedsDefault(false);
149
150 SmallVector<StringRef, 4> TailFoldTypes;
151 StringRef(Val).split(TailFoldTypes, '+', -1, false);
152
153 unsigned StartIdx = 1;
154 if (TailFoldTypes[0] == "disabled")
155 setInitialBits(TailFoldingOpts::Disabled);
156 else if (TailFoldTypes[0] == "all")
157 setInitialBits(TailFoldingOpts::All);
158 else if (TailFoldTypes[0] == "default")
159 setNeedsDefault(true);
160 else if (TailFoldTypes[0] == "simple")
161 setInitialBits(TailFoldingOpts::Simple);
162 else {
163 StartIdx = 0;
164 setInitialBits(TailFoldingOpts::Disabled);
165 }
166
167 for (unsigned I = StartIdx; I < TailFoldTypes.size(); I++) {
168 if (TailFoldTypes[I] == "reductions")
169 setEnableBit(TailFoldingOpts::Reductions);
170 else if (TailFoldTypes[I] == "recurrences")
171 setEnableBit(TailFoldingOpts::Recurrences);
172 else if (TailFoldTypes[I] == "reverse")
173 setEnableBit(TailFoldingOpts::Reverse);
174 else if (TailFoldTypes[I] == "noreductions")
175 setDisableBit(TailFoldingOpts::Reductions);
176 else if (TailFoldTypes[I] == "norecurrences")
177 setDisableBit(TailFoldingOpts::Recurrences);
178 else if (TailFoldTypes[I] == "noreverse")
179 setDisableBit(TailFoldingOpts::Reverse);
180 else
181 reportError(Val);
182 }
183 }
184
185 bool satisfies(TailFoldingOpts DefaultBits, TailFoldingOpts Required) const {
186 return (getBits(DefaultBits) & Required) == Required;
187 }
188};
189} // namespace
190
191TailFoldingOption TailFoldingOptionLoc;
192
194 "sve-tail-folding",
195 cl::desc(
196 "Control the use of vectorisation using tail-folding for SVE where the"
197 " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:"
198 "\ndisabled (Initial) No loop types will vectorize using "
199 "tail-folding"
200 "\ndefault (Initial) Uses the default tail-folding settings for "
201 "the target CPU"
202 "\nall (Initial) All legal loop types will vectorize using "
203 "tail-folding"
204 "\nsimple (Initial) Use tail-folding for simple loops (not "
205 "reductions or recurrences)"
206 "\nreductions Use tail-folding for loops containing reductions"
207 "\nnoreductions Inverse of above"
208 "\nrecurrences Use tail-folding for loops containing fixed order "
209 "recurrences"
210 "\nnorecurrences Inverse of above"
211 "\nreverse Use tail-folding for loops requiring reversed "
212 "predicates"
213 "\nnoreverse Inverse of above"),
215
216// Experimental option that will only be fully functional when the
217// code-generator is changed to use SVE instead of NEON for all fixed-width
218// operations.
220 "enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
221
222// Experimental option that will only be fully functional when the cost-model
223// and code-generator have been changed to avoid using scalable vector
224// instructions that are not legal in streaming SVE mode.
226 "enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden);
227
228static bool isSMEABIRoutineCall(const CallInst &CI,
229 const AArch64TargetLowering &TLI) {
230 const auto *F = CI.getCalledFunction();
231 return F &&
233}
234
235/// Returns true if the function has explicit operations that can only be
236/// lowered using incompatible instructions for the selected mode. This also
237/// returns true if the function F may use or modify ZA state.
239 const AArch64TargetLowering &TLI) {
240 for (const BasicBlock &BB : *F) {
241 for (const Instruction &I : BB) {
242 // Be conservative for now and assume that any call to inline asm or to
243 // intrinsics could could result in non-streaming ops (e.g. calls to
244 // @llvm.aarch64.* or @llvm.gather/scatter intrinsics). We can assume that
245 // all native LLVM instructions can be lowered to compatible instructions.
246 if (isa<CallInst>(I) && !I.isDebugOrPseudoInst() &&
247 (cast<CallInst>(I).isInlineAsm() || isa<IntrinsicInst>(I) ||
249 return true;
250 }
251 }
252 return false;
253}
254
256 SmallVectorImpl<StringRef> &Features) {
257 StringRef AttributeStr =
258 TTI->isMultiversionedFunction(F) ? "fmv-features" : "target-features";
259 StringRef FeatureStr = F.getFnAttribute(AttributeStr).getValueAsString();
260 FeatureStr.split(Features, ",");
261}
262
265 extractAttrFeatures(F, this, Features);
266 return AArch64::getCpuSupportsMask(Features);
267}
268
271 extractAttrFeatures(F, this, Features);
272 return AArch64::getFMVPriority(Features);
273}
274
276 return F.hasFnAttribute("fmv-features");
277}
278
280 const Function *Callee) const {
281 SMECallAttrs CallAttrs(*Caller, *Callee);
282
283 // Never inline a function explicitly marked as being streaming,
284 // into a non-streaming function. Assume it was marked as streaming
285 // for a reason.
286 if (CallAttrs.caller().hasNonStreamingInterfaceAndBody() &&
288 return false;
289
290 // When inlining, we should consider the body of the function, not the
291 // interface.
292 if (CallAttrs.callee().hasStreamingBody()) {
293 CallAttrs.callee().set(SMEAttrs::SM_Compatible, false);
294 CallAttrs.callee().set(SMEAttrs::SM_Enabled, true);
295 }
296
297 if (CallAttrs.callee().isNewZA() || CallAttrs.callee().isNewZT0())
298 return false;
299
300 if (CallAttrs.requiresLazySave() || CallAttrs.requiresSMChange() ||
301 CallAttrs.requiresPreservingZT0() ||
302 CallAttrs.requiresPreservingAllZAState()) {
303 if (hasPossibleIncompatibleOps(Callee, *getTLI()))
304 return false;
305 }
306
307 return BaseT::areInlineCompatible(Caller, Callee);
308}
309
311 const Function *Callee,
312 ArrayRef<Type *> Types) const {
313 if (!BaseT::areTypesABICompatible(Caller, Callee, Types))
314 return false;
315
316 // We need to ensure that argument promotion does not attempt to promote
317 // pointers to fixed-length vector types larger than 128 bits like
318 // <8 x float> (and pointers to aggregate types which have such fixed-length
319 // vector type members) into the values of the pointees. Such vector types
320 // are used for SVE VLS but there is no ABI for SVE VLS arguments and the
321 // backend cannot lower such value arguments. The 128-bit fixed-length SVE
322 // types can be safely treated as 128-bit NEON types and they cannot be
323 // distinguished in IR.
324 if (ST->useSVEForFixedLengthVectors() && llvm::any_of(Types, [](Type *Ty) {
325 auto FVTy = dyn_cast<FixedVectorType>(Ty);
326 return FVTy &&
327 FVTy->getScalarSizeInBits() * FVTy->getNumElements() > 128;
328 }))
329 return false;
330
331 return true;
332}
333
334unsigned
336 unsigned DefaultCallPenalty) const {
337 // This function calculates a penalty for executing Call in F.
338 //
339 // There are two ways this function can be called:
340 // (1) F:
341 // call from F -> G (the call here is Call)
342 //
343 // For (1), Call.getCaller() == F, so it will always return a high cost if
344 // a streaming-mode change is required (thus promoting the need to inline the
345 // function)
346 //
347 // (2) F:
348 // call from F -> G (the call here is not Call)
349 // G:
350 // call from G -> H (the call here is Call)
351 //
352 // For (2), if after inlining the body of G into F the call to H requires a
353 // streaming-mode change, and the call to G from F would also require a
354 // streaming-mode change, then there is benefit to do the streaming-mode
355 // change only once and avoid inlining of G into F.
356
357 SMEAttrs FAttrs(*F);
358 SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
359
360 if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
361 if (F == Call.getCaller()) // (1)
362 return CallPenaltyChangeSM * DefaultCallPenalty;
363 if (SMECallAttrs(FAttrs, CallAttrs.caller()).requiresSMChange()) // (2)
364 return InlineCallPenaltyChangeSM * DefaultCallPenalty;
365 }
366
367 return DefaultCallPenalty;
368}
369
373
374 if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable())
375 return true;
376
378 ST->isSVEorStreamingSVEAvailable() &&
379 !ST->disableMaximizeScalableBandwidth();
380}
381
382/// Calculate the cost of materializing a 64-bit value. This helper
383/// method might only calculate a fraction of a larger immediate. Therefore it
384/// is valid to return a cost of ZERO.
386 // Check if the immediate can be encoded within an instruction.
387 if (Val == 0 || AArch64_AM::isLogicalImmediate(Val, 64))
388 return 0;
389
390 if (Val < 0)
391 Val = ~Val;
392
393 // Calculate how many moves we will need to materialize this constant.
395 AArch64_IMM::expandMOVImm(Val, 64, Insn);
396 return Insn.size();
397}
398
399/// Calculate the cost of materializing the given constant.
403 assert(Ty->isIntegerTy());
404
405 unsigned BitSize = Ty->getPrimitiveSizeInBits();
406 if (BitSize == 0)
407 return ~0U;
408
409 // Sign-extend all constants to a multiple of 64-bit.
410 APInt ImmVal = Imm;
411 if (BitSize & 0x3f)
412 ImmVal = Imm.sext((BitSize + 63) & ~0x3fU);
413
414 // Split the constant into 64-bit chunks and calculate the cost for each
415 // chunk.
417 for (unsigned ShiftVal = 0; ShiftVal < BitSize; ShiftVal += 64) {
418 APInt Tmp = ImmVal.ashr(ShiftVal).sextOrTrunc(64);
419 int64_t Val = Tmp.getSExtValue();
420 Cost += getIntImmCost(Val);
421 }
422 // We need at least one instruction to materialze the constant.
423 return std::max<InstructionCost>(1, Cost);
424}
425
427 const APInt &Imm, Type *Ty,
429 Instruction *Inst) const {
430 assert(Ty->isIntegerTy());
431
432 unsigned BitSize = Ty->getPrimitiveSizeInBits();
433 // There is no cost model for constants with a bit size of 0. Return TCC_Free
434 // here, so that constant hoisting will ignore this constant.
435 if (BitSize == 0)
436 return TTI::TCC_Free;
437
438 unsigned ImmIdx = ~0U;
439 switch (Opcode) {
440 default:
441 return TTI::TCC_Free;
442 case Instruction::GetElementPtr:
443 // Always hoist the base address of a GetElementPtr.
444 if (Idx == 0)
445 return 2 * TTI::TCC_Basic;
446 return TTI::TCC_Free;
447 case Instruction::Store:
448 ImmIdx = 0;
449 break;
450 case Instruction::Add:
451 case Instruction::Sub:
452 case Instruction::Mul:
453 case Instruction::UDiv:
454 case Instruction::SDiv:
455 case Instruction::URem:
456 case Instruction::SRem:
457 case Instruction::And:
458 case Instruction::Or:
459 case Instruction::Xor:
460 case Instruction::ICmp:
461 ImmIdx = 1;
462 break;
463 // Always return TCC_Free for the shift value of a shift instruction.
464 case Instruction::Shl:
465 case Instruction::LShr:
466 case Instruction::AShr:
467 if (Idx == 1)
468 return TTI::TCC_Free;
469 break;
470 case Instruction::Trunc:
471 case Instruction::ZExt:
472 case Instruction::SExt:
473 case Instruction::IntToPtr:
474 case Instruction::PtrToInt:
475 case Instruction::BitCast:
476 case Instruction::PHI:
477 case Instruction::Call:
478 case Instruction::Select:
479 case Instruction::Ret:
480 case Instruction::Load:
481 break;
482 }
483
484 if (Idx == ImmIdx) {
485 int NumConstants = (BitSize + 63) / 64;
487 return (Cost <= NumConstants * TTI::TCC_Basic)
488 ? static_cast<int>(TTI::TCC_Free)
489 : Cost;
490 }
492}
493
496 const APInt &Imm, Type *Ty,
498 assert(Ty->isIntegerTy());
499
500 unsigned BitSize = Ty->getPrimitiveSizeInBits();
501 // There is no cost model for constants with a bit size of 0. Return TCC_Free
502 // here, so that constant hoisting will ignore this constant.
503 if (BitSize == 0)
504 return TTI::TCC_Free;
505
506 // Most (all?) AArch64 intrinsics do not support folding immediates into the
507 // selected instruction, so we compute the materialization cost for the
508 // immediate directly.
509 if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
511
512 switch (IID) {
513 default:
514 return TTI::TCC_Free;
515 case Intrinsic::sadd_with_overflow:
516 case Intrinsic::uadd_with_overflow:
517 case Intrinsic::ssub_with_overflow:
518 case Intrinsic::usub_with_overflow:
519 case Intrinsic::smul_with_overflow:
520 case Intrinsic::umul_with_overflow:
521 if (Idx == 1) {
522 int NumConstants = (BitSize + 63) / 64;
524 return (Cost <= NumConstants * TTI::TCC_Basic)
525 ? static_cast<int>(TTI::TCC_Free)
526 : Cost;
527 }
528 break;
529 case Intrinsic::experimental_stackmap:
530 if ((Idx < 2) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
531 return TTI::TCC_Free;
532 break;
533 case Intrinsic::experimental_patchpoint_void:
534 case Intrinsic::experimental_patchpoint:
535 if ((Idx < 4) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
536 return TTI::TCC_Free;
537 break;
538 case Intrinsic::experimental_gc_statepoint:
539 if ((Idx < 5) || (Imm.getBitWidth() <= 64 && isInt<64>(Imm.getSExtValue())))
540 return TTI::TCC_Free;
541 break;
542 }
544}
545
547AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) const {
548 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
549 if (TyWidth == 32 || TyWidth == 64)
551 // TODO: AArch64TargetLowering::LowerCTPOP() supports 128bit popcount.
552 return TTI::PSK_Software;
553}
554
556 // MispredictPenalty is defined per-CPU in AArch64Sched*.td (e.g.,
557 // AArch64SchedNeoverseV2.td).
558 return ST->getSchedModel().MispredictPenalty;
559}
560
561static bool isUnpackedVectorVT(EVT VecVT) {
562 return VecVT.isScalableVector() &&
564}
565
567 const IntrinsicCostAttributes &ICA) {
568 // We need to know at least the number of elements in the vector of buckets
569 // and the size of each element to update.
570 if (ICA.getArgTypes().size() < 2)
572
573 // Only interested in costing for the hardware instruction from SVE2.
574 if (!ST->hasSVE2())
576
577 Type *BucketPtrsTy = ICA.getArgTypes()[0]; // Type of vector of pointers
578 Type *EltTy = ICA.getArgTypes()[1]; // Type of bucket elements
579 unsigned TotalHistCnts = 1;
580
581 unsigned EltSize = EltTy->getScalarSizeInBits();
582 // Only allow (up to 64b) integers or pointers
583 if ((!EltTy->isIntegerTy() && !EltTy->isPointerTy()) || EltSize > 64)
585
586 // FIXME: We should be able to generate histcnt for fixed-length vectors
587 // using ptrue with a specific VL.
588 if (VectorType *VTy = dyn_cast<VectorType>(BucketPtrsTy)) {
589 unsigned EC = VTy->getElementCount().getKnownMinValue();
590 if (!isPowerOf2_64(EC) || !VTy->isScalableTy())
592
593 // HistCnt only supports 32b and 64b element types
594 unsigned LegalEltSize = EltSize <= 32 ? 32 : 64;
595
596 if (EC == 2 || (LegalEltSize == 32 && EC == 4))
598
599 unsigned NaturalVectorWidth = AArch64::SVEBitsPerBlock / LegalEltSize;
600 TotalHistCnts = EC / NaturalVectorWidth;
601
602 return InstructionCost(BaseHistCntCost * TotalHistCnts);
603 }
604
606}
607
611 // The code-generator is currently not able to handle scalable vectors
612 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
613 // it. This change will be removed when code-generation for these types is
614 // sufficiently reliable.
615 auto *RetTy = ICA.getReturnType();
616 if (auto *VTy = dyn_cast<ScalableVectorType>(RetTy))
617 if (VTy->getElementCount() == ElementCount::getScalable(1))
619
620 switch (ICA.getID()) {
621 case Intrinsic::experimental_vector_histogram_add: {
622 InstructionCost HistCost = getHistogramCost(ST, ICA);
623 // If the cost isn't valid, we may still be able to scalarize
624 if (HistCost.isValid())
625 return HistCost;
626 break;
627 }
628 case Intrinsic::clmul: {
629 auto LT = getTypeLegalizationCost(RetTy);
630
631 // PMUL v8i8/v16i8 is always available on AArch64
632 if (ST->hasNEON()) {
633 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
634 return LT.first;
635
636 // Scalar i8 lowers through scalar/vector moves around PMUL.
637 if (TLI->getValueType(DL, RetTy, true) == MVT::i8) {
638 auto *VecTy =
639 FixedVectorType::get(Type::getInt8Ty(RetTy->getContext()), 8);
640 return 1 +
641 getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
642 -1, nullptr, nullptr) *
643 2 +
644 getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
645 -1, nullptr, nullptr);
646 }
647 }
648
649 if (LT.second.SimpleTy == MVT::nxv2i64)
650 if (ST->hasSVEAES() && (ST->isSVEAvailable() || ST->hasSSVE_AES()))
651 return LT.first * 3;
652
653 if (ST->hasSVE2() || ST->hasSME()) {
654 switch (LT.second.SimpleTy) {
655 case MVT::nxv16i8:
656 return LT.first;
657 case MVT::nxv8i16:
658 return LT.first * 6;
659 case MVT::nxv4i32:
660 return LT.first * 3;
661 case MVT::nxv2i64:
662 return LT.first * 8;
663 default:
664 break;
665 }
666 }
667
668 // Avoid +sve giving this cost 2 due to custom lowering: It's very slow
669 if (LT.second.SimpleTy == MVT::nxv2i64)
670 return 192;
671
672 if (ST->hasAES()) {
673 switch (LT.second.SimpleTy) {
674 case MVT::i16:
675 case MVT::i32:
676 case MVT::i64:
677 case MVT::i128: {
678 auto *VecTy =
679 FixedVectorType::get(Type::getInt64Ty(RetTy->getContext()), 1);
680 return LT.first *
681 (1 +
682 getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
683 -1, nullptr, nullptr) *
684 2 +
685 getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind,
686 -1, nullptr, nullptr));
687 }
688 case MVT::v1i64:
689 return LT.first;
690 case MVT::v2i64:
691 return LT.first * 3;
692 case MVT::v2i32:
693 return LT.first * 6;
694 case MVT::v4i32:
695 return LT.first * 11;
696 case MVT::v4i16:
697 return LT.first * 14;
698 default:
699 break;
700 }
701 }
702 break;
703 }
704 case Intrinsic::umin:
705 case Intrinsic::umax:
706 case Intrinsic::smin:
707 case Intrinsic::smax: {
708 static const auto ValidMinMaxTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
709 MVT::v8i16, MVT::v2i32, MVT::v4i32,
710 MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32,
711 MVT::nxv2i64};
712 auto LT = getTypeLegalizationCost(RetTy);
713 // v2i64 types get converted to cmp+bif hence the cost of 2
714 if (LT.second == MVT::v2i64)
715 return LT.first * 2;
716 if (any_of(ValidMinMaxTys, equal_to(LT.second)))
717 return LT.first;
718 break;
719 }
720 case Intrinsic::scmp:
721 case Intrinsic::ucmp: {
722 static const CostTblEntry BitreverseTbl[] = {
723 {Intrinsic::scmp, MVT::i32, 3}, // cmp+cset+csinv
724 {Intrinsic::scmp, MVT::i64, 3}, // cmp+cset+csinv
725 {Intrinsic::scmp, MVT::v8i8, 3}, // cmgt+cmgt+sub
726 {Intrinsic::scmp, MVT::v16i8, 3}, // cmgt+cmgt+sub
727 {Intrinsic::scmp, MVT::v4i16, 3}, // cmgt+cmgt+sub
728 {Intrinsic::scmp, MVT::v8i16, 3}, // cmgt+cmgt+sub
729 {Intrinsic::scmp, MVT::v2i32, 3}, // cmgt+cmgt+sub
730 {Intrinsic::scmp, MVT::v4i32, 3}, // cmgt+cmgt+sub
731 {Intrinsic::scmp, MVT::v1i64, 3}, // cmgt+cmgt+sub
732 {Intrinsic::scmp, MVT::v2i64, 3}, // cmgt+cmgt+sub
733 };
734 const auto LT = getTypeLegalizationCost(RetTy);
735 const auto *Entry =
736 CostTableLookup(BitreverseTbl, Intrinsic::scmp, LT.second);
737 if (Entry)
738 return Entry->Cost * LT.first;
739 break;
740 }
741 case Intrinsic::sadd_sat:
742 case Intrinsic::ssub_sat:
743 case Intrinsic::uadd_sat:
744 case Intrinsic::usub_sat: {
745 static const auto ValidSatTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
746 MVT::v8i16, MVT::v2i32, MVT::v4i32,
747 MVT::v2i64};
748 auto LT = getTypeLegalizationCost(RetTy);
749 // This is a base cost of 1 for the vadd, plus 3 extract shifts if we
750 // need to extend the type, as it uses shr(qadd(shl, shl)).
751 unsigned Instrs =
752 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1 : 4;
753 if (any_of(ValidSatTys, equal_to(LT.second)))
754 return LT.first * Instrs;
755
757 uint64_t VectorSize = TS.getKnownMinValue();
758
759 if (ST->isSVEAvailable() && VectorSize >= 128 && isPowerOf2_64(VectorSize))
760 return LT.first * Instrs;
761
762 break;
763 }
764 case Intrinsic::abs: {
765 static const auto ValidAbsTys = {MVT::v8i8, MVT::v16i8, MVT::v4i16,
766 MVT::v8i16, MVT::v2i32, MVT::v4i32,
767 MVT::v2i64, MVT::nxv16i8, MVT::nxv8i16,
768 MVT::nxv4i32, MVT::nxv2i64};
769 auto LT = getTypeLegalizationCost(RetTy);
770 if (any_of(ValidAbsTys, equal_to(LT.second)))
771 return LT.first;
772 break;
773 }
774 case Intrinsic::bswap: {
775 static const auto ValidAbsTys = {MVT::v4i16, MVT::v8i16, MVT::v2i32,
776 MVT::v4i32, MVT::v2i64};
777 auto LT = getTypeLegalizationCost(RetTy);
778 if (any_of(ValidAbsTys, equal_to(LT.second)) &&
779 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits())
780 return LT.first;
781 break;
782 }
783 case Intrinsic::fma:
784 case Intrinsic::fmuladd: {
785 // Given a fma or fmuladd, cost it the same as a fmul instruction which are
786 // usually the same for costs. TODO: Add fp16 and bf16 expansion costs.
787 Type *EltTy = RetTy->getScalarType();
788 if (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
789 (EltTy->isHalfTy() && ST->hasFullFP16()))
790 return getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
791 break;
792 }
793 case Intrinsic::stepvector: {
794 InstructionCost Cost = 1; // Cost of the `index' instruction
795 auto LT = getTypeLegalizationCost(RetTy);
796 // Legalisation of illegal vectors involves an `index' instruction plus
797 // (LT.first - 1) vector adds.
798 if (LT.first > 1) {
799 Type *LegalVTy = EVT(LT.second).getTypeForEVT(RetTy->getContext());
800 InstructionCost AddCost =
801 getArithmeticInstrCost(Instruction::Add, LegalVTy, CostKind);
802 Cost += AddCost * (LT.first - 1);
803 }
804 return Cost;
805 }
806 case Intrinsic::vector_extract:
807 case Intrinsic::vector_insert: {
808 // If both the vector and subvector types are legal types and the index
809 // is 0, then this should be a no-op or simple operation; return a
810 // relatively low cost.
811
812 // If arguments aren't actually supplied, then we cannot determine the
813 // value of the index. We also want to skip predicate types.
814 if (ICA.getArgs().size() != ICA.getArgTypes().size() ||
816 break;
817
818 LLVMContext &C = RetTy->getContext();
819 EVT VecVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
820 bool IsExtract = ICA.getID() == Intrinsic::vector_extract;
821 EVT SubVecVT = IsExtract ? getTLI()->getValueType(DL, RetTy)
822 : getTLI()->getValueType(DL, ICA.getArgTypes()[1]);
823 // Skip this if either the vector or subvector types are unpacked
824 // SVE types; they may get lowered to stack stores and loads.
825 if (isUnpackedVectorVT(VecVT) || isUnpackedVectorVT(SubVecVT))
826 break;
827
829 getTLI()->getTypeConversion(C, SubVecVT);
831 getTLI()->getTypeConversion(C, VecVT);
832 const Value *Idx = IsExtract ? ICA.getArgs()[1] : ICA.getArgs()[2];
833 const ConstantInt *CIdx = cast<ConstantInt>(Idx);
834 if (SubVecLK.first == TargetLoweringBase::TypeLegal &&
835 VecLK.first == TargetLoweringBase::TypeLegal && CIdx->isZero())
836 return TTI::TCC_Free;
837 break;
838 }
839 case Intrinsic::bitreverse: {
840 static const CostTblEntry BitreverseTbl[] = {
841 {Intrinsic::bitreverse, MVT::i32, 1},
842 {Intrinsic::bitreverse, MVT::i64, 1},
843 {Intrinsic::bitreverse, MVT::v8i8, 1},
844 {Intrinsic::bitreverse, MVT::v16i8, 1},
845 {Intrinsic::bitreverse, MVT::v4i16, 2},
846 {Intrinsic::bitreverse, MVT::v8i16, 2},
847 {Intrinsic::bitreverse, MVT::v2i32, 2},
848 {Intrinsic::bitreverse, MVT::v4i32, 2},
849 {Intrinsic::bitreverse, MVT::v1i64, 2},
850 {Intrinsic::bitreverse, MVT::v2i64, 2},
851 };
852 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
853 const auto *Entry =
854 CostTableLookup(BitreverseTbl, ICA.getID(), LegalisationCost.second);
855 if (Entry) {
856 // Cost Model is using the legal type(i32) that i8 and i16 will be
857 // converted to +1 so that we match the actual lowering cost
858 if (TLI->getValueType(DL, RetTy, true) == MVT::i8 ||
859 TLI->getValueType(DL, RetTy, true) == MVT::i16)
860 return LegalisationCost.first * Entry->Cost + 1;
861
862 return LegalisationCost.first * Entry->Cost;
863 }
864 break;
865 }
866 case Intrinsic::ctpop: {
867 if (!ST->hasNEON()) {
868 // 32-bit or 64-bit ctpop without NEON is 12 instructions.
869 return getTypeLegalizationCost(RetTy).first * 12;
870 }
871 static const CostTblEntry CtpopCostTbl[] = {
872 {ISD::CTPOP, MVT::v2i64, 4},
873 {ISD::CTPOP, MVT::v4i32, 3},
874 {ISD::CTPOP, MVT::v8i16, 2},
875 {ISD::CTPOP, MVT::v16i8, 1},
876 {ISD::CTPOP, MVT::i64, 4},
877 {ISD::CTPOP, MVT::v2i32, 3},
878 {ISD::CTPOP, MVT::v4i16, 2},
879 {ISD::CTPOP, MVT::v8i8, 1},
880 {ISD::CTPOP, MVT::i32, 5},
881 // SVE types (For targets that override NEON for fixed length vectors)
882 {ISD::CTPOP, MVT::nxv2i64, 1},
883 {ISD::CTPOP, MVT::nxv4i32, 1},
884 {ISD::CTPOP, MVT::nxv8i16, 1},
885 {ISD::CTPOP, MVT::nxv16i8, 1},
886 };
887 auto LT = getTypeLegalizationCost(RetTy);
888 MVT MTy = LT.second;
889
890 // When SVE is available CNT will be used for fixed and scalable vectors.
891 if (ST->isSVEorStreamingSVEAvailable() && MTy.isFixedLengthVector())
893 128 / MTy.getScalarSizeInBits());
894
895 if (const auto *Entry = CostTableLookup(CtpopCostTbl, ISD::CTPOP, MTy)) {
896 // Extra cost of +1 when illegal vector types are legalized by promoting
897 // the integer type.
898 int ExtraCost = MTy.isVector() && MTy.getScalarSizeInBits() !=
899 RetTy->getScalarSizeInBits()
900 ? 1
901 : 0;
902 return LT.first * Entry->Cost + ExtraCost;
903 }
904 break;
905 }
906 case Intrinsic::sadd_with_overflow:
907 case Intrinsic::uadd_with_overflow:
908 case Intrinsic::ssub_with_overflow:
909 case Intrinsic::usub_with_overflow:
910 case Intrinsic::smul_with_overflow:
911 case Intrinsic::umul_with_overflow: {
912 static const CostTblEntry WithOverflowCostTbl[] = {
913 {Intrinsic::sadd_with_overflow, MVT::i8, 3},
914 {Intrinsic::uadd_with_overflow, MVT::i8, 3},
915 {Intrinsic::sadd_with_overflow, MVT::i16, 3},
916 {Intrinsic::uadd_with_overflow, MVT::i16, 3},
917 {Intrinsic::sadd_with_overflow, MVT::i32, 1},
918 {Intrinsic::uadd_with_overflow, MVT::i32, 1},
919 {Intrinsic::sadd_with_overflow, MVT::i64, 1},
920 {Intrinsic::uadd_with_overflow, MVT::i64, 1},
921 {Intrinsic::ssub_with_overflow, MVT::i8, 3},
922 {Intrinsic::usub_with_overflow, MVT::i8, 3},
923 {Intrinsic::ssub_with_overflow, MVT::i16, 3},
924 {Intrinsic::usub_with_overflow, MVT::i16, 3},
925 {Intrinsic::ssub_with_overflow, MVT::i32, 1},
926 {Intrinsic::usub_with_overflow, MVT::i32, 1},
927 {Intrinsic::ssub_with_overflow, MVT::i64, 1},
928 {Intrinsic::usub_with_overflow, MVT::i64, 1},
929 {Intrinsic::smul_with_overflow, MVT::i8, 5},
930 {Intrinsic::umul_with_overflow, MVT::i8, 4},
931 {Intrinsic::smul_with_overflow, MVT::i16, 5},
932 {Intrinsic::umul_with_overflow, MVT::i16, 4},
933 {Intrinsic::smul_with_overflow, MVT::i32, 2}, // eg umull;tst
934 {Intrinsic::umul_with_overflow, MVT::i32, 2}, // eg umull;cmp sxtw
935 {Intrinsic::smul_with_overflow, MVT::i64, 3}, // eg mul;smulh;cmp
936 {Intrinsic::umul_with_overflow, MVT::i64, 3}, // eg mul;umulh;cmp asr
937 };
938 EVT MTy = TLI->getValueType(DL, RetTy->getContainedType(0), true);
939 if (MTy.isSimple())
940 if (const auto *Entry = CostTableLookup(WithOverflowCostTbl, ICA.getID(),
941 MTy.getSimpleVT()))
942 return Entry->Cost;
943 break;
944 }
945 case Intrinsic::fptosi_sat:
946 case Intrinsic::fptoui_sat: {
947 if (ICA.getArgTypes().empty())
948 break;
949 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
950 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
951 EVT MTy = TLI->getValueType(DL, RetTy);
952 // Check for the legal types, which are where the size of the input and the
953 // output are the same, or we are using cvt f64->i32 or f32->i64.
954 if ((LT.second == MVT::f32 || LT.second == MVT::f64 ||
955 LT.second == MVT::v2f32 || LT.second == MVT::v4f32 ||
956 LT.second == MVT::v2f64)) {
957 if ((LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits() ||
958 (LT.second == MVT::f64 && MTy == MVT::i32) ||
959 (LT.second == MVT::f32 && MTy == MVT::i64)))
960 return LT.first;
961 // Extending vector types v2f32->v2i64, fcvtl*2 + fcvt*2
962 if (LT.second.getScalarType() == MVT::f32 && MTy.isFixedLengthVector() &&
963 MTy.getScalarSizeInBits() == 64)
964 return LT.first * (MTy.getVectorNumElements() > 2 ? 4 : 2);
965 }
966 // Similarly for fp16 sizes. Without FullFP16 we generally need to fcvt to
967 // f32.
968 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
969 return LT.first + getIntrinsicInstrCost(
970 {ICA.getID(),
971 RetTy,
972 {ICA.getArgTypes()[0]->getWithNewType(
973 Type::getFloatTy(RetTy->getContext()))}},
974 CostKind);
975 if ((LT.second == MVT::f16 && MTy == MVT::i32) ||
976 (LT.second == MVT::f16 && MTy == MVT::i64) ||
977 ((LT.second == MVT::v4f16 || LT.second == MVT::v8f16) &&
978 (LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())))
979 return LT.first;
980 // Extending vector types v8f16->v8i32, fcvtl*2 + fcvt*2
981 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
982 MTy.getScalarSizeInBits() == 32)
983 return LT.first * (MTy.getVectorNumElements() > 4 ? 4 : 2);
984 // Extending vector types v8f16->v8i32. These current scalarize but the
985 // codegen could be better.
986 if (LT.second.getScalarType() == MVT::f16 && MTy.isFixedLengthVector() &&
987 MTy.getScalarSizeInBits() == 64)
988 return MTy.getVectorNumElements() * 3;
989
990 // If we can we use a legal convert followed by a min+max
991 if ((LT.second.getScalarType() == MVT::f32 ||
992 LT.second.getScalarType() == MVT::f64 ||
993 LT.second.getScalarType() == MVT::f16) &&
994 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
995 Type *LegalTy =
996 Type::getIntNTy(RetTy->getContext(), LT.second.getScalarSizeInBits());
997 if (LT.second.isVector())
998 LegalTy = VectorType::get(LegalTy, LT.second.getVectorElementCount());
1000 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
1001 : Intrinsic::umin,
1002 LegalTy, {LegalTy, LegalTy});
1004 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
1005 : Intrinsic::umax,
1006 LegalTy, {LegalTy, LegalTy});
1008 return LT.first * Cost +
1009 ((LT.second.getScalarType() != MVT::f16 || ST->hasFullFP16()) ? 0
1010 : 1);
1011 }
1012 // Otherwise we need to follow the default expansion that clamps the value
1013 // using a float min/max with a fcmp+sel for nan handling when signed.
1014 Type *FPTy = ICA.getArgTypes()[0]->getScalarType();
1015 RetTy = RetTy->getScalarType();
1016 if (LT.second.isVector()) {
1017 FPTy = VectorType::get(FPTy, LT.second.getVectorElementCount());
1018 RetTy = VectorType::get(RetTy, LT.second.getVectorElementCount());
1019 }
1020 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
1022 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
1024 Cost +=
1025 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1026 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
1027 if (IsSigned) {
1028 Type *CondTy = RetTy->getWithNewBitWidth(1);
1029 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
1031 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1033 }
1034 return LT.first * Cost;
1035 }
1036 case Intrinsic::fshl:
1037 case Intrinsic::fshr: {
1038 if (ICA.getArgs().empty())
1039 break;
1040
1041 const TTI::OperandValueInfo OpInfoZ = TTI::getOperandInfo(ICA.getArgs()[2]);
1042
1043 // ROTR / ROTL is a funnel shift with equal first and second operand. For
1044 // ROTR on integer registers (i32/i64) this can be done in a single ror
1045 // instruction. A fshl with a non-constant shift uses a neg + ror.
1046 if (RetTy->isIntegerTy() && ICA.getArgs()[0] == ICA.getArgs()[1] &&
1047 (RetTy->getPrimitiveSizeInBits() == 32 ||
1048 RetTy->getPrimitiveSizeInBits() == 64)) {
1049 InstructionCost NegCost =
1050 (ICA.getID() == Intrinsic::fshl && !OpInfoZ.isConstant()) ? 1 : 0;
1051 return 1 + NegCost;
1052 }
1053
1054 // TODO: Add handling for fshl where third argument is not a constant.
1055 if (!OpInfoZ.isConstant())
1056 break;
1057
1058 const auto LegalisationCost = getTypeLegalizationCost(RetTy);
1059 if (OpInfoZ.isUniform()) {
1060 static const CostTblEntry FshlTbl[] = {
1061 {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
1062 {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
1063 {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
1064 {Intrinsic::fshl, MVT::v8i8, 2}, {Intrinsic::fshl, MVT::v4i16, 2}};
1065 // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
1066 // to avoid having to duplicate the costs.
1067 const auto *Entry =
1068 CostTableLookup(FshlTbl, Intrinsic::fshl, LegalisationCost.second);
1069 if (Entry)
1070 return LegalisationCost.first * Entry->Cost;
1071 }
1072
1073 auto TyL = getTypeLegalizationCost(RetTy);
1074 if (!RetTy->isIntegerTy())
1075 break;
1076
1077 // Estimate cost manually, as types like i8 and i16 will get promoted to
1078 // i32 and CostTableLookup will ignore the extra conversion cost.
1079 bool HigherCost = (RetTy->getScalarSizeInBits() != 32 &&
1080 RetTy->getScalarSizeInBits() < 64) ||
1081 (RetTy->getScalarSizeInBits() % 64 != 0);
1082 unsigned ExtraCost = HigherCost ? 1 : 0;
1083 if (RetTy->getScalarSizeInBits() == 32 ||
1084 RetTy->getScalarSizeInBits() == 64)
1085 ExtraCost = 0; // fhsl/fshr for i32 and i64 can be lowered to a single
1086 // extr instruction.
1087 else if (HigherCost)
1088 ExtraCost = 1;
1089 else
1090 break;
1091 return TyL.first + ExtraCost;
1092 }
1093 case Intrinsic::get_active_lane_mask: {
1094 auto RetTy = cast<VectorType>(ICA.getReturnType());
1095 EVT RetVT = getTLI()->getValueType(DL, RetTy);
1096 EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1097 if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
1098 break;
1099
1100 if (RetTy->isScalableTy()) {
1101 if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
1103 break;
1104
1105 auto LT = getTypeLegalizationCost(RetTy);
1106 InstructionCost Cost = LT.first;
1107 // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
1108 // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
1109 // nxv32i1 = get_active_lane_mask(base, idx) ->
1110 // {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
1111 if (ST->hasSVE2p1() || ST->hasSME2()) {
1112 Cost /= 2;
1113 if (Cost == 1)
1114 return Cost;
1115 }
1116
1117 // If more than one whilelo intrinsic is required, include the extra cost
1118 // required by the saturating add & select required to increment the
1119 // start value after the first intrinsic call.
1120 Type *OpTy = ICA.getArgTypes()[0];
1121 IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
1122 InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
1123 Type *CondTy = OpTy->getWithNewBitWidth(1);
1124 SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
1126 return Cost + (SplitCost * (Cost - 1));
1127 } else if (!getTLI()->isTypeLegal(RetVT)) {
1128 // We don't have enough context at this point to determine if the mask
1129 // is going to be kept live after the block, which will force the vXi1
1130 // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
1131 // For now, we just assume the vectorizer created this intrinsic and
1132 // the result will be the input for a PHI. In this case the cost will
1133 // be extremely high for fixed-width vectors.
1134 // NOTE: getScalarizationOverhead returns a cost that's far too
1135 // pessimistic for the actual generated codegen. In reality there are
1136 // two instructions generated per lane.
1137 return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
1138 }
1139 break;
1140 }
1141 case Intrinsic::experimental_vector_match: {
1142 auto *NeedleTy = cast<FixedVectorType>(ICA.getArgTypes()[1]);
1143 EVT SearchVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1144 unsigned SearchSize = NeedleTy->getNumElements();
1145 if (!getTLI()->shouldExpandVectorMatch(SearchVT, SearchSize)) {
1146 // Base cost for MATCH instructions. At least on the Neoverse V2 and
1147 // Neoverse V3, these are cheap operations with the same latency as a
1148 // vector ADD. In most cases, however, we also need to do an extra DUP.
1149 // For fixed-length vectors we currently need an extra five--six
1150 // instructions besides the MATCH.
1152 if (isa<FixedVectorType>(RetTy))
1153 Cost += 10;
1154 return Cost;
1155 }
1156 break;
1157 }
1158 case Intrinsic::cttz: {
1159 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1160 if (LT.second == MVT::v8i8 || LT.second == MVT::v16i8)
1161 return LT.first * 2;
1162 if (LT.second == MVT::v4i16 || LT.second == MVT::v8i16 ||
1163 LT.second == MVT::v2i32 || LT.second == MVT::v4i32)
1164 return LT.first * 3;
1165 break;
1166 }
1167 case Intrinsic::experimental_cttz_elts: {
1168 EVT ArgVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
1169 if (!getTLI()->shouldExpandCttzElements(ArgVT)) {
1170 // This will consist of a SVE brkb and a cntp instruction. These
1171 // typically have the same latency and half the throughput as a vector
1172 // add instruction.
1173 return 4;
1174 }
1175 break;
1176 }
1177 case Intrinsic::loop_dependence_raw_mask:
1178 case Intrinsic::loop_dependence_war_mask: {
1179 // The whilewr/rw instructions require SVE2 or SME.
1180 if (ST->hasSVE2() || ST->hasSME()) {
1181 EVT VecVT = getTLI()->getValueType(DL, RetTy);
1182 unsigned EltSizeInBytes =
1183 cast<ConstantInt>(ICA.getArgs()[2])->getZExtValue();
1184 if (!is_contained({1u, 2u, 4u, 8u}, EltSizeInBytes) ||
1185 VecVT.getVectorMinNumElements() != (16 / EltSizeInBytes))
1186 break;
1187 // For fixed-vector types we need to AND the mask with a ptrue vl<N>.
1188 return isa<FixedVectorType>(RetTy) ? 2 : 1;
1189 }
1190 break;
1191 }
1192 case Intrinsic::experimental_vector_extract_last_active:
1193 if (ST->isSVEorStreamingSVEAvailable()) {
1194 auto [LegalCost, _] = getTypeLegalizationCost(ICA.getArgTypes()[0]);
1195 // This should turn into chained clastb instructions.
1196 return LegalCost;
1197 }
1198 break;
1199 case Intrinsic::pow: {
1200 // For scalar calls we know the target has the libcall, and for fixed-width
1201 // vectors we know for the worst case it can be scalarised.
1202 EVT VT = getTLI()->getValueType(DL, RetTy);
1203 RTLIB::Libcall LC = RTLIB::getPOW(VT);
1204 bool HasLibcall = getTLI()->getLibcallImpl(LC) != RTLIB::Unsupported;
1205 bool CanLowerWithLibcalls = !isa<ScalableVectorType>(RetTy) || HasLibcall;
1206
1207 // If we know that the call can be lowered with libcalls then it's safe to
1208 // reduce the costs in some cases. This is important for scalable vectors,
1209 // since we cannot scalarize the call in the absence of a vector math
1210 // library.
1211 if (CanLowerWithLibcalls && ICA.getInst() && !ICA.getArgs().empty()) {
1212 // If we know the fast math flags and the exponent is a constant then the
1213 // cost may be less for some exponents like 0.25 and 0.75.
1214 const Constant *ExpC = dyn_cast<Constant>(ICA.getArgs()[1]);
1215 if (ExpC && isa<VectorType>(ExpC->getType()))
1216 ExpC = ExpC->getSplatValue();
1217 if (auto *ExpF = dyn_cast_or_null<ConstantFP>(ExpC)) {
1218 // The argument must be a FP constant.
1219 bool Is025 = ExpF->getValueAPF().isExactlyValue(0.25);
1220 bool Is075 = ExpF->getValueAPF().isExactlyValue(0.75);
1221 FastMathFlags FMF = ICA.getInst()->getFastMathFlags();
1222 if ((Is025 || Is075) && FMF.noInfs() && FMF.approxFunc() &&
1223 (!Is025 || FMF.noSignedZeros())) {
1224 IntrinsicCostAttributes Attrs(Intrinsic::sqrt, RetTy, {RetTy}, FMF);
1226 if (Is025)
1227 return 2 * Sqrt;
1229 getArithmeticInstrCost(Instruction::FMul, RetTy, CostKind);
1230 return (Sqrt * 2) + FMul;
1231 }
1232 // TODO: For 1/3 exponents we expect the cbrt call to be slightly
1233 // cheaper than pow.
1234 }
1235 }
1236
1237 if (HasLibcall)
1238 return getCallInstrCost(nullptr, RetTy, ICA.getArgTypes(), CostKind);
1239 break;
1240 }
1241 case Intrinsic::sqrt:
1242 case Intrinsic::fabs:
1243 case Intrinsic::ceil:
1244 case Intrinsic::floor:
1245 case Intrinsic::nearbyint:
1246 case Intrinsic::round:
1247 case Intrinsic::rint:
1248 case Intrinsic::roundeven:
1249 case Intrinsic::trunc:
1250 case Intrinsic::minnum:
1251 case Intrinsic::maxnum:
1252 case Intrinsic::minimum:
1253 case Intrinsic::maximum: {
1254 if (isa<ScalableVectorType>(RetTy) && ST->isSVEorStreamingSVEAvailable()) {
1255 auto LT = getTypeLegalizationCost(RetTy);
1256 return LT.first;
1257 }
1258 break;
1259 }
1260 default:
1261 break;
1262 }
1264}
1265
1266/// The function will remove redundant reinterprets casting in the presence
1267/// of the control flow
1268static std::optional<Instruction *> processPhiNode(InstCombiner &IC,
1269 IntrinsicInst &II) {
1271 auto RequiredType = II.getType();
1272
1273 auto *PN = dyn_cast<PHINode>(II.getArgOperand(0));
1274 assert(PN && "Expected Phi Node!");
1275
1276 // Don't create a new Phi unless we can remove the old one.
1277 if (!PN->hasOneUse())
1278 return std::nullopt;
1279
1280 for (Value *IncValPhi : PN->incoming_values()) {
1281 auto *Reinterpret = dyn_cast<IntrinsicInst>(IncValPhi);
1282 if (!Reinterpret ||
1283 Reinterpret->getIntrinsicID() !=
1284 Intrinsic::aarch64_sve_convert_to_svbool ||
1285 RequiredType != Reinterpret->getArgOperand(0)->getType())
1286 return std::nullopt;
1287 }
1288
1289 // Create the new Phi
1290 IC.Builder.SetInsertPoint(PN);
1291 PHINode *NPN = IC.Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
1292 Worklist.push_back(PN);
1293
1294 for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
1295 auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
1296 NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
1297 Worklist.push_back(Reinterpret);
1298 }
1299
1300 // Cleanup Phi Node and reinterprets
1301 return IC.replaceInstUsesWith(II, NPN);
1302}
1303
1304// A collection of properties common to SVE intrinsics that allow for combines
1305// to be written without needing to know the specific intrinsic.
1307 //
1308 // Helper routines for common intrinsic definitions.
1309 //
1310
1311 // e.g. llvm.aarch64.sve.add pg, op1, op2
1312 // with IID ==> llvm.aarch64.sve.add_u
1313 static SVEIntrinsicInfo
1320
1321 // e.g. llvm.aarch64.sve.neg inactive, pg, op
1328
1329 // e.g. llvm.aarch64.sve.fcvtnt inactive, pg, op
1335
1336 // e.g. llvm.aarch64.sve.add_u pg, op1, op2
1342
1343 // e.g. llvm.aarch64.sve.prf pg, ptr (GPIndex = 0)
1344 // llvm.aarch64.sve.st1 data, pg, ptr (GPIndex = 1)
1345 static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex) {
1346 return SVEIntrinsicInfo()
1349 }
1350
1351 // e.g. llvm.aarch64.sve.cmpeq pg, op1, op2
1352 // llvm.aarch64.sve.ld1 pg, ptr
1359
1360 // All properties relate to predication and thus having a general predicate
1361 // is the minimum requirement to say there is intrinsic info to act on.
1362 explicit operator bool() const { return hasGoverningPredicate(); }
1363
1364 //
1365 // Properties relating to the governing predicate.
1366 //
1367
1369 return GoverningPredicateIdx != std::numeric_limits<unsigned>::max();
1370 }
1371
1373 assert(hasGoverningPredicate() && "Propery not set!");
1374 return GoverningPredicateIdx;
1375 }
1376
1378 assert(!hasGoverningPredicate() && "Cannot set property twice!");
1379 GoverningPredicateIdx = Index;
1380 return *this;
1381 }
1382
1383 //
1384 // Properties relating to operations the intrinsic could be transformed into.
1385 // NOTE: This does not mean such a transformation is always possible, but the
1386 // knowledge makes it possible to reuse existing optimisations without needing
1387 // to embed specific handling for each intrinsic. For example, instruction
1388 // simplification can be used to optimise an intrinsic's active lanes.
1389 //
1390
1392 return UndefIntrinsic != Intrinsic::not_intrinsic;
1393 }
1394
1396 assert(hasMatchingUndefIntrinsic() && "Propery not set!");
1397 return UndefIntrinsic;
1398 }
1399
1401 assert(!hasMatchingUndefIntrinsic() && "Cannot set property twice!");
1402 UndefIntrinsic = IID;
1403 return *this;
1404 }
1405
1406 bool hasMatchingIROpode() const { return IROpcode != 0; }
1407
1408 unsigned getMatchingIROpode() const {
1409 assert(hasMatchingIROpode() && "Propery not set!");
1410 return IROpcode;
1411 }
1412
1414 assert(!hasMatchingIROpode() && "Cannot set property twice!");
1415 IROpcode = Opcode;
1416 return *this;
1417 }
1418
1419 //
1420 // Properties relating to the result of inactive lanes.
1421 //
1422
1424 return ResultLanes == InactiveLanesTakenFromOperand;
1425 }
1426
1428 assert(inactiveLanesTakenFromOperand() && "Propery not set!");
1429 return OperandIdxForInactiveLanes;
1430 }
1431
1433 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1434 ResultLanes = InactiveLanesTakenFromOperand;
1435 OperandIdxForInactiveLanes = Index;
1436 return *this;
1437 }
1438
1440 return ResultLanes == InactiveLanesAreNotDefined;
1441 }
1442
1444 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1445 ResultLanes = InactiveLanesAreNotDefined;
1446 return *this;
1447 }
1448
1450 return ResultLanes == InactiveLanesAreUnused;
1451 }
1452
1454 assert(ResultLanes == Uninitialized && "Cannot set property twice!");
1455 ResultLanes = InactiveLanesAreUnused;
1456 return *this;
1457 }
1458
1459 // NOTE: Whilst not limited to only inactive lanes, the common use case is:
1460 // inactiveLanesAreZeroed =
1461 // resultIsZeroInitialized() && inactiveLanesAreUnused()
1462 bool resultIsZeroInitialized() const { return ResultIsZeroInitialized; }
1463
1465 ResultIsZeroInitialized = true;
1466 return *this;
1467 }
1468
1469 //
1470 // The first operand of unary merging operations is typically only used to
1471 // set the result for inactive lanes. Knowing this allows us to deadcode the
1472 // operand when we can prove there are no inactive lanes.
1473 //
1474
1476 return OperandIdxWithNoActiveLanes != std::numeric_limits<unsigned>::max();
1477 }
1478
1480 assert(hasOperandWithNoActiveLanes() && "Propery not set!");
1481 return OperandIdxWithNoActiveLanes;
1482 }
1483
1485 assert(!hasOperandWithNoActiveLanes() && "Cannot set property twice!");
1486 OperandIdxWithNoActiveLanes = Index;
1487 return *this;
1488 }
1489
1490private:
1491 unsigned GoverningPredicateIdx = std::numeric_limits<unsigned>::max();
1492
1493 Intrinsic::ID UndefIntrinsic = Intrinsic::not_intrinsic;
1494 unsigned IROpcode = 0;
1495
1496 enum PredicationStyle {
1498 InactiveLanesTakenFromOperand,
1499 InactiveLanesAreNotDefined,
1500 InactiveLanesAreUnused
1501 } ResultLanes = Uninitialized;
1502
1503 bool ResultIsZeroInitialized = false;
1504 unsigned OperandIdxForInactiveLanes = std::numeric_limits<unsigned>::max();
1505 unsigned OperandIdxWithNoActiveLanes = std::numeric_limits<unsigned>::max();
1506};
1507
1509 // Some SVE intrinsics do not use scalable vector types, but since they are
1510 // not relevant from an SVEIntrinsicInfo perspective, they are also ignored.
1511 if (!isa<ScalableVectorType>(II.getType()) &&
1512 all_of(II.args(), [&](const Value *V) {
1513 return !isa<ScalableVectorType>(V->getType());
1514 }))
1515 return SVEIntrinsicInfo();
1516
1517 Intrinsic::ID IID = II.getIntrinsicID();
1518 switch (IID) {
1519 default:
1520 break;
1521 case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
1522 case Intrinsic::aarch64_sve_fcvt_f16f32:
1523 case Intrinsic::aarch64_sve_fcvt_f16f64:
1524 case Intrinsic::aarch64_sve_fcvt_f32f16:
1525 case Intrinsic::aarch64_sve_fcvt_f32f64:
1526 case Intrinsic::aarch64_sve_fcvt_f64f16:
1527 case Intrinsic::aarch64_sve_fcvt_f64f32:
1528 case Intrinsic::aarch64_sve_fcvtlt_f32f16:
1529 case Intrinsic::aarch64_sve_fcvtlt_f64f32:
1530 case Intrinsic::aarch64_sve_fcvtx_f32f64:
1531 case Intrinsic::aarch64_sve_fcvtzs:
1532 case Intrinsic::aarch64_sve_fcvtzs_i32f16:
1533 case Intrinsic::aarch64_sve_fcvtzs_i32f64:
1534 case Intrinsic::aarch64_sve_fcvtzs_i64f16:
1535 case Intrinsic::aarch64_sve_fcvtzs_i64f32:
1536 case Intrinsic::aarch64_sve_fcvtzu:
1537 case Intrinsic::aarch64_sve_fcvtzu_i32f16:
1538 case Intrinsic::aarch64_sve_fcvtzu_i32f64:
1539 case Intrinsic::aarch64_sve_fcvtzu_i64f16:
1540 case Intrinsic::aarch64_sve_fcvtzu_i64f32:
1541 case Intrinsic::aarch64_sve_revb:
1542 case Intrinsic::aarch64_sve_revh:
1543 case Intrinsic::aarch64_sve_revw:
1544 case Intrinsic::aarch64_sve_revd:
1545 case Intrinsic::aarch64_sve_scvtf:
1546 case Intrinsic::aarch64_sve_scvtf_f16i32:
1547 case Intrinsic::aarch64_sve_scvtf_f16i64:
1548 case Intrinsic::aarch64_sve_scvtf_f32i64:
1549 case Intrinsic::aarch64_sve_scvtf_f64i32:
1550 case Intrinsic::aarch64_sve_ucvtf:
1551 case Intrinsic::aarch64_sve_ucvtf_f16i32:
1552 case Intrinsic::aarch64_sve_ucvtf_f16i64:
1553 case Intrinsic::aarch64_sve_ucvtf_f32i64:
1554 case Intrinsic::aarch64_sve_ucvtf_f64i32:
1556
1557 case Intrinsic::aarch64_sve_fcvtnt_bf16f32_v2:
1558 case Intrinsic::aarch64_sve_fcvtnt_f16f32:
1559 case Intrinsic::aarch64_sve_fcvtnt_f32f64:
1560 case Intrinsic::aarch64_sve_fcvtxnt_f32f64:
1562
1563 case Intrinsic::aarch64_sve_fabd:
1564 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fabd_u);
1565 case Intrinsic::aarch64_sve_fadd:
1566 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fadd_u)
1567 .setMatchingIROpcode(Instruction::FAdd);
1568 case Intrinsic::aarch64_sve_fdiv:
1569 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fdiv_u)
1570 .setMatchingIROpcode(Instruction::FDiv);
1571 case Intrinsic::aarch64_sve_fmax:
1572 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmax_u);
1573 case Intrinsic::aarch64_sve_fmaxnm:
1574 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmaxnm_u);
1575 case Intrinsic::aarch64_sve_fmin:
1576 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmin_u);
1577 case Intrinsic::aarch64_sve_fminnm:
1578 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fminnm_u);
1579 case Intrinsic::aarch64_sve_fmla:
1580 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmla_u);
1581 case Intrinsic::aarch64_sve_fmls:
1582 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmls_u);
1583 case Intrinsic::aarch64_sve_fmul:
1584 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmul_u)
1585 .setMatchingIROpcode(Instruction::FMul);
1586 case Intrinsic::aarch64_sve_fmulx:
1587 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fmulx_u);
1588 case Intrinsic::aarch64_sve_fnmla:
1589 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmla_u);
1590 case Intrinsic::aarch64_sve_fnmls:
1591 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fnmls_u);
1592 case Intrinsic::aarch64_sve_fsub:
1593 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_fsub_u)
1594 .setMatchingIROpcode(Instruction::FSub);
1595 case Intrinsic::aarch64_sve_add:
1596 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_add_u)
1597 .setMatchingIROpcode(Instruction::Add);
1598 case Intrinsic::aarch64_sve_mla:
1599 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mla_u);
1600 case Intrinsic::aarch64_sve_mls:
1601 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mls_u);
1602 case Intrinsic::aarch64_sve_mul:
1603 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_mul_u)
1604 .setMatchingIROpcode(Instruction::Mul);
1605 case Intrinsic::aarch64_sve_sabd:
1606 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sabd_u);
1607 case Intrinsic::aarch64_sve_sdiv:
1608 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sdiv_u)
1609 .setMatchingIROpcode(Instruction::SDiv);
1610 case Intrinsic::aarch64_sve_smax:
1611 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smax_u);
1612 case Intrinsic::aarch64_sve_smin:
1613 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smin_u);
1614 case Intrinsic::aarch64_sve_smulh:
1615 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_smulh_u);
1616 case Intrinsic::aarch64_sve_sub:
1617 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sub_u)
1618 .setMatchingIROpcode(Instruction::Sub);
1619 case Intrinsic::aarch64_sve_uabd:
1620 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uabd_u);
1621 case Intrinsic::aarch64_sve_udiv:
1622 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_udiv_u)
1623 .setMatchingIROpcode(Instruction::UDiv);
1624 case Intrinsic::aarch64_sve_umax:
1625 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umax_u);
1626 case Intrinsic::aarch64_sve_umin:
1627 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umin_u);
1628 case Intrinsic::aarch64_sve_umulh:
1629 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_umulh_u);
1630 case Intrinsic::aarch64_sve_asr:
1631 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_asr_u)
1632 .setMatchingIROpcode(Instruction::AShr);
1633 case Intrinsic::aarch64_sve_lsl:
1634 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsl_u)
1635 .setMatchingIROpcode(Instruction::Shl);
1636 case Intrinsic::aarch64_sve_lsr:
1637 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_lsr_u)
1638 .setMatchingIROpcode(Instruction::LShr);
1639 case Intrinsic::aarch64_sve_and:
1640 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_and_u)
1641 .setMatchingIROpcode(Instruction::And);
1642 case Intrinsic::aarch64_sve_bic:
1643 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_bic_u);
1644 case Intrinsic::aarch64_sve_eor:
1645 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_eor_u)
1646 .setMatchingIROpcode(Instruction::Xor);
1647 case Intrinsic::aarch64_sve_orr:
1648 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_orr_u)
1649 .setMatchingIROpcode(Instruction::Or);
1650 case Intrinsic::aarch64_sve_shsub:
1651 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_shsub_u);
1652 case Intrinsic::aarch64_sve_shsubr:
1654 case Intrinsic::aarch64_sve_sqrshl:
1655 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqrshl_u);
1656 case Intrinsic::aarch64_sve_sqshl:
1657 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqshl_u);
1658 case Intrinsic::aarch64_sve_sqsub:
1659 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_sqsub_u);
1660 case Intrinsic::aarch64_sve_srshl:
1661 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_srshl_u);
1662 case Intrinsic::aarch64_sve_uhsub:
1663 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uhsub_u);
1664 case Intrinsic::aarch64_sve_uhsubr:
1666 case Intrinsic::aarch64_sve_uqrshl:
1667 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqrshl_u);
1668 case Intrinsic::aarch64_sve_uqshl:
1669 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqshl_u);
1670 case Intrinsic::aarch64_sve_uqsub:
1671 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_uqsub_u);
1672 case Intrinsic::aarch64_sve_urshl:
1673 return SVEIntrinsicInfo::defaultMergingOp(Intrinsic::aarch64_sve_urshl_u);
1674
1675 case Intrinsic::aarch64_sve_add_u:
1677 Instruction::Add);
1678 case Intrinsic::aarch64_sve_and_u:
1680 Instruction::And);
1681 case Intrinsic::aarch64_sve_asr_u:
1683 Instruction::AShr);
1684 case Intrinsic::aarch64_sve_eor_u:
1686 Instruction::Xor);
1687 case Intrinsic::aarch64_sve_fadd_u:
1689 Instruction::FAdd);
1690 case Intrinsic::aarch64_sve_fdiv_u:
1692 Instruction::FDiv);
1693 case Intrinsic::aarch64_sve_fmul_u:
1695 Instruction::FMul);
1696 case Intrinsic::aarch64_sve_fsub_u:
1698 Instruction::FSub);
1699 case Intrinsic::aarch64_sve_lsl_u:
1701 Instruction::Shl);
1702 case Intrinsic::aarch64_sve_lsr_u:
1704 Instruction::LShr);
1705 case Intrinsic::aarch64_sve_mul_u:
1707 Instruction::Mul);
1708 case Intrinsic::aarch64_sve_orr_u:
1710 Instruction::Or);
1711 case Intrinsic::aarch64_sve_sdiv_u:
1713 Instruction::SDiv);
1714 case Intrinsic::aarch64_sve_sub_u:
1716 Instruction::Sub);
1717 case Intrinsic::aarch64_sve_udiv_u:
1719 Instruction::UDiv);
1720
1721 case Intrinsic::aarch64_sve_addqv:
1722 case Intrinsic::aarch64_sve_and_z:
1723 case Intrinsic::aarch64_sve_bic_z:
1724 case Intrinsic::aarch64_sve_brka_z:
1725 case Intrinsic::aarch64_sve_brkb_z:
1726 case Intrinsic::aarch64_sve_brkn_z:
1727 case Intrinsic::aarch64_sve_brkpa_z:
1728 case Intrinsic::aarch64_sve_brkpb_z:
1729 case Intrinsic::aarch64_sve_cntp:
1730 case Intrinsic::aarch64_sve_compact:
1731 case Intrinsic::aarch64_sve_eor_z:
1732 case Intrinsic::aarch64_sve_eorv:
1733 case Intrinsic::aarch64_sve_eorqv:
1734 case Intrinsic::aarch64_sve_nand_z:
1735 case Intrinsic::aarch64_sve_nor_z:
1736 case Intrinsic::aarch64_sve_orn_z:
1737 case Intrinsic::aarch64_sve_orr_z:
1738 case Intrinsic::aarch64_sve_orv:
1739 case Intrinsic::aarch64_sve_orqv:
1740 case Intrinsic::aarch64_sve_pnext:
1741 case Intrinsic::aarch64_sve_rdffr_z:
1742 case Intrinsic::aarch64_sve_saddv:
1743 case Intrinsic::aarch64_sve_uaddv:
1744 case Intrinsic::aarch64_sve_umaxv:
1745 case Intrinsic::aarch64_sve_umaxqv:
1746 case Intrinsic::aarch64_sve_cmpeq:
1747 case Intrinsic::aarch64_sve_cmpeq_wide:
1748 case Intrinsic::aarch64_sve_cmpge:
1749 case Intrinsic::aarch64_sve_cmpge_wide:
1750 case Intrinsic::aarch64_sve_cmpgt:
1751 case Intrinsic::aarch64_sve_cmpgt_wide:
1752 case Intrinsic::aarch64_sve_cmphi:
1753 case Intrinsic::aarch64_sve_cmphi_wide:
1754 case Intrinsic::aarch64_sve_cmphs:
1755 case Intrinsic::aarch64_sve_cmphs_wide:
1756 case Intrinsic::aarch64_sve_cmple_wide:
1757 case Intrinsic::aarch64_sve_cmplo_wide:
1758 case Intrinsic::aarch64_sve_cmpls_wide:
1759 case Intrinsic::aarch64_sve_cmplt_wide:
1760 case Intrinsic::aarch64_sve_cmpne:
1761 case Intrinsic::aarch64_sve_cmpne_wide:
1762 case Intrinsic::aarch64_sve_facge:
1763 case Intrinsic::aarch64_sve_facgt:
1764 case Intrinsic::aarch64_sve_fcmpeq:
1765 case Intrinsic::aarch64_sve_fcmpge:
1766 case Intrinsic::aarch64_sve_fcmpgt:
1767 case Intrinsic::aarch64_sve_fcmpne:
1768 case Intrinsic::aarch64_sve_fcmpuo:
1769 case Intrinsic::aarch64_sve_ld1:
1770 case Intrinsic::aarch64_sve_ld1_gather:
1771 case Intrinsic::aarch64_sve_ld1_gather_index:
1772 case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
1773 case Intrinsic::aarch64_sve_ld1_gather_sxtw:
1774 case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
1775 case Intrinsic::aarch64_sve_ld1_gather_uxtw:
1776 case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
1777 case Intrinsic::aarch64_sve_ld1q_gather_index:
1778 case Intrinsic::aarch64_sve_ld1q_gather_scalar_offset:
1779 case Intrinsic::aarch64_sve_ld1q_gather_vector_offset:
1780 case Intrinsic::aarch64_sve_ld1ro:
1781 case Intrinsic::aarch64_sve_ld1rq:
1782 case Intrinsic::aarch64_sve_ld1udq:
1783 case Intrinsic::aarch64_sve_ld1uwq:
1784 case Intrinsic::aarch64_sve_ld2_sret:
1785 case Intrinsic::aarch64_sve_ld2q_sret:
1786 case Intrinsic::aarch64_sve_ld3_sret:
1787 case Intrinsic::aarch64_sve_ld3q_sret:
1788 case Intrinsic::aarch64_sve_ld4_sret:
1789 case Intrinsic::aarch64_sve_ld4q_sret:
1790 case Intrinsic::aarch64_sve_ldff1:
1791 case Intrinsic::aarch64_sve_ldff1_gather:
1792 case Intrinsic::aarch64_sve_ldff1_gather_index:
1793 case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
1794 case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
1795 case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
1796 case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
1797 case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
1798 case Intrinsic::aarch64_sve_ldnf1:
1799 case Intrinsic::aarch64_sve_ldnt1:
1800 case Intrinsic::aarch64_sve_ldnt1_gather:
1801 case Intrinsic::aarch64_sve_ldnt1_gather_index:
1802 case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
1803 case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
1805
1806 case Intrinsic::aarch64_sve_prf:
1807 case Intrinsic::aarch64_sve_prfb_gather_index:
1808 case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
1809 case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
1810 case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
1811 case Intrinsic::aarch64_sve_prfd_gather_index:
1812 case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
1813 case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
1814 case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
1815 case Intrinsic::aarch64_sve_prfh_gather_index:
1816 case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
1817 case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
1818 case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
1819 case Intrinsic::aarch64_sve_prfw_gather_index:
1820 case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
1821 case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
1822 case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
1824
1825 case Intrinsic::aarch64_sve_st1_scatter:
1826 case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
1827 case Intrinsic::aarch64_sve_st1_scatter_sxtw:
1828 case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
1829 case Intrinsic::aarch64_sve_st1_scatter_uxtw:
1830 case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
1831 case Intrinsic::aarch64_sve_st1dq:
1832 case Intrinsic::aarch64_sve_st1q_scatter_index:
1833 case Intrinsic::aarch64_sve_st1q_scatter_scalar_offset:
1834 case Intrinsic::aarch64_sve_st1q_scatter_vector_offset:
1835 case Intrinsic::aarch64_sve_st1wq:
1836 case Intrinsic::aarch64_sve_stnt1:
1837 case Intrinsic::aarch64_sve_stnt1_scatter:
1838 case Intrinsic::aarch64_sve_stnt1_scatter_index:
1839 case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
1840 case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
1842 case Intrinsic::aarch64_sve_st2:
1843 case Intrinsic::aarch64_sve_st2q:
1845 case Intrinsic::aarch64_sve_st3:
1846 case Intrinsic::aarch64_sve_st3q:
1848 case Intrinsic::aarch64_sve_st4:
1849 case Intrinsic::aarch64_sve_st4q:
1851 }
1852
1853 return SVEIntrinsicInfo();
1854}
1855
1856static bool isAllActivePredicate(Value *Pred) {
1857 Value *UncastedPred;
1858
1859 // Look through predicate casts that only remove lanes.
1861 m_Value(UncastedPred)))) {
1862 auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
1863 Pred = UncastedPred;
1864
1866 m_Value(UncastedPred))))
1867 // If the predicate has the same or less lanes than the uncasted predicate
1868 // then we know the casting has no effect.
1869 if (OrigPredTy->getMinNumElements() <=
1870 cast<ScalableVectorType>(UncastedPred->getType())
1871 ->getMinNumElements())
1872 Pred = UncastedPred;
1873 }
1874
1875 auto *C = dyn_cast<Constant>(Pred);
1876 return C && C->isAllOnesValue();
1877}
1878
1879// Simplify `V` by only considering the operations that affect active lanes.
1880// This function should only return existing Values or newly created Constants.
1881static Value *stripInactiveLanes(Value *V, const Value *Pg) {
1882 auto *Dup = dyn_cast<IntrinsicInst>(V);
1883 if (Dup && Dup->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
1884 Dup->getOperand(1) == Pg && isa<Constant>(Dup->getOperand(2)))
1886 cast<VectorType>(V->getType())->getElementCount(),
1887 cast<Constant>(Dup->getOperand(2)));
1888
1889 return V;
1890}
1891
1892static std::optional<Instruction *>
1894 const SVEIntrinsicInfo &IInfo) {
1895 const unsigned Opc = IInfo.getMatchingIROpode();
1896 assert(Instruction::isBinaryOp(Opc) && "Expected a binary operation!");
1897
1898 Value *Pg = II.getOperand(0);
1899 Value *Op1 = II.getOperand(1);
1900 Value *Op2 = II.getOperand(2);
1901 const DataLayout &DL = II.getDataLayout();
1902
1903 // Canonicalise constants to the RHS.
1905 isa<Constant>(Op1) && !isa<Constant>(Op2)) {
1906 IC.replaceOperand(II, 1, Op2);
1907 IC.replaceOperand(II, 2, Op1);
1908 return &II;
1909 }
1910
1911 // Only active lanes matter when simplifying the operation.
1912 Op1 = stripInactiveLanes(Op1, Pg);
1913 Op2 = stripInactiveLanes(Op2, Pg);
1914
1915 Value *SimpleII;
1916 if (auto FII = dyn_cast<FPMathOperator>(&II))
1917 SimpleII = simplifyBinOp(Opc, Op1, Op2, FII->getFastMathFlags(), DL);
1918 else
1919 SimpleII = simplifyBinOp(Opc, Op1, Op2, DL);
1920
1921 // An SVE intrinsic's result is always defined. However, this is not the case
1922 // for its equivalent IR instruction (e.g. when shifting by an amount more
1923 // than the data's bitwidth). Simplifications to an undefined result must be
1924 // ignored to preserve the intrinsic's expected behaviour.
1925 if (!SimpleII || isa<UndefValue>(SimpleII))
1926 return std::nullopt;
1927
1928 if (IInfo.inactiveLanesAreNotDefined())
1929 return IC.replaceInstUsesWith(II, SimpleII);
1930
1931 Value *Inactive = II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom());
1932
1933 // The intrinsic does nothing (e.g. sve.mul(pg, A, 1.0)).
1934 if (SimpleII == Inactive)
1935 return IC.replaceInstUsesWith(II, SimpleII);
1936
1937 // Inactive lanes must be preserved.
1938 SimpleII = IC.Builder.CreateSelect(Pg, SimpleII, Inactive);
1939 return IC.replaceInstUsesWith(II, SimpleII);
1940}
1941
1942// Use SVE intrinsic info to eliminate redundant operands and/or canonicalise
1943// to operations with less strict inactive lane requirements.
1944static std::optional<Instruction *>
1946 const SVEIntrinsicInfo &IInfo) {
1947 if (!IInfo.hasGoverningPredicate())
1948 return std::nullopt;
1949
1950 auto *OpPredicate = II.getOperand(IInfo.getGoverningPredicateOperandIdx());
1951
1952 // If there are no active lanes.
1953 if (match(OpPredicate, m_ZeroInt())) {
1955 return IC.replaceInstUsesWith(
1956 II, II.getOperand(IInfo.getOperandIdxInactiveLanesTakenFrom()));
1957
1958 if (IInfo.inactiveLanesAreUnused()) {
1959 if (IInfo.resultIsZeroInitialized())
1961
1962 return IC.eraseInstFromFunction(II);
1963 }
1964 }
1965
1966 // If there are no inactive lanes.
1967 if (isAllActivePredicate(OpPredicate)) {
1968 if (IInfo.hasOperandWithNoActiveLanes()) {
1969 unsigned OpIdx = IInfo.getOperandIdxWithNoActiveLanes();
1970 if (!isa<UndefValue>(II.getOperand(OpIdx)))
1971 return IC.replaceOperand(II, OpIdx, UndefValue::get(II.getType()));
1972 }
1973
1974 if (IInfo.hasMatchingUndefIntrinsic()) {
1975 auto *NewDecl = Intrinsic::getOrInsertDeclaration(
1976 II.getModule(), IInfo.getMatchingUndefIntrinsic(), {II.getType()});
1977 II.setCalledFunction(NewDecl);
1978 return &II;
1979 }
1980 }
1981
1982 // Operation specific simplifications.
1983 if (IInfo.hasMatchingIROpode() &&
1985 return simplifySVEIntrinsicBinOp(IC, II, IInfo);
1986
1987 return std::nullopt;
1988}
1989
1990// (from_svbool (binop (to_svbool pred) (svbool_t _) (svbool_t _))))
1991// => (binop (pred) (from_svbool _) (from_svbool _))
1992//
1993// The above transformation eliminates a `to_svbool` in the predicate
1994// operand of bitwise operation `binop` by narrowing the vector width of
1995// the operation. For example, it would convert a `<vscale x 16 x i1>
1996// and` into a `<vscale x 4 x i1> and`. This is profitable because
1997// to_svbool must zero the new lanes during widening, whereas
1998// from_svbool is free.
1999static std::optional<Instruction *>
2001 auto BinOp = dyn_cast<IntrinsicInst>(II.getOperand(0));
2002 if (!BinOp)
2003 return std::nullopt;
2004
2005 auto IntrinsicID = BinOp->getIntrinsicID();
2006 switch (IntrinsicID) {
2007 case Intrinsic::aarch64_sve_and_z:
2008 case Intrinsic::aarch64_sve_bic_z:
2009 case Intrinsic::aarch64_sve_eor_z:
2010 case Intrinsic::aarch64_sve_nand_z:
2011 case Intrinsic::aarch64_sve_nor_z:
2012 case Intrinsic::aarch64_sve_orn_z:
2013 case Intrinsic::aarch64_sve_orr_z:
2014 break;
2015 default:
2016 return std::nullopt;
2017 }
2018
2019 auto BinOpPred = BinOp->getOperand(0);
2020 auto BinOpOp1 = BinOp->getOperand(1);
2021 auto BinOpOp2 = BinOp->getOperand(2);
2022
2023 auto PredIntr = dyn_cast<IntrinsicInst>(BinOpPred);
2024 if (!PredIntr ||
2025 PredIntr->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
2026 return std::nullopt;
2027
2028 auto PredOp = PredIntr->getOperand(0);
2029 auto PredOpTy = cast<VectorType>(PredOp->getType());
2030 if (PredOpTy != II.getType())
2031 return std::nullopt;
2032
2033 SmallVector<Value *> NarrowedBinOpArgs = {PredOp};
2034 auto NarrowBinOpOp1 = IC.Builder.CreateIntrinsic(
2035 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp1});
2036 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
2037 if (BinOpOp1 == BinOpOp2)
2038 NarrowedBinOpArgs.push_back(NarrowBinOpOp1);
2039 else
2040 NarrowedBinOpArgs.push_back(IC.Builder.CreateIntrinsic(
2041 Intrinsic::aarch64_sve_convert_from_svbool, {PredOpTy}, {BinOpOp2}));
2042
2043 auto NarrowedBinOp =
2044 IC.Builder.CreateIntrinsic(IntrinsicID, {PredOpTy}, NarrowedBinOpArgs);
2045 return IC.replaceInstUsesWith(II, NarrowedBinOp);
2046}
2047
2048static std::optional<Instruction *>
2050 // If the reinterpret instruction operand is a PHI Node
2051 if (isa<PHINode>(II.getArgOperand(0)))
2052 return processPhiNode(IC, II);
2053
2054 if (auto BinOpCombine = tryCombineFromSVBoolBinOp(IC, II))
2055 return BinOpCombine;
2056
2057 // Ignore converts to/from svcount_t.
2058 if (isa<TargetExtType>(II.getArgOperand(0)->getType()) ||
2059 isa<TargetExtType>(II.getType()))
2060 return std::nullopt;
2061
2062 SmallVector<Instruction *, 32> CandidatesForRemoval;
2063 Value *Cursor = II.getOperand(0), *EarliestReplacement = nullptr;
2064
2065 const auto *IVTy = cast<VectorType>(II.getType());
2066
2067 // Walk the chain of conversions.
2068 while (Cursor) {
2069 // If the type of the cursor has fewer lanes than the final result, zeroing
2070 // must take place, which breaks the equivalence chain.
2071 const auto *CursorVTy = cast<VectorType>(Cursor->getType());
2072 if (CursorVTy->getElementCount().getKnownMinValue() <
2073 IVTy->getElementCount().getKnownMinValue())
2074 break;
2075
2076 // If the cursor has the same type as I, it is a viable replacement.
2077 if (Cursor->getType() == IVTy)
2078 EarliestReplacement = Cursor;
2079
2080 auto *IntrinsicCursor = dyn_cast<IntrinsicInst>(Cursor);
2081
2082 // If this is not an SVE conversion intrinsic, this is the end of the chain.
2083 if (!IntrinsicCursor || !(IntrinsicCursor->getIntrinsicID() ==
2084 Intrinsic::aarch64_sve_convert_to_svbool ||
2085 IntrinsicCursor->getIntrinsicID() ==
2086 Intrinsic::aarch64_sve_convert_from_svbool))
2087 break;
2088
2089 CandidatesForRemoval.insert(CandidatesForRemoval.begin(), IntrinsicCursor);
2090 Cursor = IntrinsicCursor->getOperand(0);
2091 }
2092
2093 // If no viable replacement in the conversion chain was found, there is
2094 // nothing to do.
2095 if (!EarliestReplacement)
2096 return std::nullopt;
2097
2098 return IC.replaceInstUsesWith(II, EarliestReplacement);
2099}
2100
2101static std::optional<Instruction *> instCombineSVESel(InstCombiner &IC,
2102 IntrinsicInst &II) {
2103 // svsel(ptrue, x, y) => x
2104 auto *OpPredicate = II.getOperand(0);
2105 if (isAllActivePredicate(OpPredicate))
2106 return IC.replaceInstUsesWith(II, II.getOperand(1));
2107
2108 auto Select =
2109 IC.Builder.CreateSelect(OpPredicate, II.getOperand(1), II.getOperand(2));
2110 return IC.replaceInstUsesWith(II, Select);
2111}
2112
2113static std::optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
2114 IntrinsicInst &II) {
2115 Value *Pg = II.getOperand(1);
2116
2117 // sve.dup(V, all_active, X) ==> splat(X)
2118 if (isAllActivePredicate(Pg)) {
2119 auto *RetTy = cast<ScalableVectorType>(II.getType());
2120 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2121 II.getArgOperand(2));
2122 return IC.replaceInstUsesWith(II, Splat);
2123 }
2124
2126 m_SpecificInt(AArch64SVEPredPattern::vl1))))
2127 return std::nullopt;
2128
2129 // sve.dup(V, sve.ptrue(vl1), X) ==> insertelement V, X, 0
2130 Value *Insert = IC.Builder.CreateInsertElement(
2131 II.getArgOperand(0), II.getArgOperand(2), uint64_t(0));
2132 return IC.replaceInstUsesWith(II, Insert);
2133}
2134
2135static std::optional<Instruction *> instCombineSVEDupX(InstCombiner &IC,
2136 IntrinsicInst &II) {
2137 // Replace DupX with a regular IR splat.
2138 auto *RetTy = cast<ScalableVectorType>(II.getType());
2139 Value *Splat = IC.Builder.CreateVectorSplat(RetTy->getElementCount(),
2140 II.getArgOperand(0));
2141 Splat->takeName(&II);
2142 return IC.replaceInstUsesWith(II, Splat);
2143}
2144
2145static std::optional<Instruction *> instCombineSVECmpNE(InstCombiner &IC,
2146 IntrinsicInst &II) {
2147 LLVMContext &Ctx = II.getContext();
2148
2149 if (!isAllActivePredicate(II.getArgOperand(0)))
2150 return std::nullopt;
2151
2152 // Check that we have a compare of zero..
2153 auto *SplatValue =
2155 if (!SplatValue || !SplatValue->isZero())
2156 return std::nullopt;
2157
2158 // ..against a dupq
2159 auto *DupQLane = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
2160 if (!DupQLane ||
2161 DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane)
2162 return std::nullopt;
2163
2164 // Where the dupq is a lane 0 replicate of a vector insert
2165 auto *DupQLaneIdx = dyn_cast<ConstantInt>(DupQLane->getArgOperand(1));
2166 if (!DupQLaneIdx || !DupQLaneIdx->isZero())
2167 return std::nullopt;
2168
2169 auto *VecIns = dyn_cast<IntrinsicInst>(DupQLane->getArgOperand(0));
2170 if (!VecIns || VecIns->getIntrinsicID() != Intrinsic::vector_insert)
2171 return std::nullopt;
2172
2173 // Where the vector insert is a fixed constant vector insert into undef at
2174 // index zero
2175 if (!isa<UndefValue>(VecIns->getArgOperand(0)))
2176 return std::nullopt;
2177
2178 if (!cast<ConstantInt>(VecIns->getArgOperand(2))->isZero())
2179 return std::nullopt;
2180
2181 auto *ConstVec = dyn_cast<Constant>(VecIns->getArgOperand(1));
2182 if (!ConstVec)
2183 return std::nullopt;
2184
2185 auto *VecTy = dyn_cast<FixedVectorType>(ConstVec->getType());
2186 auto *OutTy = dyn_cast<ScalableVectorType>(II.getType());
2187 if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements())
2188 return std::nullopt;
2189
2190 unsigned NumElts = VecTy->getNumElements();
2191 unsigned PredicateBits = 0;
2192
2193 // Expand intrinsic operands to a 16-bit byte level predicate
2194 for (unsigned I = 0; I < NumElts; ++I) {
2195 auto *Arg = dyn_cast<ConstantInt>(ConstVec->getAggregateElement(I));
2196 if (!Arg)
2197 return std::nullopt;
2198 if (!Arg->isZero())
2199 PredicateBits |= 1 << (I * (16 / NumElts));
2200 }
2201
2202 // If all bits are zero bail early with an empty predicate
2203 if (PredicateBits == 0) {
2204 auto *PFalse = Constant::getNullValue(II.getType());
2205 PFalse->takeName(&II);
2206 return IC.replaceInstUsesWith(II, PFalse);
2207 }
2208
2209 // Calculate largest predicate type used (where byte predicate is largest)
2210 unsigned Mask = 8;
2211 for (unsigned I = 0; I < 16; ++I)
2212 if ((PredicateBits & (1 << I)) != 0)
2213 Mask |= (I % 8);
2214
2215 unsigned PredSize = Mask & -Mask;
2216 auto *PredType = ScalableVectorType::get(
2217 Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8));
2218
2219 // Ensure all relevant bits are set
2220 for (unsigned I = 0; I < 16; I += PredSize)
2221 if ((PredicateBits & (1 << I)) == 0)
2222 return std::nullopt;
2223
2224 auto *ConvertToSVBool =
2225 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_to_svbool,
2226 PredType, ConstantInt::getTrue(PredType));
2227 auto *ConvertFromSVBool =
2228 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
2229 II.getType(), ConvertToSVBool);
2230
2231 ConvertFromSVBool->takeName(&II);
2232 return IC.replaceInstUsesWith(II, ConvertFromSVBool);
2233}
2234
2235static std::optional<Instruction *> instCombineSVELast(InstCombiner &IC,
2236 IntrinsicInst &II) {
2237 Value *Pg = II.getArgOperand(0);
2238 Value *Vec = II.getArgOperand(1);
2239 auto IntrinsicID = II.getIntrinsicID();
2240 bool IsAfter = IntrinsicID == Intrinsic::aarch64_sve_lasta;
2241
2242 // lastX(splat(X)) --> X
2243 if (auto *SplatVal = getSplatValue(Vec))
2244 return IC.replaceInstUsesWith(II, SplatVal);
2245
2246 // If x and/or y is a splat value then:
2247 // lastX (binop (x, y)) --> binop(lastX(x), lastX(y))
2248 Value *LHS, *RHS;
2249 if (match(Vec, m_OneUse(m_BinOp(m_Value(LHS), m_Value(RHS))))) {
2250 if (isSplatValue(LHS) || isSplatValue(RHS)) {
2251 auto *OldBinOp = cast<BinaryOperator>(Vec);
2252 auto OpC = OldBinOp->getOpcode();
2253 auto *NewLHS =
2254 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, LHS});
2255 auto *NewRHS =
2256 IC.Builder.CreateIntrinsic(IntrinsicID, {Vec->getType()}, {Pg, RHS});
2258 OpC, NewLHS, NewRHS, OldBinOp, OldBinOp->getName(), II.getIterator());
2259 return IC.replaceInstUsesWith(II, NewBinOp);
2260 }
2261 }
2262
2263 auto *C = dyn_cast<Constant>(Pg);
2264 if (IsAfter && C && C->isNullValue()) {
2265 // The intrinsic is extracting lane 0 so use an extract instead.
2266 auto *IdxTy = Type::getInt64Ty(II.getContext());
2267 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0));
2268 Extract->insertBefore(II.getIterator());
2269 Extract->takeName(&II);
2270 return IC.replaceInstUsesWith(II, Extract);
2271 }
2272
2273 auto *IntrPG = dyn_cast<IntrinsicInst>(Pg);
2274 if (!IntrPG)
2275 return std::nullopt;
2276
2277 if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
2278 return std::nullopt;
2279
2280 const auto PTruePattern =
2281 cast<ConstantInt>(IntrPG->getOperand(0))->getZExtValue();
2282
2283 // Can the intrinsic's predicate be converted to a known constant index?
2284 unsigned MinNumElts = getNumElementsFromSVEPredPattern(PTruePattern);
2285 if (!MinNumElts)
2286 return std::nullopt;
2287
2288 unsigned Idx = MinNumElts - 1;
2289 // Increment the index if extracting the element after the last active
2290 // predicate element.
2291 if (IsAfter)
2292 ++Idx;
2293
2294 // Ignore extracts whose index is larger than the known minimum vector
2295 // length. NOTE: This is an artificial constraint where we prefer to
2296 // maintain what the user asked for until an alternative is proven faster.
2297 auto *PgVTy = cast<ScalableVectorType>(Pg->getType());
2298 if (Idx >= PgVTy->getMinNumElements())
2299 return std::nullopt;
2300
2301 // The intrinsic is extracting a fixed lane so use an extract instead.
2302 auto *IdxTy = Type::getInt64Ty(II.getContext());
2303 auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, Idx));
2304 Extract->insertBefore(II.getIterator());
2305 Extract->takeName(&II);
2306 return IC.replaceInstUsesWith(II, Extract);
2307}
2308
2309static std::optional<Instruction *> instCombineSVECondLast(InstCombiner &IC,
2310 IntrinsicInst &II) {
2311 // The SIMD&FP variant of CLAST[AB] is significantly faster than the scalar
2312 // integer variant across a variety of micro-architectures. Replace scalar
2313 // integer CLAST[AB] intrinsic with optimal SIMD&FP variant. A simple
2314 // bitcast-to-fp + clast[ab] + bitcast-to-int will cost a cycle or two more
2315 // depending on the micro-architecture, but has been observed as generally
2316 // being faster, particularly when the CLAST[AB] op is a loop-carried
2317 // dependency.
2318 Value *Pg = II.getArgOperand(0);
2319 Value *Fallback = II.getArgOperand(1);
2320 Value *Vec = II.getArgOperand(2);
2321 Type *Ty = II.getType();
2322
2323 if (!Ty->isIntegerTy())
2324 return std::nullopt;
2325
2326 Type *FPTy;
2327 switch (cast<IntegerType>(Ty)->getBitWidth()) {
2328 default:
2329 return std::nullopt;
2330 case 16:
2331 FPTy = IC.Builder.getHalfTy();
2332 break;
2333 case 32:
2334 FPTy = IC.Builder.getFloatTy();
2335 break;
2336 case 64:
2337 FPTy = IC.Builder.getDoubleTy();
2338 break;
2339 }
2340
2341 Value *FPFallBack = IC.Builder.CreateBitCast(Fallback, FPTy);
2342 auto *FPVTy = VectorType::get(
2343 FPTy, cast<VectorType>(Vec->getType())->getElementCount());
2344 Value *FPVec = IC.Builder.CreateBitCast(Vec, FPVTy);
2345 auto *FPII = IC.Builder.CreateIntrinsic(
2346 II.getIntrinsicID(), {FPVec->getType()}, {Pg, FPFallBack, FPVec});
2347 Value *FPIItoInt = IC.Builder.CreateBitCast(FPII, II.getType());
2348 return IC.replaceInstUsesWith(II, FPIItoInt);
2349}
2350
2351static std::optional<Instruction *> instCombineRDFFR(InstCombiner &IC,
2352 IntrinsicInst &II) {
2353 // Replace rdffr with predicated rdffr.z intrinsic, so that optimizePTestInstr
2354 // can work with RDFFR_PP for ptest elimination.
2355 auto *RDFFR = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr_z,
2356 ConstantInt::getTrue(II.getType()));
2357 RDFFR->takeName(&II);
2358 return IC.replaceInstUsesWith(II, RDFFR);
2359}
2360
2361static std::optional<Instruction *>
2363 const auto Pattern = cast<ConstantInt>(II.getArgOperand(0))->getZExtValue();
2364
2365 if (Pattern == AArch64SVEPredPattern::all) {
2367 II.getType(), ElementCount::getScalable(NumElts));
2368 Cnt->takeName(&II);
2369 return IC.replaceInstUsesWith(II, Cnt);
2370 }
2371
2372 unsigned MinNumElts = getNumElementsFromSVEPredPattern(Pattern);
2373
2374 return MinNumElts && NumElts >= MinNumElts
2375 ? std::optional<Instruction *>(IC.replaceInstUsesWith(
2376 II, ConstantInt::get(II.getType(), MinNumElts)))
2377 : std::nullopt;
2378}
2379
2380static std::optional<Instruction *>
2382 const AArch64Subtarget *ST) {
2383 if (!ST->isStreaming())
2384 return std::nullopt;
2385
2386 // In streaming-mode, aarch64_sme_cntds is equivalent to aarch64_sve_cntd
2387 // with SVEPredPattern::all
2388 Value *Cnt =
2390 Cnt->takeName(&II);
2391 return IC.replaceInstUsesWith(II, Cnt);
2392}
2393
2394static std::optional<Instruction *> instCombineSVEPTest(InstCombiner &IC,
2395 IntrinsicInst &II) {
2396 Value *PgVal = II.getArgOperand(0);
2397 Value *OpVal = II.getArgOperand(1);
2398
2399 // PTEST_<FIRST|LAST>(X, X) is equivalent to PTEST_ANY(X, X).
2400 // Later optimizations prefer this form.
2401 if (PgVal == OpVal &&
2402 (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_first ||
2403 II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_last)) {
2404 Value *Ops[] = {PgVal, OpVal};
2405 Type *Tys[] = {PgVal->getType()};
2406
2407 auto *PTest =
2408 IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, Tys, Ops);
2409 PTest->takeName(&II);
2410
2411 return IC.replaceInstUsesWith(II, PTest);
2412 }
2413
2416
2417 if (!Pg || !Op)
2418 return std::nullopt;
2419
2420 Intrinsic::ID OpIID = Op->getIntrinsicID();
2421
2422 if (Pg->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
2423 OpIID == Intrinsic::aarch64_sve_convert_to_svbool &&
2424 Pg->getArgOperand(0)->getType() == Op->getArgOperand(0)->getType()) {
2425 Value *Ops[] = {Pg->getArgOperand(0), Op->getArgOperand(0)};
2426 Type *Tys[] = {Pg->getArgOperand(0)->getType()};
2427
2428 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2429
2430 PTest->takeName(&II);
2431 return IC.replaceInstUsesWith(II, PTest);
2432 }
2433
2434 // Transform PTEST_ANY(X=OP(PG,...), X) -> PTEST_ANY(PG, X)).
2435 // Later optimizations may rewrite sequence to use the flag-setting variant
2436 // of instruction X to remove PTEST.
2437 if ((Pg == Op) && (II.getIntrinsicID() == Intrinsic::aarch64_sve_ptest_any) &&
2438 ((OpIID == Intrinsic::aarch64_sve_brka_z) ||
2439 (OpIID == Intrinsic::aarch64_sve_brkb_z) ||
2440 (OpIID == Intrinsic::aarch64_sve_brkpa_z) ||
2441 (OpIID == Intrinsic::aarch64_sve_brkpb_z) ||
2442 (OpIID == Intrinsic::aarch64_sve_rdffr_z) ||
2443 (OpIID == Intrinsic::aarch64_sve_and_z) ||
2444 (OpIID == Intrinsic::aarch64_sve_bic_z) ||
2445 (OpIID == Intrinsic::aarch64_sve_eor_z) ||
2446 (OpIID == Intrinsic::aarch64_sve_nand_z) ||
2447 (OpIID == Intrinsic::aarch64_sve_nor_z) ||
2448 (OpIID == Intrinsic::aarch64_sve_orn_z) ||
2449 (OpIID == Intrinsic::aarch64_sve_orr_z))) {
2450 Value *Ops[] = {Pg->getArgOperand(0), Pg};
2451 Type *Tys[] = {Pg->getType()};
2452
2453 auto *PTest = IC.Builder.CreateIntrinsic(II.getIntrinsicID(), Tys, Ops);
2454 PTest->takeName(&II);
2455
2456 return IC.replaceInstUsesWith(II, PTest);
2457 }
2458
2459 return std::nullopt;
2460}
2461
2462template <Intrinsic::ID MulOpc, Intrinsic::ID FuseOpc>
2463static std::optional<Instruction *>
2465 bool MergeIntoAddendOp) {
2466 Value *P = II.getOperand(0);
2467 Value *MulOp0, *MulOp1, *AddendOp, *Mul;
2468 if (MergeIntoAddendOp) {
2469 AddendOp = II.getOperand(1);
2470 Mul = II.getOperand(2);
2471 } else {
2472 AddendOp = II.getOperand(2);
2473 Mul = II.getOperand(1);
2474 }
2475
2477 m_Value(MulOp1))))
2478 return std::nullopt;
2479
2480 if (!Mul->hasOneUse())
2481 return std::nullopt;
2482
2483 Instruction *FMFSource = nullptr;
2484 if (II.getType()->isFPOrFPVectorTy()) {
2485 llvm::FastMathFlags FAddFlags = II.getFastMathFlags();
2486 // Stop the combine when the flags on the inputs differ in case dropping
2487 // flags would lead to us missing out on more beneficial optimizations.
2488 if (FAddFlags != cast<CallInst>(Mul)->getFastMathFlags())
2489 return std::nullopt;
2490 if (!FAddFlags.allowContract())
2491 return std::nullopt;
2492 FMFSource = &II;
2493 }
2494
2495 Value *Res;
2496 if (MergeIntoAddendOp)
2497 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2498 {P, AddendOp, MulOp0, MulOp1}, FMFSource);
2499 else
2500 Res = IC.Builder.CreateIntrinsic(FuseOpc, {II.getType()},
2501 {P, MulOp0, MulOp1, AddendOp}, FMFSource);
2502
2503 return IC.replaceInstUsesWith(II, Res);
2504}
2505
2506static std::optional<Instruction *>
2508 Value *Pred = II.getOperand(0);
2509 Value *PtrOp = II.getOperand(1);
2510 Type *VecTy = II.getType();
2511
2512 if (isAllActivePredicate(Pred)) {
2513 LoadInst *Load = IC.Builder.CreateLoad(VecTy, PtrOp);
2514 Load->copyMetadata(II);
2515 return IC.replaceInstUsesWith(II, Load);
2516 }
2517
2518 CallInst *MaskedLoad =
2519 IC.Builder.CreateMaskedLoad(VecTy, PtrOp, PtrOp->getPointerAlignment(DL),
2520 Pred, ConstantAggregateZero::get(VecTy));
2521 MaskedLoad->copyMetadata(II);
2522 return IC.replaceInstUsesWith(II, MaskedLoad);
2523}
2524
2525static std::optional<Instruction *>
2527 Value *VecOp = II.getOperand(0);
2528 Value *Pred = II.getOperand(1);
2529 Value *PtrOp = II.getOperand(2);
2530
2531 if (isAllActivePredicate(Pred)) {
2532 StoreInst *Store = IC.Builder.CreateStore(VecOp, PtrOp);
2533 Store->copyMetadata(II);
2534 return IC.eraseInstFromFunction(II);
2535 }
2536
2537 CallInst *MaskedStore = IC.Builder.CreateMaskedStore(
2538 VecOp, PtrOp, PtrOp->getPointerAlignment(DL), Pred);
2539 MaskedStore->copyMetadata(II);
2540 return IC.eraseInstFromFunction(II);
2541}
2542
2544 switch (Intrinsic) {
2545 case Intrinsic::aarch64_sve_fmul_u:
2546 return Instruction::BinaryOps::FMul;
2547 case Intrinsic::aarch64_sve_fadd_u:
2548 return Instruction::BinaryOps::FAdd;
2549 case Intrinsic::aarch64_sve_fsub_u:
2550 return Instruction::BinaryOps::FSub;
2551 default:
2552 return Instruction::BinaryOpsEnd;
2553 }
2554}
2555
2556static std::optional<Instruction *>
2558 // Bail due to missing support for ISD::STRICT_ scalable vector operations.
2559 if (II.isStrictFP())
2560 return std::nullopt;
2561
2562 auto *OpPredicate = II.getOperand(0);
2563 auto BinOpCode = intrinsicIDToBinOpCode(II.getIntrinsicID());
2564 if (BinOpCode == Instruction::BinaryOpsEnd ||
2565 !isAllActivePredicate(OpPredicate))
2566 return std::nullopt;
2567 auto BinOp = IC.Builder.CreateBinOpFMF(
2568 BinOpCode, II.getOperand(1), II.getOperand(2), II.getFastMathFlags());
2569 return IC.replaceInstUsesWith(II, BinOp);
2570}
2571
2572static std::optional<Instruction *>
2574 assert(II.getIntrinsicID() == Intrinsic::aarch64_sve_mla_u &&
2575 "Expected MLA_U intrinsic");
2576 Value *Acc = II.getArgOperand(1);
2577 Value *MulOp0 = II.getArgOperand(2);
2578 Value *MulOp1 = II.getArgOperand(3);
2579
2580 // For mla_u, inactive lanes are undefined, so it is valid to drop the
2581 // predicate when replacing mla_u(acc, x, 1) with add(acc, x) or
2582 // mla_u(acc, x, -1) with sub(acc, x).
2583 if (match(MulOp0, m_One()))
2584 return IC.replaceInstUsesWith(II, IC.Builder.CreateAdd(Acc, MulOp1));
2585 if (match(MulOp1, m_One()))
2586 return IC.replaceInstUsesWith(II, IC.Builder.CreateAdd(Acc, MulOp0));
2587 if (match(MulOp0, m_AllOnes()))
2588 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Acc, MulOp1));
2589 if (match(MulOp1, m_AllOnes()))
2590 return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Acc, MulOp0));
2591
2592 if (isa<Constant>(MulOp0) && !isa<Constant>(MulOp1)) {
2593 II.setArgOperand(2, MulOp1);
2594 II.setArgOperand(3, MulOp0);
2595 return &II;
2596 }
2597
2598 return std::nullopt;
2599}
2600
2601static std::optional<Instruction *>
2603 assert((II.getIntrinsicID() == Intrinsic::aarch64_sve_sadalp ||
2604 II.getIntrinsicID() == Intrinsic::aarch64_sve_uadalp) &&
2605 "Expected SADALP or UADALP intrinsic");
2606
2607 // We are looking for add(adalp(%pred, zeroinitializer, %in), %acc)
2608 if (!II.hasOneUse() || !match(II.getArgOperand(1), m_Zero()))
2609 return std::nullopt;
2610
2611 auto *User = cast<Instruction>(*II.user_begin());
2612 Value *Acc;
2613 if (!match(User, m_c_Add(m_Specific(&II), m_Value(Acc))))
2614 return std::nullopt;
2615
2617 Value *PairwiseAddLong = IC.Builder.CreateIntrinsic(
2618 II.getIntrinsicID(), {II.getType()},
2619 {II.getArgOperand(0), Acc, II.getArgOperand(2)});
2620
2621 IC.replaceInstUsesWith(*User, PairwiseAddLong);
2623 return &II; // II is now trivially dead and will get erased.
2624}
2625
2626static std::optional<Instruction *> instCombineSVEVectorAdd(InstCombiner &IC,
2627 IntrinsicInst &II) {
2628 if (auto MLA = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2629 Intrinsic::aarch64_sve_mla>(
2630 IC, II, true))
2631 return MLA;
2632 if (auto MAD = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2633 Intrinsic::aarch64_sve_mad>(
2634 IC, II, false))
2635 return MAD;
2636 return std::nullopt;
2637}
2638
2639static std::optional<Instruction *>
2641 if (auto FMLA =
2642 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2643 Intrinsic::aarch64_sve_fmla>(IC, II,
2644 true))
2645 return FMLA;
2646 if (auto FMAD =
2647 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2648 Intrinsic::aarch64_sve_fmad>(IC, II,
2649 false))
2650 return FMAD;
2651 if (auto FMLA =
2652 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2653 Intrinsic::aarch64_sve_fmla>(IC, II,
2654 true))
2655 return FMLA;
2656 return std::nullopt;
2657}
2658
2659static std::optional<Instruction *>
2661 if (auto FMLA =
2662 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2663 Intrinsic::aarch64_sve_fmla>(IC, II,
2664 true))
2665 return FMLA;
2666 if (auto FMAD =
2667 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2668 Intrinsic::aarch64_sve_fmad>(IC, II,
2669 false))
2670 return FMAD;
2671 if (auto FMLA_U =
2672 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2673 Intrinsic::aarch64_sve_fmla_u>(
2674 IC, II, true))
2675 return FMLA_U;
2676 return instCombineSVEVectorBinOp(IC, II);
2677}
2678
2679static std::optional<Instruction *>
2681 if (auto FMLS =
2682 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2683 Intrinsic::aarch64_sve_fmls>(IC, II,
2684 true))
2685 return FMLS;
2686 if (auto FMSB =
2687 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2688 Intrinsic::aarch64_sve_fnmsb>(
2689 IC, II, false))
2690 return FMSB;
2691 if (auto FMLS =
2692 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2693 Intrinsic::aarch64_sve_fmls>(IC, II,
2694 true))
2695 return FMLS;
2696 return std::nullopt;
2697}
2698
2699static std::optional<Instruction *>
2701 if (auto FMLS =
2702 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2703 Intrinsic::aarch64_sve_fmls>(IC, II,
2704 true))
2705 return FMLS;
2706 if (auto FMSB =
2707 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul,
2708 Intrinsic::aarch64_sve_fnmsb>(
2709 IC, II, false))
2710 return FMSB;
2711 if (auto FMLS_U =
2712 instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_fmul_u,
2713 Intrinsic::aarch64_sve_fmls_u>(
2714 IC, II, true))
2715 return FMLS_U;
2716 return instCombineSVEVectorBinOp(IC, II);
2717}
2718
2719static std::optional<Instruction *> instCombineSVEVectorSub(InstCombiner &IC,
2720 IntrinsicInst &II) {
2721 if (auto MLS = instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul,
2722 Intrinsic::aarch64_sve_mls>(
2723 IC, II, true))
2724 return MLS;
2725 return std::nullopt;
2726}
2727
2728static std::optional<Instruction *> instCombineSVEUnpack(InstCombiner &IC,
2729 IntrinsicInst &II) {
2730 Value *UnpackArg = II.getArgOperand(0);
2731 auto *RetTy = cast<ScalableVectorType>(II.getType());
2732 bool IsSigned = II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpkhi ||
2733 II.getIntrinsicID() == Intrinsic::aarch64_sve_sunpklo;
2734
2735 // Hi = uunpkhi(splat(X)) --> Hi = splat(extend(X))
2736 // Lo = uunpklo(splat(X)) --> Lo = splat(extend(X))
2737 if (auto *ScalarArg = getSplatValue(UnpackArg)) {
2738 ScalarArg =
2739 IC.Builder.CreateIntCast(ScalarArg, RetTy->getScalarType(), IsSigned);
2740 Value *NewVal =
2741 IC.Builder.CreateVectorSplat(RetTy->getElementCount(), ScalarArg);
2742 NewVal->takeName(&II);
2743 return IC.replaceInstUsesWith(II, NewVal);
2744 }
2745
2746 return std::nullopt;
2747}
2748static std::optional<Instruction *> instCombineSVETBL(InstCombiner &IC,
2749 IntrinsicInst &II) {
2750 auto *OpVal = II.getOperand(0);
2751 auto *OpIndices = II.getOperand(1);
2752 VectorType *VTy = cast<VectorType>(II.getType());
2753
2754 // Check whether OpIndices is a constant splat value < minimal element count
2755 // of result.
2756 auto *SplatValue = dyn_cast_or_null<ConstantInt>(getSplatValue(OpIndices));
2757 if (!SplatValue ||
2758 SplatValue->getValue().uge(VTy->getElementCount().getKnownMinValue()))
2759 return std::nullopt;
2760
2761 // Convert sve_tbl(OpVal sve_dup_x(SplatValue)) to
2762 // splat_vector(extractelement(OpVal, SplatValue)) for further optimization.
2763 auto *Extract = IC.Builder.CreateExtractElement(OpVal, SplatValue);
2764 auto *VectorSplat =
2765 IC.Builder.CreateVectorSplat(VTy->getElementCount(), Extract);
2766
2767 VectorSplat->takeName(&II);
2768 return IC.replaceInstUsesWith(II, VectorSplat);
2769}
2770
2771static std::optional<Instruction *> instCombineSVEUzp1(InstCombiner &IC,
2772 IntrinsicInst &II) {
2773 Value *A, *B;
2774 Type *RetTy = II.getType();
2775 constexpr Intrinsic::ID FromSVB = Intrinsic::aarch64_sve_convert_from_svbool;
2776 constexpr Intrinsic::ID ToSVB = Intrinsic::aarch64_sve_convert_to_svbool;
2777
2778 // uzp1(to_svbool(A), to_svbool(B)) --> <A, B>
2779 // uzp1(from_svbool(to_svbool(A)), from_svbool(to_svbool(B))) --> <A, B>
2780 if ((match(II.getArgOperand(0),
2782 match(II.getArgOperand(1),
2784 (match(II.getArgOperand(0), m_Intrinsic<ToSVB>(m_Value(A))) &&
2785 match(II.getArgOperand(1), m_Intrinsic<ToSVB>(m_Value(B))))) {
2786 auto *TyA = cast<ScalableVectorType>(A->getType());
2787 if (TyA == B->getType() &&
2789 auto *SubVec = IC.Builder.CreateInsertVector(
2790 RetTy, PoisonValue::get(RetTy), A, uint64_t(0));
2791 auto *ConcatVec = IC.Builder.CreateInsertVector(RetTy, SubVec, B,
2792 TyA->getMinNumElements());
2793 ConcatVec->takeName(&II);
2794 return IC.replaceInstUsesWith(II, ConcatVec);
2795 }
2796 }
2797
2798 return std::nullopt;
2799}
2800
2801static std::optional<Instruction *> instCombineSVEZip(InstCombiner &IC,
2802 IntrinsicInst &II) {
2803 // zip1(uzp1(A, B), uzp2(A, B)) --> A
2804 // zip2(uzp1(A, B), uzp2(A, B)) --> B
2805 Value *A, *B;
2806 if (match(II.getArgOperand(0),
2809 m_Specific(A), m_Specific(B))))
2810 return IC.replaceInstUsesWith(
2811 II, (II.getIntrinsicID() == Intrinsic::aarch64_sve_zip1 ? A : B));
2812
2813 return std::nullopt;
2814}
2815
2816static std::optional<Instruction *>
2818 Value *Mask = II.getOperand(0);
2819 Value *BasePtr = II.getOperand(1);
2820 Value *Index = II.getOperand(2);
2821 Type *Ty = II.getType();
2822 Value *PassThru = ConstantAggregateZero::get(Ty);
2823
2824 // Contiguous gather => masked load.
2825 // (sve.ld1.gather.index Mask BasePtr (sve.index IndexBase 1))
2826 // => (masked.load (gep BasePtr IndexBase) Align Mask zeroinitializer)
2827 Value *IndexBase;
2829 m_One()))) {
2830 Align Alignment =
2831 BasePtr->getPointerAlignment(II.getDataLayout());
2832
2833 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2834 BasePtr, IndexBase);
2835 CallInst *MaskedLoad =
2836 IC.Builder.CreateMaskedLoad(Ty, Ptr, Alignment, Mask, PassThru);
2837 MaskedLoad->takeName(&II);
2838 return IC.replaceInstUsesWith(II, MaskedLoad);
2839 }
2840
2841 return std::nullopt;
2842}
2843
2844static std::optional<Instruction *>
2846 Value *Val = II.getOperand(0);
2847 Value *Mask = II.getOperand(1);
2848 Value *BasePtr = II.getOperand(2);
2849 Value *Index = II.getOperand(3);
2850 Type *Ty = Val->getType();
2851
2852 // Contiguous scatter => masked store.
2853 // (sve.st1.scatter.index Value Mask BasePtr (sve.index IndexBase 1))
2854 // => (masked.store Value (gep BasePtr IndexBase) Align Mask)
2855 Value *IndexBase;
2857 m_One()))) {
2858 Align Alignment =
2859 BasePtr->getPointerAlignment(II.getDataLayout());
2860
2861 Value *Ptr = IC.Builder.CreateGEP(cast<VectorType>(Ty)->getElementType(),
2862 BasePtr, IndexBase);
2863 (void)IC.Builder.CreateMaskedStore(Val, Ptr, Alignment, Mask);
2864
2865 return IC.eraseInstFromFunction(II);
2866 }
2867
2868 return std::nullopt;
2869}
2870
2871static std::optional<Instruction *> instCombineSVESDIV(InstCombiner &IC,
2872 IntrinsicInst &II) {
2873 Type *Int32Ty = IC.Builder.getInt32Ty();
2874 Value *Pred = II.getOperand(0);
2875 Value *Vec = II.getOperand(1);
2876 Value *DivVec = II.getOperand(2);
2877
2878 Value *SplatValue = getSplatValue(DivVec);
2879 ConstantInt *SplatConstantInt = dyn_cast_or_null<ConstantInt>(SplatValue);
2880 if (!SplatConstantInt)
2881 return std::nullopt;
2882
2883 APInt Divisor = SplatConstantInt->getValue();
2884 const int64_t DivisorValue = Divisor.getSExtValue();
2885 if (DivisorValue == -1)
2886 return std::nullopt;
2887 if (DivisorValue == 1)
2888 IC.replaceInstUsesWith(II, Vec);
2889
2890 if (Divisor.isPowerOf2()) {
2891 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2892 auto ASRD = IC.Builder.CreateIntrinsic(
2893 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2894 return IC.replaceInstUsesWith(II, ASRD);
2895 }
2896 if (Divisor.isNegatedPowerOf2()) {
2897 Divisor.negate();
2898 Constant *DivisorLog2 = ConstantInt::get(Int32Ty, Divisor.logBase2());
2899 auto ASRD = IC.Builder.CreateIntrinsic(
2900 Intrinsic::aarch64_sve_asrd, {II.getType()}, {Pred, Vec, DivisorLog2});
2901 auto NEG = IC.Builder.CreateIntrinsic(
2902 Intrinsic::aarch64_sve_neg, {ASRD->getType()}, {ASRD, Pred, ASRD});
2903 return IC.replaceInstUsesWith(II, NEG);
2904 }
2905
2906 return std::nullopt;
2907}
2908
2909bool SimplifyValuePattern(SmallVector<Value *> &Vec, bool AllowPoison) {
2910 size_t VecSize = Vec.size();
2911 if (VecSize == 1)
2912 return true;
2913 if (!isPowerOf2_64(VecSize))
2914 return false;
2915 size_t HalfVecSize = VecSize / 2;
2916
2917 for (auto LHS = Vec.begin(), RHS = Vec.begin() + HalfVecSize;
2918 RHS != Vec.end(); LHS++, RHS++) {
2919 if (*LHS != nullptr && *RHS != nullptr) {
2920 if (*LHS == *RHS)
2921 continue;
2922 else
2923 return false;
2924 }
2925 if (!AllowPoison)
2926 return false;
2927 if (*LHS == nullptr && *RHS != nullptr)
2928 *LHS = *RHS;
2929 }
2930
2931 Vec.resize(HalfVecSize);
2932 SimplifyValuePattern(Vec, AllowPoison);
2933 return true;
2934}
2935
2936// Try to simplify dupqlane patterns like dupqlane(f32 A, f32 B, f32 A, f32 B)
2937// to dupqlane(f64(C)) where C is A concatenated with B
2938static std::optional<Instruction *> instCombineSVEDupqLane(InstCombiner &IC,
2939 IntrinsicInst &II) {
2940 Value *CurrentInsertElt = nullptr, *Default = nullptr;
2941 if (!match(II.getOperand(0),
2943 m_Value(Default), m_Value(CurrentInsertElt), m_Value())) ||
2944 !isa<FixedVectorType>(CurrentInsertElt->getType()))
2945 return std::nullopt;
2946 auto IIScalableTy = cast<ScalableVectorType>(II.getType());
2947
2948 // Insert the scalars into a container ordered by InsertElement index
2949 SmallVector<Value *> Elts(IIScalableTy->getMinNumElements(), nullptr);
2950 while (auto InsertElt = dyn_cast<InsertElementInst>(CurrentInsertElt)) {
2951 auto Idx = cast<ConstantInt>(InsertElt->getOperand(2));
2952 Elts[Idx->getValue().getZExtValue()] = InsertElt->getOperand(1);
2953 CurrentInsertElt = InsertElt->getOperand(0);
2954 }
2955
2956 bool AllowPoison =
2957 isa<PoisonValue>(CurrentInsertElt) && isa<PoisonValue>(Default);
2958 if (!SimplifyValuePattern(Elts, AllowPoison))
2959 return std::nullopt;
2960
2961 // Rebuild the simplified chain of InsertElements. e.g. (a, b, a, b) as (a, b)
2962 Value *InsertEltChain = PoisonValue::get(CurrentInsertElt->getType());
2963 for (size_t I = 0; I < Elts.size(); I++) {
2964 if (Elts[I] == nullptr)
2965 continue;
2966 InsertEltChain = IC.Builder.CreateInsertElement(InsertEltChain, Elts[I],
2967 IC.Builder.getInt64(I));
2968 }
2969 if (InsertEltChain == nullptr)
2970 return std::nullopt;
2971
2972 // Splat the simplified sequence, e.g. (f16 a, f16 b, f16 c, f16 d) as one i64
2973 // value or (f16 a, f16 b) as one i32 value. This requires an InsertSubvector
2974 // be bitcast to a type wide enough to fit the sequence, be splatted, and then
2975 // be narrowed back to the original type.
2976 unsigned PatternWidth = IIScalableTy->getScalarSizeInBits() * Elts.size();
2977 unsigned PatternElementCount = IIScalableTy->getScalarSizeInBits() *
2978 IIScalableTy->getMinNumElements() /
2979 PatternWidth;
2980
2981 IntegerType *WideTy = IC.Builder.getIntNTy(PatternWidth);
2982 auto *WideScalableTy = ScalableVectorType::get(WideTy, PatternElementCount);
2983 auto *WideShuffleMaskTy =
2984 ScalableVectorType::get(IC.Builder.getInt32Ty(), PatternElementCount);
2985
2986 auto InsertSubvector = IC.Builder.CreateInsertVector(
2987 II.getType(), PoisonValue::get(II.getType()), InsertEltChain,
2988 uint64_t(0));
2989 auto WideBitcast =
2990 IC.Builder.CreateBitOrPointerCast(InsertSubvector, WideScalableTy);
2991 auto WideShuffleMask = ConstantAggregateZero::get(WideShuffleMaskTy);
2992 auto WideShuffle = IC.Builder.CreateShuffleVector(
2993 WideBitcast, PoisonValue::get(WideScalableTy), WideShuffleMask);
2994 auto NarrowBitcast =
2995 IC.Builder.CreateBitOrPointerCast(WideShuffle, II.getType());
2996
2997 return IC.replaceInstUsesWith(II, NarrowBitcast);
2998}
2999
3000static std::optional<Instruction *> instCombineMaxMinNM(InstCombiner &IC,
3001 IntrinsicInst &II) {
3002 Value *A = II.getArgOperand(0);
3003 Value *B = II.getArgOperand(1);
3004 if (A == B)
3005 return IC.replaceInstUsesWith(II, A);
3006
3007 return std::nullopt;
3008}
3009
3010static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
3011 IntrinsicInst &II) {
3012 Value *Pred = II.getOperand(0);
3013 Value *Vec = II.getOperand(1);
3014 Value *Shift = II.getOperand(2);
3015
3016 // Convert SRSHL into the simpler LSL intrinsic when fed by an ABS intrinsic.
3017 Value *AbsPred, *MergedValue;
3019 m_Value(MergedValue), m_Value(AbsPred), m_Value())) &&
3021 m_Value(MergedValue), m_Value(AbsPred), m_Value())))
3022
3023 return std::nullopt;
3024
3025 // Transform is valid if any of the following are true:
3026 // * The ABS merge value is an undef or non-negative
3027 // * The ABS predicate is all active
3028 // * The ABS predicate and the SRSHL predicates are the same
3029 if (!isa<UndefValue>(MergedValue) && !match(MergedValue, m_NonNegative()) &&
3030 AbsPred != Pred && !isAllActivePredicate(AbsPred))
3031 return std::nullopt;
3032
3033 // Only valid when the shift amount is non-negative, otherwise the rounding
3034 // behaviour of SRSHL cannot be ignored.
3035 if (!match(Shift, m_NonNegative()))
3036 return std::nullopt;
3037
3038 auto LSL = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_lsl,
3039 {II.getType()}, {Pred, Vec, Shift});
3040
3041 return IC.replaceInstUsesWith(II, LSL);
3042}
3043
3044static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
3045 IntrinsicInst &II) {
3046 Value *Vec = II.getOperand(0);
3047
3048 if (getSplatValue(Vec) == II.getOperand(1))
3049 return IC.replaceInstUsesWith(II, Vec);
3050
3051 return std::nullopt;
3052}
3053
3054static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
3055 IntrinsicInst &II) {
3056 // If this barrier is post-dominated by identical one we can remove it
3057 auto *NI = II.getNextNode();
3058 unsigned LookaheadThreshold = DMBLookaheadThreshold;
3059 auto CanSkipOver = [](Instruction *I) {
3060 return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
3061 };
3062 while (LookaheadThreshold-- && CanSkipOver(NI)) {
3063 auto *NIBB = NI->getParent();
3064 NI = NI->getNextNode();
3065 if (!NI) {
3066 if (auto *SuccBB = NIBB->getUniqueSuccessor())
3067 NI = &*SuccBB->getFirstNonPHIOrDbgOrLifetime();
3068 else
3069 break;
3070 }
3071 }
3072 auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
3073 if (NextII && II.isIdenticalTo(NextII))
3074 return IC.eraseInstFromFunction(II);
3075
3076 return std::nullopt;
3077}
3078
3079static std::optional<Instruction *> instCombineWhilelo(InstCombiner &IC,
3080 IntrinsicInst &II) {
3081 return IC.replaceInstUsesWith(
3082 II,
3083 IC.Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
3084 {II.getType(), II.getOperand(0)->getType()},
3085 {II.getOperand(0), II.getOperand(1)}));
3086}
3087
3088static std::optional<Instruction *> instCombinePTrue(InstCombiner &IC,
3089 IntrinsicInst &II) {
3090 unsigned PredPattern = cast<ConstantInt>(II.getOperand(0))->getZExtValue();
3091 // SVE vector length is a power-of-two, thus pow2 is synonymous with all.
3092 if (PredPattern == AArch64SVEPredPattern::all ||
3093 PredPattern == AArch64SVEPredPattern::pow2)
3094 return IC.replaceInstUsesWith(II, ConstantInt::getTrue(II.getType()));
3095 return std::nullopt;
3096}
3097
3098static std::optional<Instruction *> instCombineSVEUxt(InstCombiner &IC,
3100 unsigned NumBits) {
3101 Value *Passthru = II.getOperand(0);
3102 Value *Pg = II.getOperand(1);
3103 Value *Op = II.getOperand(2);
3104
3105 // Convert UXT[BHW] to AND.
3106 if (isa<UndefValue>(Passthru) || isAllActivePredicate(Pg)) {
3107 auto *Ty = cast<VectorType>(II.getType());
3108 auto MaskValue = APInt::getLowBitsSet(Ty->getScalarSizeInBits(), NumBits);
3109 auto *Mask = ConstantInt::get(Ty, MaskValue);
3110 auto *And = IC.Builder.CreateIntrinsic(Intrinsic::aarch64_sve_and_u, {Ty},
3111 {Pg, Op, Mask});
3112 return IC.replaceInstUsesWith(II, And);
3113 }
3114
3115 return std::nullopt;
3116}
3117
3118static std::optional<Instruction *>
3120 SMEAttrs FnSMEAttrs(*II.getFunction());
3121 bool IsStreaming = FnSMEAttrs.hasStreamingInterfaceOrBody();
3122 if (IsStreaming || !FnSMEAttrs.hasStreamingCompatibleInterface())
3123 return IC.replaceInstUsesWith(
3124 II, ConstantInt::getBool(II.getType(), IsStreaming));
3125 return std::nullopt;
3126}
3127
3128std::optional<Instruction *>
3130 IntrinsicInst &II) const {
3132 if (std::optional<Instruction *> I = simplifySVEIntrinsic(IC, II, IInfo))
3133 return I;
3134
3135 Intrinsic::ID IID = II.getIntrinsicID();
3136 switch (IID) {
3137 default:
3138 break;
3139 case Intrinsic::aarch64_dmb:
3140 return instCombineDMB(IC, II);
3141 case Intrinsic::aarch64_neon_fmaxnm:
3142 case Intrinsic::aarch64_neon_fminnm:
3143 return instCombineMaxMinNM(IC, II);
3144 case Intrinsic::aarch64_sve_convert_from_svbool:
3145 return instCombineConvertFromSVBool(IC, II);
3146 case Intrinsic::aarch64_sve_dup:
3147 return instCombineSVEDup(IC, II);
3148 case Intrinsic::aarch64_sve_dup_x:
3149 return instCombineSVEDupX(IC, II);
3150 case Intrinsic::aarch64_sve_cmpne:
3151 case Intrinsic::aarch64_sve_cmpne_wide:
3152 return instCombineSVECmpNE(IC, II);
3153 case Intrinsic::aarch64_sve_rdffr:
3154 return instCombineRDFFR(IC, II);
3155 case Intrinsic::aarch64_sve_lasta:
3156 case Intrinsic::aarch64_sve_lastb:
3157 return instCombineSVELast(IC, II);
3158 case Intrinsic::aarch64_sve_clasta_n:
3159 case Intrinsic::aarch64_sve_clastb_n:
3160 return instCombineSVECondLast(IC, II);
3161 case Intrinsic::aarch64_sve_cntd:
3162 return instCombineSVECntElts(IC, II, 2);
3163 case Intrinsic::aarch64_sve_cntw:
3164 return instCombineSVECntElts(IC, II, 4);
3165 case Intrinsic::aarch64_sve_cnth:
3166 return instCombineSVECntElts(IC, II, 8);
3167 case Intrinsic::aarch64_sve_cntb:
3168 return instCombineSVECntElts(IC, II, 16);
3169 case Intrinsic::aarch64_sme_cntsd:
3170 return instCombineSMECntsd(IC, II, ST);
3171 case Intrinsic::aarch64_sve_ptest_any:
3172 case Intrinsic::aarch64_sve_ptest_first:
3173 case Intrinsic::aarch64_sve_ptest_last:
3174 return instCombineSVEPTest(IC, II);
3175 case Intrinsic::aarch64_sve_fadd:
3176 return instCombineSVEVectorFAdd(IC, II);
3177 case Intrinsic::aarch64_sve_fadd_u:
3178 return instCombineSVEVectorFAddU(IC, II);
3179 case Intrinsic::aarch64_sve_fmul_u:
3180 return instCombineSVEVectorBinOp(IC, II);
3181 case Intrinsic::aarch64_sve_fsub:
3182 return instCombineSVEVectorFSub(IC, II);
3183 case Intrinsic::aarch64_sve_fsub_u:
3184 return instCombineSVEVectorFSubU(IC, II);
3185 case Intrinsic::aarch64_sve_add:
3186 return instCombineSVEVectorAdd(IC, II);
3187 case Intrinsic::aarch64_sve_add_u:
3188 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3189 Intrinsic::aarch64_sve_mla_u>(
3190 IC, II, true);
3191 case Intrinsic::aarch64_sve_mla_u:
3192 return instCombineSVEVectorMlaU(IC, II);
3193 case Intrinsic::aarch64_sve_sadalp:
3194 case Intrinsic::aarch64_sve_uadalp:
3196 case Intrinsic::aarch64_sve_sub:
3197 return instCombineSVEVectorSub(IC, II);
3198 case Intrinsic::aarch64_sve_sub_u:
3199 return instCombineSVEVectorFuseMulAddSub<Intrinsic::aarch64_sve_mul_u,
3200 Intrinsic::aarch64_sve_mls_u>(
3201 IC, II, true);
3202 case Intrinsic::aarch64_sve_tbl:
3203 return instCombineSVETBL(IC, II);
3204 case Intrinsic::aarch64_sve_uunpkhi:
3205 case Intrinsic::aarch64_sve_uunpklo:
3206 case Intrinsic::aarch64_sve_sunpkhi:
3207 case Intrinsic::aarch64_sve_sunpklo:
3208 return instCombineSVEUnpack(IC, II);
3209 case Intrinsic::aarch64_sve_uzp1:
3210 return instCombineSVEUzp1(IC, II);
3211 case Intrinsic::aarch64_sve_zip1:
3212 case Intrinsic::aarch64_sve_zip2:
3213 return instCombineSVEZip(IC, II);
3214 case Intrinsic::aarch64_sve_ld1_gather_index:
3215 return instCombineLD1GatherIndex(IC, II);
3216 case Intrinsic::aarch64_sve_st1_scatter_index:
3217 return instCombineST1ScatterIndex(IC, II);
3218 case Intrinsic::aarch64_sve_ld1:
3219 return instCombineSVELD1(IC, II, DL);
3220 case Intrinsic::aarch64_sve_st1:
3221 return instCombineSVEST1(IC, II, DL);
3222 case Intrinsic::aarch64_sve_sdiv:
3223 return instCombineSVESDIV(IC, II);
3224 case Intrinsic::aarch64_sve_sel:
3225 return instCombineSVESel(IC, II);
3226 case Intrinsic::aarch64_sve_srshl:
3227 return instCombineSVESrshl(IC, II);
3228 case Intrinsic::aarch64_sve_dupq_lane:
3229 return instCombineSVEDupqLane(IC, II);
3230 case Intrinsic::aarch64_sve_insr:
3231 return instCombineSVEInsr(IC, II);
3232 case Intrinsic::aarch64_sve_whilelo:
3233 return instCombineWhilelo(IC, II);
3234 case Intrinsic::aarch64_sve_ptrue:
3235 return instCombinePTrue(IC, II);
3236 case Intrinsic::aarch64_sve_uxtb:
3237 return instCombineSVEUxt(IC, II, 8);
3238 case Intrinsic::aarch64_sve_uxth:
3239 return instCombineSVEUxt(IC, II, 16);
3240 case Intrinsic::aarch64_sve_uxtw:
3241 return instCombineSVEUxt(IC, II, 32);
3242 case Intrinsic::aarch64_sme_in_streaming_mode:
3243 return instCombineInStreamingMode(IC, II);
3244 }
3245
3246 return std::nullopt;
3247}
3248
3250 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
3251 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
3252 std::function<void(Instruction *, unsigned, APInt, APInt &)>
3253 SimplifyAndSetOp) const {
3254 switch (II.getIntrinsicID()) {
3255 default:
3256 break;
3257 case Intrinsic::aarch64_neon_fcvtxn:
3258 case Intrinsic::aarch64_neon_rshrn:
3259 case Intrinsic::aarch64_neon_sqrshrn:
3260 case Intrinsic::aarch64_neon_sqrshrun:
3261 case Intrinsic::aarch64_neon_sqshrn:
3262 case Intrinsic::aarch64_neon_sqshrun:
3263 case Intrinsic::aarch64_neon_sqxtn:
3264 case Intrinsic::aarch64_neon_sqxtun:
3265 case Intrinsic::aarch64_neon_uqrshrn:
3266 case Intrinsic::aarch64_neon_uqshrn:
3267 case Intrinsic::aarch64_neon_uqxtn:
3268 SimplifyAndSetOp(&II, 0, OrigDemandedElts, UndefElts);
3269 break;
3270 }
3271
3272 return std::nullopt;
3273}
3274
3276 return ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3278}
3279
3282 switch (K) {
3284 return TypeSize::getFixed(64);
3286 if (ST->useSVEForFixedLengthVectors() &&
3287 (ST->isSVEAvailable() || EnableFixedwidthAutovecInStreamingMode))
3288 return TypeSize::getFixed(
3289 std::max(ST->getMinSVEVectorSizeInBits(), 128u));
3290 else if (ST->isNeonAvailable())
3291 return TypeSize::getFixed(128);
3292 else
3293 return TypeSize::getFixed(0);
3295 if (ST->isSVEAvailable() || (ST->isSVEorStreamingSVEAvailable() &&
3297 return TypeSize::getScalable(128);
3298 else
3299 return TypeSize::getScalable(0);
3300 }
3301 llvm_unreachable("Unsupported register kind");
3302}
3303
3304bool AArch64TTIImpl::isSingleExtWideningInstruction(
3305 unsigned Opcode, Type *DstTy, ArrayRef<const Value *> Args,
3306 Type *SrcOverrideTy) const {
3307 // A helper that returns a vector type from the given type. The number of
3308 // elements in type Ty determines the vector width.
3309 auto toVectorTy = [&](Type *ArgTy) {
3310 return VectorType::get(ArgTy->getScalarType(),
3311 cast<VectorType>(DstTy)->getElementCount());
3312 };
3313
3314 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3315 // i32, i64]. SVE doesn't generally have the same set of instructions to
3316 // perform an extend with the add/sub/mul. There are SMULLB style
3317 // instructions, but they operate on top/bottom, requiring some sort of lane
3318 // interleaving to be used with zext/sext.
3319 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3320 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3321 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3322 return false;
3323
3324 Type *SrcTy = SrcOverrideTy;
3325 switch (Opcode) {
3326 case Instruction::Add: // UADDW(2), SADDW(2).
3327 case Instruction::Sub: { // USUBW(2), SSUBW(2).
3328 // The second operand needs to be an extend
3329 if (isa<SExtInst>(Args[1]) || isa<ZExtInst>(Args[1])) {
3330 if (!SrcTy)
3331 SrcTy =
3332 toVectorTy(cast<Instruction>(Args[1])->getOperand(0)->getType());
3333 break;
3334 }
3335
3336 if (Opcode == Instruction::Sub)
3337 return false;
3338
3339 // UADDW(2), SADDW(2) can be commutted.
3340 if (isa<SExtInst>(Args[0]) || isa<ZExtInst>(Args[0])) {
3341 if (!SrcTy)
3342 SrcTy =
3343 toVectorTy(cast<Instruction>(Args[0])->getOperand(0)->getType());
3344 break;
3345 }
3346 return false;
3347 }
3348 default:
3349 return false;
3350 }
3351
3352 // Legalize the destination type and ensure it can be used in a widening
3353 // operation.
3354 auto DstTyL = getTypeLegalizationCost(DstTy);
3355 if (!DstTyL.second.isVector() || DstEltSize != DstTy->getScalarSizeInBits())
3356 return false;
3357
3358 // Legalize the source type and ensure it can be used in a widening
3359 // operation.
3360 assert(SrcTy && "Expected some SrcTy");
3361 auto SrcTyL = getTypeLegalizationCost(SrcTy);
3362 unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
3363 if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
3364 return false;
3365
3366 // Get the total number of vector elements in the legalized types.
3367 InstructionCost NumDstEls =
3368 DstTyL.first * DstTyL.second.getVectorMinNumElements();
3369 InstructionCost NumSrcEls =
3370 SrcTyL.first * SrcTyL.second.getVectorMinNumElements();
3371
3372 // Return true if the legalized types have the same number of vector elements
3373 // and the destination element type size is twice that of the source type.
3374 return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstEltSize;
3375}
3376
3377Type *AArch64TTIImpl::isBinExtWideningInstruction(unsigned Opcode, Type *DstTy,
3379 Type *SrcOverrideTy) const {
3380 if (Opcode != Instruction::Add && Opcode != Instruction::Sub &&
3381 Opcode != Instruction::Mul)
3382 return nullptr;
3383
3384 // Exit early if DstTy is not a vector type whose elements are one of [i16,
3385 // i32, i64]. SVE doesn't generally have the same set of instructions to
3386 // perform an extend with the add/sub/mul. There are SMULLB style
3387 // instructions, but they operate on top/bottom, requiring some sort of lane
3388 // interleaving to be used with zext/sext.
3389 unsigned DstEltSize = DstTy->getScalarSizeInBits();
3390 if (!useNeonVector(DstTy) || Args.size() != 2 ||
3391 (DstEltSize != 16 && DstEltSize != 32 && DstEltSize != 64))
3392 return nullptr;
3393
3394 auto getScalarSizeWithOverride = [&](const Value *V) {
3395 if (SrcOverrideTy)
3396 return SrcOverrideTy->getScalarSizeInBits();
3397 return cast<Instruction>(V)
3398 ->getOperand(0)
3399 ->getType()
3400 ->getScalarSizeInBits();
3401 };
3402
3403 unsigned MaxEltSize = 0;
3404 if ((isa<SExtInst>(Args[0]) && isa<SExtInst>(Args[1])) ||
3405 (isa<ZExtInst>(Args[0]) && isa<ZExtInst>(Args[1]))) {
3406 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3407 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3408 MaxEltSize = std::max(EltSize0, EltSize1);
3409 } else if (isa<SExtInst, ZExtInst>(Args[0]) &&
3410 isa<SExtInst, ZExtInst>(Args[1])) {
3411 unsigned EltSize0 = getScalarSizeWithOverride(Args[0]);
3412 unsigned EltSize1 = getScalarSizeWithOverride(Args[1]);
3413 // mul(sext, zext) will become smull(sext, zext) if the extends are large
3414 // enough.
3415 if (EltSize0 >= DstEltSize / 2 || EltSize1 >= DstEltSize / 2)
3416 return nullptr;
3417 MaxEltSize = DstEltSize / 2;
3418 } else if (Opcode == Instruction::Mul &&
3419 (isa<ZExtInst>(Args[0]) || isa<ZExtInst>(Args[1]))) {
3420 // If one of the operands is a Zext and the other has enough zero bits
3421 // to be treated as unsigned, we can still generate a umull, meaning the
3422 // zext is free.
3423 KnownBits Known =
3424 computeKnownBits(isa<ZExtInst>(Args[0]) ? Args[1] : Args[0], DL);
3425 if (Args[0]->getType()->getScalarSizeInBits() -
3426 Known.Zero.countLeadingOnes() >
3427 DstTy->getScalarSizeInBits() / 2)
3428 return nullptr;
3429
3430 MaxEltSize =
3431 getScalarSizeWithOverride(isa<ZExtInst>(Args[0]) ? Args[0] : Args[1]);
3432 } else
3433 return nullptr;
3434
3435 if (MaxEltSize * 2 > DstEltSize)
3436 return nullptr;
3437
3438 Type *ExtTy = DstTy->getWithNewBitWidth(MaxEltSize * 2);
3439 if (ExtTy->getPrimitiveSizeInBits() <= 64)
3440 return nullptr;
3441 return ExtTy;
3442}
3443
3444// s/urhadd instructions implement the following pattern, making the
3445// extends free:
3446// %x = add ((zext i8 -> i16), 1)
3447// %y = (zext i8 -> i16)
3448// trunc i16 (lshr (add %x, %y), 1) -> i8
3449//
3451 Type *Src) const {
3452 // The source should be a legal vector type.
3453 if (!Src->isVectorTy() || !TLI->isTypeLegal(TLI->getValueType(DL, Src)) ||
3454 (Src->isScalableTy() && !ST->hasSVE2()))
3455 return false;
3456
3457 if (ExtUser->getOpcode() != Instruction::Add || !ExtUser->hasOneUse())
3458 return false;
3459
3460 // Look for trunc/shl/add before trying to match the pattern.
3461 const Instruction *Add = ExtUser;
3462 auto *AddUser =
3463 dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3464 if (AddUser && AddUser->getOpcode() == Instruction::Add)
3465 Add = AddUser;
3466
3467 auto *Shr = dyn_cast_or_null<Instruction>(Add->getUniqueUndroppableUser());
3468 if (!Shr || Shr->getOpcode() != Instruction::LShr)
3469 return false;
3470
3471 auto *Trunc = dyn_cast_or_null<Instruction>(Shr->getUniqueUndroppableUser());
3472 if (!Trunc || Trunc->getOpcode() != Instruction::Trunc ||
3473 Src->getScalarSizeInBits() !=
3474 cast<CastInst>(Trunc)->getDestTy()->getScalarSizeInBits())
3475 return false;
3476
3477 // Try to match the whole pattern. Ext could be either the first or second
3478 // m_ZExtOrSExt matched.
3479 Instruction *Ex1, *Ex2;
3480 if (!(match(Add, m_c_Add(m_Instruction(Ex1),
3481 m_c_Add(m_Instruction(Ex2), m_One())))))
3482 return false;
3483
3484 // Ensure both extends are of the same type
3485 if (match(Ex1, m_ZExtOrSExt(m_Value())) &&
3486 Ex1->getOpcode() == Ex2->getOpcode())
3487 return true;
3488
3489 return false;
3490}
3491
3493 Type *Src,
3496 const Instruction *I) const {
3497 int ISD = TLI->InstructionOpcodeToISD(Opcode);
3498 assert(ISD && "Invalid opcode");
3499 // If the cast is observable, and it is used by a widening instruction (e.g.,
3500 // uaddl, saddw, etc.), it may be free.
3501 if (I && I->hasOneUser()) {
3502 auto *SingleUser = cast<Instruction>(*I->user_begin());
3503 SmallVector<const Value *, 4> Operands(SingleUser->operand_values());
3504 if (Type *ExtTy = isBinExtWideningInstruction(
3505 SingleUser->getOpcode(), Dst, Operands,
3506 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3507 // The cost from Src->Src*2 needs to be added if required, the cost from
3508 // Src*2->ExtTy is free.
3509 if (ExtTy->getScalarSizeInBits() > Src->getScalarSizeInBits() * 2) {
3510 Type *DoubleSrcTy =
3511 Src->getWithNewBitWidth(Src->getScalarSizeInBits() * 2);
3512 return getCastInstrCost(Opcode, DoubleSrcTy, Src,
3514 }
3515
3516 return 0;
3517 }
3518
3519 if (isSingleExtWideningInstruction(
3520 SingleUser->getOpcode(), Dst, Operands,
3521 Src != I->getOperand(0)->getType() ? Src : nullptr)) {
3522 // For adds only count the second operand as free if both operands are
3523 // extends but not the same operation. (i.e both operands are not free in
3524 // add(sext, zext)).
3525 if (SingleUser->getOpcode() == Instruction::Add) {
3526 if (I == SingleUser->getOperand(1) ||
3527 (isa<CastInst>(SingleUser->getOperand(1)) &&
3528 cast<CastInst>(SingleUser->getOperand(1))->getOpcode() == Opcode))
3529 return 0;
3530 } else {
3531 // Others are free so long as isSingleExtWideningInstruction
3532 // returned true.
3533 return 0;
3534 }
3535 }
3536
3537 // The cast will be free for the s/urhadd instructions
3538 if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
3539 isExtPartOfAvgExpr(SingleUser, Dst, Src))
3540 return 0;
3541 }
3542
3543 EVT SrcTy = TLI->getValueType(DL, Src);
3544 EVT DstTy = TLI->getValueType(DL, Dst);
3545
3546 if (!SrcTy.isSimple() || !DstTy.isSimple())
3547 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
3548
3549 // For the moment we do not have lowering for SVE1-only fptrunc f64->bf16 as
3550 // we use fcvtx under SVE2. Give them invalid costs.
3551 if (!ST->hasSVE2() && !ST->isStreamingSVEAvailable() &&
3552 ISD == ISD::FP_ROUND && SrcTy.isScalableVector() &&
3553 DstTy.getScalarType() == MVT::bf16 && SrcTy.getScalarType() == MVT::f64)
3555
3556 static const TypeConversionCostTblEntry BF16Tbl[] = {
3557 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 1}, // bfcvt
3558 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 1}, // bfcvt
3559 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 1}, // bfcvtn
3560 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 2}, // bfcvtn+bfcvtn2
3561 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 2}, // bfcvtn+fcvtn
3562 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 3}, // fcvtn+fcvtl2+bfcvtn
3563 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+bfcvtn
3564 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 1}, // bfcvt
3565 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 1}, // bfcvt
3566 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 3}, // bfcvt+bfcvt+uzp1
3567 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 2}, // fcvtx+bfcvt
3568 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 5}, // 2*fcvtx+2*bfcvt+uzp1
3569 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 11}, // 4*fcvt+4*bfcvt+3*uzp
3570 };
3571
3572 if (ST->hasBF16())
3573 if (const auto *Entry = ConvertCostTableLookup(
3574 BF16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
3575 return Entry->Cost;
3576
3577 // We have to estimate a cost of fixed length operation upon
3578 // SVE registers(operations) with the number of registers required
3579 // for a fixed type to be represented upon SVE registers.
3580 EVT WiderTy = SrcTy.bitsGT(DstTy) ? SrcTy : DstTy;
3581 if (SrcTy.isFixedLengthVector() && DstTy.isFixedLengthVector() &&
3582 SrcTy.getVectorNumElements() == DstTy.getVectorNumElements() &&
3583 ST->useSVEForFixedLengthVectors(WiderTy)) {
3584 std::pair<InstructionCost, MVT> LT =
3585 getTypeLegalizationCost(WiderTy.getTypeForEVT(Dst->getContext()));
3586 unsigned NumElements =
3587 AArch64::SVEBitsPerBlock / LT.second.getScalarSizeInBits();
3588 return LT.first *
3590 Opcode,
3591 ScalableVectorType::get(Dst->getScalarType(), NumElements),
3592 ScalableVectorType::get(Src->getScalarType(), NumElements), CCH,
3593 CostKind, I);
3594 }
3595
3596 // Symbolic constants for the SVE sitofp/uitofp entries in the table below
3597 // The cost of unpacking twice is artificially increased for now in order
3598 // to avoid regressions against NEON, which will use tbl instructions directly
3599 // instead of multiple layers of [s|u]unpk[lo|hi].
3600 // We use the unpacks in cases where the destination type is illegal and
3601 // requires splitting of the input, even if the input type itself is legal.
3602 const unsigned int SVE_EXT_COST = 1;
3603 const unsigned int SVE_FCVT_COST = 1;
3604 const unsigned int SVE_UNPACK_ONCE = 4;
3605 const unsigned int SVE_UNPACK_TWICE = 16;
3606
3607 static const TypeConversionCostTblEntry ConversionTbl[] = {
3608 {ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1}, // xtn
3609 {ISD::TRUNCATE, MVT::v2i16, MVT::v2i64, 1}, // xtn
3610 {ISD::TRUNCATE, MVT::v2i32, MVT::v2i64, 1}, // xtn
3611 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i32, 1}, // xtn
3612 {ISD::TRUNCATE, MVT::v4i8, MVT::v4i64, 3}, // 2 xtn + 1 uzp1
3613 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1}, // xtn
3614 {ISD::TRUNCATE, MVT::v4i16, MVT::v4i64, 2}, // 1 uzp1 + 1 xtn
3615 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 1}, // 1 uzp1
3616 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i16, 1}, // 1 xtn
3617 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 2}, // 1 uzp1 + 1 xtn
3618 {ISD::TRUNCATE, MVT::v8i8, MVT::v8i64, 4}, // 3 x uzp1 + xtn
3619 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i32, 1}, // 1 uzp1
3620 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i64, 3}, // 3 x uzp1
3621 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i64, 2}, // 2 x uzp1
3622 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i16, 1}, // uzp1
3623 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 3}, // (2 + 1) x uzp1
3624 {ISD::TRUNCATE, MVT::v16i8, MVT::v16i64, 7}, // (4 + 2 + 1) x uzp1
3625 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 2}, // 2 x uzp1
3626 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i64, 6}, // (4 + 2) x uzp1
3627 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i64, 4}, // 4 x uzp1
3628
3629 // Truncations on nxvmiN
3630 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i8, 2},
3631 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i16, 2},
3632 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i32, 2},
3633 {ISD::TRUNCATE, MVT::nxv2i1, MVT::nxv2i64, 2},
3634 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i8, 2},
3635 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i16, 2},
3636 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i32, 2},
3637 {ISD::TRUNCATE, MVT::nxv4i1, MVT::nxv4i64, 5},
3638 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i8, 2},
3639 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i16, 2},
3640 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i32, 5},
3641 {ISD::TRUNCATE, MVT::nxv8i1, MVT::nxv8i64, 11},
3642 {ISD::TRUNCATE, MVT::nxv16i1, MVT::nxv16i8, 2},
3643 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i16, 0},
3644 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i32, 0},
3645 {ISD::TRUNCATE, MVT::nxv2i8, MVT::nxv2i64, 0},
3646 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i32, 0},
3647 {ISD::TRUNCATE, MVT::nxv2i16, MVT::nxv2i64, 0},
3648 {ISD::TRUNCATE, MVT::nxv2i32, MVT::nxv2i64, 0},
3649 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i16, 0},
3650 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i32, 0},
3651 {ISD::TRUNCATE, MVT::nxv4i8, MVT::nxv4i64, 1},
3652 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i32, 0},
3653 {ISD::TRUNCATE, MVT::nxv4i16, MVT::nxv4i64, 1},
3654 {ISD::TRUNCATE, MVT::nxv4i32, MVT::nxv4i64, 1},
3655 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i16, 0},
3656 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i32, 1},
3657 {ISD::TRUNCATE, MVT::nxv8i8, MVT::nxv8i64, 3},
3658 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i32, 1},
3659 {ISD::TRUNCATE, MVT::nxv8i16, MVT::nxv8i64, 3},
3660 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i16, 1},
3661 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i32, 3},
3662 {ISD::TRUNCATE, MVT::nxv16i8, MVT::nxv16i64, 7},
3663
3664 // The number of shll instructions for the extension.
3665 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3666 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3},
3667 {ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3668 {ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 2},
3669 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3670 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3},
3671 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3672 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 2},
3673 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3674 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7},
3675 {ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3676 {ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6},
3677 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3678 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 2},
3679 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3680 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6},
3681
3682 // FP Ext and trunc
3683 {ISD::FP_EXTEND, MVT::f64, MVT::f32, 1}, // fcvt
3684 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f32, 1}, // fcvtl
3685 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f32, 2}, // fcvtl+fcvtl2
3686 // FP16
3687 {ISD::FP_EXTEND, MVT::f32, MVT::f16, 1}, // fcvt
3688 {ISD::FP_EXTEND, MVT::f64, MVT::f16, 1}, // fcvt
3689 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1}, // fcvtl
3690 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 2}, // fcvtl+fcvtl2
3691 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2f16, 2}, // fcvtl+fcvtl
3692 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4f16, 3}, // fcvtl+fcvtl2+fcvtl
3693 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8f16, 6}, // 2 * fcvtl+fcvtl2+fcvtl
3694 // BF16 (uses shift)
3695 {ISD::FP_EXTEND, MVT::f32, MVT::bf16, 1}, // shl
3696 {ISD::FP_EXTEND, MVT::f64, MVT::bf16, 2}, // shl+fcvt
3697 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4bf16, 1}, // shll
3698 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8bf16, 2}, // shll+shll2
3699 {ISD::FP_EXTEND, MVT::v2f64, MVT::v2bf16, 2}, // shll+fcvtl
3700 {ISD::FP_EXTEND, MVT::v4f64, MVT::v4bf16, 3}, // shll+fcvtl+fcvtl2
3701 {ISD::FP_EXTEND, MVT::v8f64, MVT::v8bf16, 6}, // 2 * shll+fcvtl+fcvtl2
3702 // FP Ext and trunc
3703 {ISD::FP_ROUND, MVT::f32, MVT::f64, 1}, // fcvt
3704 {ISD::FP_ROUND, MVT::v2f32, MVT::v2f64, 1}, // fcvtn
3705 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f64, 2}, // fcvtn+fcvtn2
3706 // FP16
3707 {ISD::FP_ROUND, MVT::f16, MVT::f32, 1}, // fcvt
3708 {ISD::FP_ROUND, MVT::f16, MVT::f64, 1}, // fcvt
3709 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f32, 1}, // fcvtn
3710 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f32, 2}, // fcvtn+fcvtn2
3711 {ISD::FP_ROUND, MVT::v2f16, MVT::v2f64, 2}, // fcvtn+fcvtn
3712 {ISD::FP_ROUND, MVT::v4f16, MVT::v4f64, 3}, // fcvtn+fcvtn2+fcvtn
3713 {ISD::FP_ROUND, MVT::v8f16, MVT::v8f64, 6}, // 2 * fcvtn+fcvtn2+fcvtn
3714 // BF16 (more complex, with +bf16 is handled above)
3715 {ISD::FP_ROUND, MVT::bf16, MVT::f32, 8}, // Expansion is ~8 insns
3716 {ISD::FP_ROUND, MVT::bf16, MVT::f64, 9}, // fcvtn + above
3717 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f32, 8},
3718 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f32, 8},
3719 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f32, 15},
3720 {ISD::FP_ROUND, MVT::v2bf16, MVT::v2f64, 9},
3721 {ISD::FP_ROUND, MVT::v4bf16, MVT::v4f64, 10},
3722 {ISD::FP_ROUND, MVT::v8bf16, MVT::v8f64, 19},
3723
3724 // LowerVectorINT_TO_FP:
3725 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3726 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3727 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3728 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1},
3729 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1},
3730 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 1},
3731
3732 // SVE: to nxv2f16
3733 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3734 SVE_EXT_COST + SVE_FCVT_COST},
3735 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3736 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3737 {ISD::SINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3738 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i8,
3739 SVE_EXT_COST + SVE_FCVT_COST},
3740 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i16, SVE_FCVT_COST},
3741 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i32, SVE_FCVT_COST},
3742 {ISD::UINT_TO_FP, MVT::nxv2f16, MVT::nxv2i64, SVE_FCVT_COST},
3743
3744 // SVE: to nxv4f16
3745 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3746 SVE_EXT_COST + SVE_FCVT_COST},
3747 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3748 {ISD::SINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3749 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i8,
3750 SVE_EXT_COST + SVE_FCVT_COST},
3751 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i16, SVE_FCVT_COST},
3752 {ISD::UINT_TO_FP, MVT::nxv4f16, MVT::nxv4i32, SVE_FCVT_COST},
3753
3754 // SVE: to nxv8f16
3755 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3756 SVE_EXT_COST + SVE_FCVT_COST},
3757 {ISD::SINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3758 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i8,
3759 SVE_EXT_COST + SVE_FCVT_COST},
3760 {ISD::UINT_TO_FP, MVT::nxv8f16, MVT::nxv8i16, SVE_FCVT_COST},
3761
3762 // SVE: to nxv16f16
3763 {ISD::SINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3764 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3765 {ISD::UINT_TO_FP, MVT::nxv16f16, MVT::nxv16i8,
3766 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3767
3768 // Complex: to v2f32
3769 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3770 {ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3771 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3},
3772 {ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 3},
3773
3774 // SVE: to nxv2f32
3775 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3776 SVE_EXT_COST + SVE_FCVT_COST},
3777 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3778 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3779 {ISD::SINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3780 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i8,
3781 SVE_EXT_COST + SVE_FCVT_COST},
3782 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i16, SVE_FCVT_COST},
3783 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i32, SVE_FCVT_COST},
3784 {ISD::UINT_TO_FP, MVT::nxv2f32, MVT::nxv2i64, SVE_FCVT_COST},
3785
3786 // Complex: to v4f32
3787 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 4},
3788 {ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3789 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3},
3790 {ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2},
3791
3792 // SVE: to nxv4f32
3793 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3794 SVE_EXT_COST + SVE_FCVT_COST},
3795 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3796 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3797 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i8,
3798 SVE_EXT_COST + SVE_FCVT_COST},
3799 {ISD::UINT_TO_FP, MVT::nxv4f32, MVT::nxv4i16, SVE_FCVT_COST},
3800 {ISD::SINT_TO_FP, MVT::nxv4f32, MVT::nxv4i32, SVE_FCVT_COST},
3801
3802 // Complex: to v8f32
3803 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3804 {ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3805 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i8, 10},
3806 {ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4},
3807
3808 // SVE: to nxv8f32
3809 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3810 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3811 {ISD::SINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3812 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3813 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i8,
3814 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3815 {ISD::UINT_TO_FP, MVT::nxv8f32, MVT::nxv8i16,
3816 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3817
3818 // SVE: to nxv16f32
3819 {ISD::SINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3820 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3821 {ISD::UINT_TO_FP, MVT::nxv16f32, MVT::nxv16i8,
3822 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3823
3824 // Complex: to v16f32
3825 {ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3826 {ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i8, 21},
3827
3828 // Complex: to v2f64
3829 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3830 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3831 {ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3832 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4},
3833 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 4},
3834 {ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2},
3835
3836 // SVE: to nxv2f64
3837 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3838 SVE_EXT_COST + SVE_FCVT_COST},
3839 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3840 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3841 {ISD::SINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3842 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i8,
3843 SVE_EXT_COST + SVE_FCVT_COST},
3844 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i16, SVE_FCVT_COST},
3845 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i32, SVE_FCVT_COST},
3846 {ISD::UINT_TO_FP, MVT::nxv2f64, MVT::nxv2i64, SVE_FCVT_COST},
3847
3848 // Complex: to v4f64
3849 {ISD::SINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3850 {ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i32, 4},
3851
3852 // SVE: to nxv4f64
3853 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3854 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3855 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3856 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3857 {ISD::SINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3858 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3859 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i8,
3860 SVE_EXT_COST + SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3861 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i16,
3862 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3863 {ISD::UINT_TO_FP, MVT::nxv4f64, MVT::nxv4i32,
3864 SVE_UNPACK_ONCE + 2 * SVE_FCVT_COST},
3865
3866 // SVE: to nxv8f64
3867 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3868 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3869 {ISD::SINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3870 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3871 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i8,
3872 SVE_EXT_COST + SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3873 {ISD::UINT_TO_FP, MVT::nxv8f64, MVT::nxv8i16,
3874 SVE_UNPACK_TWICE + 4 * SVE_FCVT_COST},
3875
3876 // LowerVectorFP_TO_INT
3877 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f32, 1},
3878 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1},
3879 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f64, 1},
3880 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1},
3881 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1},
3882 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f64, 1},
3883
3884 // Complex, from v2f32: legal type is v2i32 (no cost) or v2i64 (1 ext).
3885 {ISD::FP_TO_SINT, MVT::v2i64, MVT::v2f32, 2},
3886 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f32, 1},
3887 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f32, 1},
3888 {ISD::FP_TO_UINT, MVT::v2i64, MVT::v2f32, 2},
3889 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f32, 1},
3890 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f32, 1},
3891
3892 // Complex, from v4f32: legal type is v4i16, 1 narrowing => ~2
3893 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2},
3894 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 2},
3895 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2},
3896 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 2},
3897
3898 // Complex, from v2f64: legal type is v2i32, 1 narrowing => ~2.
3899 {ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2},
3900 {ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2},
3901 {ISD::FP_TO_SINT, MVT::v2i8, MVT::v2f64, 2},
3902 {ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2},
3903 {ISD::FP_TO_UINT, MVT::v2i16, MVT::v2f64, 2},
3904 {ISD::FP_TO_UINT, MVT::v2i8, MVT::v2f64, 2},
3905
3906 // Complex, from nxv2f32.
3907 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3908 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3909 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3910 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3911 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f32, 1},
3912 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f32, 1},
3913 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f32, 1},
3914 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f32, 1},
3915
3916 // Complex, from nxv2f64.
3917 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3918 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3919 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3920 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3921 {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3922 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
3923 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
3924 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
3925 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
3926 {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
3927
3928 // Complex, from nxv4f32.
3929 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3930 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3931 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3932 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3933 {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3934 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
3935 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
3936 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
3937 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
3938 {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
3939
3940 // Complex, from nxv8f64. Illegal -> illegal conversions not required.
3941 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3942 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3943 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f64, 7},
3944 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f64, 7},
3945
3946 // Complex, from nxv4f64. Illegal -> illegal conversions not required.
3947 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3948 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3949 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3950 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f64, 3},
3951 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f64, 3},
3952 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f64, 3},
3953
3954 // Complex, from nxv8f32. Illegal -> illegal conversions not required.
3955 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3956 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3957 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f32, 3},
3958 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f32, 3},
3959
3960 // Complex, from nxv8f16.
3961 {ISD::FP_TO_SINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3962 {ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3963 {ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3964 {ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3965 {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3966 {ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
3967 {ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
3968 {ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
3969 {ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
3970 {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
3971
3972 // Complex, from nxv4f16.
3973 {ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3974 {ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3975 {ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3976 {ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3977 {ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f16, 4},
3978 {ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f16, 1},
3979 {ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f16, 1},
3980 {ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f16, 1},
3981
3982 // Complex, from nxv2f16.
3983 {ISD::FP_TO_SINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3984 {ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3985 {ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3986 {ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3987 {ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f16, 1},
3988 {ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f16, 1},
3989 {ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f16, 1},
3990 {ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f16, 1},
3991
3992 // Truncate from nxvmf32 to nxvmf16.
3993 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f32, 1},
3994 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f32, 1},
3995 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 3},
3996
3997 // Truncate from nxvmf32 to nxvmbf16.
3998 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f32, 8},
3999 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f32, 8},
4000 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f32, 17},
4001
4002 // Truncate from nxvmf64 to nxvmf16.
4003 {ISD::FP_ROUND, MVT::nxv2f16, MVT::nxv2f64, 1},
4004 {ISD::FP_ROUND, MVT::nxv4f16, MVT::nxv4f64, 3},
4005 {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f64, 7},
4006
4007 // Truncate from nxvmf64 to nxvmbf16.
4008 {ISD::FP_ROUND, MVT::nxv2bf16, MVT::nxv2f64, 9},
4009 {ISD::FP_ROUND, MVT::nxv4bf16, MVT::nxv4f64, 19},
4010 {ISD::FP_ROUND, MVT::nxv8bf16, MVT::nxv8f64, 39},
4011
4012 // Truncate from nxvmf64 to nxvmf32.
4013 {ISD::FP_ROUND, MVT::nxv2f32, MVT::nxv2f64, 1},
4014 {ISD::FP_ROUND, MVT::nxv4f32, MVT::nxv4f64, 3},
4015 {ISD::FP_ROUND, MVT::nxv8f32, MVT::nxv8f64, 6},
4016
4017 // Extend from nxvmf16 to nxvmf32.
4018 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2f16, 1},
4019 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4f16, 1},
4020 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8f16, 2},
4021
4022 // Extend from nxvmbf16 to nxvmf32.
4023 {ISD::FP_EXTEND, MVT::nxv2f32, MVT::nxv2bf16, 1}, // lsl
4024 {ISD::FP_EXTEND, MVT::nxv4f32, MVT::nxv4bf16, 1}, // lsl
4025 {ISD::FP_EXTEND, MVT::nxv8f32, MVT::nxv8bf16, 4}, // unpck+unpck+lsl+lsl
4026
4027 // Extend from nxvmf16 to nxvmf64.
4028 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f16, 1},
4029 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f16, 2},
4030 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f16, 4},
4031
4032 // Extend from nxvmbf16 to nxvmf64.
4033 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2bf16, 2}, // lsl+fcvt
4034 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4bf16, 6}, // 2*unpck+2*lsl+2*fcvt
4035 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8bf16, 14}, // 6*unpck+4*lsl+4*fcvt
4036
4037 // Extend from nxvmf32 to nxvmf64.
4038 {ISD::FP_EXTEND, MVT::nxv2f64, MVT::nxv2f32, 1},
4039 {ISD::FP_EXTEND, MVT::nxv4f64, MVT::nxv4f32, 2},
4040 {ISD::FP_EXTEND, MVT::nxv8f64, MVT::nxv8f32, 6},
4041
4042 // Bitcasts from float to integer
4043 {ISD::BITCAST, MVT::nxv2f16, MVT::nxv2i16, 0},
4044 {ISD::BITCAST, MVT::nxv4f16, MVT::nxv4i16, 0},
4045 {ISD::BITCAST, MVT::nxv2f32, MVT::nxv2i32, 0},
4046
4047 // Bitcasts from integer to float
4048 {ISD::BITCAST, MVT::nxv2i16, MVT::nxv2f16, 0},
4049 {ISD::BITCAST, MVT::nxv4i16, MVT::nxv4f16, 0},
4050 {ISD::BITCAST, MVT::nxv2i32, MVT::nxv2f32, 0},
4051
4052 // Add cost for extending to illegal -too wide- scalable vectors.
4053 // zero/sign extend are implemented by multiple unpack operations,
4054 // where each operation has a cost of 1.
4055 {ISD::ZERO_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
4056 {ISD::ZERO_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
4057 {ISD::ZERO_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
4058 {ISD::ZERO_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
4059 {ISD::ZERO_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
4060 {ISD::ZERO_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
4061
4062 {ISD::SIGN_EXTEND, MVT::nxv16i16, MVT::nxv16i8, 2},
4063 {ISD::SIGN_EXTEND, MVT::nxv16i32, MVT::nxv16i8, 6},
4064 {ISD::SIGN_EXTEND, MVT::nxv16i64, MVT::nxv16i8, 14},
4065 {ISD::SIGN_EXTEND, MVT::nxv8i32, MVT::nxv8i16, 2},
4066 {ISD::SIGN_EXTEND, MVT::nxv8i64, MVT::nxv8i16, 6},
4067 {ISD::SIGN_EXTEND, MVT::nxv4i64, MVT::nxv4i32, 2},
4068 };
4069
4070 if (const auto *Entry = ConvertCostTableLookup(
4071 ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
4072 return Entry->Cost;
4073
4074 static const TypeConversionCostTblEntry FP16Tbl[] = {
4075 {ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f16, 1}, // fcvtzs
4076 {ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f16, 1},
4077 {ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f16, 1}, // fcvtzs
4078 {ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f16, 1},
4079 {ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f16, 2}, // fcvtl+fcvtzs
4080 {ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f16, 2},
4081 {ISD::FP_TO_SINT, MVT::v8i8, MVT::v8f16, 2}, // fcvtzs+xtn
4082 {ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f16, 2},
4083 {ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f16, 1}, // fcvtzs
4084 {ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f16, 1},
4085 {ISD::FP_TO_SINT, MVT::v8i32, MVT::v8f16, 4}, // 2*fcvtl+2*fcvtzs
4086 {ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f16, 4},
4087 {ISD::FP_TO_SINT, MVT::v16i8, MVT::v16f16, 3}, // 2*fcvtzs+xtn
4088 {ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f16, 3},
4089 {ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f16, 2}, // 2*fcvtzs
4090 {ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f16, 2},
4091 {ISD::FP_TO_SINT, MVT::v16i32, MVT::v16f16, 8}, // 4*fcvtl+4*fcvtzs
4092 {ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f16, 8},
4093 {ISD::UINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // ushll + ucvtf
4094 {ISD::SINT_TO_FP, MVT::v8f16, MVT::v8i8, 2}, // sshll + scvtf
4095 {ISD::UINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * ushl(2) + 2 * ucvtf
4096 {ISD::SINT_TO_FP, MVT::v16f16, MVT::v16i8, 4}, // 2 * sshl(2) + 2 * scvtf
4097 };
4098
4099 if (ST->hasFullFP16())
4100 if (const auto *Entry = ConvertCostTableLookup(
4101 FP16Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
4102 return Entry->Cost;
4103
4104 // INT_TO_FP of i64->f32 will scalarize, which is required to avoid
4105 // double-rounding issues.
4106 if ((ISD == ISD::SINT_TO_FP || ISD == ISD::UINT_TO_FP) &&
4107 DstTy.getScalarType() == MVT::f32 && SrcTy.getScalarSizeInBits() > 32 &&
4109 return cast<FixedVectorType>(Dst)->getNumElements() *
4110 getCastInstrCost(Opcode, Dst->getScalarType(),
4111 Src->getScalarType(), CCH, CostKind) +
4113 true, CostKind) +
4115 false, CostKind);
4116
4117 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4119 ST->isSVEorStreamingSVEAvailable() &&
4120 TLI->getTypeAction(Src->getContext(), SrcTy) ==
4122 TLI->getTypeAction(Dst->getContext(), DstTy) ==
4124 // The standard behaviour in the backend for these cases is to split the
4125 // extend up into two parts:
4126 // 1. Perform an extending load or masked load up to the legal type.
4127 // 2. Extend the loaded data to the final type.
4128 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
4129 Type *LegalTy = EVT(SrcLT.second).getTypeForEVT(Src->getContext());
4131 Opcode, LegalTy, Src, CCH, CostKind, I);
4133 Opcode, Dst, LegalTy, TTI::CastContextHint::None, CostKind, I);
4134 return Part1 + Part2;
4135 }
4136
4137 // The BasicTTIImpl version only deals with CCH==TTI::CastContextHint::Normal,
4138 // but we also want to include the TTI::CastContextHint::Masked case too.
4139 if ((ISD == ISD::ZERO_EXTEND || ISD == ISD::SIGN_EXTEND) &&
4141 ST->isSVEorStreamingSVEAvailable() && TLI->isTypeLegal(DstTy))
4143
4144 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
4145}
4146
4149 VectorType *VecTy, unsigned Index,
4151
4152 // Make sure we were given a valid extend opcode.
4153 assert((Opcode == Instruction::SExt || Opcode == Instruction::ZExt) &&
4154 "Invalid opcode");
4155
4156 // We are extending an element we extract from a vector, so the source type
4157 // of the extend is the element type of the vector.
4158 auto *Src = VecTy->getElementType();
4159
4160 // Sign- and zero-extends are for integer types only.
4161 assert(isa<IntegerType>(Dst) && isa<IntegerType>(Src) && "Invalid type");
4162
4163 // Get the cost for the extract. We compute the cost (if any) for the extend
4164 // below.
4165 InstructionCost Cost = getVectorInstrCost(Instruction::ExtractElement, VecTy,
4166 CostKind, Index, nullptr, nullptr);
4167
4168 // Legalize the types.
4169 auto VecLT = getTypeLegalizationCost(VecTy);
4170 auto DstVT = TLI->getValueType(DL, Dst);
4171 auto SrcVT = TLI->getValueType(DL, Src);
4172
4173 // If the resulting type is still a vector and the destination type is legal,
4174 // we may get the extension for free. If not, get the default cost for the
4175 // extend.
4176 if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
4177 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4178 CostKind);
4179
4180 // The destination type should be larger than the element type. If not, get
4181 // the default cost for the extend.
4182 if (DstVT.getFixedSizeInBits() < SrcVT.getFixedSizeInBits())
4183 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4184 CostKind);
4185
4186 switch (Opcode) {
4187 default:
4188 llvm_unreachable("Opcode should be either SExt or ZExt");
4189
4190 // For sign-extends, we only need a smov, which performs the extension
4191 // automatically.
4192 case Instruction::SExt:
4193 return Cost;
4194
4195 // For zero-extends, the extend is performed automatically by a umov unless
4196 // the destination type is i64 and the element type is i8 or i16.
4197 case Instruction::ZExt:
4198 if (DstVT.getSizeInBits() != 64u || SrcVT.getSizeInBits() == 32u)
4199 return Cost;
4200 }
4201
4202 // If we are unable to perform the extend for free, get the default cost.
4203 return Cost + getCastInstrCost(Opcode, Dst, Src, TTI::CastContextHint::None,
4204 CostKind);
4205}
4206
4209 const Instruction *I) const {
4211 return Opcode == Instruction::PHI ? 0 : 1;
4212 assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
4213 // Branches are assumed to be predicted.
4214 return 0;
4215}
4216
4217InstructionCost AArch64TTIImpl::getVectorInstrCostHelper(
4218 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4219 const Instruction *I, Value *Scalar,
4220 ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4221 TTI::VectorInstrContext VIC) const {
4222 assert(Val->isVectorTy() && "This must be a vector type");
4223
4224 if (Index != -1U) {
4225 // Legalize the type.
4226 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
4227
4228 // This type is legalized to a scalar type.
4229 if (!LT.second.isVector())
4230 return 0;
4231
4232 // The type may be split. For fixed-width vectors we can normalize the
4233 // index to the new type.
4234 if (LT.second.isFixedLengthVector()) {
4235 unsigned Width = LT.second.getVectorNumElements();
4236 Index = Index % Width;
4237 }
4238
4239 // The element at index zero is already inside the vector.
4240 // - For a insert-element or extract-element
4241 // instruction that extracts integers, an explicit FPR -> GPR move is
4242 // needed. So it has non-zero cost.
4243 if (Index == 0 && !Val->getScalarType()->isIntegerTy())
4244 return 0;
4245
4246 // This is recognising a LD1 single-element structure to one lane of one
4247 // register instruction. I.e., if this is an `insertelement` instruction,
4248 // and its second operand is a load, then we will generate a LD1, which
4249 // are expensive instructions on some uArchs.
4250 if (VIC == TTI::VectorInstrContext::Load) {
4251 if (ST->hasFastLD1Single())
4252 return 0;
4253 return CostKind == TTI::TCK_CodeSize
4254 ? 0
4256 }
4257
4258 // i1 inserts and extract will include an extra cset or cmp of the vector
4259 // value. Increase the cost by 1 to account.
4260 if (Val->getScalarSizeInBits() == 1)
4261 return CostKind == TTI::TCK_CodeSize
4262 ? 2
4263 : ST->getVectorInsertExtractBaseCost() + 1;
4264
4265 // FIXME:
4266 // If the extract-element and insert-element instructions could be
4267 // simplified away (e.g., could be combined into users by looking at use-def
4268 // context), they have no cost. This is not done in the first place for
4269 // compile-time considerations.
4270 }
4271
4272 // In case of Neon, if there exists extractelement from lane != 0 such that
4273 // 1. extractelement does not necessitate a move from vector_reg -> GPR.
4274 // 2. extractelement result feeds into fmul.
4275 // 3. Other operand of fmul is an extractelement from lane 0 or lane
4276 // equivalent to 0.
4277 // then the extractelement can be merged with fmul in the backend and it
4278 // incurs no cost.
4279 // e.g.
4280 // define double @foo(<2 x double> %a) {
4281 // %1 = extractelement <2 x double> %a, i32 0
4282 // %2 = extractelement <2 x double> %a, i32 1
4283 // %res = fmul double %1, %2
4284 // ret double %res
4285 // }
4286 // %2 and %res can be merged in the backend to generate fmul d0, d0, v1.d[1]
4287 auto ExtractCanFuseWithFmul = [&]() {
4288 // We bail out if the extract is from lane 0.
4289 if (Index == 0)
4290 return false;
4291
4292 // Check if the scalar element type of the vector operand of ExtractElement
4293 // instruction is one of the allowed types.
4294 auto IsAllowedScalarTy = [&](const Type *T) {
4295 return T->isFloatTy() || T->isDoubleTy() ||
4296 (T->isHalfTy() && ST->hasFullFP16());
4297 };
4298
4299 // Check if the extractelement user is scalar fmul.
4300 auto IsUserFMulScalarTy = [](const Value *EEUser) {
4301 // Check if the user is scalar fmul.
4302 const auto *BO = dyn_cast<BinaryOperator>(EEUser);
4303 return BO && BO->getOpcode() == BinaryOperator::FMul &&
4304 !BO->getType()->isVectorTy();
4305 };
4306
4307 // Check if the extract index is from lane 0 or lane equivalent to 0 for a
4308 // certain scalar type and a certain vector register width.
4309 auto IsExtractLaneEquivalentToZero = [&](unsigned Idx, unsigned EltSz) {
4310 auto RegWidth =
4312 .getFixedValue();
4313 return Idx == 0 || (RegWidth != 0 && (Idx * EltSz) % RegWidth == 0);
4314 };
4315
4316 // Check if the type constraints on input vector type and result scalar type
4317 // of extractelement instruction are satisfied.
4318 if (!isa<FixedVectorType>(Val) || !IsAllowedScalarTy(Val->getScalarType()))
4319 return false;
4320
4321 if (Scalar) {
4322 DenseMap<User *, unsigned> UserToExtractIdx;
4323 for (auto *U : Scalar->users()) {
4324 if (!IsUserFMulScalarTy(U))
4325 return false;
4326 // Recording entry for the user is important. Index value is not
4327 // important.
4328 UserToExtractIdx[U];
4329 }
4330 if (UserToExtractIdx.empty())
4331 return false;
4332 for (auto &[S, U, L] : ScalarUserAndIdx) {
4333 for (auto *U : S->users()) {
4334 if (UserToExtractIdx.contains(U)) {
4335 auto *FMul = cast<BinaryOperator>(U);
4336 auto *Op0 = FMul->getOperand(0);
4337 auto *Op1 = FMul->getOperand(1);
4338 if ((Op0 == S && Op1 == S) || Op0 != S || Op1 != S) {
4339 UserToExtractIdx[U] = L;
4340 break;
4341 }
4342 }
4343 }
4344 }
4345 for (auto &[U, L] : UserToExtractIdx) {
4346 if (!IsExtractLaneEquivalentToZero(Index, Val->getScalarSizeInBits()) &&
4347 !IsExtractLaneEquivalentToZero(L, Val->getScalarSizeInBits()))
4348 return false;
4349 }
4350 } else {
4351 const auto *EE = cast<ExtractElementInst>(I);
4352
4353 const auto *IdxOp = dyn_cast<ConstantInt>(EE->getIndexOperand());
4354 if (!IdxOp)
4355 return false;
4356
4357 return !EE->users().empty() && all_of(EE->users(), [&](const User *U) {
4358 if (!IsUserFMulScalarTy(U))
4359 return false;
4360
4361 // Check if the other operand of extractelement is also extractelement
4362 // from lane equivalent to 0.
4363 const auto *BO = cast<BinaryOperator>(U);
4364 const auto *OtherEE = dyn_cast<ExtractElementInst>(
4365 BO->getOperand(0) == EE ? BO->getOperand(1) : BO->getOperand(0));
4366 if (OtherEE) {
4367 const auto *IdxOp = dyn_cast<ConstantInt>(OtherEE->getIndexOperand());
4368 if (!IdxOp)
4369 return false;
4370 return IsExtractLaneEquivalentToZero(
4371 cast<ConstantInt>(OtherEE->getIndexOperand())
4372 ->getValue()
4373 .getZExtValue(),
4374 OtherEE->getType()->getScalarSizeInBits());
4375 }
4376 return true;
4377 });
4378 }
4379 return true;
4380 };
4381
4382 if (Opcode == Instruction::ExtractElement && (I || Scalar) &&
4383 ExtractCanFuseWithFmul())
4384 return 0;
4385
4386 // All other insert/extracts cost this much.
4387 return CostKind == TTI::TCK_CodeSize ? 1
4388 : ST->getVectorInsertExtractBaseCost();
4389}
4390
4392 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4393 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
4394 // Treat insert at lane 0 into a poison vector as having zero cost. This
4395 // ensures vector broadcasts via an insert + shuffle (and will be lowered to a
4396 // single dup) are treated as cheap.
4397 if (Opcode == Instruction::InsertElement && Index == 0 && Op0 &&
4398 isa<PoisonValue>(Op0))
4399 return 0;
4400 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr,
4401 nullptr, {}, VIC);
4402}
4403
4405 unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index,
4406 Value *Scalar, ArrayRef<std::tuple<Value *, User *, int>> ScalarUserAndIdx,
4407 TTI::VectorInstrContext VIC) const {
4408 return getVectorInstrCostHelper(Opcode, Val, CostKind, Index, nullptr, Scalar,
4409 ScalarUserAndIdx, VIC);
4410}
4411
4414 TTI::TargetCostKind CostKind, unsigned Index,
4415 TTI::VectorInstrContext VIC) const {
4416 return getVectorInstrCostHelper(I.getOpcode(), Val, CostKind, Index, &I,
4417 nullptr, {}, VIC);
4418}
4419
4423 unsigned Index) const {
4424 if (isa<FixedVectorType>(Val))
4426 Index);
4427
4428 // This typically requires both while and lastb instructions in order
4429 // to extract the last element. If this is in a loop the while
4430 // instruction can at least be hoisted out, although it will consume a
4431 // predicate register. The cost should be more expensive than the base
4432 // extract cost, which is 2 for most CPUs.
4433 return CostKind == TTI::TCK_CodeSize
4434 ? 2
4435 : ST->getVectorInsertExtractBaseCost() + 1;
4436}
4437
4439 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
4440 TTI::TargetCostKind CostKind, bool ForPoisonSrc, ArrayRef<Value *> VL,
4441 TTI::VectorInstrContext VIC) const {
4444 if (Ty->getElementType()->isFloatingPointTy())
4445 return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract,
4446 CostKind);
4447 unsigned VecInstCost =
4448 CostKind == TTI::TCK_CodeSize ? 1 : ST->getVectorInsertExtractBaseCost();
4449 return DemandedElts.popcount() * (Insert + Extract) * VecInstCost;
4450}
4451
4452std::optional<InstructionCost> AArch64TTIImpl::getFP16BF16PromoteCost(
4454 TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE,
4455 std::function<InstructionCost(Type *)> InstCost) const {
4456 if (!Ty->getScalarType()->isHalfTy() && !Ty->getScalarType()->isBFloatTy())
4457 return std::nullopt;
4458 if (Ty->getScalarType()->isHalfTy() && ST->hasFullFP16())
4459 return std::nullopt;
4460 // If we have +sve-b16b16 the operation can be promoted to SVE.
4461 if (CanUseSVE && ST->hasSVEB16B16() && ST->isNonStreamingSVEorSME2Available())
4462 return std::nullopt;
4463
4464 Type *PromotedTy = Ty->getWithNewType(Type::getFloatTy(Ty->getContext()));
4465 InstructionCost Cost = getCastInstrCost(Instruction::FPExt, PromotedTy, Ty,
4467 if (!Op1Info.isConstant() && !Op2Info.isConstant())
4468 Cost *= 2;
4469 Cost += InstCost(PromotedTy);
4470 if (IncludeTrunc)
4471 Cost += getCastInstrCost(Instruction::FPTrunc, Ty, PromotedTy,
4473 return Cost;
4474}
4475
4477 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
4479 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
4480
4481 // The code-generator is currently not able to handle scalable vectors
4482 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
4483 // it. This change will be removed when code-generation for these types is
4484 // sufficiently reliable.
4485 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
4486 if (VTy->getElementCount() == ElementCount::getScalable(1))
4488
4489 // TODO: Handle more cost kinds.
4491 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4492 Op2Info, Args, CxtI);
4493
4494 // Legalize the type.
4495 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
4496 int ISD = TLI->InstructionOpcodeToISD(Opcode);
4497
4498 // Increase the cost for half and bfloat types if not architecturally
4499 // supported.
4500 if (ISD == ISD::FADD || ISD == ISD::FSUB || ISD == ISD::FMUL ||
4501 ISD == ISD::FDIV || ISD == ISD::FREM) {
4502 if (auto PromotedCost = getFP16BF16PromoteCost(
4503 Ty, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/true,
4504 // There is not native support for fdiv/frem even with +sve-b16b16.
4505 /*CanUseSVE=*/ISD != ISD::FDIV && ISD != ISD::FREM,
4506 [&](Type *PromotedTy) {
4507 return getArithmeticInstrCost(Opcode, PromotedTy, CostKind,
4508 Op1Info, Op2Info);
4509 }))
4510 return *PromotedCost;
4511
4512 // fp128 all go via libcalls
4513 if (Ty->getScalarType()->isFP128Ty())
4514 return (CostKind == TTI::TCK_CodeSize ? 1 : 10) * LT.first;
4515 }
4516
4517 // If the operation is a widening instruction (smull or umull) and both
4518 // operands are extends the cost can be cheaper by considering that the
4519 // operation will operate on the narrowest type size possible (double the
4520 // largest input size) and a further extend.
4521 if (Type *ExtTy = isBinExtWideningInstruction(Opcode, Ty, Args)) {
4522 if (ExtTy != Ty)
4523 return getArithmeticInstrCost(Opcode, ExtTy, CostKind) +
4524 getCastInstrCost(Instruction::ZExt, Ty, ExtTy,
4526 return LT.first;
4527 }
4528
4529 switch (ISD) {
4530 default:
4531 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4532 Op2Info);
4533 case ISD::ADD:
4534 case ISD::SUB:
4535 return LT.first; // Also works for i128
4536 case ISD::MUL:
4537 if (LT.second == MVT::v2i64) {
4538 // When SVE is available, then we can lower the v2i64 operation using
4539 // the SVE mul instruction, which has a lower cost.
4540 if (ST->hasSVE())
4541 return LT.first;
4542
4543 // When SVE is not available, there is no MUL.2d instruction,
4544 // which means mul <2 x i64> is expensive as elements are extracted
4545 // from the vectors and the muls scalarized.
4546 // As getScalarizationOverhead is a bit too pessimistic, we
4547 // estimate the cost for a i64 vector directly here, which is:
4548 // - four 2-cost i64 extracts,
4549 // - two 2-cost i64 inserts, and
4550 // - two 1-cost muls.
4551 // So, for a v2i64 with LT.First = 1 the cost is 14, and for a v4i64 with
4552 // LT.first = 2 the cost is 28.
4553 return cast<VectorType>(Ty)->getElementCount().getKnownMinValue() *
4554 (getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind) +
4555 getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind, -1,
4556 nullptr, nullptr) *
4557 2 +
4558 getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4559 nullptr, nullptr));
4560 }
4561 return LT.first;
4562 case ISD::SREM:
4563 case ISD::SDIV:
4564 /*
4565 Notes for sdiv/srem specific costs:
4566 1. This only considers the cases where the divisor is constant, uniform and
4567 (pow-of-2/non-pow-of-2). Other cases are not important since they either
4568 result in some form of (ldr + adrp), corresponding to constant vectors, or
4569 scalarization of the division operation.
4570 2. Constant divisors, either negative in whole or partially, don't result in
4571 significantly different codegen as compared to positive constant divisors.
4572 So, we don't consider negative divisors separately.
4573 3. If the codegen is significantly different with SVE, it has been indicated
4574 using comments at appropriate places.
4575
4576 sdiv specific cases:
4577 -----------------------------------------------------------------------
4578 codegen | pow-of-2 | Type
4579 -----------------------------------------------------------------------
4580 add + cmp + csel + asr | Y | i64
4581 add + cmp + csel + asr | Y | i32
4582 -----------------------------------------------------------------------
4583
4584 srem specific cases:
4585 -----------------------------------------------------------------------
4586 codegen | pow-of-2 | Type
4587 -----------------------------------------------------------------------
4588 negs + and + and + csneg | Y | i64
4589 negs + and + and + csneg | Y | i32
4590 -----------------------------------------------------------------------
4591
4592 other sdiv/srem cases:
4593 -------------------------------------------------------------------------
4594 common codegen | + srem | + sdiv | pow-of-2 | Type
4595 -------------------------------------------------------------------------
4596 smulh + asr + add + add | - | - | N | i64
4597 smull + lsr + add + add | - | - | N | i32
4598 usra | and + sub | sshr | Y | <2 x i64>
4599 2 * (scalar code) | - | - | N | <2 x i64>
4600 usra | bic + sub | sshr + neg | Y | <4 x i32>
4601 smull2 + smull + uzp2 | mls | - | N | <4 x i32>
4602 + sshr + usra | | | |
4603 -------------------------------------------------------------------------
4604 */
4605 if (Op2Info.isConstant() && Op2Info.isUniform()) {
4606 InstructionCost AddCost =
4607 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4608 Op1Info.getNoProps(), Op2Info.getNoProps());
4609 InstructionCost AsrCost =
4610 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4611 Op1Info.getNoProps(), Op2Info.getNoProps());
4612 InstructionCost MulCost =
4613 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4614 Op1Info.getNoProps(), Op2Info.getNoProps());
4615 // add/cmp/csel/csneg should have similar cost while asr/negs/and should
4616 // have similar cost.
4617 auto VT = TLI->getValueType(DL, Ty);
4618 if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) {
4619 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4620 // Neg can be folded into the asr instruction.
4621 return ISD == ISD::SDIV ? (3 * AddCost + AsrCost)
4622 : (3 * AsrCost + AddCost);
4623 } else {
4624 return MulCost + AsrCost + 2 * AddCost;
4625 }
4626 } else if (VT.isVector()) {
4627 InstructionCost UsraCost = 2 * AsrCost;
4628 if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) {
4629 // Division with scalable types corresponds to native 'asrd'
4630 // instruction when SVE is available.
4631 // e.g. %1 = sdiv <vscale x 4 x i32> %a, splat (i32 8)
4632
4633 // One more for the negation in SDIV
4635 (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0;
4636 if (Ty->isScalableTy() && ST->hasSVE())
4637 Cost += 2 * AsrCost;
4638 else {
4639 Cost +=
4640 UsraCost +
4641 (ISD == ISD::SDIV
4642 ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost
4643 : 2 * AddCost);
4644 }
4645 return Cost;
4646 } else if (LT.second == MVT::v2i64) {
4647 return VT.getVectorNumElements() *
4648 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind,
4649 Op1Info.getNoProps(),
4650 Op2Info.getNoProps());
4651 } else {
4652 // When SVE is available, we get:
4653 // smulh + lsr + add/sub + asr + add/sub.
4654 if (Ty->isScalableTy() && ST->hasSVE())
4655 return MulCost /*smulh cost*/ + 2 * AddCost + 2 * AsrCost;
4656 return 2 * MulCost + AddCost /*uzp2 cost*/ + AsrCost + UsraCost;
4657 }
4658 }
4659 }
4660 if (Op2Info.isConstant() && !Op2Info.isUniform() &&
4661 LT.second.isFixedLengthVector()) {
4662 // FIXME: When the constant vector is non-uniform, this may result in
4663 // loading the vector from constant pool or in some cases, may also result
4664 // in scalarization. For now, we are approximating this with the
4665 // scalarization cost.
4666 auto ExtractCost = 2 * getVectorInstrCost(Instruction::ExtractElement, Ty,
4667 CostKind, -1, nullptr, nullptr);
4668 auto InsertCost = getVectorInstrCost(Instruction::InsertElement, Ty,
4669 CostKind, -1, nullptr, nullptr);
4670 unsigned NElts = cast<FixedVectorType>(Ty)->getNumElements();
4671 return ExtractCost + InsertCost +
4672 NElts * getArithmeticInstrCost(Opcode, Ty->getScalarType(),
4673 CostKind, Op1Info.getNoProps(),
4674 Op2Info.getNoProps());
4675 }
4676 [[fallthrough]];
4677 case ISD::UDIV:
4678 case ISD::UREM: {
4679 auto VT = TLI->getValueType(DL, Ty);
4680 if (Op2Info.isConstant()) {
4681 // If the operand is a power of 2 we can use the shift or and cost.
4682 if (ISD == ISD::UDIV && Op2Info.isPowerOf2())
4683 return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
4684 Op1Info.getNoProps(),
4685 Op2Info.getNoProps());
4686 if (ISD == ISD::UREM && Op2Info.isPowerOf2())
4687 return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
4688 Op1Info.getNoProps(),
4689 Op2Info.getNoProps());
4690
4691 if (ISD == ISD::UDIV || ISD == ISD::UREM) {
4692 // Divides by a constant are expanded to MULHU + SUB + SRL + ADD + SRL.
4693 // The MULHU will be expanded to UMULL for the types not listed below,
4694 // and will become a pair of UMULL+MULL2 for 128bit vectors.
4695 bool HasMULH = VT == MVT::i64 || LT.second == MVT::nxv2i64 ||
4696 LT.second == MVT::nxv4i32 || LT.second == MVT::nxv8i16 ||
4697 LT.second == MVT::nxv16i8;
4698 bool Is128bit = LT.second.is128BitVector();
4699
4700 InstructionCost MulCost =
4701 getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
4702 Op1Info.getNoProps(), Op2Info.getNoProps());
4703 InstructionCost AddCost =
4704 getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
4705 Op1Info.getNoProps(), Op2Info.getNoProps());
4706 InstructionCost ShrCost =
4707 getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
4708 Op1Info.getNoProps(), Op2Info.getNoProps());
4709 InstructionCost DivCost = MulCost * (Is128bit ? 2 : 1) + // UMULL/UMULH
4710 (HasMULH ? 0 : ShrCost) + // UMULL shift
4711 AddCost * 2 + ShrCost;
4712 return DivCost + (ISD == ISD::UREM ? MulCost + AddCost : 0);
4713 }
4714 }
4715
4716 // div i128's are lowered as libcalls. Pass nullptr as (u)divti3 calls are
4717 // emitted by the backend even when those functions are not declared in the
4718 // module.
4719 if (!VT.isVector() && VT.getSizeInBits() > 64)
4720 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4721
4723 Opcode, Ty, CostKind, Op1Info, Op2Info);
4724 if (Ty->isVectorTy() && (ISD == ISD::SDIV || ISD == ISD::UDIV)) {
4725 if (TLI->isOperationLegalOrCustom(ISD, LT.second) && ST->hasSVE()) {
4726 // SDIV/UDIV operations are lowered using SVE, then we can have less
4727 // costs.
4728 if (VT.isSimple() && isa<FixedVectorType>(Ty) &&
4729 Ty->getPrimitiveSizeInBits().getFixedValue() < 128) {
4730 static const CostTblEntry DivTbl[]{
4731 {ISD::SDIV, MVT::v2i8, 5}, {ISD::SDIV, MVT::v4i8, 8},
4732 {ISD::SDIV, MVT::v8i8, 8}, {ISD::SDIV, MVT::v2i16, 5},
4733 {ISD::SDIV, MVT::v4i16, 5}, {ISD::SDIV, MVT::v2i32, 1},
4734 {ISD::UDIV, MVT::v2i8, 5}, {ISD::UDIV, MVT::v4i8, 8},
4735 {ISD::UDIV, MVT::v8i8, 8}, {ISD::UDIV, MVT::v2i16, 5},
4736 {ISD::UDIV, MVT::v4i16, 5}, {ISD::UDIV, MVT::v2i32, 1}};
4737
4738 const auto *Entry = CostTableLookup(DivTbl, ISD, VT.getSimpleVT());
4739 if (nullptr != Entry)
4740 return Entry->Cost;
4741 }
4742 // For 8/16-bit elements, the cost is higher because the type
4743 // requires promotion and possibly splitting:
4744 if (LT.second.getScalarType() == MVT::i8)
4745 Cost *= 8;
4746 else if (LT.second.getScalarType() == MVT::i16)
4747 Cost *= 4;
4748 return Cost;
4749 } else {
4750 // If one of the operands is a uniform constant then the cost for each
4751 // element is Cost for insertion, extraction and division.
4752 // Insertion cost = 2, Extraction Cost = 2, Division = cost for the
4753 // operation with scalar type
4754 if ((Op1Info.isConstant() && Op1Info.isUniform()) ||
4755 (Op2Info.isConstant() && Op2Info.isUniform())) {
4756 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
4758 Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info);
4759 return (4 + DivCost) * VTy->getNumElements();
4760 }
4761 }
4762 // On AArch64, without SVE, vector divisions are expanded
4763 // into scalar divisions of each pair of elements.
4764 Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, CostKind,
4765 -1, nullptr, nullptr);
4766 Cost += getVectorInstrCost(Instruction::InsertElement, Ty, CostKind, -1,
4767 nullptr, nullptr);
4768 }
4769
4770 // TODO: if one of the arguments is scalar, then it's not necessary to
4771 // double the cost of handling the vector elements.
4772 Cost += Cost;
4773 }
4774 return Cost;
4775 }
4776 case ISD::XOR:
4777 case ISD::OR:
4778 case ISD::AND:
4779 case ISD::SRL:
4780 case ISD::SRA:
4781 case ISD::SHL:
4782 // These nodes are marked as 'custom' for combining purposes only.
4783 // We know that they are legal. See LowerAdd in ISelLowering.
4784 return LT.first;
4785
4786 case ISD::FNEG:
4787 // Scalar fmul(fneg) or fneg(fmul) can be converted to fnmul
4788 if ((Ty->isFloatTy() || Ty->isDoubleTy() ||
4789 (Ty->isHalfTy() && ST->hasFullFP16())) &&
4790 CxtI &&
4791 ((CxtI->hasOneUse() &&
4792 match(*CxtI->user_begin(), m_FMul(m_Value(), m_Value()))) ||
4793 match(CxtI->getOperand(0), m_FMul(m_Value(), m_Value()))))
4794 return 0;
4795 [[fallthrough]];
4796 case ISD::FADD:
4797 case ISD::FSUB:
4798 if (!Ty->getScalarType()->isFP128Ty())
4799 return LT.first;
4800 [[fallthrough]];
4801 case ISD::FMUL:
4802 case ISD::FDIV:
4803 // These nodes are marked as 'custom' just to lower them to SVE.
4804 // We know said lowering will incur no additional cost.
4805 if (!Ty->getScalarType()->isFP128Ty())
4806 return 2 * LT.first;
4807
4808 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4809 Op2Info);
4810 case ISD::FREM:
4811 // Pass nullptr as fmod/fmodf calls are emitted by the backend even when
4812 // those functions are not declared in the module.
4813 if (!Ty->isVectorTy())
4814 return getCallInstrCost(/*Function*/ nullptr, Ty, {Ty, Ty}, CostKind);
4815 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
4816 Op2Info);
4817 }
4818}
4819
4822 const SCEV *Ptr,
4824 // Address computations in vectorized code with non-consecutive addresses will
4825 // likely result in more instructions compared to scalar code where the
4826 // computation can more often be merged into the index mode. The resulting
4827 // extra micro-ops can significantly decrease throughput.
4828 unsigned NumVectorInstToHideOverhead = NeonNonConstStrideOverhead;
4829 int MaxMergeDistance = 64;
4830
4831 if (PtrTy->isVectorTy() && SE &&
4832 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
4833 return NumVectorInstToHideOverhead;
4834
4835 // In many cases the address computation is not merged into the instruction
4836 // addressing mode.
4837 return 1;
4838}
4839
4840/// Check whether Opcode1 has less throughput according to the scheduling
4841/// model than Opcode2.
4843 unsigned Opcode1, unsigned Opcode2) const {
4844 const MCSchedModel &Sched = ST->getSchedModel();
4845 const TargetInstrInfo *TII = ST->getInstrInfo();
4846 if (!Sched.hasInstrSchedModel())
4847 return false;
4848
4849 const MCSchedClassDesc *SCD1 =
4850 Sched.getSchedClassDesc(TII->get(Opcode1).getSchedClass());
4851 const MCSchedClassDesc *SCD2 =
4852 Sched.getSchedClassDesc(TII->get(Opcode2).getSchedClass());
4853 // We cannot handle variant scheduling classes without an MI. If we need to
4854 // support them for any of the instructions we query the information of we
4855 // might need to add a way to resolve them without a MI or not use the
4856 // scheduling info.
4857 assert(!SCD1->isVariant() && !SCD2->isVariant() &&
4858 "Cannot handle variant scheduling classes without an MI");
4859 if (!SCD1->isValid() || !SCD2->isValid())
4860 return false;
4861
4862 return MCSchedModel::getReciprocalThroughput(*ST, *SCD1) >
4864}
4865
4867 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
4869 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
4870 // We don't lower some vector selects well that are wider than the register
4871 // width. TODO: Improve this with different cost kinds.
4872 if (isa<FixedVectorType>(ValTy) && Opcode == Instruction::Select) {
4873 // We would need this many instructions to hide the scalarization happening.
4874 const int AmortizationCost = 20;
4875
4876 // If VecPred is not set, check if we can get a predicate from the context
4877 // instruction, if its type matches the requested ValTy.
4878 if (VecPred == CmpInst::BAD_ICMP_PREDICATE && I && I->getType() == ValTy) {
4879 CmpPredicate CurrentPred;
4880 if (match(I, m_Select(m_Cmp(CurrentPred, m_Value(), m_Value()), m_Value(),
4881 m_Value())))
4882 VecPred = CurrentPred;
4883 }
4884 // Check if we have a compare/select chain that can be lowered using
4885 // a (F)CMxx & BFI pair.
4886 if (CmpInst::isIntPredicate(VecPred) || VecPred == CmpInst::FCMP_OLE ||
4887 VecPred == CmpInst::FCMP_OLT || VecPred == CmpInst::FCMP_OGT ||
4888 VecPred == CmpInst::FCMP_OGE || VecPred == CmpInst::FCMP_OEQ ||
4889 VecPred == CmpInst::FCMP_UNE) {
4890 static const auto ValidMinMaxTys = {
4891 MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16, MVT::v2i32,
4892 MVT::v4i32, MVT::v2i64, MVT::v2f32, MVT::v4f32, MVT::v2f64};
4893 static const auto ValidFP16MinMaxTys = {MVT::v4f16, MVT::v8f16};
4894
4895 auto LT = getTypeLegalizationCost(ValTy);
4896 if (any_of(ValidMinMaxTys, equal_to(LT.second)) ||
4897 (ST->hasFullFP16() &&
4898 any_of(ValidFP16MinMaxTys, equal_to(LT.second))))
4899 return LT.first;
4900 }
4901
4902 static const TypeConversionCostTblEntry VectorSelectTbl[] = {
4903 {Instruction::Select, MVT::v2i1, MVT::v2f32, 2},
4904 {Instruction::Select, MVT::v2i1, MVT::v2f64, 2},
4905 {Instruction::Select, MVT::v4i1, MVT::v4f32, 2},
4906 {Instruction::Select, MVT::v4i1, MVT::v4f16, 2},
4907 {Instruction::Select, MVT::v8i1, MVT::v8f16, 2},
4908 {Instruction::Select, MVT::v16i1, MVT::v16i16, 16},
4909 {Instruction::Select, MVT::v8i1, MVT::v8i32, 8},
4910 {Instruction::Select, MVT::v16i1, MVT::v16i32, 16},
4911 {Instruction::Select, MVT::v4i1, MVT::v4i64, 4 * AmortizationCost},
4912 {Instruction::Select, MVT::v8i1, MVT::v8i64, 8 * AmortizationCost},
4913 {Instruction::Select, MVT::v16i1, MVT::v16i64, 16 * AmortizationCost}};
4914
4915 EVT SelCondTy = TLI->getValueType(DL, CondTy);
4916 EVT SelValTy = TLI->getValueType(DL, ValTy);
4917 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
4918 if (const auto *Entry = ConvertCostTableLookup(VectorSelectTbl, Opcode,
4919 SelCondTy.getSimpleVT(),
4920 SelValTy.getSimpleVT()))
4921 return Entry->Cost;
4922 }
4923 }
4924
4925 if (Opcode == Instruction::FCmp) {
4926 if (auto PromotedCost = getFP16BF16PromoteCost(
4927 ValTy, CostKind, Op1Info, Op2Info, /*IncludeTrunc=*/false,
4928 // TODO: Consider costing SVE FCMPs.
4929 /*CanUseSVE=*/false, [&](Type *PromotedTy) {
4931 getCmpSelInstrCost(Opcode, PromotedTy, CondTy, VecPred,
4932 CostKind, Op1Info, Op2Info);
4933 if (isa<VectorType>(PromotedTy))
4935 Instruction::Trunc,
4939 return Cost;
4940 }))
4941 return *PromotedCost;
4942
4943 auto LT = getTypeLegalizationCost(ValTy);
4944 // Model unknown fp compares as a libcall.
4945 if (LT.second.getScalarType() != MVT::f64 &&
4946 LT.second.getScalarType() != MVT::f32 &&
4947 LT.second.getScalarType() != MVT::f16)
4948 return LT.first * getCallInstrCost(/*Function*/ nullptr, ValTy,
4949 {ValTy, ValTy}, CostKind);
4950
4951 // Some comparison operators require expanding to multiple compares + or.
4952 unsigned Factor = 1;
4953 if (!CondTy->isVectorTy() &&
4954 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4955 Factor = 2; // fcmp with 2 selects
4956 else if (isa<FixedVectorType>(ValTy) &&
4957 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ ||
4958 VecPred == FCmpInst::FCMP_ORD || VecPred == FCmpInst::FCMP_UNO))
4959 Factor = 3; // fcmxx+fcmyy+or
4960 else if (isa<ScalableVectorType>(ValTy) &&
4961 (VecPred == FCmpInst::FCMP_ONE || VecPred == FCmpInst::FCMP_UEQ))
4962 Factor = 3; // fcmxx+fcmyy+or
4963
4964 if (isa<ScalableVectorType>(ValTy) &&
4966 hasKnownLowerThroughputFromSchedulingModel(AArch64::FCMEQ_PPzZZ_S,
4967 AArch64::FCMEQv4f32))
4968 Factor *= 2;
4969
4970 return Factor * (CostKind == TTI::TCK_Latency ? 2 : LT.first);
4971 }
4972
4973 // Treat the icmp in icmp(and, 0) or icmp(and, -1/1) when it can be folded to
4974 // icmp(and, 0) as free, as we can make use of ands, but only if the
4975 // comparison is not unsigned. FIXME: Enable for non-throughput cost kinds
4976 // providing it will not cause performance regressions.
4977 if (CostKind == TTI::TCK_RecipThroughput && ValTy->isIntegerTy() &&
4978 Opcode == Instruction::ICmp && I && !CmpInst::isUnsigned(VecPred) &&
4979 TLI->isTypeLegal(TLI->getValueType(DL, ValTy)) &&
4980 match(I->getOperand(0), m_And(m_Value(), m_Value()))) {
4981 if (match(I->getOperand(1), m_Zero()))
4982 return 0;
4983
4984 // x >= 1 / x < 1 -> x > 0 / x <= 0
4985 if (match(I->getOperand(1), m_One()) &&
4986 (VecPred == CmpInst::ICMP_SLT || VecPred == CmpInst::ICMP_SGE))
4987 return 0;
4988
4989 // x <= -1 / x > -1 -> x > 0 / x <= 0
4990 if (match(I->getOperand(1), m_AllOnes()) &&
4991 (VecPred == CmpInst::ICMP_SLE || VecPred == CmpInst::ICMP_SGT))
4992 return 0;
4993 }
4994
4995 // The base case handles scalable vectors fine for now, since it treats the
4996 // cost as 1 * legalization cost.
4997 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
4998 Op1Info, Op2Info, I);
4999}
5000
5002AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
5004 if (ST->requiresStrictAlign()) {
5005 // TODO: Add cost modeling for strict align. Misaligned loads expand to
5006 // a bunch of instructions when strict align is enabled.
5007 return Options;
5008 }
5009 Options.AllowOverlappingLoads = true;
5010 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
5011 Options.NumLoadsPerBlock = Options.MaxNumLoads;
5012 // TODO: Though vector loads usually perform well on AArch64, in some targets
5013 // they may wake up the FP unit, which raises the power consumption. Perhaps
5014 // they could be used with no holds barred (-O3).
5015 Options.LoadSizes = {8, 4, 2, 1};
5016 Options.AllowedTailExpansions = {3, 5, 6};
5017 return Options;
5018}
5019
5021 return ST->hasSVE();
5022}
5023
5027 switch (MICA.getID()) {
5028 case Intrinsic::masked_scatter:
5029 case Intrinsic::masked_gather:
5030 return getGatherScatterOpCost(MICA, CostKind);
5031 case Intrinsic::masked_load:
5032 case Intrinsic::masked_expandload:
5033 case Intrinsic::masked_store:
5034 return getMaskedMemoryOpCost(MICA, CostKind);
5035 }
5037}
5038
5042 Type *Src = MICA.getDataType();
5043
5044 if (useNeonVector(Src))
5046 auto LT = getTypeLegalizationCost(Src);
5047 if (!LT.first.isValid())
5049
5050 // Return an invalid cost for element types that we are unable to lower.
5051 auto *VT = cast<VectorType>(Src);
5052 if (VT->getElementType()->isIntegerTy(1))
5054
5055 // The code-generator is currently not able to handle scalable vectors
5056 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5057 // it. This change will be removed when code-generation for these types is
5058 // sufficiently reliable.
5059 if (VT->getElementCount() == ElementCount::getScalable(1))
5061
5062 InstructionCost MemOpCost = LT.first;
5063 if (MICA.getID() == Intrinsic::masked_expandload) {
5064 if (!isLegalMaskedExpandLoad(Src, MICA.getAlignment()))
5066
5067 // Operation will be split into expand of masked.load
5068 MemOpCost *= 2;
5069 }
5070
5071 // If we need to split the memory operation, we will also need to split the
5072 // mask. This will likely lead to overestimating the cost in some cases if
5073 // multiple memory operations use the same mask, but we often don't have
5074 // enough context to figure that out here.
5075 //
5076 // If the elements being loaded are bytes then the mask will already be split,
5077 // since the number of bits in a P register matches the number of bytes in a
5078 // Z register.
5079 if (LT.first > 1 && LT.second.getScalarSizeInBits() > 8)
5080 return MemOpCost * 2;
5081
5082 return MemOpCost;
5083}
5084
5085// This function returns gather/scatter overhead either from
5086// user-provided value or specialized values per-target from \p ST.
5087static unsigned getSVEGatherScatterOverhead(unsigned Opcode,
5088 const AArch64Subtarget *ST) {
5089 assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
5090 "Should be called on only load or stores.");
5091 switch (Opcode) {
5092 case Instruction::Load:
5093 if (SVEGatherOverhead.getNumOccurrences() > 0)
5094 return SVEGatherOverhead;
5095 return ST->getGatherOverhead();
5096 break;
5097 case Instruction::Store:
5098 if (SVEScatterOverhead.getNumOccurrences() > 0)
5099 return SVEScatterOverhead;
5100 return ST->getScatterOverhead();
5101 break;
5102 default:
5103 llvm_unreachable("Shouldn't have reached here");
5104 }
5105}
5106
5110
5111 unsigned Opcode = (MICA.getID() == Intrinsic::masked_gather ||
5112 MICA.getID() == Intrinsic::vp_gather)
5113 ? Instruction::Load
5114 : Instruction::Store;
5115
5116 Type *DataTy = MICA.getDataType();
5117 Align Alignment = MICA.getAlignment();
5118 const Instruction *I = MICA.getInst();
5119
5120 if (useNeonVector(DataTy) || !isLegalMaskedGatherScatter(DataTy))
5122 auto *VT = cast<VectorType>(DataTy);
5123 auto LT = getTypeLegalizationCost(DataTy);
5124 if (!LT.first.isValid())
5126
5127 // Return an invalid cost for element types that we are unable to lower.
5128 if (!LT.second.isVector() ||
5129 !isElementTypeLegalForScalableVector(VT->getElementType()) ||
5130 VT->getElementType()->isIntegerTy(1))
5132
5133 // The code-generator is currently not able to handle scalable vectors
5134 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5135 // it. This change will be removed when code-generation for these types is
5136 // sufficiently reliable.
5137 if (VT->getElementCount() == ElementCount::getScalable(1))
5139
5140 ElementCount LegalVF = LT.second.getVectorElementCount();
5141 InstructionCost MemOpCost =
5142 getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind,
5143 {TTI::OK_AnyValue, TTI::OP_None}, I);
5144 // Add on an overhead cost for using gathers/scatters.
5145 MemOpCost *= getSVEGatherScatterOverhead(Opcode, ST);
5146 return LT.first * MemOpCost * getMaxNumElements(LegalVF);
5147}
5148
5150 return isa<FixedVectorType>(Ty) && !ST->useSVEForFixedLengthVectors();
5151}
5152
5154 Align Alignment,
5155 unsigned AddressSpace,
5157 TTI::OperandValueInfo OpInfo,
5158 const Instruction *I) const {
5159 EVT VT = TLI->getValueType(DL, Ty, true);
5160 // Type legalization can't handle structs
5161 if (VT == MVT::Other)
5162 return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
5163 CostKind);
5164
5165 auto LT = getTypeLegalizationCost(Ty);
5166 if (!LT.first.isValid())
5168
5169 // The code-generator is currently not able to handle scalable vectors
5170 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5171 // it. This change will be removed when code-generation for these types is
5172 // sufficiently reliable.
5173 // We also only support full register predicate loads and stores.
5174 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5175 if (VTy->getElementCount() == ElementCount::getScalable(1) ||
5176 (VTy->getElementType()->isIntegerTy(1) &&
5177 !VTy->getElementCount().isKnownMultipleOf(
5180
5181 // TODO: consider latency as well for TCK_SizeAndLatency.
5183 return LT.first;
5184
5185 if (CostKind == TTI::TCK_Latency) {
5186 // Latency doesn't make much sense for stores, so just return 1
5187 if (Opcode == Instruction::Store)
5188 return 1;
5189 // If the subtarget has overridden the load latency then use that instead of
5190 // querying the SchedModel.
5191 if (ST->getFixedLoadLatency())
5192 return (LT.first - 1) + ST->getFixedLoadLatency();
5193 // We expect the load to become LT.first loads of type LT.second. The
5194 // latency will be the latency of the last load plus the time it gets to get
5195 // there, which will be the amount of other loads before that (i.e. total
5196 // loads - 1) multiplied by how long it takes to get through them (the
5197 // reciprocal of the throughput). We get the latency and reciprocal
5198 // throughput from the SchedModel, and assume that the loads become the
5199 // variant with unsigned integer offset.
5200 unsigned Inst = 0;
5201 if (LT.second.isScalableVector() ||
5202 ST->useSVEForFixedLengthVectors(LT.second)) {
5203 Inst = AArch64::LDR_ZXI;
5204 } else if (LT.second.isVector() || LT.second.isFloatingPoint()) {
5205 switch (LT.second.getSizeInBits()) {
5206 case 8:
5207 Inst = AArch64::LDRBui;
5208 break;
5209 case 16:
5210 Inst = AArch64::LDRHui;
5211 break;
5212 case 32:
5213 Inst = AArch64::LDRSui;
5214 break;
5215 case 64:
5216 Inst = AArch64::LDRDui;
5217 break;
5218 case 128:
5219 Inst = AArch64::LDRQui;
5220 break;
5221 default:
5222 llvm_unreachable("Unexpected float or vector type");
5223 }
5224 } else {
5225 switch (LT.second.getSizeInBits()) {
5226 case 8:
5227 Inst = AArch64::LDRBBui;
5228 break;
5229 case 16:
5230 Inst = AArch64::LDRHHui;
5231 break;
5232 case 32:
5233 Inst = AArch64::LDRWui;
5234 break;
5235 case 64:
5236 Inst = AArch64::LDRXui;
5237 break;
5238 default:
5239 llvm_unreachable("Unexpected integer type");
5240 }
5241 }
5242 const MCSchedModel &Sched = ST->getSchedModel();
5243 const TargetInstrInfo *TII = ST->getInstrInfo();
5244 unsigned SchedClass = TII->get(Inst).getSchedClass();
5245 const MCSchedClassDesc *SCD = Sched.getSchedClassDesc(SchedClass);
5246 // We need to convert the number of loads before the last to a float here,
5247 // as the reciprocal throughput may be fractional.
5248 float NumLoads = (LT.first - 1).getValue();
5249 return NumLoads * Sched.getReciprocalThroughput(*ST, *SCD) +
5250 Sched.computeInstrLatency(*ST, *SCD);
5251 }
5252
5253 if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
5254 LT.second.is128BitVector() && Alignment < Align(16)) {
5255 // Unaligned stores are extremely inefficient. We don't split all
5256 // unaligned 128-bit stores because the negative impact that has shown in
5257 // practice on inlined block copy code.
5258 // We make such stores expensive so that we will only vectorize if there
5259 // are 6 other instructions getting vectorized.
5260 const int AmortizationCost = 6;
5261
5262 return LT.first * 2 * AmortizationCost;
5263 }
5264
5265 // Opaque ptr or ptr vector types are i64s and can be lowered to STP/LDPs.
5266 if (Ty->isPtrOrPtrVectorTy())
5267 return LT.first;
5268
5269 if (useNeonVector(Ty)) {
5270 // Check truncating stores and extending loads.
5271 if (Ty->getScalarSizeInBits() != LT.second.getScalarSizeInBits()) {
5272 // v4i8 types are lowered to scalar a load/store and sshll/xtn.
5273 if (VT == MVT::v4i8)
5274 return 2;
5275 // Otherwise we need to scalarize.
5276 return cast<FixedVectorType>(Ty)->getNumElements() * 2;
5277 }
5278 EVT EltVT = VT.getVectorElementType();
5279 unsigned EltSize = EltVT.getScalarSizeInBits();
5280 if (!isPowerOf2_32(EltSize) || EltSize < 8 || EltSize > 64 ||
5281 VT.getVectorNumElements() >= (128 / EltSize) || Alignment != Align(1))
5282 return LT.first;
5283 // FIXME: v3i8 lowering currently is very inefficient, due to automatic
5284 // widening to v4i8, which produces suboptimal results.
5285 if (VT.getVectorNumElements() == 3 && EltVT == MVT::i8)
5286 return LT.first;
5287
5288 // Check non-power-of-2 loads/stores for legal vector element types with
5289 // NEON. Non-power-of-2 memory ops will get broken down to a set of
5290 // operations on smaller power-of-2 ops, including ld1/st1.
5291 LLVMContext &C = Ty->getContext();
5293 SmallVector<EVT> TypeWorklist;
5294 TypeWorklist.push_back(VT);
5295 while (!TypeWorklist.empty()) {
5296 EVT CurrVT = TypeWorklist.pop_back_val();
5297 unsigned CurrNumElements = CurrVT.getVectorNumElements();
5298 if (isPowerOf2_32(CurrNumElements)) {
5299 Cost += 1;
5300 continue;
5301 }
5302
5303 unsigned PrevPow2 = NextPowerOf2(CurrNumElements) / 2;
5304 TypeWorklist.push_back(EVT::getVectorVT(C, EltVT, PrevPow2));
5305 TypeWorklist.push_back(
5306 EVT::getVectorVT(C, EltVT, CurrNumElements - PrevPow2));
5307 }
5308 return Cost;
5309 }
5310
5311 return LT.first;
5312}
5313
5315 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
5316 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
5317 bool UseMaskForCond, bool UseMaskForGaps) const {
5318 assert(Factor >= 2 && "Invalid interleave factor");
5319 auto *VecVTy = cast<VectorType>(VecTy);
5320
5321 if (VecTy->isScalableTy() && !ST->hasSVE())
5323
5324 // Scalable VFs will emit vector.[de]interleave intrinsics, and currently we
5325 // only have lowering for power-of-2 factors.
5326 // TODO: Add lowering for vector.[de]interleave3 intrinsics and support in
5327 // InterleavedAccessPass for ld3/st3
5328 if (VecTy->isScalableTy() && !isPowerOf2_32(Factor))
5330
5331 // Vectorization for masked interleaved accesses is only enabled for scalable
5332 // VF.
5333 if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps))
5335
5336 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
5337 unsigned MinElts = VecVTy->getElementCount().getKnownMinValue();
5338 auto *SubVecTy =
5339 VectorType::get(VecVTy->getElementType(),
5340 VecVTy->getElementCount().divideCoefficientBy(Factor));
5341
5342 // ldN/stN only support legal vector types of size 64 or 128 in bits.
5343 // Accesses having vector types that are a multiple of 128 bits can be
5344 // matched to more than one ldN/stN instruction.
5345 bool UseScalable;
5346 if (MinElts % Factor == 0 &&
5347 TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable))
5348 return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable);
5349 }
5350
5351 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
5352 Alignment, AddressSpace, CostKind,
5353 UseMaskForCond, UseMaskForGaps);
5354}
5355
5360 for (auto *I : Tys) {
5361 if (!I->isVectorTy())
5362 continue;
5363 if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
5364 128)
5365 Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
5366 getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
5367 }
5368 return Cost;
5369}
5370
5372 Align Alignment) const {
5373 // Neon types should be scalarised when we are not choosing to use SVE.
5374 if (useNeonVector(DataTy))
5375 return false;
5376
5377 // Return true only if we are able to lower using the SVE2p2/SME2p2
5378 // expand instruction.
5379 return (ST->isSVEAvailable() && ST->hasSVE2p2()) ||
5380 (ST->isSVEorStreamingSVEAvailable() && ST->hasSME2p2());
5381}
5382
5383unsigned
5385 bool HasUnorderedReductions) const {
5386 if (VF.isScalar() || (HasUnorderedReductions && VF.getKnownMinValue() <= 4))
5387 return 4;
5388 return ST->getMaxInterleaveFactor();
5389}
5390
5391// For Falkor, we want to avoid having too many strided loads in a loop since
5392// that can exhaust the HW prefetcher resources. We adjust the unroller
5393// MaxCount preference below to attempt to ensure unrolling doesn't create too
5394// many strided loads.
5395static void
5398 enum { MaxStridedLoads = 7 };
5399 auto countStridedLoads = [](Loop *L, ScalarEvolution &SE) {
5400 int StridedLoads = 0;
5401 // FIXME? We could make this more precise by looking at the CFG and
5402 // e.g. not counting loads in each side of an if-then-else diamond.
5403 for (const auto BB : L->blocks()) {
5404 for (auto &I : *BB) {
5405 LoadInst *LMemI = dyn_cast<LoadInst>(&I);
5406 if (!LMemI)
5407 continue;
5408
5409 Value *PtrValue = LMemI->getPointerOperand();
5410 if (L->isLoopInvariant(PtrValue))
5411 continue;
5412
5413 const SCEV *LSCEV = SE.getSCEV(PtrValue);
5414 const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
5415 if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
5416 continue;
5417
5418 // FIXME? We could take pairing of unrolled load copies into account
5419 // by looking at the AddRec, but we would probably have to limit this
5420 // to loops with no stores or other memory optimization barriers.
5421 ++StridedLoads;
5422 // We've seen enough strided loads that seeing more won't make a
5423 // difference.
5424 if (StridedLoads > MaxStridedLoads / 2)
5425 return StridedLoads;
5426 }
5427 }
5428 return StridedLoads;
5429 };
5430
5431 int StridedLoads = countStridedLoads(L, SE);
5432 LLVM_DEBUG(dbgs() << "falkor-hwpf: detected " << StridedLoads
5433 << " strided loads\n");
5434 // Pick the largest power of 2 unroll count that won't result in too many
5435 // strided loads.
5436 if (StridedLoads) {
5437 UP.MaxCount = 1 << Log2_32(MaxStridedLoads / StridedLoads);
5438 LLVM_DEBUG(dbgs() << "falkor-hwpf: setting unroll MaxCount to "
5439 << UP.MaxCount << '\n');
5440 }
5441}
5442
5443// This function returns true if the loop:
5444// 1. Has a valid cost, and
5445// 2. Has a cost within the supplied budget.
5446// Otherwise it returns false.
5448 InstructionCost Budget,
5449 unsigned *FinalSize) {
5450 // Estimate the size of the loop.
5451 InstructionCost LoopCost = 0;
5452
5453 for (auto *BB : L->getBlocks()) {
5454 for (auto &I : *BB) {
5455 SmallVector<const Value *, 4> Operands(I.operand_values());
5456 InstructionCost Cost =
5457 TTI.getInstructionCost(&I, Operands, TTI::TCK_CodeSize);
5458 // This can happen with intrinsics that don't currently have a cost model
5459 // or for some operations that require SVE.
5460 if (!Cost.isValid())
5461 return false;
5462
5463 LoopCost += Cost;
5464 if (LoopCost > Budget)
5465 return false;
5466 }
5467 }
5468
5469 if (FinalSize)
5470 *FinalSize = LoopCost.getValue();
5471 return true;
5472}
5473
5475 const AArch64TTIImpl &TTI) {
5476 // Only consider loops with unknown trip counts for which we can determine
5477 // a symbolic expression. Multi-exit loops with small known trip counts will
5478 // likely be unrolled anyway.
5479 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5481 return false;
5482
5483 // It might not be worth unrolling loops with low max trip counts. Restrict
5484 // this to max trip counts > 32 for now.
5485 unsigned MaxTC = SE.getSmallConstantMaxTripCount(L);
5486 if (MaxTC > 0 && MaxTC <= 32)
5487 return false;
5488
5489 // Make sure the loop size is <= 5.
5490 if (!isLoopSizeWithinBudget(L, TTI, 5, nullptr))
5491 return false;
5492
5493 // Small search loops with multiple exits can be highly beneficial to unroll.
5494 // We only care about loops with exactly two exiting blocks, although each
5495 // block could jump to the same exit block.
5496 ArrayRef<BasicBlock *> Blocks = L->getBlocks();
5497 if (Blocks.size() != 2)
5498 return false;
5499
5500 if (any_of(Blocks, [](BasicBlock *BB) {
5502 }))
5503 return false;
5504
5505 return true;
5506}
5507
5508/// For Apple CPUs, we want to runtime-unroll loops to make better use if the
5509/// OOO engine's wide instruction window and various predictors.
5510static void
5513 const AArch64TTIImpl &TTI) {
5514 // Limit loops with structure that is highly likely to benefit from runtime
5515 // unrolling; that is we exclude outer loops and loops with many blocks (i.e.
5516 // likely with complex control flow). Note that the heuristics here may be
5517 // overly conservative and we err on the side of avoiding runtime unrolling
5518 // rather than unroll excessively. They are all subject to further refinement.
5519 if (!L->isInnermost() || L->getNumBlocks() > 8)
5520 return;
5521
5522 // Loops with multiple exits are handled by common code.
5523 if (!L->getExitBlock())
5524 return;
5525
5526 // Check if the loop contains any reductions that could be parallelized when
5527 // unrolling. If so, enable partial unrolling, if the trip count is know to be
5528 // a multiple of 2.
5529 bool HasParellelizableReductions =
5530 L->getNumBlocks() == 1 &&
5531 any_of(L->getHeader()->phis(),
5532 [&SE, L](PHINode &Phi) {
5533 return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
5534 }) &&
5535 isLoopSizeWithinBudget(L, TTI, 12, nullptr);
5536 if (HasParellelizableReductions &&
5537 SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
5538 UP.Partial = true;
5539 UP.MaxCount = 4;
5540 UP.AddAdditionalAccumulators = true;
5541 }
5542
5543 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
5545 (SE.getSmallConstantMaxTripCount(L) > 0 &&
5546 SE.getSmallConstantMaxTripCount(L) <= 32))
5547 return;
5548
5549 if (findStringMetadataForLoop(L, "llvm.loop.isvectorized"))
5550 return;
5551
5553 return;
5554
5555 // Limit to loops with trip counts that are cheap to expand.
5556 UP.SCEVExpansionBudget = 1;
5557
5558 if (HasParellelizableReductions) {
5559 UP.Runtime = true;
5561 UP.AddAdditionalAccumulators = true;
5562 }
5563
5564 // Try to unroll small loops, of few-blocks with low budget, if they have
5565 // load/store dependencies, to expose more parallel memory access streams,
5566 // or if they do little work inside a block (i.e. load -> X -> store pattern).
5567 BasicBlock *Header = L->getHeader();
5568 BasicBlock *Latch = L->getLoopLatch();
5569 if (Header == Latch) {
5570 // Estimate the size of the loop.
5571 unsigned Size;
5572 unsigned Width = 10;
5573 if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
5574 return;
5575
5576 // Try to find an unroll count that maximizes the use of the instruction
5577 // window, i.e. trying to fetch as many instructions per cycle as possible.
5578 unsigned MaxInstsPerLine = 16;
5579 unsigned UC = 1;
5580 unsigned BestUC = 1;
5581 unsigned SizeWithBestUC = BestUC * Size;
5582 while (UC <= 8) {
5583 unsigned SizeWithUC = UC * Size;
5584 if (SizeWithUC > 48)
5585 break;
5586 if ((SizeWithUC % MaxInstsPerLine) == 0 ||
5587 (SizeWithBestUC % MaxInstsPerLine) < (SizeWithUC % MaxInstsPerLine)) {
5588 BestUC = UC;
5589 SizeWithBestUC = BestUC * Size;
5590 }
5591 UC++;
5592 }
5593
5594 if (BestUC == 1)
5595 return;
5596
5597 SmallPtrSet<Value *, 8> LoadedValuesPlus;
5599 for (auto *BB : L->blocks()) {
5600 for (auto &I : *BB) {
5602 if (!Ptr)
5603 continue;
5604 const SCEV *PtrSCEV = SE.getSCEV(Ptr);
5605 if (SE.isLoopInvariant(PtrSCEV, L))
5606 continue;
5607 if (isa<LoadInst>(&I)) {
5608 LoadedValuesPlus.insert(&I);
5609 // Include in-loop 1st users of loaded values.
5610 for (auto *U : I.users())
5611 if (L->contains(cast<Instruction>(U)))
5612 LoadedValuesPlus.insert(U);
5613 } else
5614 Stores.push_back(cast<StoreInst>(&I));
5615 }
5616 }
5617
5618 if (none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
5619 return LoadedValuesPlus.contains(SI->getOperand(0));
5620 }))
5621 return;
5622
5623 UP.Runtime = true;
5624 UP.DefaultUnrollRuntimeCount = BestUC;
5625 return;
5626 }
5627
5628 // Try to runtime-unroll loops with early-continues depending on loop-varying
5629 // loads; this helps with branch-prediction for the early-continues.
5630 auto *Term = dyn_cast<CondBrInst>(Header->getTerminator());
5632 if (!Term || Preds.size() == 1 || !llvm::is_contained(Preds, Header) ||
5633 none_of(Preds, [L](BasicBlock *Pred) { return L->contains(Pred); }))
5634 return;
5635
5636 std::function<bool(Instruction *, unsigned)> DependsOnLoopLoad =
5637 [&](Instruction *I, unsigned Depth) -> bool {
5638 if (isa<PHINode>(I) || L->isLoopInvariant(I) || Depth > 8)
5639 return false;
5640
5641 if (isa<LoadInst>(I))
5642 return true;
5643
5644 return any_of(I->operands(), [&](Value *V) {
5645 auto *I = dyn_cast<Instruction>(V);
5646 return I && DependsOnLoopLoad(I, Depth + 1);
5647 });
5648 };
5649 CmpPredicate Pred;
5650 Instruction *I;
5651 if (match(Term, m_Br(m_ICmp(Pred, m_Instruction(I), m_Value()), m_Value(),
5652 m_Value())) &&
5653 DependsOnLoopLoad(I, 0)) {
5654 UP.Runtime = true;
5655 }
5656}
5657
5660 OptimizationRemarkEmitter *ORE) const {
5661 // Enable partial unrolling and runtime unrolling.
5662 BaseT::getUnrollingPreferences(L, SE, UP, ORE);
5663
5664 UP.UpperBound = true;
5665
5666 // For inner loop, it is more likely to be a hot one, and the runtime check
5667 // can be promoted out from LICM pass, so the overhead is less, let's try
5668 // a larger threshold to unroll more loops.
5669 if (L->getLoopDepth() > 1)
5670 UP.PartialThreshold *= 2;
5671
5672 // Disable partial & runtime unrolling on -Os.
5674
5675 // Scan the loop: don't unroll loops with calls as this could prevent
5676 // inlining. Don't unroll auto-vectorized loops either, though do allow
5677 // unrolling of the scalar remainder.
5678 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
5680 for (auto *BB : L->getBlocks()) {
5681 for (auto &I : *BB) {
5682 // Both auto-vectorized loops and the scalar remainder have the
5683 // isvectorized attribute, so differentiate between them by the presence
5684 // of vector instructions.
5685 if (IsVectorized && I.getType()->isVectorTy())
5686 return;
5687 if (isa<CallBase>(I)) {
5690 if (!isLoweredToCall(F))
5691 continue;
5692 return;
5693 }
5694
5695 SmallVector<const Value *, 4> Operands(I.operand_values());
5696 Cost += getInstructionCost(&I, Operands,
5698 }
5699 }
5700
5701 // Apply subtarget-specific unrolling preferences.
5702 if (ST->isAppleMLike())
5703 getAppleRuntimeUnrollPreferences(L, SE, UP, *this);
5704 else if (ST->getProcFamily() == AArch64Subtarget::Falkor &&
5707
5708 // If this is a small, multi-exit loop similar to something like std::find,
5709 // then there is typically a performance improvement achieved by unrolling.
5710 if (!L->getExitBlock() && shouldUnrollMultiExitLoop(L, SE, *this)) {
5711 UP.RuntimeUnrollMultiExit = true;
5712 UP.Runtime = true;
5713 // Limit unroll count.
5715 // Allow slightly more costly trip-count expansion to catch search loops
5716 // with pointer inductions.
5717 UP.SCEVExpansionBudget = 5;
5718 return;
5719 }
5720
5721 // Enable runtime unrolling for in-order models
5722 // If mcpu is omitted, getProcFamily() returns AArch64Subtarget::Others, so by
5723 // checking for that case, we can ensure that the default behaviour is
5724 // unchanged
5725 if (ST->getProcFamily() != AArch64Subtarget::Generic &&
5726 !ST->getSchedModel().isOutOfOrder()) {
5727 UP.Runtime = true;
5728 UP.Partial = true;
5729 UP.UnrollRemainder = true;
5731
5732 UP.UnrollAndJam = true;
5734 }
5735
5736 // Force unrolling small loops can be very useful because of the branch
5737 // taken cost of the backedge.
5739 UP.Force = true;
5740}
5741
5746
5748 Type *ExpectedType,
5749 bool CanCreate) const {
5750 switch (Inst->getIntrinsicID()) {
5751 default:
5752 return nullptr;
5753 case Intrinsic::aarch64_neon_st1x2:
5754 case Intrinsic::aarch64_neon_st1x3:
5755 case Intrinsic::aarch64_neon_st1x4:
5756 case Intrinsic::aarch64_neon_st2:
5757 case Intrinsic::aarch64_neon_st3:
5758 case Intrinsic::aarch64_neon_st4: {
5759 // Create a struct type
5760 StructType *ST = dyn_cast<StructType>(ExpectedType);
5761 if (!CanCreate || !ST)
5762 return nullptr;
5763 unsigned NumElts = Inst->arg_size() - 1;
5764 if (ST->getNumElements() != NumElts)
5765 return nullptr;
5766 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5767 if (Inst->getArgOperand(i)->getType() != ST->getElementType(i))
5768 return nullptr;
5769 }
5770 Value *Res = PoisonValue::get(ExpectedType);
5771 IRBuilder<> Builder(Inst);
5772 for (unsigned i = 0, e = NumElts; i != e; ++i) {
5773 Value *L = Inst->getArgOperand(i);
5774 Res = Builder.CreateInsertValue(Res, L, i);
5775 }
5776 return Res;
5777 }
5778 case Intrinsic::aarch64_neon_ld1x2:
5779 case Intrinsic::aarch64_neon_ld1x3:
5780 case Intrinsic::aarch64_neon_ld1x4:
5781 case Intrinsic::aarch64_neon_ld2:
5782 case Intrinsic::aarch64_neon_ld3:
5783 case Intrinsic::aarch64_neon_ld4:
5784 if (Inst->getType() == ExpectedType)
5785 return Inst;
5786 return nullptr;
5787 }
5788}
5789
5791 MemIntrinsicInfo &Info) const {
5792 switch (Inst->getIntrinsicID()) {
5793 default:
5794 break;
5795 case Intrinsic::aarch64_neon_ld1x2:
5796 case Intrinsic::aarch64_neon_ld1x3:
5797 case Intrinsic::aarch64_neon_ld1x4:
5798 case Intrinsic::aarch64_neon_ld2:
5799 case Intrinsic::aarch64_neon_ld3:
5800 case Intrinsic::aarch64_neon_ld4:
5801 Info.ReadMem = true;
5802 Info.WriteMem = false;
5803 Info.PtrVal = Inst->getArgOperand(0);
5804 break;
5805 case Intrinsic::aarch64_neon_st1x2:
5806 case Intrinsic::aarch64_neon_st1x3:
5807 case Intrinsic::aarch64_neon_st1x4:
5808 case Intrinsic::aarch64_neon_st2:
5809 case Intrinsic::aarch64_neon_st3:
5810 case Intrinsic::aarch64_neon_st4:
5811 Info.ReadMem = false;
5812 Info.WriteMem = true;
5813 Info.PtrVal = Inst->getArgOperand(Inst->arg_size() - 1);
5814 break;
5815 }
5816
5817 // Use the ID of neon load as the "matching id".
5818 switch (Inst->getIntrinsicID()) {
5819 default:
5820 return false;
5821 case Intrinsic::aarch64_neon_ld1x2:
5822 case Intrinsic::aarch64_neon_st1x2:
5823 Info.MatchingId = Intrinsic::aarch64_neon_ld1x2;
5824 break;
5825 case Intrinsic::aarch64_neon_ld1x3:
5826 case Intrinsic::aarch64_neon_st1x3:
5827 Info.MatchingId = Intrinsic::aarch64_neon_ld1x3;
5828 break;
5829 case Intrinsic::aarch64_neon_ld1x4:
5830 case Intrinsic::aarch64_neon_st1x4:
5831 Info.MatchingId = Intrinsic::aarch64_neon_ld1x4;
5832 break;
5833 case Intrinsic::aarch64_neon_ld2:
5834 case Intrinsic::aarch64_neon_st2:
5835 Info.MatchingId = Intrinsic::aarch64_neon_ld2;
5836 break;
5837 case Intrinsic::aarch64_neon_ld3:
5838 case Intrinsic::aarch64_neon_st3:
5839 Info.MatchingId = Intrinsic::aarch64_neon_ld3;
5840 break;
5841 case Intrinsic::aarch64_neon_ld4:
5842 case Intrinsic::aarch64_neon_st4:
5843 Info.MatchingId = Intrinsic::aarch64_neon_ld4;
5844 break;
5845 }
5846 return true;
5847}
5848
5849/// See if \p I should be considered for address type promotion. We check if \p
5850/// I is a sext with right type and used in memory accesses. If it used in a
5851/// "complex" getelementptr, we allow it to be promoted without finding other
5852/// sext instructions that sign extended the same initial value. A getelementptr
5853/// is considered as "complex" if it has more than 2 operands.
5855 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
5856 bool Considerable = false;
5857 AllowPromotionWithoutCommonHeader = false;
5858 if (!isa<SExtInst>(&I))
5859 return false;
5860 Type *ConsideredSExtType =
5861 Type::getInt64Ty(I.getParent()->getParent()->getContext());
5862 if (I.getType() != ConsideredSExtType)
5863 return false;
5864 // See if the sext is the one with the right type and used in at least one
5865 // GetElementPtrInst.
5866 for (const User *U : I.users()) {
5867 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
5868 Considerable = true;
5869 // A getelementptr is considered as "complex" if it has more than 2
5870 // operands. We will promote a SExt used in such complex GEP as we
5871 // expect some computation to be merged if they are done on 64 bits.
5872 if (GEPInst->getNumOperands() > 2) {
5873 AllowPromotionWithoutCommonHeader = true;
5874 break;
5875 }
5876 }
5877 }
5878 return Considerable;
5879}
5880
5882 const RecurrenceDescriptor &RdxDesc, ElementCount VF) const {
5883 if (!VF.isScalable())
5884 return true;
5885
5886 Type *Ty = RdxDesc.getRecurrenceType();
5887 if (Ty->isBFloatTy() || !isElementTypeLegalForScalableVector(Ty))
5888 return false;
5889
5890 switch (RdxDesc.getRecurrenceKind()) {
5891 case RecurKind::Sub:
5892 case RecurKind::FSub:
5895 case RecurKind::Add:
5896 case RecurKind::FAdd:
5897 case RecurKind::And:
5898 case RecurKind::Or:
5899 case RecurKind::Xor:
5900 case RecurKind::SMin:
5901 case RecurKind::SMax:
5902 case RecurKind::UMin:
5903 case RecurKind::UMax:
5904 case RecurKind::FMin:
5905 case RecurKind::FMax:
5906 case RecurKind::FMulAdd:
5907 case RecurKind::AnyOf:
5909 return true;
5910 default:
5911 return false;
5912 }
5913}
5914
5917 FastMathFlags FMF,
5919 // The code-generator is currently not able to handle scalable vectors
5920 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5921 // it. This change will be removed when code-generation for these types is
5922 // sufficiently reliable.
5923 if (auto *VTy = dyn_cast<ScalableVectorType>(Ty))
5924 if (VTy->getElementCount() == ElementCount::getScalable(1))
5926
5927 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
5928
5929 if (LT.second.getScalarType() == MVT::f16 && !ST->hasFullFP16())
5930 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
5931
5932 InstructionCost LegalizationCost = 0;
5933 if (LT.first > 1) {
5934 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
5935 IntrinsicCostAttributes Attrs(IID, LegalVTy, {LegalVTy, LegalVTy}, FMF);
5936 LegalizationCost = getIntrinsicInstrCost(Attrs, CostKind) * (LT.first - 1);
5937 }
5938
5939 return LegalizationCost + /*Cost of horizontal reduction*/ 2;
5940}
5941
5943 unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const {
5944 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
5945 InstructionCost LegalizationCost = 0;
5946 if (LT.first > 1) {
5947 Type *LegalVTy = EVT(LT.second).getTypeForEVT(ValTy->getContext());
5948 LegalizationCost = getArithmeticInstrCost(Opcode, LegalVTy, CostKind);
5949 LegalizationCost *= LT.first - 1;
5950 }
5951
5952 int ISD = TLI->InstructionOpcodeToISD(Opcode);
5953 assert(ISD && "Invalid opcode");
5954 // Add the final reduction cost for the legal horizontal reduction
5955 switch (ISD) {
5956 case ISD::ADD:
5957 case ISD::AND:
5958 case ISD::OR:
5959 case ISD::XOR:
5960 case ISD::FADD:
5961 return LegalizationCost + 2;
5962 default:
5964 }
5965}
5966
5969 std::optional<FastMathFlags> FMF,
5971 // The code-generator is currently not able to handle scalable vectors
5972 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
5973 // it. This change will be removed when code-generation for these types is
5974 // sufficiently reliable.
5975 if (auto *VTy = dyn_cast<ScalableVectorType>(ValTy))
5976 if (VTy->getElementCount() == ElementCount::getScalable(1))
5978
5980 if (auto *FixedVTy = dyn_cast<FixedVectorType>(ValTy)) {
5981 InstructionCost BaseCost =
5982 BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
5983 // Add on extra cost to reflect the extra overhead on some CPUs. We still
5984 // end up vectorizing for more computationally intensive loops.
5985 return BaseCost + FixedVTy->getNumElements();
5986 }
5987
5988 if (Opcode != Instruction::FAdd || ValTy->getElementType()->isBFloatTy())
5990
5991 auto *VTy = cast<ScalableVectorType>(ValTy);
5993 getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
5994 Cost *= getMaxNumElements(VTy->getElementCount());
5995 return Cost;
5996 }
5997
5998 if (isa<ScalableVectorType>(ValTy))
5999 return getArithmeticReductionCostSVE(Opcode, ValTy, CostKind);
6000
6001 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
6002 MVT MTy = LT.second;
6003 int ISD = TLI->InstructionOpcodeToISD(Opcode);
6004 assert(ISD && "Invalid opcode");
6005
6006 // Horizontal adds can use the 'addv' instruction. We model the cost of these
6007 // instructions as twice a normal vector add, plus 1 for each legalization
6008 // step (LT.first). This is the only arithmetic vector reduction operation for
6009 // which we have an instruction.
6010 // OR, XOR and AND costs should match the codegen from:
6011 // OR: llvm/test/CodeGen/AArch64/reduce-or.ll
6012 // XOR: llvm/test/CodeGen/AArch64/reduce-xor.ll
6013 // AND: llvm/test/CodeGen/AArch64/reduce-and.ll
6014 static const CostTblEntry CostTblNoPairwise[]{
6015 {ISD::ADD, MVT::v8i8, 2},
6016 {ISD::ADD, MVT::v16i8, 2},
6017 {ISD::ADD, MVT::v4i16, 2},
6018 {ISD::ADD, MVT::v8i16, 2},
6019 {ISD::ADD, MVT::v2i32, 2},
6020 {ISD::ADD, MVT::v4i32, 2},
6021 {ISD::ADD, MVT::v2i64, 2},
6022 {ISD::OR, MVT::v8i8, 5}, // fmov + orr_lsr + orr_lsr + lsr + orr
6023 {ISD::OR, MVT::v16i8, 7}, // ext + orr + same as v8i8
6024 {ISD::OR, MVT::v4i16, 4}, // fmov + orr_lsr + lsr + orr
6025 {ISD::OR, MVT::v8i16, 6}, // ext + orr + same as v4i16
6026 {ISD::OR, MVT::v2i32, 3}, // fmov + lsr + orr
6027 {ISD::OR, MVT::v4i32, 5}, // ext + orr + same as v2i32
6028 {ISD::OR, MVT::v2i64, 3}, // ext + orr + fmov
6029 {ISD::XOR, MVT::v8i8, 5}, // Same as above for or...
6030 {ISD::XOR, MVT::v16i8, 7},
6031 {ISD::XOR, MVT::v4i16, 4},
6032 {ISD::XOR, MVT::v8i16, 6},
6033 {ISD::XOR, MVT::v2i32, 3},
6034 {ISD::XOR, MVT::v4i32, 5},
6035 {ISD::XOR, MVT::v2i64, 3},
6036 {ISD::AND, MVT::v8i8, 5}, // Same as above for or...
6037 {ISD::AND, MVT::v16i8, 7},
6038 {ISD::AND, MVT::v4i16, 4},
6039 {ISD::AND, MVT::v8i16, 6},
6040 {ISD::AND, MVT::v2i32, 3},
6041 {ISD::AND, MVT::v4i32, 5},
6042 {ISD::AND, MVT::v2i64, 3},
6043 };
6044 switch (ISD) {
6045 default:
6046 break;
6047 case ISD::FADD:
6048 if (Type *EltTy = ValTy->getScalarType();
6049 // FIXME: For half types without fullfp16 support, this could extend and
6050 // use a fp32 faddp reduction but current codegen unrolls.
6051 MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
6052 (EltTy->isHalfTy() && ST->hasFullFP16()))) {
6053 const unsigned NElts = MTy.getVectorNumElements();
6054 if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
6055 isPowerOf2_32(NElts))
6056 // Reduction corresponding to series of fadd instructions is lowered to
6057 // series of faddp instructions. faddp has latency/throughput that
6058 // matches fadd instruction and hence, every faddp instruction can be
6059 // considered to have a relative cost = 1 with
6060 // CostKind = TCK_RecipThroughput.
6061 // An faddp will pairwise add vector elements, so the size of input
6062 // vector reduces by half every time, requiring
6063 // #(faddp instructions) = log2_32(NElts).
6064 return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
6065 }
6066 break;
6067 case ISD::ADD:
6068 if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
6069 return (LT.first - 1) + Entry->Cost;
6070 break;
6071 case ISD::XOR:
6072 case ISD::AND:
6073 case ISD::OR:
6074 const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy);
6075 if (!Entry)
6076 break;
6077 auto *ValVTy = cast<FixedVectorType>(ValTy);
6078 if (MTy.getVectorNumElements() <= ValVTy->getNumElements() &&
6079 isPowerOf2_32(ValVTy->getNumElements())) {
6080 InstructionCost ExtraCost = 0;
6081 if (LT.first != 1) {
6082 // Type needs to be split, so there is an extra cost of LT.first - 1
6083 // arithmetic ops.
6084 auto *Ty = FixedVectorType::get(ValTy->getElementType(),
6085 MTy.getVectorNumElements());
6086 ExtraCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
6087 ExtraCost *= LT.first - 1;
6088 }
6089 // All and/or/xor of i1 will be lowered with maxv/minv/addv + fmov
6090 auto Cost = ValVTy->getElementType()->isIntegerTy(1) ? 2 : Entry->Cost;
6091 return Cost + ExtraCost;
6092 }
6093 break;
6094 }
6095 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
6096}
6097
6099 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *VecTy,
6100 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
6101 EVT VecVT = TLI->getValueType(DL, VecTy);
6102 EVT ResVT = TLI->getValueType(DL, ResTy);
6103
6104 if (Opcode == Instruction::Add && VecVT.isSimple() && ResVT.isSimple() &&
6105 VecVT.getSizeInBits() >= 64) {
6106 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
6107
6108 // The legal cases are:
6109 // UADDLV 8/16/32->32
6110 // UADDLP 32->64
6111 unsigned RevVTSize = ResVT.getSizeInBits();
6112 if (((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6113 RevVTSize <= 32) ||
6114 ((LT.second == MVT::v4i16 || LT.second == MVT::v8i16) &&
6115 RevVTSize <= 32) ||
6116 ((LT.second == MVT::v2i32 || LT.second == MVT::v4i32) &&
6117 RevVTSize <= 64))
6118 return (LT.first - 1) * 2 + 2;
6119 }
6120
6121 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, VecTy, FMF,
6122 CostKind);
6123}
6124
6126AArch64TTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
6127 Type *ResTy, VectorType *VecTy,
6129 EVT VecVT = TLI->getValueType(DL, VecTy);
6130 EVT ResVT = TLI->getValueType(DL, ResTy);
6131
6132 if (ST->hasDotProd() && VecVT.isSimple() && ResVT.isSimple() &&
6133 RedOpcode == Instruction::Add) {
6134 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VecTy);
6135
6136 // The legal cases with dotprod are
6137 // UDOT 8->32
6138 // Which requires an additional uaddv to sum the i32 values.
6139 if ((LT.second == MVT::v8i8 || LT.second == MVT::v16i8) &&
6140 ResVT == MVT::i32)
6141 return LT.first + 2;
6142 }
6143
6144 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, VecTy,
6145 CostKind);
6146}
6147
6151 static const CostTblEntry ShuffleTbl[] = {
6152 { TTI::SK_Splice, MVT::nxv16i8, 1 },
6153 { TTI::SK_Splice, MVT::nxv8i16, 1 },
6154 { TTI::SK_Splice, MVT::nxv4i32, 1 },
6155 { TTI::SK_Splice, MVT::nxv2i64, 1 },
6156 { TTI::SK_Splice, MVT::nxv2f16, 1 },
6157 { TTI::SK_Splice, MVT::nxv4f16, 1 },
6158 { TTI::SK_Splice, MVT::nxv8f16, 1 },
6159 { TTI::SK_Splice, MVT::nxv2bf16, 1 },
6160 { TTI::SK_Splice, MVT::nxv4bf16, 1 },
6161 { TTI::SK_Splice, MVT::nxv8bf16, 1 },
6162 { TTI::SK_Splice, MVT::nxv2f32, 1 },
6163 { TTI::SK_Splice, MVT::nxv4f32, 1 },
6164 { TTI::SK_Splice, MVT::nxv2f64, 1 },
6165 };
6166
6167 // The code-generator is currently not able to handle scalable vectors
6168 // of <vscale x 1 x eltty> yet, so return an invalid cost to avoid selecting
6169 // it. This change will be removed when code-generation for these types is
6170 // sufficiently reliable.
6173
6174 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
6175 Type *LegalVTy = EVT(LT.second).getTypeForEVT(Tp->getContext());
6176 EVT PromotedVT = LT.second.getScalarType() == MVT::i1
6177 ? TLI->getPromotedVTForPredicate(EVT(LT.second))
6178 : LT.second;
6179 Type *PromotedVTy = EVT(PromotedVT).getTypeForEVT(Tp->getContext());
6180 InstructionCost LegalizationCost = 0;
6181 if (Index < 0) {
6182 LegalizationCost =
6183 getCmpSelInstrCost(Instruction::ICmp, PromotedVTy, PromotedVTy,
6185 getCmpSelInstrCost(Instruction::Select, PromotedVTy, LegalVTy,
6187 }
6188
6189 // Predicated splice are promoted when lowering. See AArch64ISelLowering.cpp
6190 // Cost performed on a promoted type.
6191 if (LT.second.getScalarType() == MVT::i1) {
6192 LegalizationCost +=
6193 getCastInstrCost(Instruction::ZExt, PromotedVTy, LegalVTy,
6195 getCastInstrCost(Instruction::Trunc, LegalVTy, PromotedVTy,
6197 }
6198 const auto *Entry =
6199 CostTableLookup(ShuffleTbl, TTI::SK_Splice, PromotedVT.getSimpleVT());
6200 assert(Entry && "Illegal Type for Splice");
6201 LegalizationCost += Entry->Cost;
6202 return LegalizationCost * LT.first;
6203}
6204
6206 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
6208 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
6209 TTI::TargetCostKind CostKind, std::optional<FastMathFlags> FMF) const {
6211
6213 return Invalid;
6214
6215 if ((Opcode != Instruction::Add && Opcode != Instruction::Sub &&
6216 Opcode != Instruction::FAdd && Opcode != Instruction::FSub) ||
6217 OpAExtend == TTI::PR_None)
6218 return Invalid;
6219
6220 // Floating-point partial reductions are invalid if `reassoc` and `contract`
6221 // are not allowed.
6222 if (AccumType->isFloatingPointTy()) {
6223 assert(FMF && "Missing FastMathFlags for floating-point partial reduction");
6224 if (!FMF->allowReassoc() || !FMF->allowContract())
6225 return Invalid;
6226 } else {
6227 assert(!FMF &&
6228 "FastMathFlags only apply to floating-point partial reductions");
6229 }
6230
6231 assert((BinOp || (OpBExtend == TTI::PR_None && !InputTypeB)) &&
6232 (!BinOp || (OpBExtend != TTI::PR_None && InputTypeB)) &&
6233 "Unexpected values for OpBExtend or InputTypeB");
6234
6235 // We only support multiply binary operations for now, and for muls we
6236 // require the types being extended to be the same.
6237 if (BinOp && ((*BinOp != Instruction::Mul && *BinOp != Instruction::FMul) ||
6238 InputTypeA != InputTypeB))
6239 return Invalid;
6240
6241 bool IsUSDot = OpBExtend != TTI::PR_None && OpAExtend != OpBExtend;
6242 // USDot is natively supported with +i8mm. With plain +dotprod, SUMLA is
6243 // lowered to two udots plus an eor and a sub.
6244 if (IsUSDot && !ST->hasMatMulInt8() && !ST->hasDotProd())
6245 // FIXME: Remove this early bailout in favour of expand cost.
6246 return Invalid;
6247
6248 unsigned Ratio =
6249 AccumType->getScalarSizeInBits() / InputTypeA->getScalarSizeInBits();
6250 if (VF.getKnownMinValue() <= Ratio)
6251 return Invalid;
6252
6253 VectorType *InputVectorType = VectorType::get(InputTypeA, VF);
6254 VectorType *AccumVectorType =
6255 VectorType::get(AccumType, VF.divideCoefficientBy(Ratio));
6256 // We don't yet support all kinds of legalization.
6257 auto TC = TLI->getTypeConversion(AccumVectorType->getContext(),
6258 EVT::getEVT(AccumVectorType));
6259 switch (TC.first) {
6260 default:
6261 return Invalid;
6265 // The legalised type (e.g. after splitting) must be legal too.
6266 if (TLI->getTypeAction(AccumVectorType->getContext(), TC.second) !=
6268 return Invalid;
6269 break;
6270 }
6271
6272 std::pair<InstructionCost, MVT> AccumLT =
6273 getTypeLegalizationCost(AccumVectorType);
6274 std::pair<InstructionCost, MVT> InputLT =
6275 getTypeLegalizationCost(InputVectorType);
6276
6277 // Returns true if the subtarget supports the operation for a given type.
6278 auto IsSupported = [&](bool SVEPred, bool NEONPred) -> bool {
6279 return (ST->isSVEorStreamingSVEAvailable() && SVEPred) ||
6280 (AccumLT.second.isFixedLengthVector() &&
6281 AccumLT.second.getSizeInBits() <= 128 && ST->isNeonAvailable() &&
6282 NEONPred);
6283 };
6284
6285 bool IsSub = Opcode == Instruction::Sub || Opcode == Instruction::FSub;
6286 InstructionCost Cost = InputLT.first * TTI::TCC_Basic;
6287 // Integer partial sub-reductions that don't map to a specific instruction,
6288 // carry an extra cost for implementing a double negation:
6289 // partial_reduce_umls acc, lhs, rhs
6290 // <=> -partial_reduce_umla -acc, lhs, rhs
6291 InstructionCost INegCost = IsSub ? 2 * InputLT.first * TTI::TCC_Basic : 0;
6292
6293 if (AccumLT.second.getScalarType() == MVT::i32 &&
6294 InputLT.second.getScalarType() == MVT::i8) {
6295 // i8 -> i32 is natively supported with udot/sdot for both NEON and SVE.
6296 if (!IsUSDot && IsSupported(true, ST->hasDotProd()))
6297 return Cost + INegCost;
6298 // i8 -> i32 usdot requires +i8mm
6299 if (IsUSDot && IsSupported(ST->hasMatMulInt8(), ST->hasMatMulInt8()))
6300 return Cost + INegCost;
6301 // Without +i8mm, lower SUMLA via two udots plus an eor and a sub on plain
6302 // +dotprod targets. Note that this is only implemented for NEON, as all
6303 // modern CPUs with SVE also have +i8mm. Charge an extra factor for the
6304 // expansion.
6305 if (IsUSDot && IsSupported(false, ST->hasDotProd()))
6306 return Cost * 3 + INegCost;
6307 }
6308
6309 if (ST->isSVEorStreamingSVEAvailable() && !IsUSDot) {
6310 // i16 -> i64 is natively supported for udot/sdot
6311 if (AccumLT.second.getScalarType() == MVT::i64 &&
6312 InputLT.second.getScalarType() == MVT::i16)
6313 return Cost + INegCost;
6314 // i16 -> i32 is natively supported with SVE2p1 udot/sdot.
6315 // For sub-reductions, we prefer using the *mlslb/t instructions.
6316 if (AccumLT.second.getScalarType() == MVT::i32 &&
6317 InputLT.second.getScalarType() == MVT::i16 &&
6318 (ST->hasSVE2p1() || ST->hasSME2()) && !IsSub)
6319 return Cost;
6320 // i8 -> i64 is supported with an extra level of extends
6321 if (AccumLT.second.getScalarType() == MVT::i64 &&
6322 InputLT.second.getScalarType() == MVT::i8)
6323 // FIXME: This cost should probably be a little higher, e.g. Cost + 2
6324 // because it requires two extra extends on the inputs. But if we'd change
6325 // that now, a regular reduction would be cheaper because the costs of
6326 // the extends in the IR are still counted. This can be fixed
6327 // after https://github.com/llvm/llvm-project/pull/147302 has landed.
6328 return Cost + INegCost;
6329 // i8 -> i16 is natively supported with SVE2p3 udot/sdot
6330 // For sub-reductions, we prefer using the *mlslb/t instructions.
6331 if (AccumLT.second.getScalarType() == MVT::i16 &&
6332 InputLT.second.getScalarType() == MVT::i8 &&
6333 (ST->hasSVE2p3() || ST->hasSME2p3()) && !IsSub)
6334 return Cost;
6335 }
6336
6337 // f16 -> f32 is natively supported for fdot using either
6338 // SVE or NEON instruction.
6339 if (Opcode == Instruction::FAdd && !IsSub &&
6340 IsSupported(ST->hasSME2() || ST->hasSVE2p1(), ST->hasF16F32DOT()) &&
6341 AccumLT.second.getScalarType() == MVT::f32 &&
6342 InputLT.second.getScalarType() == MVT::f16)
6343 return Cost;
6344
6345 // For a ratio of 2, we can use *mlal and *mlsl top/bottom instructions.
6346 if (Ratio == 2 && !IsUSDot) {
6347 MVT InVT = InputLT.second.getScalarType();
6348
6349 // SVE2 [us]ml[as]lb/t and NEON [us]ml[as]l(2)
6350 if (IsSupported(ST->hasSVE2() || ST->hasSME(), true) &&
6351 llvm::is_contained({MVT::i8, MVT::i16, MVT::i32}, InVT.SimpleTy))
6352 return Cost * 2;
6353
6354 // SVE2 fml[as]lb/t and NEON fml[as]l(2)
6355 if (IsSupported(ST->hasSVE2(), ST->hasFP16FML()) && InVT == MVT::f16)
6356 return Cost * 2;
6357
6358 // SME2/SVE2p1 bfmlslb/t
6359 if (IsSupported(ST->hasSVE2p1() || ST->hasSME2(), false) &&
6360 InVT == MVT::bf16 && IsSub)
6361 return Cost * 2;
6362
6363 // FP partial sub-reductions that don't map to a specific instruction,
6364 // carry an extra cost for implementing an extra negation:
6365 // partial_reduce_fmls acc, lhs, rhs
6366 // <=> partial_reduce_fmla acc, lhs, -rhs
6367 InstructionCost FNegCost = IsSub ? InputLT.first * TTI::TCC_Basic : 0;
6368
6369 // SVE and NEON bfmlalb/t
6370 if (IsSupported(ST->hasBF16(), ST->hasBF16()) && InVT == MVT::bf16)
6371 return Cost * 2 + FNegCost;
6372 }
6373
6374 return BaseT::getPartialReductionCost(Opcode, InputTypeA, InputTypeB,
6375 AccumType, VF, OpAExtend, OpBExtend,
6376 BinOp, CostKind, FMF);
6377}
6378
6381 VectorType *SrcTy, ArrayRef<int> Mask,
6382 TTI::TargetCostKind CostKind, int Index,
6384 const Instruction *CxtI) const {
6385 assert((Mask.empty() || DstTy->isScalableTy() ||
6386 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
6387 "Expected the Mask to match the return size if given");
6388 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
6389 "Expected the same scalar types");
6390 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
6391
6392 // If we have a Mask, and the LT is being legalized somehow, split the Mask
6393 // into smaller vectors and sum the cost of each shuffle.
6394 if (!Mask.empty() && isa<FixedVectorType>(SrcTy) && LT.second.isVector() &&
6395 LT.second.getScalarSizeInBits() * Mask.size() > 128 &&
6396 SrcTy->getScalarSizeInBits() == LT.second.getScalarSizeInBits() &&
6397 Mask.size() > LT.second.getVectorNumElements() && !Index && !SubTp) {
6398 // Check for LD3/LD4 instructions, which are represented in llvm IR as
6399 // deinterleaving-shuffle(load). The shuffle cost could potentially be free,
6400 // but we model it with a cost of LT.first so that LD3/LD4 have a higher
6401 // cost than just the load.
6402 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
6405 return std::max<InstructionCost>(1, LT.first / 4);
6406
6407 // Check for ST3/ST4 instructions, which are represented in llvm IR as
6408 // store(interleaving-shuffle). The shuffle cost could potentially be free,
6409 // but we model it with a cost of LT.first so that ST3/ST4 have a higher
6410 // cost than just the store.
6411 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
6413 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2) ||
6415 Mask, 3, SrcTy->getElementCount().getKnownMinValue() * 2)))
6416 return LT.first;
6417
6418 unsigned TpNumElts = Mask.size();
6419 unsigned LTNumElts = LT.second.getVectorNumElements();
6420 unsigned NumVecs = (TpNumElts + LTNumElts - 1) / LTNumElts;
6421 VectorType *NTp = VectorType::get(SrcTy->getScalarType(),
6422 LT.second.getVectorElementCount());
6424 std::map<std::tuple<unsigned, unsigned, SmallVector<int>>, InstructionCost>
6425 PreviousCosts;
6426 for (unsigned N = 0; N < NumVecs; N++) {
6427 SmallVector<int> NMask;
6428 // Split the existing mask into chunks of size LTNumElts. Track the source
6429 // sub-vectors to ensure the result has at most 2 inputs.
6430 unsigned Source1 = -1U, Source2 = -1U;
6431 unsigned NumSources = 0;
6432 for (unsigned E = 0; E < LTNumElts; E++) {
6433 int MaskElt = (N * LTNumElts + E < TpNumElts) ? Mask[N * LTNumElts + E]
6435 if (MaskElt < 0) {
6437 continue;
6438 }
6439
6440 // Calculate which source from the input this comes from and whether it
6441 // is new to us.
6442 unsigned Source = MaskElt / LTNumElts;
6443 if (NumSources == 0) {
6444 Source1 = Source;
6445 NumSources = 1;
6446 } else if (NumSources == 1 && Source != Source1) {
6447 Source2 = Source;
6448 NumSources = 2;
6449 } else if (NumSources >= 2 && Source != Source1 && Source != Source2) {
6450 NumSources++;
6451 }
6452
6453 // Add to the new mask. For the NumSources>2 case these are not correct,
6454 // but are only used for the modular lane number.
6455 if (Source == Source1)
6456 NMask.push_back(MaskElt % LTNumElts);
6457 else if (Source == Source2)
6458 NMask.push_back(MaskElt % LTNumElts + LTNumElts);
6459 else
6460 NMask.push_back(MaskElt % LTNumElts);
6461 }
6462 // Check if we have already generated this sub-shuffle, which means we
6463 // will have already generated the output. For example a <16 x i32> splat
6464 // will be the same sub-splat 4 times, which only needs to be generated
6465 // once and reused.
6466 auto Result =
6467 PreviousCosts.insert({std::make_tuple(Source1, Source2, NMask), 0});
6468 // Check if it was already in the map (already costed).
6469 if (!Result.second)
6470 continue;
6471 // If the sub-mask has at most 2 input sub-vectors then re-cost it using
6472 // getShuffleCost. If not then cost it using the worst case as the number
6473 // of element moves into a new vector.
6474 InstructionCost NCost =
6475 NumSources <= 2
6476 ? getShuffleCost(NumSources <= 1 ? TTI::SK_PermuteSingleSrc
6478 NTp, NTp, NMask, CostKind, 0, nullptr, Args,
6479 CxtI)
6480 : LTNumElts;
6481 Result.first->second = NCost;
6482 Cost += NCost;
6483 }
6484 return Cost;
6485 }
6486
6487 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
6488 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
6489 // A subvector extract can be implemented with a NEON/SVE ext (or trivial
6490 // extract, if from lane 0) for 128-bit NEON vectors or legal SVE vectors.
6491 // This currently only handles low or high extracts to prevent SLP vectorizer
6492 // regressions.
6493 // Note that SVE's ext instruction is destructive, but it can be fused with
6494 // a movprfx to act like a constructive instruction.
6495 if (IsExtractSubvector && LT.second.isFixedLengthVector()) {
6496 if (LT.second.getFixedSizeInBits() >= 128 &&
6497 cast<FixedVectorType>(SubTp)->getNumElements() ==
6498 LT.second.getVectorNumElements() / 2) {
6499 if (Index == 0)
6500 return 0;
6501 if (Index == (int)LT.second.getVectorNumElements() / 2)
6502 return 1;
6503 }
6505 }
6506 // FIXME: This was added to keep the costs equal when adding DstTys. Update
6507 // the code to handle length-changing shuffles.
6508 if (Kind == TTI::SK_InsertSubvector) {
6509 LT = getTypeLegalizationCost(DstTy);
6510 SrcTy = DstTy;
6511 }
6512
6513 // Check for identity masks, which we can treat as free for both fixed and
6514 // scalable vector paths.
6515 if (!Mask.empty() && LT.second.isFixedLengthVector() &&
6516 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) &&
6517 all_of(enumerate(Mask), [](const auto &M) {
6518 return M.value() < 0 || M.value() == (int)M.index();
6519 }))
6520 return 0;
6521
6522 // Segmented shuffle matching.
6523 if (Kind == TTI::SK_PermuteSingleSrc && isa<FixedVectorType>(SrcTy) &&
6524 !Mask.empty() && SrcTy->getPrimitiveSizeInBits().isNonZero() &&
6525 SrcTy->getPrimitiveSizeInBits().isKnownMultipleOf(
6527
6529 unsigned Segments =
6531 unsigned SegmentElts = VTy->getNumElements() / Segments;
6532
6533 // dupq zd.t, zn.t[idx]
6534 if ((ST->hasSVE2p1() || ST->hasSME2p1()) &&
6535 ST->isSVEorStreamingSVEAvailable() &&
6536 isDUPQMask(Mask, Segments, SegmentElts))
6537 return LT.first;
6538
6539 // mov zd.q, vn
6540 if (ST->isSVEorStreamingSVEAvailable() &&
6541 isDUPFirstSegmentMask(Mask, Segments, SegmentElts))
6542 return LT.first;
6543 }
6544
6545 // Check for broadcast loads, which are supported by the LD1R instruction.
6546 // In terms of code-size, the shuffle vector is free when a load + dup get
6547 // folded into a LD1R. That's what we check and return here. For performance
6548 // and reciprocal throughput, a LD1R is not completely free. In this case, we
6549 // return the cost for the broadcast below (i.e. 1 for most/all types), so
6550 // that we model the load + dup sequence slightly higher because LD1R is a
6551 // high latency instruction.
6552 if (CostKind == TTI::TCK_CodeSize && Kind == TTI::SK_Broadcast) {
6553 bool IsLoad = !Args.empty() && isa<LoadInst>(Args[0]);
6554 if (IsLoad && LT.second.isVector() &&
6555 isLegalBroadcastLoad(SrcTy->getElementType(),
6556 LT.second.getVectorElementCount()))
6557 return 0;
6558 }
6559
6560 // If we have 4 elements for the shuffle and a Mask, get the cost straight
6561 // from the perfect shuffle tables.
6562 if (Mask.size() == 4 &&
6563 SrcTy->getElementCount() == ElementCount::getFixed(4) &&
6564 (SrcTy->getScalarSizeInBits() == 16 ||
6565 SrcTy->getScalarSizeInBits() == 32) &&
6566 all_of(Mask, [](int E) { return E < 8; }))
6567 return getPerfectShuffleCost(Mask);
6568
6569 // Check for other shuffles that are not SK_ kinds but we have native
6570 // instructions for, for example ZIP and UZP.
6571 unsigned Unused;
6572 if (LT.second.isFixedLengthVector() &&
6573 LT.second.getVectorNumElements() == Mask.size() &&
6574 (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc ||
6575 // Discrepancies between isTRNMask and ShuffleVectorInst::isTransposeMask
6576 // mean that we can end up with shuffles that satisfy isTRNMask, but end
6577 // up labelled as TTI::SK_InsertSubvector. (e.g. {2, 0}).
6578 Kind == TTI::SK_InsertSubvector) &&
6579 (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6580 isTRNMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) ||
6581 isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) ||
6582 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6583 LT.second.getVectorNumElements(), 16) ||
6584 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6585 LT.second.getVectorNumElements(), 32) ||
6586 isREVMask(Mask, LT.second.getScalarSizeInBits(),
6587 LT.second.getVectorNumElements(), 64) ||
6588 // Check for non-zero lane splats
6589 all_of(drop_begin(Mask),
6590 [&Mask](int M) { return M < 0 || M == Mask[0]; })))
6591 return 1;
6592
6593 if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
6594 Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc ||
6595 Kind == TTI::SK_Reverse || Kind == TTI::SK_Splice) {
6596 static const CostTblEntry ShuffleTbl[] = {
6597 // Broadcast shuffle kinds can be performed with 'dup'.
6598 {TTI::SK_Broadcast, MVT::v8i8, 1},
6599 {TTI::SK_Broadcast, MVT::v16i8, 1},
6600 {TTI::SK_Broadcast, MVT::v4i16, 1},
6601 {TTI::SK_Broadcast, MVT::v8i16, 1},
6602 {TTI::SK_Broadcast, MVT::v2i32, 1},
6603 {TTI::SK_Broadcast, MVT::v4i32, 1},
6604 {TTI::SK_Broadcast, MVT::v2i64, 1},
6605 {TTI::SK_Broadcast, MVT::v4f16, 1},
6606 {TTI::SK_Broadcast, MVT::v8f16, 1},
6607 {TTI::SK_Broadcast, MVT::v4bf16, 1},
6608 {TTI::SK_Broadcast, MVT::v8bf16, 1},
6609 {TTI::SK_Broadcast, MVT::v2f32, 1},
6610 {TTI::SK_Broadcast, MVT::v4f32, 1},
6611 {TTI::SK_Broadcast, MVT::v2f64, 1},
6612 // Transpose shuffle kinds can be performed with 'trn1/trn2' and
6613 // 'zip1/zip2' instructions.
6614 {TTI::SK_Transpose, MVT::v8i8, 1},
6615 {TTI::SK_Transpose, MVT::v16i8, 1},
6616 {TTI::SK_Transpose, MVT::v4i16, 1},
6617 {TTI::SK_Transpose, MVT::v8i16, 1},
6618 {TTI::SK_Transpose, MVT::v2i32, 1},
6619 {TTI::SK_Transpose, MVT::v4i32, 1},
6620 {TTI::SK_Transpose, MVT::v2i64, 1},
6621 {TTI::SK_Transpose, MVT::v4f16, 1},
6622 {TTI::SK_Transpose, MVT::v8f16, 1},
6623 {TTI::SK_Transpose, MVT::v4bf16, 1},
6624 {TTI::SK_Transpose, MVT::v8bf16, 1},
6625 {TTI::SK_Transpose, MVT::v2f32, 1},
6626 {TTI::SK_Transpose, MVT::v4f32, 1},
6627 {TTI::SK_Transpose, MVT::v2f64, 1},
6628 // Select shuffle kinds.
6629 // TODO: handle vXi8/vXi16.
6630 {TTI::SK_Select, MVT::v2i32, 1}, // mov.
6631 {TTI::SK_Select, MVT::v4i32, 2}, // rev+trn (or similar).
6632 {TTI::SK_Select, MVT::v2i64, 1}, // mov.
6633 {TTI::SK_Select, MVT::v2f32, 1}, // mov.
6634 {TTI::SK_Select, MVT::v4f32, 2}, // rev+trn (or similar).
6635 {TTI::SK_Select, MVT::v2f64, 1}, // mov.
6636 // PermuteSingleSrc shuffle kinds.
6637 {TTI::SK_PermuteSingleSrc, MVT::v2i32, 1}, // mov.
6638 {TTI::SK_PermuteSingleSrc, MVT::v4i32, 3}, // perfectshuffle worst case.
6639 {TTI::SK_PermuteSingleSrc, MVT::v2i64, 1}, // mov.
6640 {TTI::SK_PermuteSingleSrc, MVT::v2f32, 1}, // mov.
6641 {TTI::SK_PermuteSingleSrc, MVT::v4f32, 3}, // perfectshuffle worst case.
6642 {TTI::SK_PermuteSingleSrc, MVT::v2f64, 1}, // mov.
6643 {TTI::SK_PermuteSingleSrc, MVT::v4i16, 3}, // perfectshuffle worst case.
6644 {TTI::SK_PermuteSingleSrc, MVT::v4f16, 3}, // perfectshuffle worst case.
6645 {TTI::SK_PermuteSingleSrc, MVT::v4bf16, 3}, // same
6646 {TTI::SK_PermuteSingleSrc, MVT::v8i16, 8}, // constpool + load + tbl
6647 {TTI::SK_PermuteSingleSrc, MVT::v8f16, 8}, // constpool + load + tbl
6648 {TTI::SK_PermuteSingleSrc, MVT::v8bf16, 8}, // constpool + load + tbl
6649 {TTI::SK_PermuteSingleSrc, MVT::v8i8, 8}, // constpool + load + tbl
6650 {TTI::SK_PermuteSingleSrc, MVT::v16i8, 8}, // constpool + load + tbl
6651 // Reverse can be lowered with `rev`.
6652 {TTI::SK_Reverse, MVT::v2i32, 1}, // REV64
6653 {TTI::SK_Reverse, MVT::v4i32, 2}, // REV64; EXT
6654 {TTI::SK_Reverse, MVT::v2i64, 1}, // EXT
6655 {TTI::SK_Reverse, MVT::v2f32, 1}, // REV64
6656 {TTI::SK_Reverse, MVT::v4f32, 2}, // REV64; EXT
6657 {TTI::SK_Reverse, MVT::v2f64, 1}, // EXT
6658 {TTI::SK_Reverse, MVT::v8f16, 2}, // REV64; EXT
6659 {TTI::SK_Reverse, MVT::v8bf16, 2}, // REV64; EXT
6660 {TTI::SK_Reverse, MVT::v8i16, 2}, // REV64; EXT
6661 {TTI::SK_Reverse, MVT::v16i8, 2}, // REV64; EXT
6662 {TTI::SK_Reverse, MVT::v4f16, 1}, // REV64
6663 {TTI::SK_Reverse, MVT::v4bf16, 1}, // REV64
6664 {TTI::SK_Reverse, MVT::v4i16, 1}, // REV64
6665 {TTI::SK_Reverse, MVT::v8i8, 1}, // REV64
6666 // Splice can all be lowered as `ext`.
6667 {TTI::SK_Splice, MVT::v2i32, 1},
6668 {TTI::SK_Splice, MVT::v4i32, 1},
6669 {TTI::SK_Splice, MVT::v2i64, 1},
6670 {TTI::SK_Splice, MVT::v2f32, 1},
6671 {TTI::SK_Splice, MVT::v4f32, 1},
6672 {TTI::SK_Splice, MVT::v2f64, 1},
6673 {TTI::SK_Splice, MVT::v8f16, 1},
6674 {TTI::SK_Splice, MVT::v8bf16, 1},
6675 {TTI::SK_Splice, MVT::v8i16, 1},
6676 {TTI::SK_Splice, MVT::v16i8, 1},
6677 {TTI::SK_Splice, MVT::v4f16, 1},
6678 {TTI::SK_Splice, MVT::v4bf16, 1},
6679 {TTI::SK_Splice, MVT::v4i16, 1},
6680 {TTI::SK_Splice, MVT::v8i8, 1},
6681 // Broadcast shuffle kinds for scalable vectors
6682 {TTI::SK_Broadcast, MVT::nxv16i8, 1},
6683 {TTI::SK_Broadcast, MVT::nxv8i16, 1},
6684 {TTI::SK_Broadcast, MVT::nxv4i32, 1},
6685 {TTI::SK_Broadcast, MVT::nxv2i64, 1},
6686 {TTI::SK_Broadcast, MVT::nxv2f16, 1},
6687 {TTI::SK_Broadcast, MVT::nxv4f16, 1},
6688 {TTI::SK_Broadcast, MVT::nxv8f16, 1},
6689 {TTI::SK_Broadcast, MVT::nxv2bf16, 1},
6690 {TTI::SK_Broadcast, MVT::nxv4bf16, 1},
6691 {TTI::SK_Broadcast, MVT::nxv8bf16, 1},
6692 {TTI::SK_Broadcast, MVT::nxv2f32, 1},
6693 {TTI::SK_Broadcast, MVT::nxv4f32, 1},
6694 {TTI::SK_Broadcast, MVT::nxv2f64, 1},
6695 {TTI::SK_Broadcast, MVT::nxv16i1, 1},
6696 {TTI::SK_Broadcast, MVT::nxv8i1, 1},
6697 {TTI::SK_Broadcast, MVT::nxv4i1, 1},
6698 {TTI::SK_Broadcast, MVT::nxv2i1, 1},
6699 // Handle the cases for vector.reverse with scalable vectors
6700 {TTI::SK_Reverse, MVT::nxv16i8, 1},
6701 {TTI::SK_Reverse, MVT::nxv8i16, 1},
6702 {TTI::SK_Reverse, MVT::nxv4i32, 1},
6703 {TTI::SK_Reverse, MVT::nxv2i64, 1},
6704 {TTI::SK_Reverse, MVT::nxv2f16, 1},
6705 {TTI::SK_Reverse, MVT::nxv4f16, 1},
6706 {TTI::SK_Reverse, MVT::nxv8f16, 1},
6707 {TTI::SK_Reverse, MVT::nxv2bf16, 1},
6708 {TTI::SK_Reverse, MVT::nxv4bf16, 1},
6709 {TTI::SK_Reverse, MVT::nxv8bf16, 1},
6710 {TTI::SK_Reverse, MVT::nxv2f32, 1},
6711 {TTI::SK_Reverse, MVT::nxv4f32, 1},
6712 {TTI::SK_Reverse, MVT::nxv2f64, 1},
6713 {TTI::SK_Reverse, MVT::nxv16i1, 1},
6714 {TTI::SK_Reverse, MVT::nxv8i1, 1},
6715 {TTI::SK_Reverse, MVT::nxv4i1, 1},
6716 {TTI::SK_Reverse, MVT::nxv2i1, 1},
6717 };
6718 if (const auto *Entry = CostTableLookup(ShuffleTbl, Kind, LT.second))
6719 return LT.first * Entry->Cost;
6720 }
6721
6722 if (Kind == TTI::SK_Splice && isa<ScalableVectorType>(SrcTy))
6723 return getSpliceCost(SrcTy, Index, CostKind);
6724
6725 // Inserting a subvector can often be done with either a D, S or H register
6726 // move, so long as the inserted vector is "aligned".
6727 if (Kind == TTI::SK_InsertSubvector && LT.second.isFixedLengthVector() &&
6728 LT.second.getSizeInBits() <= 128 && SubTp) {
6729 std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
6730 if (SubLT.second.isVector()) {
6731 int NumElts = LT.second.getVectorNumElements();
6732 int NumSubElts = SubLT.second.getVectorNumElements();
6733 if ((Index % NumSubElts) == 0 && (NumElts % NumSubElts) == 0)
6734 return SubLT.first;
6735 }
6736 }
6737
6738 // Restore optimal kind.
6739 if (IsExtractSubvector)
6741 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index, SubTp,
6742 Args, CxtI);
6743}
6744
6747 const DominatorTree &DT) {
6748 const auto &Strides = DenseMap<Value *, const SCEV *>();
6749 for (BasicBlock *BB : TheLoop->blocks()) {
6750 // Scan the instructions in the block and look for addresses that are
6751 // consecutive and decreasing.
6752 for (Instruction &I : *BB) {
6753 if (isa<LoadInst>(&I) || isa<StoreInst>(&I)) {
6755 Type *AccessTy = getLoadStoreType(&I);
6756 if (getPtrStride(*PSE, AccessTy, Ptr, TheLoop, DT, Strides,
6757 /*Assume=*/true, /*ShouldCheckWrap=*/false)
6758 .value_or(0) < 0)
6759 return true;
6760 }
6761 }
6762 }
6763 return false;
6764}
6765
6767 if (SVEPreferFixedOverScalableIfEqualCost.getNumOccurrences())
6769 // For cases like post-LTO vectorization, when we eventually know the trip
6770 // count, epilogue with fixed-width vectorization can be deleted if the trip
6771 // count is less than the epilogue iterations. That's why we prefer
6772 // fixed-width vectorization in epilogue in case of equal costs.
6773 if (IsEpilogue)
6774 return true;
6775 return ST->useFixedOverScalableIfEqualCost();
6776}
6777
6779 return ST->getEpilogueVectorizationMinVF();
6780}
6781
6783 if (!ST->hasSVE())
6784 return false;
6785
6786 // We don't currently support vectorisation with interleaving for SVE - with
6787 // such loops we're better off not using tail-folding. This gives us a chance
6788 // to fall back on fixed-width vectorisation using NEON's ld2/st2/etc.
6789 if (TFI->IAI->hasGroups())
6790 return false;
6791
6793 if (TFI->LVL->getReductionVars().size())
6794 Required |= TailFoldingOpts::Reductions;
6795 if (TFI->LVL->getFixedOrderRecurrences().size())
6796 Required |= TailFoldingOpts::Recurrences;
6797
6798 // We call this to discover whether any load/store pointers in the loop have
6799 // negative strides. This will require extra work to reverse the loop
6800 // predicate, which may be expensive.
6803 *TFI->LVL->getDominatorTree()))
6804 Required |= TailFoldingOpts::Reverse;
6805 if (Required == TailFoldingOpts::Disabled)
6806 Required |= TailFoldingOpts::Simple;
6807
6808 if (!TailFoldingOptionLoc.satisfies(ST->getSVETailFoldingDefaultOpts(),
6809 Required))
6810 return false;
6811
6812 // Don't tail-fold for tight loops where we would be better off interleaving
6813 // with an unpredicated loop.
6814 unsigned NumInsns = 0;
6815 for (BasicBlock *BB : TFI->LVL->getLoop()->blocks()) {
6816 NumInsns += BB->size();
6817 }
6818
6819 // We expect 4 of these to be a IV PHI, IV add, IV compare and branch.
6820 return NumInsns >= SVETailFoldInsnThreshold;
6821}
6822
6825 StackOffset BaseOffset, bool HasBaseReg,
6826 int64_t Scale, unsigned AddrSpace) const {
6827 // Scaling factors are not free at all.
6828 // Operands | Rt Latency
6829 // -------------------------------------------
6830 // Rt, [Xn, Xm] | 4
6831 // -------------------------------------------
6832 // Rt, [Xn, Xm, lsl #imm] | Rn: 4 Rm: 5
6833 // Rt, [Xn, Wm, <extend> #imm] |
6835 AM.BaseGV = BaseGV;
6836 AM.BaseOffs = BaseOffset.getFixed();
6837 AM.HasBaseReg = HasBaseReg;
6838 AM.Scale = Scale;
6839 AM.ScalableOffset = BaseOffset.getScalable();
6840 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace))
6841 // Scale represents reg2 * scale, thus account for 1 if
6842 // it is not equal to 0 or 1.
6843 return AM.Scale != 0 && AM.Scale != 1;
6845}
6846
6848 const Instruction *I) const {
6850 // For the binary operators (e.g. or) we need to be more careful than
6851 // selects, here we only transform them if they are already at a natural
6852 // break point in the code - the end of a block with an unconditional
6853 // terminator.
6854 if (I->getOpcode() == Instruction::Or &&
6855 isa<UncondBrInst>(I->getNextNode()))
6856 return true;
6857
6858 if (I->getOpcode() == Instruction::Add ||
6859 I->getOpcode() == Instruction::Sub)
6860 return true;
6861 }
6863}
6864
6867 const TargetTransformInfo::LSRCost &C2) const {
6868 // AArch64 specific here is adding the number of instructions to the
6869 // comparison (though not as the first consideration, as some targets do)
6870 // along with changing the priority of the base additions.
6871 // TODO: Maybe a more nuanced tradeoff between instruction count
6872 // and number of registers? To be investigated at a later date.
6873 if (EnableLSRCostOpt)
6874 return std::tie(C1.NumRegs, C1.Insns, C1.NumBaseAdds, C1.AddRecCost,
6875 C1.NumIVMuls, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
6876 std::tie(C2.NumRegs, C2.Insns, C2.NumBaseAdds, C2.AddRecCost,
6877 C2.NumIVMuls, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
6878
6880}
6881
6882static bool isSplatShuffle(Value *V) {
6883 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V))
6884 return all_equal(Shuf->getShuffleMask());
6885 return false;
6886}
6887
6888/// Check if both Op1 and Op2 are shufflevector extracts of either the lower
6889/// or upper half of the vector elements.
6890static bool areExtractShuffleVectors(Value *Op1, Value *Op2,
6891 bool AllowSplat = false) {
6892 // Scalable types can't be extract shuffle vectors.
6893 if (Op1->getType()->isScalableTy() || Op2->getType()->isScalableTy())
6894 return false;
6895
6896 auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
6897 auto *FullTy = FullV->getType();
6898 auto *HalfTy = HalfV->getType();
6899 return FullTy->getPrimitiveSizeInBits().getFixedValue() ==
6900 2 * HalfTy->getPrimitiveSizeInBits().getFixedValue();
6901 };
6902
6903 auto extractHalf = [](Value *FullV, Value *HalfV) {
6904 auto *FullVT = cast<FixedVectorType>(FullV->getType());
6905 auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
6906 return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
6907 };
6908
6909 ArrayRef<int> M1, M2;
6910 Value *S1Op1 = nullptr, *S2Op1 = nullptr;
6911 if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
6912 !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
6913 return false;
6914
6915 // If we allow splats, set S1Op1/S2Op1 to nullptr for the relevant arg so that
6916 // it is not checked as an extract below.
6917 if (AllowSplat && isSplatShuffle(Op1))
6918 S1Op1 = nullptr;
6919 if (AllowSplat && isSplatShuffle(Op2))
6920 S2Op1 = nullptr;
6921
6922 // Check that the operands are half as wide as the result and we extract
6923 // half of the elements of the input vectors.
6924 if ((S1Op1 && (!areTypesHalfed(S1Op1, Op1) || !extractHalf(S1Op1, Op1))) ||
6925 (S2Op1 && (!areTypesHalfed(S2Op1, Op2) || !extractHalf(S2Op1, Op2))))
6926 return false;
6927
6928 // Check the mask extracts either the lower or upper half of vector
6929 // elements.
6930 int M1Start = 0;
6931 int M2Start = 0;
6932 int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
6933 if ((S1Op1 &&
6934 !ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start)) ||
6935 (S2Op1 &&
6936 !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start)))
6937 return false;
6938
6939 if ((M1Start != 0 && M1Start != (NumElements / 2)) ||
6940 (M2Start != 0 && M2Start != (NumElements / 2)))
6941 return false;
6942 if (S1Op1 && S2Op1 && M1Start != M2Start)
6943 return false;
6944
6945 return true;
6946}
6947
6948/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
6949/// of the vector elements.
6950static bool areExtractExts(Value *Ext1, Value *Ext2) {
6951 auto areExtDoubled = [](Instruction *Ext) {
6952 return Ext->getType()->getScalarSizeInBits() ==
6953 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
6954 };
6955
6956 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
6957 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
6958 !areExtDoubled(cast<Instruction>(Ext1)) ||
6959 !areExtDoubled(cast<Instruction>(Ext2)))
6960 return false;
6961
6962 return true;
6963}
6964
6965/// Check if Op could be used with vmull_high_p64 intrinsic.
6967 Value *VectorOperand = nullptr;
6968 ConstantInt *ElementIndex = nullptr;
6969 return match(Op, m_ExtractElt(m_Value(VectorOperand),
6970 m_ConstantInt(ElementIndex))) &&
6971 ElementIndex->getValue() == 1 &&
6972 isa<FixedVectorType>(VectorOperand->getType()) &&
6973 cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
6974}
6975
6976/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
6977static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
6979}
6980
6982 // Restrict ourselves to the form CodeGenPrepare typically constructs.
6983 auto *GEP = dyn_cast<GetElementPtrInst>(Ptrs);
6984 if (!GEP || GEP->getNumOperands() != 2)
6985 return false;
6986
6987 Value *Base = GEP->getOperand(0);
6988 Value *Offsets = GEP->getOperand(1);
6989
6990 // We only care about scalar_base+vector_offsets.
6991 if (Base->getType()->isVectorTy() || !Offsets->getType()->isVectorTy())
6992 return false;
6993
6994 // Sink extends that would allow us to use 32-bit offset vectors.
6995 if (isa<SExtInst>(Offsets) || isa<ZExtInst>(Offsets)) {
6996 auto *OffsetsInst = cast<Instruction>(Offsets);
6997 if (OffsetsInst->getType()->getScalarSizeInBits() > 32 &&
6998 OffsetsInst->getOperand(0)->getType()->getScalarSizeInBits() <= 32)
6999 Ops.push_back(&GEP->getOperandUse(1));
7000 }
7001
7002 // Sink the GEP.
7003 return true;
7004}
7005
7006/// We want to sink following cases:
7007/// (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A, vscale;
7008/// (add|sub|gep) A, ((mul|shl) zext(vscale), imm);
7010 if (match(Op, m_VScale()))
7011 return true;
7012 if (match(Op, m_Shl(m_VScale(), m_ConstantInt())) ||
7014 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7015 return true;
7016 }
7017 if (match(Op, m_Shl(m_ZExt(m_VScale()), m_ConstantInt())) ||
7019 Value *ZExtOp = cast<Instruction>(Op)->getOperand(0);
7020 Ops.push_back(&cast<Instruction>(ZExtOp)->getOperandUse(0));
7021 Ops.push_back(&cast<Instruction>(Op)->getOperandUse(0));
7022 return true;
7023 }
7024 return false;
7025}
7026
7027static bool isFNeg(Value *Op) { return match(Op, m_FNeg(m_Value())); }
7028
7029/// Check if sinking \p I's operands to I's basic block is profitable, because
7030/// the operands can be folded into a target instruction, e.g.
7031/// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
7035 switch (II->getIntrinsicID()) {
7036 case Intrinsic::aarch64_neon_smull:
7037 case Intrinsic::aarch64_neon_umull:
7038 if (areExtractShuffleVectors(II->getOperand(0), II->getOperand(1),
7039 /*AllowSplat=*/true)) {
7040 Ops.push_back(&II->getOperandUse(0));
7041 Ops.push_back(&II->getOperandUse(1));
7042 return true;
7043 }
7044 [[fallthrough]];
7045
7046 case Intrinsic::fma:
7047 case Intrinsic::fmuladd:
7048 if (isa<VectorType>(I->getType()) &&
7049 cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7050 !ST->hasFullFP16())
7051 return false;
7052
7053 if (isFNeg(II->getOperand(0)))
7054 Ops.push_back(&II->getOperandUse(0));
7055 if (isFNeg(II->getOperand(1)))
7056 Ops.push_back(&II->getOperandUse(1));
7057
7058 [[fallthrough]];
7059 case Intrinsic::aarch64_neon_sqdmull:
7060 case Intrinsic::aarch64_neon_sqdmulh:
7061 case Intrinsic::aarch64_neon_sqrdmulh:
7062 // Sink splats for index lane variants
7063 if (isSplatShuffle(II->getOperand(0)))
7064 Ops.push_back(&II->getOperandUse(0));
7065 if (isSplatShuffle(II->getOperand(1)))
7066 Ops.push_back(&II->getOperandUse(1));
7067 return !Ops.empty();
7068 case Intrinsic::aarch64_neon_fmlal:
7069 case Intrinsic::aarch64_neon_fmlal2:
7070 case Intrinsic::aarch64_neon_fmlsl:
7071 case Intrinsic::aarch64_neon_fmlsl2:
7072 // Sink splats for index lane variants
7073 if (isSplatShuffle(II->getOperand(1)))
7074 Ops.push_back(&II->getOperandUse(1));
7075 if (isSplatShuffle(II->getOperand(2)))
7076 Ops.push_back(&II->getOperandUse(2));
7077 return !Ops.empty();
7078 case Intrinsic::aarch64_sve_ptest_first:
7079 case Intrinsic::aarch64_sve_ptest_last:
7080 if (auto *IIOp = dyn_cast<IntrinsicInst>(II->getOperand(0)))
7081 if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue)
7082 Ops.push_back(&II->getOperandUse(0));
7083 return !Ops.empty();
7084 case Intrinsic::aarch64_sme_write_horiz:
7085 case Intrinsic::aarch64_sme_write_vert:
7086 case Intrinsic::aarch64_sme_writeq_horiz:
7087 case Intrinsic::aarch64_sme_writeq_vert: {
7088 auto *Idx = dyn_cast<Instruction>(II->getOperand(1));
7089 if (!Idx || Idx->getOpcode() != Instruction::Add)
7090 return false;
7091 Ops.push_back(&II->getOperandUse(1));
7092 return true;
7093 }
7094 case Intrinsic::aarch64_sme_read_horiz:
7095 case Intrinsic::aarch64_sme_read_vert:
7096 case Intrinsic::aarch64_sme_readq_horiz:
7097 case Intrinsic::aarch64_sme_readq_vert:
7098 case Intrinsic::aarch64_sme_ld1b_vert:
7099 case Intrinsic::aarch64_sme_ld1h_vert:
7100 case Intrinsic::aarch64_sme_ld1w_vert:
7101 case Intrinsic::aarch64_sme_ld1d_vert:
7102 case Intrinsic::aarch64_sme_ld1q_vert:
7103 case Intrinsic::aarch64_sme_st1b_vert:
7104 case Intrinsic::aarch64_sme_st1h_vert:
7105 case Intrinsic::aarch64_sme_st1w_vert:
7106 case Intrinsic::aarch64_sme_st1d_vert:
7107 case Intrinsic::aarch64_sme_st1q_vert:
7108 case Intrinsic::aarch64_sme_ld1b_horiz:
7109 case Intrinsic::aarch64_sme_ld1h_horiz:
7110 case Intrinsic::aarch64_sme_ld1w_horiz:
7111 case Intrinsic::aarch64_sme_ld1d_horiz:
7112 case Intrinsic::aarch64_sme_ld1q_horiz:
7113 case Intrinsic::aarch64_sme_st1b_horiz:
7114 case Intrinsic::aarch64_sme_st1h_horiz:
7115 case Intrinsic::aarch64_sme_st1w_horiz:
7116 case Intrinsic::aarch64_sme_st1d_horiz:
7117 case Intrinsic::aarch64_sme_st1q_horiz: {
7118 auto *Idx = dyn_cast<Instruction>(II->getOperand(3));
7119 if (!Idx || Idx->getOpcode() != Instruction::Add)
7120 return false;
7121 Ops.push_back(&II->getOperandUse(3));
7122 return true;
7123 }
7124 case Intrinsic::aarch64_neon_pmull:
7125 if (!areExtractShuffleVectors(II->getOperand(0), II->getOperand(1)))
7126 return false;
7127 Ops.push_back(&II->getOperandUse(0));
7128 Ops.push_back(&II->getOperandUse(1));
7129 return true;
7130 case Intrinsic::aarch64_neon_pmull64:
7131 if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
7132 II->getArgOperand(1)))
7133 return false;
7134 Ops.push_back(&II->getArgOperandUse(0));
7135 Ops.push_back(&II->getArgOperandUse(1));
7136 return true;
7137 case Intrinsic::masked_gather:
7138 if (!shouldSinkVectorOfPtrs(II->getArgOperand(0), Ops))
7139 return false;
7140 Ops.push_back(&II->getArgOperandUse(0));
7141 return true;
7142 case Intrinsic::masked_scatter:
7143 if (!shouldSinkVectorOfPtrs(II->getArgOperand(1), Ops))
7144 return false;
7145 Ops.push_back(&II->getArgOperandUse(1));
7146 return true;
7147 default:
7148 return false;
7149 }
7150 }
7151
7152 auto ShouldSinkCondition = [](Value *Cond,
7153 SmallVectorImpl<Use *> &Ops) -> bool {
7155 return false;
7157 if (II->getIntrinsicID() != Intrinsic::vector_reduce_or ||
7158 !isa<ScalableVectorType>(II->getOperand(0)->getType()))
7159 return false;
7160 if (isa<CmpInst>(II->getOperand(0)))
7161 Ops.push_back(&II->getOperandUse(0));
7162 return true;
7163 };
7164
7165 switch (I->getOpcode()) {
7166 case Instruction::GetElementPtr:
7167 case Instruction::Add:
7168 case Instruction::Sub:
7169 // Sink vscales closer to uses for better isel
7170 for (unsigned Op = 0; Op < I->getNumOperands(); ++Op) {
7171 if (shouldSinkVScale(I->getOperand(Op), Ops)) {
7172 Ops.push_back(&I->getOperandUse(Op));
7173 return true;
7174 }
7175 }
7176 break;
7177 case Instruction::Select: {
7178 if (!ShouldSinkCondition(I->getOperand(0), Ops))
7179 return false;
7180
7181 Ops.push_back(&I->getOperandUse(0));
7182 return true;
7183 }
7184 case Instruction::UncondBr:
7185 return false;
7186 case Instruction::CondBr: {
7187 if (!ShouldSinkCondition(cast<CondBrInst>(I)->getCondition(), Ops))
7188 return false;
7189
7190 Ops.push_back(&I->getOperandUse(0));
7191 return true;
7192 }
7193 case Instruction::FMul:
7194 // fmul with contract flag can be combined with fadd into fma.
7195 // Sinking fneg into this block enables fmls pattern.
7196 if (cast<FPMathOperator>(I)->hasAllowContract()) {
7197 if (isFNeg(I->getOperand(0)))
7198 Ops.push_back(&I->getOperandUse(0));
7199 if (isFNeg(I->getOperand(1)))
7200 Ops.push_back(&I->getOperandUse(1));
7201 }
7202 break;
7203
7204 // Type | BIC | ORN | EON
7205 // ----------------+-----------+-----------+-----------
7206 // scalar | Base | Base | Base
7207 // scalar w/shift | - | - | -
7208 // fixed vector | NEON/Base | NEON/Base | BSL2N/Base
7209 // scalable vector | SVE | - | BSL2N
7210 case Instruction::Xor:
7211 // EON only for scalars (possibly expanded fixed vectors)
7212 // and vectors using the SVE2/SME BSL2N instruction.
7213 if (I->getType()->isVectorTy() && ST->isNeonAvailable()) {
7214 bool HasBSL2N =
7215 ST->isSVEorStreamingSVEAvailable() && (ST->hasSVE2() || ST->hasSME());
7216 if (!HasBSL2N)
7217 break;
7218 }
7219 [[fallthrough]];
7220 case Instruction::And:
7221 case Instruction::Or:
7222 // Even though we could use the SVE2/SME BSL2N instruction,
7223 // it might pessimize with an extra MOV depending on register allocation.
7224 if (I->getOpcode() == Instruction::Or &&
7225 isa<ScalableVectorType>(I->getType()))
7226 break;
7227 // Shift can be fold into scalar AND/ORR/EOR,
7228 // but not the non-negated operand of BIC/ORN/EON.
7229 if (!(I->getType()->isVectorTy() && ST->hasNEON()) &&
7231 break;
7232 for (auto &Op : I->operands()) {
7233 // (and/or/xor X, (not Y)) -> (bic/orn/eon X, Y)
7234 if (match(Op.get(), m_Not(m_Value()))) {
7235 Ops.push_back(&Op);
7236 return true;
7237 }
7238 // (and/or/xor X, (splat (not Y))) -> (bic/orn/eon X, (splat Y))
7239 if (match(Op.get(),
7241 m_Value(), m_ZeroMask()))) {
7242 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
7243 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
7244 Ops.push_back(&Not);
7245 Ops.push_back(&InsertElt);
7246 Ops.push_back(&Op);
7247 return true;
7248 }
7249 }
7250 break;
7251 default:
7252 break;
7253 }
7254
7255 if (!I->getType()->isVectorTy())
7256 return !Ops.empty();
7257
7258 switch (I->getOpcode()) {
7259 case Instruction::Sub:
7260 case Instruction::Add: {
7261 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
7262 return false;
7263
7264 // If the exts' operands extract either the lower or upper elements, we
7265 // can sink them too.
7266 auto Ext1 = cast<Instruction>(I->getOperand(0));
7267 auto Ext2 = cast<Instruction>(I->getOperand(1));
7268 if (areExtractShuffleVectors(Ext1->getOperand(0), Ext2->getOperand(0))) {
7269 Ops.push_back(&Ext1->getOperandUse(0));
7270 Ops.push_back(&Ext2->getOperandUse(0));
7271 }
7272
7273 Ops.push_back(&I->getOperandUse(0));
7274 Ops.push_back(&I->getOperandUse(1));
7275
7276 return true;
7277 }
7278 case Instruction::Or: {
7279 // Pattern: Or(And(MaskValue, A), And(Not(MaskValue), B)) ->
7280 // bitselect(MaskValue, A, B) where Not(MaskValue) = Xor(MaskValue, -1)
7281 if (ST->hasNEON()) {
7282 Instruction *OtherAnd, *IA, *IB;
7283 Value *MaskValue;
7284 // MainAnd refers to And instruction that has 'Not' as one of its operands
7285 if (match(I, m_c_Or(m_OneUse(m_Instruction(OtherAnd)),
7286 m_OneUse(m_c_And(m_OneUse(m_Not(m_Value(MaskValue))),
7287 m_Instruction(IA)))))) {
7288 if (match(OtherAnd,
7289 m_c_And(m_Specific(MaskValue), m_Instruction(IB)))) {
7290 Instruction *MainAnd = I->getOperand(0) == OtherAnd
7291 ? cast<Instruction>(I->getOperand(1))
7292 : cast<Instruction>(I->getOperand(0));
7293
7294 // Both Ands should be in same basic block as Or
7295 if (I->getParent() != MainAnd->getParent() ||
7296 I->getParent() != OtherAnd->getParent())
7297 return false;
7298
7299 // Non-mask operands of both Ands should also be in same basic block
7300 if (I->getParent() != IA->getParent() ||
7301 I->getParent() != IB->getParent())
7302 return false;
7303
7304 Ops.push_back(
7305 &MainAnd->getOperandUse(MainAnd->getOperand(0) == IA ? 1 : 0));
7306 Ops.push_back(&I->getOperandUse(0));
7307 Ops.push_back(&I->getOperandUse(1));
7308
7309 return true;
7310 }
7311 }
7312 }
7313
7314 return false;
7315 }
7316 case Instruction::Mul: {
7317 auto ShouldSinkSplatForIndexedVariant = [](Value *V) {
7318 auto *Ty = cast<VectorType>(V->getType());
7319 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7320 if (Ty->isScalableTy())
7321 return false;
7322
7323 // Indexed variants of Mul exist for i16 and i32 element types only.
7324 return Ty->getScalarSizeInBits() == 16 || Ty->getScalarSizeInBits() == 32;
7325 };
7326
7327 int NumZExts = 0, NumSExts = 0;
7328 for (auto &Op : I->operands()) {
7329 // Make sure we are not already sinking this operand
7330 if (any_of(Ops, [&](Use *U) { return U->get() == Op; }))
7331 continue;
7332
7333 if (match(&Op, m_ZExtOrSExt(m_Value()))) {
7334 auto *Ext = cast<Instruction>(Op);
7335 auto *ExtOp = Ext->getOperand(0);
7336 if (isSplatShuffle(ExtOp) && ShouldSinkSplatForIndexedVariant(ExtOp))
7337 Ops.push_back(&Ext->getOperandUse(0));
7338 Ops.push_back(&Op);
7339
7340 if (isa<SExtInst>(Ext)) {
7341 NumSExts++;
7342 } else {
7343 NumZExts++;
7344 // A zext(a) is also a sext(zext(a)), if we take more than 2 steps.
7345 if (Ext->getOperand(0)->getType()->getScalarSizeInBits() * 2 <
7346 I->getType()->getScalarSizeInBits())
7347 NumSExts++;
7348 }
7349
7350 continue;
7351 }
7352
7354 if (!Shuffle)
7355 continue;
7356
7357 // If the Shuffle is a splat and the operand is a zext/sext, sinking the
7358 // operand and the s/zext can help create indexed s/umull. This is
7359 // especially useful to prevent i64 mul being scalarized.
7360 if (isSplatShuffle(Shuffle) &&
7361 match(Shuffle->getOperand(0), m_ZExtOrSExt(m_Value()))) {
7362 Ops.push_back(&Shuffle->getOperandUse(0));
7363 Ops.push_back(&Op);
7364 if (match(Shuffle->getOperand(0), m_SExt(m_Value())))
7365 NumSExts++;
7366 else
7367 NumZExts++;
7368 continue;
7369 }
7370
7371 Value *ShuffleOperand = Shuffle->getOperand(0);
7372 InsertElementInst *Insert = dyn_cast<InsertElementInst>(ShuffleOperand);
7373 if (!Insert)
7374 continue;
7375
7376 Instruction *OperandInstr = dyn_cast<Instruction>(Insert->getOperand(1));
7377 if (!OperandInstr)
7378 continue;
7379
7380 ConstantInt *ElementConstant =
7381 dyn_cast<ConstantInt>(Insert->getOperand(2));
7382 // Check that the insertelement is inserting into element 0
7383 if (!ElementConstant || !ElementConstant->isZero())
7384 continue;
7385
7386 unsigned Opcode = OperandInstr->getOpcode();
7387 if (Opcode == Instruction::SExt)
7388 NumSExts++;
7389 else if (Opcode == Instruction::ZExt)
7390 NumZExts++;
7391 else {
7392 // If we find that the top bits are known 0, then we can sink and allow
7393 // the backend to generate a umull.
7394 unsigned Bitwidth = I->getType()->getScalarSizeInBits();
7395 APInt UpperMask = APInt::getHighBitsSet(Bitwidth, Bitwidth / 2);
7396 if (!MaskedValueIsZero(OperandInstr, UpperMask, DL))
7397 continue;
7398 NumZExts++;
7399 }
7400
7401 // And(Load) is excluded to prevent CGP getting stuck in a loop of sinking
7402 // the And, just to hoist it again back to the load.
7403 if (!match(OperandInstr, m_And(m_Load(m_Value()), m_Value())))
7404 Ops.push_back(&Insert->getOperandUse(1));
7405 Ops.push_back(&Shuffle->getOperandUse(0));
7406 Ops.push_back(&Op);
7407 }
7408
7409 // It is profitable to sink if we found two of the same type of extends.
7410 if (!Ops.empty() && (NumSExts == 2 || NumZExts == 2))
7411 return true;
7412
7413 // Otherwise, see if we should sink splats for indexed variants.
7414 if (!ShouldSinkSplatForIndexedVariant(I))
7415 return false;
7416
7417 Ops.clear();
7418 if (isSplatShuffle(I->getOperand(0)))
7419 Ops.push_back(&I->getOperandUse(0));
7420 if (isSplatShuffle(I->getOperand(1)))
7421 Ops.push_back(&I->getOperandUse(1));
7422
7423 return !Ops.empty();
7424 }
7425 case Instruction::FMul: {
7426 // For SVE the lane-indexing is within 128-bits, so we can't fold splats.
7427 if (I->getType()->isScalableTy())
7428 return !Ops.empty();
7429
7430 if (cast<VectorType>(I->getType())->getElementType()->isHalfTy() &&
7431 !ST->hasFullFP16())
7432 return !Ops.empty();
7433
7434 // Sink splats for index lane variants
7435 if (isSplatShuffle(I->getOperand(0)))
7436 Ops.push_back(&I->getOperandUse(0));
7437 if (isSplatShuffle(I->getOperand(1)))
7438 Ops.push_back(&I->getOperandUse(1));
7439 return !Ops.empty();
7440 }
7441 default:
7442 return false;
7443 }
7444 return false;
7445}
static bool isAllActivePredicate(const SelectionDAG &DAG, SDValue N)
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static std::optional< Instruction * > instCombinePTrue(InstCombiner &IC, IntrinsicInst &II)
TailFoldingOption TailFoldingOptionLoc
static std::optional< Instruction * > instCombineSVEVectorFAdd(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFuseMulAddSub(InstCombiner &IC, IntrinsicInst &II, bool MergeIntoAddendOp)
static void getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP)
bool SimplifyValuePattern(SmallVector< Value * > &Vec, bool AllowPoison)
static std::optional< Instruction * > instCombineSVESel(InstCombiner &IC, IntrinsicInst &II)
static bool hasPossibleIncompatibleOps(const Function *F, const AArch64TargetLowering &TLI)
Returns true if the function has explicit operations that can only be lowered using incompatible inst...
static bool shouldSinkVScale(Value *Op, SmallVectorImpl< Use * > &Ops)
We want to sink following cases: (add|sub|gep) A, ((mul|shl) vscale, imm); (add|sub|gep) A,...
static InstructionCost getHistogramCost(const AArch64Subtarget *ST, const IntrinsicCostAttributes &ICA)
static std::optional< Instruction * > tryCombineFromSVBoolBinOp(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEUnpack(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVETailFoldInsnThreshold("sve-tail-folding-insn-threshold", cl::init(15), cl::Hidden)
static cl::opt< bool > EnableFixedwidthAutovecInStreamingMode("enable-fixedwidth-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static void getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE, TargetTransformInfo::UnrollingPreferences &UP, const AArch64TTIImpl &TTI)
For Apple CPUs, we want to runtime-unroll loops to make better use if the OOO engine's wide instructi...
static std::optional< Instruction * > instCombineWhilelo(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFAddU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPairwiseAddLong(InstCombiner &IC, IntrinsicInst &II)
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
static cl::opt< bool > EnableLSRCostOpt("enable-aarch64-lsr-cost-opt", cl::init(true), cl::Hidden)
static bool shouldSinkVectorOfPtrs(Value *Ptrs, SmallVectorImpl< Use * > &Ops)
static bool shouldUnrollMultiExitLoop(Loop *L, ScalarEvolution &SE, const AArch64TTIImpl &TTI)
static std::optional< Instruction * > simplifySVEIntrinsicBinOp(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static std::optional< Instruction * > instCombineSVEVectorSub(InstCombiner &IC, IntrinsicInst &II)
static bool isLoopSizeWithinBudget(Loop *L, const AArch64TTIImpl &TTI, InstructionCost Budget, unsigned *FinalSize)
static std::optional< Instruction * > instCombineLD1GatherIndex(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSub(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > processPhiNode(InstCombiner &IC, IntrinsicInst &II)
The function will remove redundant reinterprets casting in the presence of the control flow.
static std::optional< Instruction * > instCombineSVEInsr(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSMECntsd(InstCombiner &IC, IntrinsicInst &II, const AArch64Subtarget *ST)
static void extractAttrFeatures(const Function &F, const AArch64TTIImpl *TTI, SmallVectorImpl< StringRef > &Features)
static std::optional< Instruction * > instCombineST1ScatterIndex(InstCombiner &IC, IntrinsicInst &II)
static bool isSMEABIRoutineCall(const CallInst &CI, const AArch64TargetLowering &TLI)
static std::optional< Instruction * > instCombineSVESDIV(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEST1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static Value * stripInactiveLanes(Value *V, const Value *Pg)
static cl::opt< bool > SVEPreferFixedOverScalableIfEqualCost("sve-prefer-fixed-over-scalable-if-equal", cl::Hidden)
static bool isUnpackedVectorVT(EVT VecVT)
static std::optional< Instruction * > instCombineSVEDupX(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVECmpNE(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineDMB(InstCombiner &IC, IntrinsicInst &II)
static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorFSubU(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineRDFFR(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineMaxMinNM(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > SVEGatherOverhead("sve-gather-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVECondLast(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEPTest(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEZip(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< int > Aarch64ForceUnrollThreshold("aarch64-force-unroll-threshold", cl::init(0), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in AArch64"))
static std::optional< Instruction * > instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden, cl::desc("The cost of a histcnt instruction"))
static std::optional< Instruction * > instCombineConvertFromSVBool(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > CallPenaltyChangeSM("call-penalty-sm-change", cl::init(5), cl::Hidden, cl::desc("Penalty of calling a function that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVEUzp1(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVEVectorBinOp(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableScalableAutovecInStreamingMode("enable-scalable-autovec-in-streaming-mode", cl::init(false), cl::Hidden)
static std::optional< Instruction * > instCombineSVETBL(InstCombiner &IC, IntrinsicInst &II)
static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2)
Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
static bool isFNeg(Value *Op)
static Instruction::BinaryOps intrinsicIDToBinOpCode(unsigned Intrinsic)
static bool containsDecreasingPointers(Loop *TheLoop, PredicatedScalarEvolution *PSE, const DominatorTree &DT)
static bool isSplatShuffle(Value *V)
static cl::opt< unsigned > InlineCallPenaltyChangeSM("inline-call-penalty-sm-change", cl::init(10), cl::Hidden, cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"))
static std::optional< Instruction * > instCombineSVELD1(InstCombiner &IC, IntrinsicInst &II, const DataLayout &DL)
static std::optional< Instruction * > instCombineSVESrshl(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > DMBLookaheadThreshold("dmb-lookahead-threshold", cl::init(10), cl::Hidden, cl::desc("The number of instructions to search for a redundant dmb"))
static std::optional< Instruction * > simplifySVEIntrinsic(InstCombiner &IC, IntrinsicInst &II, const SVEIntrinsicInfo &IInfo)
static unsigned getSVEGatherScatterOverhead(unsigned Opcode, const AArch64Subtarget *ST)
static std::optional< Instruction * > instCombineSVEVectorMlaU(InstCombiner &IC, IntrinsicInst &II)
static bool isOperandOfVmullHighP64(Value *Op)
Check if Op could be used with vmull_high_p64 intrinsic.
static std::optional< Instruction * > instCombineInStreamingMode(InstCombiner &IC, IntrinsicInst &II)
static std::optional< Instruction * > instCombineSVELast(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< unsigned > NeonNonConstStrideOverhead("neon-nonconst-stride-overhead", cl::init(10), cl::Hidden)
static cl::opt< bool > EnableFalkorHWPFUnrollFix("enable-falkor-hwpf-unroll-fix", cl::init(true), cl::Hidden)
static std::optional< Instruction * > instCombineSVECntElts(InstCombiner &IC, IntrinsicInst &II, unsigned NumElts)
static std::optional< Instruction * > instCombineSVEUxt(InstCombiner &IC, IntrinsicInst &II, unsigned NumBits)
static cl::opt< TailFoldingOption, true, cl::parser< std::string > > SVETailFolding("sve-tail-folding", cl::desc("Control the use of vectorisation using tail-folding for SVE where the" " option is specified in the form (Initial)[+(Flag1|Flag2|...)]:" "\ndisabled (Initial) No loop types will vectorize using " "tail-folding" "\ndefault (Initial) Uses the default tail-folding settings for " "the target CPU" "\nall (Initial) All legal loop types will vectorize using " "tail-folding" "\nsimple (Initial) Use tail-folding for simple loops (not " "reductions or recurrences)" "\nreductions Use tail-folding for loops containing reductions" "\nnoreductions Inverse of above" "\nrecurrences Use tail-folding for loops containing fixed order " "recurrences" "\nnorecurrences Inverse of above" "\nreverse Use tail-folding for loops requiring reversed " "predicates" "\nnoreverse Inverse of above"), cl::location(TailFoldingOptionLoc))
static bool areExtractShuffleVectors(Value *Op1, Value *Op2, bool AllowSplat=false)
Check if both Op1 and Op2 are shufflevector extracts of either the lower or upper half of the vector ...
static std::optional< Instruction * > instCombineSVEVectorAdd(InstCombiner &IC, IntrinsicInst &II)
static cl::opt< bool > EnableOrLikeSelectOpt("enable-aarch64-or-like-select", cl::init(true), cl::Hidden)
static cl::opt< unsigned > SVEScatterOverhead("sve-scatter-overhead", cl::init(10), cl::Hidden)
static std::optional< Instruction * > instCombineSVEDupqLane(InstCombiner &IC, IntrinsicInst &II)
This file a TargetTransformInfoImplBase conforming object specific to the AArch64 target machine.
AMDGPU Register Bank Select
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static Error reportError(StringRef Message)
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
This file defines the DenseMap class.
@ Default
static Value * getCondition(Instruction *I)
Hexagon Common GEP
const HexagonInstrInfo * TII
#define _
This file provides the interface for the instcombine pass implementation.
static constexpr Value * getValue(Ty &ValueOrUse)
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
#define P(N)
const SmallVectorImpl< MachineOperand > & Cond
static uint64_t getBits(uint64_t Val, int Start, int End)
#define LLVM_DEBUG(...)
Definition Debug.h:119
static unsigned getScalarSizeInBits(Type *Ty)
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
static unsigned getBitWidth(Type *Ty, const DataLayout &DL)
Returns the bitwidth of the given scalar or pointer type.
Value * RHS
Value * LHS
BinaryOperator * Mul
unsigned getVectorInsertExtractBaseCost() const
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
bool isLegalBroadcastLoad(Type *ElementTy, ElementCount NumElements) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
bool isExtPartOfAvgExpr(const Instruction *ExtUser, Type *Dst, Type *Src) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getIntImmCost(int64_t Val) const
Calculate the cost of materializing a 64-bit value.
std::optional< InstructionCost > getFP16BF16PromoteCost(Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info, bool IncludeTrunc, bool CanUseSVE, std::function< InstructionCost(Type *)> InstCost) const
FP16 and BF16 operations are lowered to fptrunc(op(fpext, fpext) if the architecture features are not...
bool prefersVectorizedAddressing() const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
bool isElementTypeLegalForScalableVector(Type *Ty) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
APInt getPriorityMask(const Function &F) const override
bool shouldMaximizeVectorBandwidth(TargetTransformInfo::RegisterKind K) const override
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool useNeonVector(const Type *Ty) const
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedExpandLoad(Type *DataTy, Align Alignment) const override
TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
InstructionCost getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy, unsigned Index, TTI::TargetCostKind CostKind) const override
unsigned getInlineCallPenalty(const Function *F, const CallBase &Call, unsigned DefaultCallPenalty) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
unsigned getMaxNumElements(ElementCount VF) const
Try to return an estimate cost factor that can be used as a multiplier when scalarizing an operation ...
bool shouldTreatInstructionLikeSelect(const Instruction *I) const override
bool isMultiversionedFunction(const Function &F) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
bool isLegalToVectorizeReduction(const RecurrenceDescriptor &RdxDesc, ElementCount VF) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool isLegalMaskedGatherScatter(Type *DataType) const
InstructionCost getBranchMispredictPenalty() const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
APInt getFeatureMask(const Function &F) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const override
bool enableScalableVectorization() const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
Value * getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst, Type *ExpectedType, bool CanCreate=true) const override
bool hasKnownLowerThroughputFromSchedulingModel(unsigned Opcode1, unsigned Opcode2) const
Check whether Opcode1 has less throughput according to the scheduling model than Opcode2.
unsigned getEpilogueVectorizationMinVF() const override
InstructionCost getSpliceCost(VectorType *Tp, int Index, TTI::TargetCostKind CostKind) const
InstructionCost getArithmeticReductionCostSVE(unsigned Opcode, VectorType *ValTy, TTI::TargetCostKind CostKind) const
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
Return the cost of the scaling factor used in the addressing mode represented by AM for this target,...
bool preferFixedOverScalableIfEqualCost(bool IsEpilogue) const override
unsigned getMaxInterleaveFactor(ElementCount VF, bool HasUnorderedReductions) const override
Class for arbitrary precision integers.
Definition APInt.h:78
bool isNegatedPowerOf2() const
Check if this APInt's negated value is a power of two greater than zero.
Definition APInt.h:450
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
unsigned countLeadingOnes() const
Definition APInt.h:1647
void negate()
Negate this APInt in place.
Definition APInt.h:1491
LLVM_ABI APInt sextOrTrunc(unsigned width) const
Sign extend or truncate to width.
Definition APInt.cpp:1084
unsigned logBase2() const
Definition APInt.h:1784
APInt ashr(unsigned ShiftAmt) const
Arithmetic right-shift function.
Definition APInt.h:834
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
LLVM Basic Block Representation.
Definition BasicBlock.h:62
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
bool areInlineCompatible(const Function *Caller, const Function *Callee) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind, std::optional< FastMathFlags > FMF) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
bool isTypeLegal(Type *Ty) const override
static BinaryOperator * CreateWithCopiedFlags(BinaryOps Opc, Value *V1, Value *V2, Value *CopyO, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Definition InstrTypes.h:254
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
Function * getCalledFunction() const
Returns the function called, or null if this is an indirect function invocation or the function signa...
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
This class represents a function call, abstracting a target machine's calling convention.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:743
@ ICMP_SLT
signed less than
Definition InstrTypes.h:769
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:746
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:744
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:745
@ ICMP_UGT
unsigned greater than
Definition InstrTypes.h:763
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:748
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:751
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:747
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:749
@ ICMP_SGE
signed greater or equal
Definition InstrTypes.h:768
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:756
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:839
bool isUnsigned() const
Definition InstrTypes.h:999
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
static LLVM_ABI ConstantAggregateZero * get(Type *Ty)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
bool isZero() const
This is just a convenience method to make client code smaller for a common code.
Definition Constants.h:219
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
static LLVM_ABI ConstantInt * getBool(LLVMContext &Context, bool V)
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
This is an important base class in LLVM.
Definition Constant.h:43
LLVM_ABI Constant * getSplatValue(bool AllowPoison=false) const
If all elements of the vector constant have the same value, return that value.
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
TypeSize getTypeSizeInBits(Type *Ty) const
Size examples:
Definition DataLayout.h:791
bool empty() const
Definition DenseMap.h:173
bool contains(const_arg_type_t< KeyT > Val) const
Return true if the specified key is in the map, false otherwise.
Definition DenseMap.h:216
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
static constexpr ElementCount getScalable(ScalarTy MinVal)
Definition TypeSize.h:312
static constexpr ElementCount getFixed(ScalarTy MinVal)
Definition TypeSize.h:309
constexpr bool isScalar() const
Exactly one element.
Definition TypeSize.h:320
static ExtractElementInst * Create(Value *Vec, Value *Idx, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
This provides a helper for copying FMF from an instruction or setting specified flags.
Definition IRBuilder.h:93
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:67
bool noInfs() const
Definition FMF.h:66
bool approxFunc() const
Definition FMF.h:70
bool allowContract() const
Definition FMF.h:69
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2617
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2605
IntegerType * getIntNTy(unsigned N)
Fetch the type representing an N-bit integer.
Definition IRBuilder.h:547
Type * getDoubleTy()
Fetch the type representing a 64-bit floating point value.
Definition IRBuilder.h:567
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI CallInst * CreateMaskedLoad(Type *Ty, Value *Ptr, Align Alignment, Value *Mask, Value *PassThru=nullptr, const Twine &Name="")
Create a call to Masked Load intrinsic.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
IntegerType * getInt32Ty()
Fetch the type representing a 32-bit integer.
Definition IRBuilder.h:534
Type * getHalfTy()
Fetch the type representing a 16-bit floating point value.
Definition IRBuilder.h:552
Value * CreateGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="", GEPNoWrapFlags NW=GEPNoWrapFlags::none())
Definition IRBuilder.h:2000
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:482
Value * CreateBitOrPointerCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2314
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2529
Value * CreateBinOpFMF(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1737
Value * CreateSub(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1439
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2232
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1906
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2639
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1919
LLVM_ABI CallInst * CreateMaskedStore(Value *Val, Value *Ptr, Align Alignment, Value *Mask)
Create a call to Masked Store intrinsic.
Value * CreateAdd(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1422
Type * getFloatTy()
Fetch the type representing a 32-bit floating point value.
Definition IRBuilder.h:562
Value * CreateIntCast(Value *V, Type *DestTy, bool isSigned, const Twine &Name="")
Definition IRBuilder.h:2305
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
Value * CreateInsertVector(Type *DstType, Value *SrcVec, Value *SubVec, Value *Idx, const Twine &Name="")
Create a call to the vector.insert intrinsic.
Definition IRBuilder.h:1126
LLVM_ABI Value * CreateElementCount(Type *Ty, ElementCount EC)
Create an expression which evaluates to the number of elements in EC at runtime.
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
This instruction inserts a single (scalar) element into a VectorType value.
The core instruction combiner logic.
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
bool isBinaryOp() const
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
Class to represent integer types.
bool hasGroups() const
Returns true if we have any interleave groups.
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
const IntrinsicInst * getInst() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
Value * getPointerOperand()
iterator_range< block_iterator > blocks() const
RecurrenceSet & getFixedOrderRecurrences()
Return the fixed-order recurrences found in the loop.
PredicatedScalarEvolution * getPredicatedScalarEvolution() const
const ReductionList & getReductionVars() const
Returns the reduction variables found in the loop.
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
static MVT getScalableVectorVT(MVT VT, unsigned NumElements)
bool isFixedLengthVector() const
MVT getVectorElementType() const
size_type size() const
Definition MapVector.h:58
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
The RecurrenceDescriptor is used to identify recurrences variables in a loop.
Type * getRecurrenceType() const
Returns the type of the recurrence.
RecurKind getRecurrenceKind() const
This node represents a polynomial recurrence on the trip count of the specified loop.
bool isAffine() const
Return true if this represents an expression A + B*x where A and B are loop invariant values.
This class represents an analyzed expression in the program.
SMEAttrs is a utility class to parse the SME ACLE attributes on functions.
bool hasNonStreamingInterfaceAndBody() const
bool hasStreamingCompatibleInterface() const
bool hasStreamingInterfaceOrBody() const
bool isSMEABIRoutine() const
bool hasStreamingBody() const
void set(unsigned M, bool Enable=true)
SMECallAttrs is a utility class to hold the SMEAttrs for a callsite.
bool requiresPreservingZT0() const
bool requiresPreservingAllZAState() const
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:889
static ScalableVectorType * getDoubleElementsVectorType(ScalableVectorType *VTy)
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI unsigned getSmallConstantTripMultiple(const Loop *L, const SCEV *ExitCount)
Returns the largest constant divisor of the trip count as a normal unsigned value,...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
LLVM_ABI unsigned getSmallConstantMaxTripCount(const Loop *L, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
Returns the upper bound of the loop trip count as a normal unsigned value.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
const SCEV * getSymbolicMaxBackedgeTakenCount(const Loop *L)
When successful, this returns a SCEV that is greater than or equal to (i.e.
This instruction constructs a fixed permutation of two input vectors.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isExtractSubvectorMask(ArrayRef< int > Mask, int NumSrcElts, int &Index)
Return true if this shuffle mask is an extract subvector mask.
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
An instruction for storing to memory.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
std::pair< StringRef, StringRef > split(char Separator) const
Split into two substrings around the first occurrence of a separator character.
Definition StringRef.h:736
Class to represent struct types.
TargetInstrInfo - Interface to description of machine instruction set.
std::pair< LegalizeTypeAction, EVT > LegalizeKind
LegalizeKind holds the legalization kind that needs to happen to EVT in order to type-legalize it.
const RTLIB::RuntimeLibcallsInfo & getRuntimeLibcallsInfo() const
virtual const DataLayout & getDataLayout() const
virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) const
virtual bool isLoweredToCall(const Function *F) const
virtual bool isLSRCostLess(const TTI::LSRCost &C1, const TTI::LSRCost &C2) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
virtual bool areTypesABICompatible(const Function *Caller, const Function *Callee, ArrayRef< Type * > Types) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
@ Load
The value being inserted comes from a load (InsertElement only).
static LLVM_ABI OperandValueInfo getOperandInfo(const Value *V)
Collect properties of V used in cost analysis, e.g. OP_PowerOf2.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:346
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:147
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isDoubleTy() const
Return true if this is 'double', a 64-bit IEEE fp type.
Definition Type.h:158
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:306
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:286
static LLVM_ABI UndefValue * get(Type *T)
Static factory methods - Return an 'undef' object of the specified type.
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:993
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:400
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static VectorType * getInteger(VectorType *VTy)
This static method gets a VectorType with the same number of elements as the input type,...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:252
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
static bool isLogicalImmediate(uint64_t imm, unsigned regSize)
isLogicalImmediate - Return true if the immediate is valid for a logical immediate instruction of the...
void expandMOVImm(uint64_t Imm, unsigned BitSize, SmallVectorImpl< ImmInsnModel > &Insn)
Expand a MOVi32imm or MOVi64imm pseudo instruction to one or more real move-immediate instructions to...
LLVM_ABI APInt getCpuSupportsMask(ArrayRef< StringRef > Features)
static constexpr unsigned SVEBitsPerBlock
LLVM_ABI APInt getFMVPriority(ArrayRef< StringRef > Features)
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
BinaryOp_match< LHS, RHS, Instruction::FMul > m_FMul(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
cst_pred_ty< is_nonnegative > m_NonNegative()
Match an integer or vector of non-negative values.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
IntrinsicID_match m_VScale()
Matches a call to llvm.vscale().
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
AnyBinaryOp_match< LHS, RHS, true > m_c_BinOp(const LHS &L, const RHS &R)
Matches a BinaryOperator with LHS and RHS in either order.
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinOpPred_match< LHS, RHS, is_shift_op > m_Shift(const LHS &L, const RHS &R)
Matches shift operations.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
brc_match< Cond_t, match_bind< BasicBlock >, match_bind< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
auto m_Undef()
Match an arbitrary undef constant.
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
LLVM_ABI Libcall getPOW(EVT RetVT)
getPOW - Return the POW_* value for the given types, or UNKNOWN_LIBCALL if there is none.
initializer< Ty > init(const Ty &Val)
LocationClass< Ty > location(Ty &L)
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
std::optional< unsigned > isDUPQMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPQMask - matches a splat of equivalent lanes within segments of a given number of elements.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
bool isZIPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for zip1 or zip2 masks of the form: <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0,...
TailFoldingOpts
An enum to describe what types of loops we should attempt to tail-fold: Disabled: None Reductions: Lo...
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
bool isDUPFirstSegmentMask(ArrayRef< int > Mask, unsigned Segments, unsigned SegmentSize)
isDUPFirstSegmentMask - matches a splat of the first 128b segment.
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Uninitialized
Definition Threading.h:60
LLVM_ABI std::optional< const MDOperand * > findStringMetadataForLoop(const Loop *TheLoop, StringRef Name)
Find string metadata for loop.
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
unsigned M1(unsigned Val)
Definition VE.h:377
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned getPerfectShuffleCost(llvm::ArrayRef< int > M)
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool none_of(R &&Range, UnaryPredicate P)
Provide wrappers to std::none_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1753
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
bool isUZPMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut)
Return true for uzp1 or uzp2 masks of the form: <0, 2, 4, 6, 8, 10, 12, 14> or <1,...
bool isREVMask(ArrayRef< int > M, unsigned EltSize, unsigned NumElts, unsigned BlockSize)
isREVMask - Check if a vector shuffle corresponds to a REV instruction with the specified blocksize.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
TargetTransformInfo TTI
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
@ UMin
Unsigned integer min implemented in terms of select(cmp()).
@ Or
Bitwise or logical OR of integers.
@ FSub
Subtraction of floats.
@ FAddChainWithSubs
A chain of fadds and fsubs.
@ AnyOf
AnyOf reduction with select(cmp(),x,y) where one of (x,y) is loop invariant, and both x and y are int...
@ Xor
Bitwise or logical XOR of integers.
@ FindLast
FindLast reduction with select(cmp(),x,y) where x and y.
@ FMax
FP max implemented in terms of select(cmp()).
@ FMulAdd
Sum of float products with llvm.fmuladd(a * b + sum).
@ FMul
Product of floats.
@ SMax
Signed integer max implemented in terms of select(cmp()).
@ And
Bitwise or logical AND of integers.
@ SMin
Signed integer min implemented in terms of select(cmp()).
@ FMin
FP min implemented in terms of select(cmp()).
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ AddChainWithSubs
A chain of adds and subs.
@ FAdd
Sum of floats.
@ UMax
Unsigned integer max implemented in terms of select(cmp()).
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
unsigned getNumElementsFromSVEPredPattern(unsigned Pattern)
Return the number of active elements for VL1 to VL256 predicate pattern, zero for all other patterns.
auto predecessors(const MachineBasicBlock *BB)
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool ShouldCheckWrap=true, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
constexpr uint64_t NextPowerOf2(uint64_t A)
Returns the next power of two (in 64-bits) that is strictly greater than A.
Definition MathExtras.h:373
bool isTRNMask(ArrayRef< int > M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut)
Return true for trn1 or trn2 masks of the form: <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0,...
#define N
static SVEIntrinsicInfo defaultMergingUnaryNarrowingTopOp()
static SVEIntrinsicInfo defaultZeroingOp()
SVEIntrinsicInfo & setOperandIdxInactiveLanesTakenFrom(unsigned Index)
static SVEIntrinsicInfo defaultMergingOp(Intrinsic::ID IID=Intrinsic::not_intrinsic)
SVEIntrinsicInfo & setOperandIdxWithNoActiveLanes(unsigned Index)
unsigned getOperandIdxWithNoActiveLanes() const
SVEIntrinsicInfo & setInactiveLanesAreUnused()
SVEIntrinsicInfo & setInactiveLanesAreNotDefined()
SVEIntrinsicInfo & setGoverningPredicateOperandIdx(unsigned Index)
static SVEIntrinsicInfo defaultUndefOp()
Intrinsic::ID getMatchingUndefIntrinsic() const
SVEIntrinsicInfo & setResultIsZeroInitialized()
static SVEIntrinsicInfo defaultMergingUnaryOp()
SVEIntrinsicInfo & setMatchingUndefIntrinsic(Intrinsic::ID IID)
unsigned getGoverningPredicateOperandIdx() const
SVEIntrinsicInfo & setMatchingIROpcode(unsigned Opcode)
unsigned getOperandIdxInactiveLanesTakenFrom() const
static SVEIntrinsicInfo defaultVoidOp(unsigned GPIndex)
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:382
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool isFixedLengthVector() const
Definition ValueTypes.h:199
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
bool isScalableVector() const
Return true if this is a vector type where the runtime length is machine dependent.
Definition ValueTypes.h:187
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
Summarize the scheduling resources required for an instruction of a particular scheduling class.
Definition MCSchedule.h:129
bool isVariant() const
Definition MCSchedule.h:150
Machine model for scheduling, bundling, and heuristics.
Definition MCSchedule.h:264
static LLVM_ABI double getReciprocalThroughput(const MCSubtargetInfo &STI, const MCSchedClassDesc &SCDesc)
Matching combinators.
Information about a load/store intrinsic defined by the target.
InterleavedAccessInfo * IAI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
bool RuntimeUnrollMultiExit
Allow runtime unrolling multi-exit loops.
unsigned SCEVExpansionBudget
Don't allow runtime unrolling if expanding the trip count takes more than SCEVExpansionBudget.
bool AddAdditionalAccumulators
Allow unrolling to add parallel reduction phis.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
unsigned PartialThreshold
The cost threshold for the unrolled loop, like Threshold, but used for partial/runtime unrolling (set...
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...