LLVM 23.0.0git
ARMTargetTransformInfo.cpp
Go to the documentation of this file.
1//===- ARMTargetTransformInfo.cpp - ARM specific TTI ----------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
10#include "ARMSubtarget.h"
12#include "llvm/ADT/APInt.h"
19#include "llvm/IR/BasicBlock.h"
20#include "llvm/IR/DataLayout.h"
22#include "llvm/IR/Instruction.h"
25#include "llvm/IR/Intrinsics.h"
26#include "llvm/IR/IntrinsicsARM.h"
28#include "llvm/IR/Type.h"
37#include <algorithm>
38#include <cassert>
39#include <cstdint>
40#include <optional>
41#include <utility>
42
43using namespace llvm;
44
45#define DEBUG_TYPE "armtti"
46
48 "enable-arm-maskedldst", cl::Hidden, cl::init(true),
49 cl::desc("Enable the generation of masked loads and stores"));
50
52 "disable-arm-loloops", cl::Hidden, cl::init(false),
53 cl::desc("Disable the generation of low-overhead loops"));
54
55static cl::opt<bool>
56 AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
57 cl::desc("Enable the generation of WLS loops"));
58
60 "widen-global-strings", cl::Hidden, cl::init(true),
61 cl::desc("Enable the widening of global strings to alignment boundaries"));
62
64
66
68
70 "arm-force-unroll-threshold", cl::init(12), cl::Hidden,
72 "Threshold for forced unrolling of small loops in Arm architecture"));
73
74/// Convert a vector load intrinsic into a simple llvm load instruction.
75/// This is beneficial when the underlying object being addressed comes
76/// from a constant, since we get constant-folding for free.
77static Value *simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign,
78 InstCombiner::BuilderTy &Builder) {
79 auto *IntrAlign = dyn_cast<ConstantInt>(II.getArgOperand(1));
80
81 if (!IntrAlign)
82 return nullptr;
83
84 unsigned Alignment = IntrAlign->getLimitedValue() < MemAlign
85 ? MemAlign
86 : IntrAlign->getLimitedValue();
87
88 if (!isPowerOf2_32(Alignment))
89 return nullptr;
90
91 return Builder.CreateAlignedLoad(II.getType(), II.getArgOperand(0),
92 Align(Alignment));
93}
94
97 ScalarEvolution *SE) const {
98 if (ST->hasMVEIntegerOps())
100
101 if (L->getHeader()->getParent()->hasOptSize())
102 return TTI::AMK_None;
103
104 if (ST->isMClass() && ST->isThumb2() &&
105 L->getNumBlocks() == 1)
106 return TTI::AMK_PreIndexed;
107
108 return TTI::AMK_None;
109}
110
111std::optional<Instruction *>
113 using namespace PatternMatch;
114 Intrinsic::ID IID = II.getIntrinsicID();
115 switch (IID) {
116 default:
117 break;
118 case Intrinsic::arm_neon_vld1: {
119 Align MemAlign =
120 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
122 if (Value *V = simplifyNeonVld1(II, MemAlign.value(), IC.Builder)) {
123 return IC.replaceInstUsesWith(II, V);
124 }
125 break;
126 }
127
128 case Intrinsic::arm_neon_vld2:
129 case Intrinsic::arm_neon_vld3:
130 case Intrinsic::arm_neon_vld4:
131 case Intrinsic::arm_neon_vld2lane:
132 case Intrinsic::arm_neon_vld3lane:
133 case Intrinsic::arm_neon_vld4lane:
134 case Intrinsic::arm_neon_vst1:
135 case Intrinsic::arm_neon_vst2:
136 case Intrinsic::arm_neon_vst3:
137 case Intrinsic::arm_neon_vst4:
138 case Intrinsic::arm_neon_vst2lane:
139 case Intrinsic::arm_neon_vst3lane:
140 case Intrinsic::arm_neon_vst4lane: {
141 Align MemAlign =
142 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
144 unsigned AlignArg = II.arg_size() - 1;
145 Value *AlignArgOp = II.getArgOperand(AlignArg);
146 MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
147 if (Align && *Align < MemAlign) {
148 return IC.replaceOperand(
149 II, AlignArg,
150 ConstantInt::get(Type::getInt32Ty(II.getContext()), MemAlign.value(),
151 false));
152 }
153 break;
154 }
155
156 case Intrinsic::arm_neon_vld1x2:
157 case Intrinsic::arm_neon_vld1x3:
158 case Intrinsic::arm_neon_vld1x4:
159 case Intrinsic::arm_neon_vst1x2:
160 case Intrinsic::arm_neon_vst1x3:
161 case Intrinsic::arm_neon_vst1x4: {
162 Align NewAlign =
163 getKnownAlignment(II.getArgOperand(0), IC.getDataLayout(), &II,
165 Align OldAlign = II.getParamAlign(0).valueOrOne();
166 if (NewAlign > OldAlign)
167 II.addParamAttr(0,
168 Attribute::getWithAlignment(II.getContext(), NewAlign));
169 break;
170 }
171
172 case Intrinsic::arm_mve_pred_i2v: {
173 Value *Arg = II.getArgOperand(0);
174 Value *ArgArg;
176 PatternMatch::m_Value(ArgArg))) &&
177 II.getType() == ArgArg->getType()) {
178 return IC.replaceInstUsesWith(II, ArgArg);
179 }
180 Constant *XorMask;
182 PatternMatch::m_Value(ArgArg)),
183 PatternMatch::m_Constant(XorMask))) &&
184 II.getType() == ArgArg->getType()) {
185 if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
186 if (CI->getValue().trunc(16).isAllOnes()) {
187 auto TrueVector = IC.Builder.CreateVectorSplat(
188 cast<FixedVectorType>(II.getType())->getNumElements(),
189 IC.Builder.getTrue());
190 return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
191 }
192 }
193 }
194 KnownBits ScalarKnown(32);
195 if (IC.SimplifyDemandedBits(&II, 0, APInt::getLowBitsSet(32, 16),
196 ScalarKnown)) {
197 return &II;
198 }
199 break;
200 }
201 case Intrinsic::arm_mve_pred_v2i: {
202 Value *Arg = II.getArgOperand(0);
203 Value *ArgArg;
205 PatternMatch::m_Value(ArgArg)))) {
206 return IC.replaceInstUsesWith(II, ArgArg);
207 }
208
209 if (II.getMetadata(LLVMContext::MD_range))
210 break;
211
212 ConstantRange Range(APInt(32, 0), APInt(32, 0x10000));
213
214 if (auto CurrentRange = II.getRange()) {
215 Range = Range.intersectWith(*CurrentRange);
216 if (Range == CurrentRange)
217 break;
218 }
219
220 II.addRangeRetAttr(Range);
221 II.addRetAttr(Attribute::NoUndef);
222 return &II;
223 }
224 case Intrinsic::arm_mve_vadc:
225 case Intrinsic::arm_mve_vadc_predicated: {
226 unsigned CarryOp =
227 (II.getIntrinsicID() == Intrinsic::arm_mve_vadc_predicated) ? 3 : 2;
228 assert(II.getArgOperand(CarryOp)->getType()->getScalarSizeInBits() == 32 &&
229 "Bad type for intrinsic!");
230
231 KnownBits CarryKnown(32);
232 if (IC.SimplifyDemandedBits(&II, CarryOp, APInt::getOneBitSet(32, 29),
233 CarryKnown)) {
234 return &II;
235 }
236 break;
237 }
238 case Intrinsic::arm_mve_vmldava: {
240 if (I->hasOneUse()) {
241 auto *User = cast<Instruction>(*I->user_begin());
242 Value *OpZ;
243 if (match(User, m_c_Add(m_Specific(I), m_Value(OpZ))) &&
244 match(I->getOperand(3), m_Zero())) {
245 Value *OpX = I->getOperand(4);
246 Value *OpY = I->getOperand(5);
247 Type *OpTy = OpX->getType();
248
250 Value *V =
251 IC.Builder.CreateIntrinsic(Intrinsic::arm_mve_vmldava, {OpTy},
252 {I->getOperand(0), I->getOperand(1),
253 I->getOperand(2), OpZ, OpX, OpY});
254
256 return IC.eraseInstFromFunction(*User);
257 }
258 }
259 return std::nullopt;
260 }
261 }
262 return std::nullopt;
263}
264
266 InstCombiner &IC, IntrinsicInst &II, APInt OrigDemandedElts,
267 APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3,
268 std::function<void(Instruction *, unsigned, APInt, APInt &)>
269 SimplifyAndSetOp) const {
270
271 // Compute the demanded bits for a narrowing MVE intrinsic. The TopOpc is the
272 // opcode specifying a Top/Bottom instruction, which can change between
273 // instructions.
274 auto SimplifyNarrowInstrTopBottom =[&](unsigned TopOpc) {
275 unsigned NumElts = cast<FixedVectorType>(II.getType())->getNumElements();
276 unsigned IsTop = cast<ConstantInt>(II.getOperand(TopOpc))->getZExtValue();
277
278 // The only odd/even lanes of operand 0 will only be demanded depending
279 // on whether this is a top/bottom instruction.
280 APInt DemandedElts =
281 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
282 : APInt::getHighBitsSet(2, 1));
283 SimplifyAndSetOp(&II, 0, OrigDemandedElts & DemandedElts, UndefElts);
284 // The other lanes will be defined from the inserted elements.
285 UndefElts &= APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
286 : APInt::getHighBitsSet(2, 1));
287 return std::nullopt;
288 };
289
290 switch (II.getIntrinsicID()) {
291 default:
292 break;
293 case Intrinsic::arm_mve_vcvt_narrow:
294 SimplifyNarrowInstrTopBottom(2);
295 break;
296 case Intrinsic::arm_mve_vqmovn:
297 SimplifyNarrowInstrTopBottom(4);
298 break;
299 case Intrinsic::arm_mve_vshrn:
300 SimplifyNarrowInstrTopBottom(7);
301 break;
302 }
303
304 return std::nullopt;
305}
306
309 assert(Ty->isIntegerTy());
310
311 unsigned Bits = Ty->getPrimitiveSizeInBits();
312 if (Bits == 0 || Imm.getActiveBits() >= 64)
313 return 4;
314
315 int64_t SImmVal = Imm.getSExtValue();
316 uint64_t ZImmVal = Imm.getZExtValue();
317 if (!ST->isThumb()) {
318 if ((SImmVal >= 0 && SImmVal < 65536) ||
319 (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
320 (ARM_AM::getSOImmVal(~ZImmVal) != -1))
321 return 1;
322 return ST->hasV6T2Ops() ? 2 : 3;
323 }
324 if (ST->isThumb2()) {
325 if ((SImmVal >= 0 && SImmVal < 65536) ||
326 (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
327 (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
328 return 1;
329 return ST->hasV6T2Ops() ? 2 : 3;
330 }
331 // Thumb1, any i8 imm cost 1.
332 if (Bits == 8 || (SImmVal >= 0 && SImmVal < 256))
333 return 1;
334 if ((~SImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
335 return 2;
336 // Load from constantpool.
337 return 3;
338}
339
340// Constants smaller than 256 fit in the immediate field of
341// Thumb1 instructions so we return a zero cost and 1 otherwise.
343 const APInt &Imm,
344 Type *Ty) const {
345 if (Imm.isNonNegative() && Imm.getLimitedValue() < 256)
346 return 0;
347
348 return 1;
349}
350
351// Checks whether Inst is part of a min(max()) or max(min()) pattern
352// that will match to an SSAT instruction. Returns the instruction being
353// saturated, or null if no saturation pattern was found.
354static Value *isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm) {
355 Value *LHS, *RHS;
356 ConstantInt *C;
358
359 if (InstSPF == SPF_SMAX &&
361 C->getValue() == Imm && Imm.isNegative() && Imm.isNegatedPowerOf2()) {
362
363 auto isSSatMin = [&](Value *MinInst) {
364 if (isa<SelectInst>(MinInst)) {
365 Value *MinLHS, *MinRHS;
366 ConstantInt *MinC;
367 SelectPatternFlavor MinSPF =
368 matchSelectPattern(MinInst, MinLHS, MinRHS).Flavor;
369 if (MinSPF == SPF_SMIN &&
371 MinC->getValue() == ((-Imm) - 1))
372 return true;
373 }
374 return false;
375 };
376
377 if (isSSatMin(Inst->getOperand(1)))
378 return cast<Instruction>(Inst->getOperand(1))->getOperand(1);
379 if (Inst->hasNUses(2) &&
380 (isSSatMin(*Inst->user_begin()) || isSSatMin(*(++Inst->user_begin()))))
381 return Inst->getOperand(1);
382 }
383 return nullptr;
384}
385
386// Look for a FP Saturation pattern, where the instruction can be simplified to
387// a fptosi.sat. max(min(fptosi)). The constant in this case is always free.
388static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm) {
389 if (Imm.getBitWidth() != 64 ||
390 Imm != APInt::getHighBitsSet(64, 33)) // -2147483648
391 return false;
392 Value *FP = isSSATMinMaxPattern(Inst, Imm);
393 if (!FP && isa<ICmpInst>(Inst) && Inst->hasOneUse())
395 if (!FP)
396 return false;
397 return isa<FPToSIInst>(FP);
398}
399
400InstructionCost ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
401 const APInt &Imm, Type *Ty,
403 Instruction *Inst) const {
404 // Division by a constant can be turned into multiplication, but only if we
405 // know it's constant. So it's not so much that the immediate is cheap (it's
406 // not), but that the alternative is worse.
407 // FIXME: this is probably unneeded with GlobalISel.
408 if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
409 Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
410 Idx == 1)
411 return 0;
412
413 // Leave any gep offsets for the CodeGenPrepare, which will do a better job at
414 // splitting any large offsets.
415 if (Opcode == Instruction::GetElementPtr && Idx != 0)
416 return 0;
417
418 if (Opcode == Instruction::And) {
419 // UXTB/UXTH
420 if (Imm == 255 || Imm == 65535)
421 return 0;
422 // Conversion to BIC is free, and means we can use ~Imm instead.
423 return std::min(getIntImmCost(Imm, Ty, CostKind),
424 getIntImmCost(~Imm, Ty, CostKind));
425 }
426
427 if (Opcode == Instruction::Add)
428 // Conversion to SUB is free, and means we can use -Imm instead.
429 return std::min(getIntImmCost(Imm, Ty, CostKind),
430 getIntImmCost(-Imm, Ty, CostKind));
431
432 if (Opcode == Instruction::ICmp && Imm.isNegative() &&
433 Ty->getIntegerBitWidth() == 32) {
434 int64_t NegImm = -Imm.getSExtValue();
435 if (ST->isThumb2() && NegImm < 1<<12)
436 // icmp X, #-C -> cmn X, #C
437 return 0;
438 if (ST->isThumb() && NegImm < 1<<8)
439 // icmp X, #-C -> adds X, #C
440 return 0;
441 }
442
443 // xor a, -1 can always be folded to MVN
444 if (Opcode == Instruction::Xor && Imm.isAllOnes())
445 return 0;
446
447 // Ensures negative constant of min(max()) or max(min()) patterns that
448 // match to SSAT instructions don't get hoisted
449 if (Inst && ((ST->hasV6Ops() && !ST->isThumb()) || ST->isThumb2()) &&
450 Ty->getIntegerBitWidth() <= 32) {
451 if (isSSATMinMaxPattern(Inst, Imm) ||
452 (isa<ICmpInst>(Inst) && Inst->hasOneUse() &&
454 return 0;
455 }
456
457 if (Inst && ST->hasVFP2Base() && isFPSatMinMaxPattern(Inst, Imm))
458 return 0;
459
460 // We can convert <= -1 to < 0, which is generally quite cheap.
461 if (Inst && Opcode == Instruction::ICmp && Idx == 1 && Imm.isAllOnes()) {
462 ICmpInst::Predicate Pred = cast<ICmpInst>(Inst)->getPredicate();
463 if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE)
464 return std::min(getIntImmCost(Imm, Ty, CostKind),
465 getIntImmCost(Imm + 1, Ty, CostKind));
466 }
467
468 return getIntImmCost(Imm, Ty, CostKind);
469}
470
473 const Instruction *I) const {
475 (ST->hasNEON() || ST->hasMVEIntegerOps())) {
476 // FIXME: The vectorizer is highly sensitive to the cost of these
477 // instructions, which suggests that it may be using the costs incorrectly.
478 // But, for now, just make them free to avoid performance regressions for
479 // vector targets.
480 return 0;
481 }
482 return BaseT::getCFInstrCost(Opcode, CostKind, I);
483}
484
486 Type *Src,
489 const Instruction *I) const {
490 int ISD = TLI->InstructionOpcodeToISD(Opcode);
491 assert(ISD && "Invalid opcode");
492
493 // TODO: Allow non-throughput costs that aren't binary.
494 auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost {
496 return Cost == 0 ? 0 : 1;
497 return Cost;
498 };
499 auto IsLegalFPType = [this](EVT VT) {
500 EVT EltVT = VT.getScalarType();
501 return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
502 (EltVT == MVT::f64 && ST->hasFP64()) ||
503 (EltVT == MVT::f16 && ST->hasFullFP16());
504 };
505
506 EVT SrcTy = TLI->getValueType(DL, Src);
507 EVT DstTy = TLI->getValueType(DL, Dst);
508
509 if (!SrcTy.isSimple() || !DstTy.isSimple())
510 return AdjustCost(
511 BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
512
513 // Extending masked load/Truncating masked stores is expensive because we
514 // currently don't split them. This means that we'll likely end up
515 // loading/storing each element individually (hence the high cost).
516 if ((ST->hasMVEIntegerOps() &&
517 (Opcode == Instruction::Trunc || Opcode == Instruction::ZExt ||
518 Opcode == Instruction::SExt)) ||
519 (ST->hasMVEFloatOps() &&
520 (Opcode == Instruction::FPExt || Opcode == Instruction::FPTrunc) &&
521 IsLegalFPType(SrcTy) && IsLegalFPType(DstTy)))
522 if (CCH == TTI::CastContextHint::Masked && DstTy.getSizeInBits() > 128)
523 return 2 * DstTy.getVectorNumElements() *
524 ST->getMVEVectorCostFactor(CostKind);
525
526 // The extend of other kinds of load is free
527 if (CCH == TTI::CastContextHint::Normal ||
529 static const TypeConversionCostTblEntry LoadConversionTbl[] = {
530 {ISD::SIGN_EXTEND, MVT::i32, MVT::i16, 0},
531 {ISD::ZERO_EXTEND, MVT::i32, MVT::i16, 0},
532 {ISD::SIGN_EXTEND, MVT::i32, MVT::i8, 0},
533 {ISD::ZERO_EXTEND, MVT::i32, MVT::i8, 0},
534 {ISD::SIGN_EXTEND, MVT::i16, MVT::i8, 0},
535 {ISD::ZERO_EXTEND, MVT::i16, MVT::i8, 0},
536 {ISD::SIGN_EXTEND, MVT::i64, MVT::i32, 1},
537 {ISD::ZERO_EXTEND, MVT::i64, MVT::i32, 1},
538 {ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 1},
539 {ISD::ZERO_EXTEND, MVT::i64, MVT::i16, 1},
540 {ISD::SIGN_EXTEND, MVT::i64, MVT::i8, 1},
541 {ISD::ZERO_EXTEND, MVT::i64, MVT::i8, 1},
542 };
543 if (const auto *Entry = ConvertCostTableLookup(
544 LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
545 return AdjustCost(Entry->Cost);
546
547 static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
548 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
549 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0},
550 {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 0},
551 {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
552 {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
553 {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
554 // The following extend from a legal type to an illegal type, so need to
555 // split the load. This introduced an extra load operation, but the
556 // extend is still "free".
557 {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
558 {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
559 {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
560 {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
561 {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
562 {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
563 };
564 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
565 if (const auto *Entry =
566 ConvertCostTableLookup(MVELoadConversionTbl, ISD,
567 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
568 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
569 }
570
571 static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
572 // FPExtends are similar but also require the VCVT instructions.
573 {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
574 {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
575 };
576 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
577 if (const auto *Entry =
578 ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
579 DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
580 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
581 }
582
583 // The truncate of a store is free. This is the mirror of extends above.
584 static const TypeConversionCostTblEntry MVEStoreConversionTbl[] = {
585 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
586 {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
587 {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
588 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
589 {ISD::TRUNCATE, MVT::v8i32, MVT::v8i8, 1},
590 {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
591 {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
592 };
593 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
594 if (const auto *Entry =
595 ConvertCostTableLookup(MVEStoreConversionTbl, ISD,
596 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
597 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
598 }
599
600 static const TypeConversionCostTblEntry MVEFStoreConversionTbl[] = {
601 {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
602 {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
603 };
604 if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
605 if (const auto *Entry =
606 ConvertCostTableLookup(MVEFStoreConversionTbl, ISD,
607 SrcTy.getSimpleVT(), DstTy.getSimpleVT()))
608 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
609 }
610 }
611
612 // NEON vector operations that can extend their inputs.
613 if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
614 I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
615 static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
616 // vaddl
617 { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
618 { ISD::ADD, MVT::v8i16, MVT::v8i8, 0 },
619 // vsubl
620 { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
621 { ISD::SUB, MVT::v8i16, MVT::v8i8, 0 },
622 // vmull
623 { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
624 { ISD::MUL, MVT::v8i16, MVT::v8i8, 0 },
625 // vshll
626 { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
627 { ISD::SHL, MVT::v8i16, MVT::v8i8, 0 },
628 };
629
630 auto *User = cast<Instruction>(*I->user_begin());
631 int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
632 if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
633 DstTy.getSimpleVT(),
634 SrcTy.getSimpleVT())) {
635 return AdjustCost(Entry->Cost);
636 }
637 }
638
639 // Single to/from double precision conversions.
640 if (Src->isVectorTy() && ST->hasNEON() &&
641 ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
642 DstTy.getScalarType() == MVT::f32) ||
643 (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
644 DstTy.getScalarType() == MVT::f64))) {
645 static const CostTblEntry NEONFltDblTbl[] = {
646 // Vector fptrunc/fpext conversions.
647 {ISD::FP_ROUND, MVT::v2f64, 2},
648 {ISD::FP_EXTEND, MVT::v2f32, 2},
649 {ISD::FP_EXTEND, MVT::v4f32, 4}};
650
651 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
652 if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
653 return AdjustCost(LT.first * Entry->Cost);
654 }
655
656 // Some arithmetic, load and store operations have specific instructions
657 // to cast up/down their types automatically at no extra cost.
658 // TODO: Get these tables to know at least what the related operations are.
659 static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
660 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
661 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
662 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
663 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
664 { ISD::TRUNCATE, MVT::v4i32, MVT::v4i64, 0 },
665 { ISD::TRUNCATE, MVT::v4i16, MVT::v4i32, 1 },
666
667 // The number of vmovl instructions for the extension.
668 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
669 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
670 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
671 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
672 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
673 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 3 },
674 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
675 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
676 { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
677 { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
678 { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
679 { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
680 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
681 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i8, 7 },
682 { ISD::SIGN_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
683 { ISD::ZERO_EXTEND, MVT::v8i64, MVT::v8i16, 6 },
684 { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
685 { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 6 },
686
687 // Operations that we legalize using splitting.
688 { ISD::TRUNCATE, MVT::v16i8, MVT::v16i32, 6 },
689 { ISD::TRUNCATE, MVT::v8i8, MVT::v8i32, 3 },
690
691 // Vector float <-> i32 conversions.
692 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
693 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i32, 1 },
694
695 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
696 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i8, 3 },
697 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
698 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i16, 2 },
699 { ISD::SINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
700 { ISD::UINT_TO_FP, MVT::v2f32, MVT::v2i32, 1 },
701 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
702 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i1, 3 },
703 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
704 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i8, 3 },
705 { ISD::SINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
706 { ISD::UINT_TO_FP, MVT::v4f32, MVT::v4i16, 2 },
707 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
708 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i16, 4 },
709 { ISD::SINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
710 { ISD::UINT_TO_FP, MVT::v8f32, MVT::v8i32, 2 },
711 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
712 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i16, 8 },
713 { ISD::SINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
714 { ISD::UINT_TO_FP, MVT::v16f32, MVT::v16i32, 4 },
715
716 { ISD::FP_TO_SINT, MVT::v4i32, MVT::v4f32, 1 },
717 { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
718 { ISD::FP_TO_SINT, MVT::v4i8, MVT::v4f32, 3 },
719 { ISD::FP_TO_UINT, MVT::v4i8, MVT::v4f32, 3 },
720 { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
721 { ISD::FP_TO_UINT, MVT::v4i16, MVT::v4f32, 2 },
722
723 // Vector double <-> i32 conversions.
724 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
725 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
726
727 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
728 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i8, 4 },
729 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
730 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i16, 3 },
731 { ISD::SINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
732 { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i32, 2 },
733
734 { ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 2 },
735 { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f64, 2 },
736 { ISD::FP_TO_SINT, MVT::v8i16, MVT::v8f32, 4 },
737 { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f32, 4 },
738 { ISD::FP_TO_SINT, MVT::v16i16, MVT::v16f32, 8 },
739 { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 8 }
740 };
741
742 if (SrcTy.isVector() && ST->hasNEON()) {
743 if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
744 DstTy.getSimpleVT(),
745 SrcTy.getSimpleVT()))
746 return AdjustCost(Entry->Cost);
747 }
748
749 // Scalar float to integer conversions.
750 static const TypeConversionCostTblEntry NEONFloatConversionTbl[] = {
751 { ISD::FP_TO_SINT, MVT::i1, MVT::f32, 2 },
752 { ISD::FP_TO_UINT, MVT::i1, MVT::f32, 2 },
753 { ISD::FP_TO_SINT, MVT::i1, MVT::f64, 2 },
754 { ISD::FP_TO_UINT, MVT::i1, MVT::f64, 2 },
755 { ISD::FP_TO_SINT, MVT::i8, MVT::f32, 2 },
756 { ISD::FP_TO_UINT, MVT::i8, MVT::f32, 2 },
757 { ISD::FP_TO_SINT, MVT::i8, MVT::f64, 2 },
758 { ISD::FP_TO_UINT, MVT::i8, MVT::f64, 2 },
759 { ISD::FP_TO_SINT, MVT::i16, MVT::f32, 2 },
760 { ISD::FP_TO_UINT, MVT::i16, MVT::f32, 2 },
761 { ISD::FP_TO_SINT, MVT::i16, MVT::f64, 2 },
762 { ISD::FP_TO_UINT, MVT::i16, MVT::f64, 2 },
763 { ISD::FP_TO_SINT, MVT::i32, MVT::f32, 2 },
764 { ISD::FP_TO_UINT, MVT::i32, MVT::f32, 2 },
765 { ISD::FP_TO_SINT, MVT::i32, MVT::f64, 2 },
766 { ISD::FP_TO_UINT, MVT::i32, MVT::f64, 2 },
767 { ISD::FP_TO_SINT, MVT::i64, MVT::f32, 10 },
768 { ISD::FP_TO_UINT, MVT::i64, MVT::f32, 10 },
769 { ISD::FP_TO_SINT, MVT::i64, MVT::f64, 10 },
770 { ISD::FP_TO_UINT, MVT::i64, MVT::f64, 10 }
771 };
772 if (SrcTy.isFloatingPoint() && ST->hasNEON()) {
773 if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
774 DstTy.getSimpleVT(),
775 SrcTy.getSimpleVT()))
776 return AdjustCost(Entry->Cost);
777 }
778
779 // Scalar integer to float conversions.
780 static const TypeConversionCostTblEntry NEONIntegerConversionTbl[] = {
781 { ISD::SINT_TO_FP, MVT::f32, MVT::i1, 2 },
782 { ISD::UINT_TO_FP, MVT::f32, MVT::i1, 2 },
783 { ISD::SINT_TO_FP, MVT::f64, MVT::i1, 2 },
784 { ISD::UINT_TO_FP, MVT::f64, MVT::i1, 2 },
785 { ISD::SINT_TO_FP, MVT::f32, MVT::i8, 2 },
786 { ISD::UINT_TO_FP, MVT::f32, MVT::i8, 2 },
787 { ISD::SINT_TO_FP, MVT::f64, MVT::i8, 2 },
788 { ISD::UINT_TO_FP, MVT::f64, MVT::i8, 2 },
789 { ISD::SINT_TO_FP, MVT::f32, MVT::i16, 2 },
790 { ISD::UINT_TO_FP, MVT::f32, MVT::i16, 2 },
791 { ISD::SINT_TO_FP, MVT::f64, MVT::i16, 2 },
792 { ISD::UINT_TO_FP, MVT::f64, MVT::i16, 2 },
793 { ISD::SINT_TO_FP, MVT::f32, MVT::i32, 2 },
794 { ISD::UINT_TO_FP, MVT::f32, MVT::i32, 2 },
795 { ISD::SINT_TO_FP, MVT::f64, MVT::i32, 2 },
796 { ISD::UINT_TO_FP, MVT::f64, MVT::i32, 2 },
797 { ISD::SINT_TO_FP, MVT::f32, MVT::i64, 10 },
798 { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 10 },
799 { ISD::SINT_TO_FP, MVT::f64, MVT::i64, 10 },
800 { ISD::UINT_TO_FP, MVT::f64, MVT::i64, 10 }
801 };
802
803 if (SrcTy.isInteger() && ST->hasNEON()) {
804 if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
805 ISD, DstTy.getSimpleVT(),
806 SrcTy.getSimpleVT()))
807 return AdjustCost(Entry->Cost);
808 }
809
810 // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
811 // instruction, i8->i32 is two. i64 zexts are an VAND with a constant, sext
812 // are linearised so take more.
813 static const TypeConversionCostTblEntry MVEVectorConversionTbl[] = {
814 { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
815 { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 1 },
816 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
817 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 2 },
818 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8, 10 },
819 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8, 2 },
820 { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
821 { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
822 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 10 },
823 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
824 { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 8 },
825 { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 2 },
826 };
827
828 if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
829 if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
830 ISD, DstTy.getSimpleVT(),
831 SrcTy.getSimpleVT()))
832 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
833 }
834
835 if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
836 // As general rule, fp converts that were not matched above are scalarized
837 // and cost 1 vcvt for each lane, so long as the instruction is available.
838 // If not it will become a series of function calls.
839 const InstructionCost CallCost =
840 getCallInstrCost(nullptr, Dst, {Src}, CostKind);
841 int Lanes = 1;
842 if (SrcTy.isFixedLengthVector())
843 Lanes = SrcTy.getVectorNumElements();
844
845 if (IsLegalFPType(SrcTy) && IsLegalFPType(DstTy))
846 return Lanes;
847 else
848 return Lanes * CallCost;
849 }
850
851 if (ISD == ISD::TRUNCATE && ST->hasMVEIntegerOps() &&
852 SrcTy.isFixedLengthVector()) {
853 // Treat a truncate with larger than legal source (128bits for MVE) as
854 // expensive, 2 instructions per lane.
855 if ((SrcTy.getScalarType() == MVT::i8 ||
856 SrcTy.getScalarType() == MVT::i16 ||
857 SrcTy.getScalarType() == MVT::i32) &&
858 SrcTy.getSizeInBits() > 128 &&
859 SrcTy.getSizeInBits() > DstTy.getSizeInBits())
860 return SrcTy.getVectorNumElements() * 2;
861 }
862
863 // Scalar integer conversion costs.
864 static const TypeConversionCostTblEntry ARMIntegerConversionTbl[] = {
865 // i16 -> i64 requires two dependent operations.
866 { ISD::SIGN_EXTEND, MVT::i64, MVT::i16, 2 },
867
868 // Truncates on i64 are assumed to be free.
869 { ISD::TRUNCATE, MVT::i32, MVT::i64, 0 },
870 { ISD::TRUNCATE, MVT::i16, MVT::i64, 0 },
871 { ISD::TRUNCATE, MVT::i8, MVT::i64, 0 },
872 { ISD::TRUNCATE, MVT::i1, MVT::i64, 0 }
873 };
874
875 if (SrcTy.isInteger()) {
876 if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
877 DstTy.getSimpleVT(),
878 SrcTy.getSimpleVT()))
879 return AdjustCost(Entry->Cost);
880 }
881
882 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
883 ? ST->getMVEVectorCostFactor(CostKind)
884 : 1;
885 return AdjustCost(
886 BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I));
887}
888
890 unsigned Opcode, Type *ValTy, TTI::TargetCostKind CostKind, unsigned Index,
891 const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC) const {
892 // Penalize inserting into an D-subregister. We end up with a three times
893 // lower estimated throughput on swift.
894 if (ST->hasSlowLoadDSubregister() && Opcode == Instruction::InsertElement &&
895 ValTy->isVectorTy() && ValTy->getScalarSizeInBits() <= 32)
896 return 3;
897
898 if (ST->hasNEON() && (Opcode == Instruction::InsertElement ||
899 Opcode == Instruction::ExtractElement)) {
900 // Cross-class copies are expensive on many microarchitectures,
901 // so assume they are expensive by default.
902 if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
903 return 3;
904
905 // Even if it's not a cross class copy, this likely leads to mixing
906 // of NEON and VFP code and should be therefore penalized.
907 if (ValTy->isVectorTy() &&
908 ValTy->getScalarSizeInBits() <= 32)
909 return std::max<InstructionCost>(
910 BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
911 VIC),
912 2U);
913 }
914
915 if (ST->hasMVEIntegerOps() && (Opcode == Instruction::InsertElement ||
916 Opcode == Instruction::ExtractElement)) {
917 // Integer cross-lane moves are more expensive than float, which can
918 // sometimes just be vmovs. Integer involve being passes to GPR registers,
919 // causing more of a delay.
920 std::pair<InstructionCost, MVT> LT =
922 return LT.first * (ValTy->getScalarType()->isIntegerTy() ? 4 : 1);
923 }
924
925 return BaseT::getVectorInstrCost(Opcode, ValTy, CostKind, Index, Op0, Op1,
926 VIC);
927}
928
930 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
932 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
933 int ISD = TLI->InstructionOpcodeToISD(Opcode);
934
935 // Thumb scalar code size cost for select.
937 ST->isThumb() && !ValTy->isVectorTy()) {
938 // Assume expensive structs.
939 if (TLI->getValueType(DL, ValTy, true) == MVT::Other)
940 return TTI::TCC_Expensive;
941
942 // Select costs can vary because they:
943 // - may require one or more conditional mov (including an IT),
944 // - can't operate directly on immediates,
945 // - require live flags, which we can't copy around easily.
947
948 // Possible IT instruction for Thumb2, or more for Thumb1.
949 ++Cost;
950
951 // i1 values may need rematerialising by using mov immediates and/or
952 // flag setting instructions.
953 if (ValTy->isIntegerTy(1))
954 ++Cost;
955
956 return Cost;
957 }
958
959 // If this is a vector min/max/abs, use the cost of that intrinsic directly
960 // instead. Hopefully when min/max intrinsics are more prevalent this code
961 // will not be needed.
962 const Instruction *Sel = I;
963 if ((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) && Sel &&
964 Sel->hasOneUse())
965 Sel = cast<Instruction>(Sel->user_back());
966 if (Sel && ValTy->isVectorTy() &&
967 (ValTy->isIntOrIntVectorTy() || ValTy->isFPOrFPVectorTy())) {
968 const Value *LHS, *RHS;
969 SelectPatternFlavor SPF = matchSelectPattern(Sel, LHS, RHS).Flavor;
970 unsigned IID = 0;
971 switch (SPF) {
972 case SPF_ABS:
973 IID = Intrinsic::abs;
974 break;
975 case SPF_SMIN:
976 IID = Intrinsic::smin;
977 break;
978 case SPF_SMAX:
979 IID = Intrinsic::smax;
980 break;
981 case SPF_UMIN:
982 IID = Intrinsic::umin;
983 break;
984 case SPF_UMAX:
985 IID = Intrinsic::umax;
986 break;
987 case SPF_FMINNUM:
988 IID = Intrinsic::minnum;
989 break;
990 case SPF_FMAXNUM:
991 IID = Intrinsic::maxnum;
992 break;
993 default:
994 break;
995 }
996 if (IID) {
997 // The ICmp is free, the select gets the cost of the min/max/etc
998 if (Sel != I)
999 return 0;
1000 IntrinsicCostAttributes CostAttrs(IID, ValTy, {ValTy, ValTy});
1001 return getIntrinsicInstrCost(CostAttrs, CostKind);
1002 }
1003 }
1004
1005 // On NEON a vector select gets lowered to vbsl.
1006 if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT && CondTy) {
1007 // Lowering of some vector selects is currently far from perfect.
1008 static const TypeConversionCostTblEntry NEONVectorSelectTbl[] = {
1009 { ISD::SELECT, MVT::v4i1, MVT::v4i64, 4*4 + 1*2 + 1 },
1010 { ISD::SELECT, MVT::v8i1, MVT::v8i64, 50 },
1011 { ISD::SELECT, MVT::v16i1, MVT::v16i64, 100 }
1012 };
1013
1014 EVT SelCondTy = TLI->getValueType(DL, CondTy);
1015 EVT SelValTy = TLI->getValueType(DL, ValTy);
1016 if (SelCondTy.isSimple() && SelValTy.isSimple()) {
1017 if (const auto *Entry = ConvertCostTableLookup(NEONVectorSelectTbl, ISD,
1018 SelCondTy.getSimpleVT(),
1019 SelValTy.getSimpleVT()))
1020 return Entry->Cost;
1021 }
1022
1023 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1024 return LT.first;
1025 }
1026
1027 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy() &&
1028 (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
1029 cast<FixedVectorType>(ValTy)->getNumElements() > 1) {
1030 FixedVectorType *VecValTy = cast<FixedVectorType>(ValTy);
1032 if (!VecCondTy)
1034
1035 // If we don't have mve.fp any fp operations will need to be scalarized.
1036 if (Opcode == Instruction::FCmp && !ST->hasMVEFloatOps()) {
1037 // One scalaization insert, one scalarization extract and the cost of the
1038 // fcmps.
1039 return BaseT::getScalarizationOverhead(VecValTy, /*Insert*/ false,
1040 /*Extract*/ true, CostKind) +
1041 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1042 /*Extract*/ false, CostKind) +
1043 VecValTy->getNumElements() *
1044 getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
1045 VecCondTy->getScalarType(), VecPred,
1046 CostKind, Op1Info, Op2Info, I);
1047 }
1048
1049 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1050 int BaseCost = ST->getMVEVectorCostFactor(CostKind);
1051 // There are two types - the input that specifies the type of the compare
1052 // and the output vXi1 type. Because we don't know how the output will be
1053 // split, we may need an expensive shuffle to get two in sync. This has the
1054 // effect of making larger than legal compares (v8i32 for example)
1055 // expensive.
1056 if (LT.second.isVector() && LT.second.getVectorNumElements() > 2) {
1057 if (LT.first > 1)
1058 return LT.first * BaseCost +
1059 BaseT::getScalarizationOverhead(VecCondTy, /*Insert*/ true,
1060 /*Extract*/ false, CostKind);
1061 return BaseCost;
1062 }
1063 }
1064
1065 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1066 // for "multiple beats" potentially needed by MVE instructions.
1067 int BaseCost = 1;
1068 if (ST->hasMVEIntegerOps() && ValTy->isVectorTy())
1069 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1070
1071 return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred,
1072 CostKind, Op1Info, Op2Info, I);
1073}
1074
1077 const SCEV *Ptr,
1079 // Address computations in vectorized code with non-consecutive addresses will
1080 // likely result in more instructions compared to scalar code where the
1081 // computation can more often be merged into the index mode. The resulting
1082 // extra micro-ops can significantly decrease throughput.
1083 unsigned NumVectorInstToHideOverhead = 10;
1084 int MaxMergeDistance = 64;
1085
1086 if (ST->hasNEON()) {
1087 if (PtrTy->isVectorTy() && SE &&
1088 !BaseT::isConstantStridedAccessLessThan(SE, Ptr, MaxMergeDistance + 1))
1089 return NumVectorInstToHideOverhead;
1090
1091 // In many cases the address computation is not merged into the instruction
1092 // addressing mode.
1093 return 1;
1094 }
1095 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1096}
1097
1100 // If a VCTP is part of a chain, it's already profitable and shouldn't be
1101 // optimized, else LSR may block tail-predication.
1102 switch (II->getIntrinsicID()) {
1103 case Intrinsic::arm_mve_vctp8:
1104 case Intrinsic::arm_mve_vctp16:
1105 case Intrinsic::arm_mve_vctp32:
1106 case Intrinsic::arm_mve_vctp64:
1107 return true;
1108 default:
1109 break;
1110 }
1111 }
1112 return false;
1113}
1114
1116 unsigned /*AddressSpace*/,
1117 TTI::MaskKind /*MaskKind*/) const {
1118 if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
1119 return false;
1120
1121 if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
1122 // Don't support v2i1 yet.
1123 if (VecTy->getNumElements() == 2)
1124 return false;
1125
1126 // We don't support extending fp types.
1127 unsigned VecWidth = DataTy->getPrimitiveSizeInBits();
1128 if (VecWidth != 128 && VecTy->getElementType()->isFloatingPointTy())
1129 return false;
1130 }
1131
1132 unsigned EltWidth = DataTy->getScalarSizeInBits();
1133 return (EltWidth == 32 && Alignment >= 4) ||
1134 (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
1135}
1136
1137bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
1138 if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
1139 return false;
1140
1141 unsigned EltWidth = Ty->getScalarSizeInBits();
1142 return ((EltWidth == 32 && Alignment >= 4) ||
1143 (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
1144}
1145
1146/// Given a memcpy/memset/memmove instruction, return the number of memory
1147/// operations performed, via querying findOptimalMemOpLowering. Returns -1 if a
1148/// call is used.
1150 MemOp MOp;
1151 unsigned DstAddrSpace = ~0u;
1152 unsigned SrcAddrSpace = ~0u;
1153 const Function *F = I->getParent()->getParent();
1154
1155 if (const auto *MC = dyn_cast<MemTransferInst>(I)) {
1156 ConstantInt *C = dyn_cast<ConstantInt>(MC->getLength());
1157 // If 'size' is not a constant, a library call will be generated.
1158 if (!C)
1159 return -1;
1160
1161 const unsigned Size = C->getValue().getZExtValue();
1162 const Align DstAlign = MC->getDestAlign().valueOrOne();
1163 const Align SrcAlign = MC->getSourceAlign().valueOrOne();
1164
1165 MOp = MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
1166 /*IsVolatile*/ false);
1167 DstAddrSpace = MC->getDestAddressSpace();
1168 SrcAddrSpace = MC->getSourceAddressSpace();
1169 }
1170 else if (const auto *MS = dyn_cast<MemSetInst>(I)) {
1171 ConstantInt *C = dyn_cast<ConstantInt>(MS->getLength());
1172 // If 'size' is not a constant, a library call will be generated.
1173 if (!C)
1174 return -1;
1175
1176 const unsigned Size = C->getValue().getZExtValue();
1177 const Align DstAlign = MS->getDestAlign().valueOrOne();
1178
1179 MOp = MemOp::Set(Size, /*DstAlignCanChange*/ false, DstAlign,
1180 /*IsZeroMemset*/ false, /*IsVolatile*/ false);
1181 DstAddrSpace = MS->getDestAddressSpace();
1182 }
1183 else
1184 llvm_unreachable("Expected a memcpy/move or memset!");
1185
1186 unsigned Limit, Factor = 2;
1187 switch(I->getIntrinsicID()) {
1188 case Intrinsic::memcpy:
1189 Limit = TLI->getMaxStoresPerMemcpy(F->hasMinSize());
1190 break;
1191 case Intrinsic::memmove:
1192 Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
1193 break;
1194 case Intrinsic::memset:
1195 Limit = TLI->getMaxStoresPerMemset(F->hasMinSize());
1196 Factor = 1;
1197 break;
1198 default:
1199 llvm_unreachable("Expected a memcpy/move or memset!");
1200 }
1201
1202 // MemOps will be poplulated with a list of data types that needs to be
1203 // loaded and stored. That's why we multiply the number of elements by 2 to
1204 // get the cost for this memcpy.
1205 std::vector<EVT> MemOps;
1206 LLVMContext &C = F->getContext();
1207 if (getTLI()->findOptimalMemOpLowering(C, MemOps, Limit, MOp, DstAddrSpace,
1208 SrcAddrSpace, F->getAttributes(),
1209 nullptr))
1210 return MemOps.size() * Factor;
1211
1212 // If we can't find an optimal memop lowering, return the default cost
1213 return -1;
1214}
1215
1218
1219 // To model the cost of a library call, we assume 1 for the call, and
1220 // 3 for the argument setup.
1221 if (NumOps == -1)
1222 return 4;
1223 return NumOps;
1224}
1225
1227 VectorType *DstTy, VectorType *SrcTy,
1228 ArrayRef<int> Mask,
1230 int Index, VectorType *SubTp,
1232 const Instruction *CxtI) const {
1233 assert((Mask.empty() || DstTy->isScalableTy() ||
1234 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
1235 "Expected the Mask to match the return size if given");
1236 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
1237 "Expected the same scalar types");
1238
1239 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
1240 // Treat extractsubvector as single op permutation.
1241 bool IsExtractSubvector = Kind == TTI::SK_ExtractSubvector;
1242 if (IsExtractSubvector)
1244 if (ST->hasNEON()) {
1245 if (Kind == TTI::SK_Broadcast) {
1246 static const CostTblEntry NEONDupTbl[] = {
1247 // VDUP handles these cases.
1248 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1249 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1250 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1251 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1252 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1253 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1254
1255 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1256 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1257 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1258 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1}};
1259
1260 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1261 if (const auto *Entry =
1262 CostTableLookup(NEONDupTbl, ISD::VECTOR_SHUFFLE, LT.second))
1263 return LT.first * Entry->Cost;
1264 }
1265 if (Kind == TTI::SK_Reverse) {
1266 static const CostTblEntry NEONShuffleTbl[] = {
1267 // Reverse shuffle cost one instruction if we are shuffling within a
1268 // double word (vrev) or two if we shuffle a quad word (vrev, vext).
1269 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1270 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1271 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1272 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1273 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 1},
1274 {ISD::VECTOR_SHUFFLE, MVT::v8i8, 1},
1275
1276 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1277 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1278 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 2},
1279 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 2}};
1280
1281 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1282 if (const auto *Entry =
1283 CostTableLookup(NEONShuffleTbl, ISD::VECTOR_SHUFFLE, LT.second))
1284 return LT.first * Entry->Cost;
1285 }
1286 if (Kind == TTI::SK_Select) {
1287 static const CostTblEntry NEONSelShuffleTbl[] = {
1288 // Select shuffle cost table for ARM. Cost is the number of
1289 // instructions
1290 // required to create the shuffled vector.
1291
1292 {ISD::VECTOR_SHUFFLE, MVT::v2f32, 1},
1293 {ISD::VECTOR_SHUFFLE, MVT::v2i64, 1},
1294 {ISD::VECTOR_SHUFFLE, MVT::v2f64, 1},
1295 {ISD::VECTOR_SHUFFLE, MVT::v2i32, 1},
1296
1297 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 2},
1298 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 2},
1299 {ISD::VECTOR_SHUFFLE, MVT::v4i16, 2},
1300
1301 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 16},
1302
1303 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 32}};
1304
1305 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1306 if (const auto *Entry = CostTableLookup(NEONSelShuffleTbl,
1307 ISD::VECTOR_SHUFFLE, LT.second))
1308 return LT.first * Entry->Cost;
1309 }
1310 }
1311 if (ST->hasMVEIntegerOps()) {
1312 if (Kind == TTI::SK_Broadcast) {
1313 static const CostTblEntry MVEDupTbl[] = {
1314 // VDUP handles these cases.
1315 {ISD::VECTOR_SHUFFLE, MVT::v4i32, 1},
1316 {ISD::VECTOR_SHUFFLE, MVT::v8i16, 1},
1317 {ISD::VECTOR_SHUFFLE, MVT::v16i8, 1},
1318 {ISD::VECTOR_SHUFFLE, MVT::v4f32, 1},
1319 {ISD::VECTOR_SHUFFLE, MVT::v8f16, 1}};
1320
1321 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1322 if (const auto *Entry = CostTableLookup(MVEDupTbl, ISD::VECTOR_SHUFFLE,
1323 LT.second))
1324 return LT.first * Entry->Cost * ST->getMVEVectorCostFactor(CostKind);
1325 }
1326
1327 if (!Mask.empty()) {
1328 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
1329 // Check for LD2/LD4 instructions, which are represented in llvm IR as
1330 // deinterleaving-shuffle(load). The shuffle cost could potentially be
1331 // free, but we model it with a cost of LT.first so that LD2/LD4 have a
1332 // higher cost than just the load.
1333 if (Args.size() >= 1 && isa<LoadInst>(Args[0]) &&
1334 (LT.second.getScalarSizeInBits() == 8 ||
1335 LT.second.getScalarSizeInBits() == 16 ||
1336 LT.second.getScalarSizeInBits() == 32) &&
1337 LT.second.getSizeInBits() == 128 &&
1338 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1340 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1342 return ST->getMVEVectorCostFactor(CostKind) *
1343 std::max<InstructionCost>(1, LT.first / 4);
1344
1345 // Check for ST2/ST4 instructions, which are represented in llvm IR as
1346 // store(interleaving-shuffle). The shuffle cost could potentially be
1347 // free, but we model it with a cost of LT.first so that ST2/ST4 have a
1348 // higher cost than just the store.
1349 if (CxtI && CxtI->hasOneUse() && isa<StoreInst>(*CxtI->user_begin()) &&
1350 (LT.second.getScalarSizeInBits() == 8 ||
1351 LT.second.getScalarSizeInBits() == 16 ||
1352 LT.second.getScalarSizeInBits() == 32) &&
1353 LT.second.getSizeInBits() == 128 &&
1354 ((TLI->getMaxSupportedInterleaveFactor() >= 2 &&
1356 Mask, 2, SrcTy->getElementCount().getKnownMinValue() * 2)) ||
1357 (TLI->getMaxSupportedInterleaveFactor() == 4 &&
1359 Mask, 4, SrcTy->getElementCount().getKnownMinValue() * 2))))
1360 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1361
1362 if (LT.second.isVector() &&
1363 Mask.size() <= LT.second.getVectorNumElements() &&
1364 (isVREVMask(Mask, LT.second, 16) || isVREVMask(Mask, LT.second, 32) ||
1365 isVREVMask(Mask, LT.second, 64)))
1366 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1367 }
1368 }
1369
1370 // Restore optimal kind.
1371 if (IsExtractSubvector)
1373 int BaseCost = ST->hasMVEIntegerOps() && SrcTy->isVectorTy()
1374 ? ST->getMVEVectorCostFactor(CostKind)
1375 : 1;
1376 return BaseCost * BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind,
1377 Index, SubTp);
1378}
1379
1381 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
1383 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
1384 int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
1385 if (ST->isThumb() && CostKind == TTI::TCK_CodeSize && Ty->isIntegerTy(1)) {
1386 // Make operations on i1 relatively expensive as this often involves
1387 // combining predicates. AND and XOR should be easier to handle with IT
1388 // blocks.
1389 switch (ISDOpcode) {
1390 default:
1391 break;
1392 case ISD::AND:
1393 case ISD::XOR:
1394 return 2;
1395 case ISD::OR:
1396 return 3;
1397 }
1398 }
1399
1400 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1401
1402 if (ST->hasNEON()) {
1403 const unsigned FunctionCallDivCost = 20;
1404 const unsigned ReciprocalDivCost = 10;
1405 static const CostTblEntry CostTbl[] = {
1406 // Division.
1407 // These costs are somewhat random. Choose a cost of 20 to indicate that
1408 // vectorizing devision (added function call) is going to be very expensive.
1409 // Double registers types.
1410 { ISD::SDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1411 { ISD::UDIV, MVT::v1i64, 1 * FunctionCallDivCost},
1412 { ISD::SREM, MVT::v1i64, 1 * FunctionCallDivCost},
1413 { ISD::UREM, MVT::v1i64, 1 * FunctionCallDivCost},
1414 { ISD::SDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1415 { ISD::UDIV, MVT::v2i32, 2 * FunctionCallDivCost},
1416 { ISD::SREM, MVT::v2i32, 2 * FunctionCallDivCost},
1417 { ISD::UREM, MVT::v2i32, 2 * FunctionCallDivCost},
1418 { ISD::SDIV, MVT::v4i16, ReciprocalDivCost},
1419 { ISD::UDIV, MVT::v4i16, ReciprocalDivCost},
1420 { ISD::SREM, MVT::v4i16, 4 * FunctionCallDivCost},
1421 { ISD::UREM, MVT::v4i16, 4 * FunctionCallDivCost},
1422 { ISD::SDIV, MVT::v8i8, ReciprocalDivCost},
1423 { ISD::UDIV, MVT::v8i8, ReciprocalDivCost},
1424 { ISD::SREM, MVT::v8i8, 8 * FunctionCallDivCost},
1425 { ISD::UREM, MVT::v8i8, 8 * FunctionCallDivCost},
1426 // Quad register types.
1427 { ISD::SDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1428 { ISD::UDIV, MVT::v2i64, 2 * FunctionCallDivCost},
1429 { ISD::SREM, MVT::v2i64, 2 * FunctionCallDivCost},
1430 { ISD::UREM, MVT::v2i64, 2 * FunctionCallDivCost},
1431 { ISD::SDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1432 { ISD::UDIV, MVT::v4i32, 4 * FunctionCallDivCost},
1433 { ISD::SREM, MVT::v4i32, 4 * FunctionCallDivCost},
1434 { ISD::UREM, MVT::v4i32, 4 * FunctionCallDivCost},
1435 { ISD::SDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1436 { ISD::UDIV, MVT::v8i16, 8 * FunctionCallDivCost},
1437 { ISD::SREM, MVT::v8i16, 8 * FunctionCallDivCost},
1438 { ISD::UREM, MVT::v8i16, 8 * FunctionCallDivCost},
1439 { ISD::SDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1440 { ISD::UDIV, MVT::v16i8, 16 * FunctionCallDivCost},
1441 { ISD::SREM, MVT::v16i8, 16 * FunctionCallDivCost},
1442 { ISD::UREM, MVT::v16i8, 16 * FunctionCallDivCost},
1443 // Multiplication.
1444 };
1445
1446 if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
1447 return LT.first * Entry->Cost;
1448
1450 Opcode, Ty, CostKind, Op1Info, Op2Info);
1451
1452 // This is somewhat of a hack. The problem that we are facing is that SROA
1453 // creates a sequence of shift, and, or instructions to construct values.
1454 // These sequences are recognized by the ISel and have zero-cost. Not so for
1455 // the vectorized code. Because we have support for v2i64 but not i64 those
1456 // sequences look particularly beneficial to vectorize.
1457 // To work around this we increase the cost of v2i64 operations to make them
1458 // seem less beneficial.
1459 if (LT.second == MVT::v2i64 && Op2Info.isUniform() && Op2Info.isConstant())
1460 Cost += 4;
1461
1462 return Cost;
1463 }
1464
1465 // If this operation is a shift on arm/thumb2, it might well be folded into
1466 // the following instruction, hence having a cost of 0.
1467 auto LooksLikeAFreeShift = [&]() {
1468 if (ST->isThumb1Only() || Ty->isVectorTy())
1469 return false;
1470
1471 if (!CxtI || !CxtI->hasOneUse() || !CxtI->isShift())
1472 return false;
1473 if (!Op2Info.isUniform() || !Op2Info.isConstant())
1474 return false;
1475
1476 // Folded into a ADC/ADD/AND/BIC/CMP/EOR/MVN/ORR/ORN/RSB/SBC/SUB
1477 switch (cast<Instruction>(CxtI->user_back())->getOpcode()) {
1478 case Instruction::Add:
1479 case Instruction::Sub:
1480 case Instruction::And:
1481 case Instruction::Xor:
1482 case Instruction::Or:
1483 case Instruction::ICmp:
1484 return true;
1485 default:
1486 return false;
1487 }
1488 };
1489 if (LooksLikeAFreeShift())
1490 return 0;
1491
1492 // When targets have both DSP and MVE we find that the
1493 // the compiler will attempt to vectorize as well as using
1494 // scalar (S/U)MLAL operations. This is in cases where we have
1495 // the pattern ext(mul(ext(i16), ext(i16))) we find
1496 // that codegen performs better when only using (S/U)MLAL scalar
1497 // ops instead of trying to mix vector ops with (S/U)MLAL ops. We therefore
1498 // check if a mul instruction is used in a (U/S)MLAL pattern.
1499 auto MulInDSPMLALPattern = [&](const Instruction *I, unsigned Opcode,
1500 Type *Ty) -> bool {
1501 if (!ST->hasDSP())
1502 return false;
1503
1504 if (!I)
1505 return false;
1506
1507 if (Opcode != Instruction::Mul)
1508 return false;
1509
1510 if (Ty->isVectorTy())
1511 return false;
1512
1513 auto ValueOpcodesEqual = [](const Value *LHS, const Value *RHS) -> bool {
1514 return cast<Instruction>(LHS)->getOpcode() ==
1515 cast<Instruction>(RHS)->getOpcode();
1516 };
1517 auto IsExtInst = [](const Value *V) -> bool {
1518 return isa<ZExtInst>(V) || isa<SExtInst>(V);
1519 };
1520 auto IsExtensionFromHalf = [](const Value *V) -> bool {
1521 return cast<Instruction>(V)->getOperand(0)->getType()->isIntegerTy(16);
1522 };
1523
1524 // We check the arguments of the instruction to see if they're extends
1525 auto *BinOp = dyn_cast<BinaryOperator>(I);
1526 if (!BinOp)
1527 return false;
1528 Value *Op0 = BinOp->getOperand(0);
1529 Value *Op1 = BinOp->getOperand(1);
1530 if (IsExtInst(Op0) && IsExtInst(Op1) && ValueOpcodesEqual(Op0, Op1)) {
1531 // We're interested in an ext of an i16
1532 if (!I->getType()->isIntegerTy(32) || !IsExtensionFromHalf(Op0) ||
1533 !IsExtensionFromHalf(Op1))
1534 return false;
1535 // We need to check if this result will be further extended to i64
1536 // and that all these uses are SExt
1537 for (auto *U : I->users())
1538 if (!IsExtInst(U))
1539 return false;
1540 return true;
1541 }
1542
1543 return false;
1544 };
1545
1546 if (MulInDSPMLALPattern(CxtI, Opcode, Ty))
1547 return 0;
1548
1549 // Default to cheap (throughput/size of 1 instruction) but adjust throughput
1550 // for "multiple beats" potentially needed by MVE instructions.
1551 int BaseCost = 1;
1552 if (ST->hasMVEIntegerOps() && Ty->isVectorTy())
1553 BaseCost = ST->getMVEVectorCostFactor(CostKind);
1554
1555 // The rest of this mostly follows what is done in
1556 // BaseT::getArithmeticInstrCost, without treating floats as more expensive
1557 // that scalars or increasing the costs for custom operations. The results is
1558 // also multiplied by the MVEVectorCostFactor where appropriate.
1559 if (TLI->isOperationLegalOrCustomOrPromote(ISDOpcode, LT.second))
1560 return LT.first * BaseCost;
1561
1562 // Else this is expand, assume that we need to scalarize this op.
1563 if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
1564 unsigned Num = VTy->getNumElements();
1566 getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
1567 // Return the cost of multiple scalar invocation plus the cost of
1568 // inserting and extracting the values.
1569 SmallVector<Type *> Tys(Args.size(), Ty);
1570 return BaseT::getScalarizationOverhead(VTy, Args, Tys, CostKind) +
1571 Num * Cost;
1572 }
1573
1574 return BaseCost;
1575}
1576
1578 Align Alignment,
1579 unsigned AddressSpace,
1581 TTI::OperandValueInfo OpInfo,
1582 const Instruction *I) const {
1583 // FIXME: Load latency isn't handled here
1584 if (Opcode == Instruction::Load && CostKind == TTI::TCK_Latency)
1585 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1586 CostKind, OpInfo, I);
1587
1588 // TODO: Handle other cost kinds.
1590 return 1;
1591
1592 // Type legalization can't handle structs
1593 if (TLI->getValueType(DL, Src, true) == MVT::Other)
1594 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1595 CostKind);
1596
1597 if (ST->hasNEON() && Src->isVectorTy() && Alignment != Align(16) &&
1598 cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
1599 // Unaligned loads/stores are extremely inefficient.
1600 // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
1601 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
1602 return LT.first * 4;
1603 }
1604
1605 // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
1606 // Same for stores.
1607 if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
1608 ((Opcode == Instruction::Load && I->hasOneUse() &&
1609 isa<FPExtInst>(*I->user_begin())) ||
1610 (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
1612 Type *DstTy =
1613 Opcode == Instruction::Load
1614 ? (*I->user_begin())->getType()
1615 : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
1616 if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
1617 DstTy->getScalarType()->isFloatTy())
1618 return ST->getMVEVectorCostFactor(CostKind);
1619 }
1620
1621 int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
1622 ? ST->getMVEVectorCostFactor(CostKind)
1623 : 1;
1624 return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1625 CostKind, OpInfo, I);
1626}
1627
1631 switch (MICA.getID()) {
1632 case Intrinsic::masked_scatter:
1633 case Intrinsic::masked_gather:
1634 return getGatherScatterOpCost(MICA, CostKind);
1635 case Intrinsic::masked_load:
1636 case Intrinsic::masked_store:
1637 return getMaskedMemoryOpCost(MICA, CostKind);
1638 }
1640}
1641
1645 unsigned IID = MICA.getID();
1646 Type *Src = MICA.getDataType();
1647 Align Alignment = MICA.getAlignment();
1648 unsigned AddressSpace = MICA.getAddressSpace();
1649 if (ST->hasMVEIntegerOps()) {
1650 if (IID == Intrinsic::masked_load &&
1651 isLegalMaskedLoad(Src, Alignment, AddressSpace))
1652 return ST->getMVEVectorCostFactor(CostKind);
1653 if (IID == Intrinsic::masked_store &&
1654 isLegalMaskedStore(Src, Alignment, AddressSpace))
1655 return ST->getMVEVectorCostFactor(CostKind);
1656 }
1657 if (!isa<FixedVectorType>(Src))
1659 // Scalar cost, which is currently very high due to the efficiency of the
1660 // generated code.
1661 return cast<FixedVectorType>(Src)->getNumElements() * 8;
1662}
1663
1665 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1666 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1667 bool UseMaskForCond, bool UseMaskForGaps) const {
1668 assert(Factor >= 2 && "Invalid interleave factor");
1669 assert(isa<VectorType>(VecTy) && "Expect a vector type");
1670
1671 // vldN/vstN doesn't support vector types of i64/f64 element.
1672 bool EltIs64Bits = DL.getTypeSizeInBits(VecTy->getScalarType()) == 64;
1673
1674 if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
1675 !UseMaskForCond && !UseMaskForGaps) {
1676 unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
1677 auto *SubVecTy =
1678 FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
1679
1680 // vldN/vstN only support legal vector types of size 64 or 128 in bits.
1681 // Accesses having vector types that are a multiple of 128 bits can be
1682 // matched to more than one vldN/vstN instruction.
1683 int BaseCost =
1684 ST->hasMVEIntegerOps() ? ST->getMVEVectorCostFactor(CostKind) : 1;
1685 if (NumElts % Factor == 0 &&
1686 TLI->isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
1687 return Factor * BaseCost * TLI->getNumInterleavedAccesses(SubVecTy, DL);
1688
1689 // Some smaller than legal interleaved patterns are cheap as we can make
1690 // use of the vmovn or vrev patterns to interleave a standard load. This is
1691 // true for v4i8, v8i8 and v4i16 at least (but not for v4f16 as it is
1692 // promoted differently). The cost of 2 here is then a load and vrev or
1693 // vmovn.
1694 if (ST->hasMVEIntegerOps() && Factor == 2 && NumElts / Factor > 2 &&
1695 VecTy->isIntOrIntVectorTy() &&
1696 DL.getTypeSizeInBits(SubVecTy).getFixedValue() <= 64)
1697 return 2 * BaseCost;
1698 }
1699
1700 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1701 Alignment, AddressSpace, CostKind,
1702 UseMaskForCond, UseMaskForGaps);
1703}
1704
1708
1709 Type *DataTy = MICA.getDataType();
1710 const Value *Ptr = MICA.getPointer();
1711 bool VariableMask = MICA.getVariableMask();
1712 Align Alignment = MICA.getAlignment();
1713 const Instruction *I = MICA.getInst();
1714
1715 using namespace PatternMatch;
1716 if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
1718
1719 assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
1720 auto *VTy = cast<FixedVectorType>(DataTy);
1721
1722 // TODO: Splitting, once we do that.
1723
1724 unsigned NumElems = VTy->getNumElements();
1725 unsigned EltSize = VTy->getScalarSizeInBits();
1726 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(DataTy);
1727
1728 // For now, it is assumed that for the MVE gather instructions the loads are
1729 // all effectively serialised. This means the cost is the scalar cost
1730 // multiplied by the number of elements being loaded. This is possibly very
1731 // conservative, but even so we still end up vectorising loops because the
1732 // cost per iteration for many loops is lower than for scalar loops.
1733 InstructionCost VectorCost =
1734 NumElems * LT.first * ST->getMVEVectorCostFactor(CostKind);
1735 // The scalarization cost should be a lot higher. We use the number of vector
1736 // elements plus the scalarization overhead. If masking is required then a lot
1737 // of little blocks will be needed and potentially a scalarized p0 mask,
1738 // greatly increasing the cost.
1739 InstructionCost ScalarCost =
1740 NumElems * LT.first + (VariableMask ? NumElems * 5 : 0) +
1741 BaseT::getScalarizationOverhead(VTy, /*Insert*/ true, /*Extract*/ false,
1742 CostKind) +
1743 BaseT::getScalarizationOverhead(VTy, /*Insert*/ false, /*Extract*/ true,
1744 CostKind);
1745
1746 if (EltSize < 8 || Alignment < EltSize / 8)
1747 return ScalarCost;
1748
1749 unsigned ExtSize = EltSize;
1750 // Check whether there's a single user that asks for an extended type
1751 if (I != nullptr) {
1752 // Dependent of the caller of this function, a gather instruction will
1753 // either have opcode Instruction::Load or be a call to the masked_gather
1754 // intrinsic
1755 if ((I->getOpcode() == Instruction::Load ||
1757 I->hasOneUse()) {
1758 const User *Us = *I->users().begin();
1759 if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
1760 // only allow valid type combinations
1761 unsigned TypeSize =
1762 cast<Instruction>(Us)->getType()->getScalarSizeInBits();
1763 if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
1764 (TypeSize == 16 && EltSize == 8)) &&
1765 TypeSize * NumElems == 128) {
1766 ExtSize = TypeSize;
1767 }
1768 }
1769 }
1770 // Check whether the input data needs to be truncated
1771 TruncInst *T;
1772 if ((I->getOpcode() == Instruction::Store ||
1774 (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
1775 // Only allow valid type combinations
1776 unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
1777 if (((EltSize == 16 && TypeSize == 32) ||
1778 (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
1779 TypeSize * NumElems == 128)
1780 ExtSize = TypeSize;
1781 }
1782 }
1783
1784 if (ExtSize * NumElems != 128 || NumElems < 4)
1785 return ScalarCost;
1786
1787 // Any (aligned) i32 gather will not need to be scalarised.
1788 if (ExtSize == 32)
1789 return VectorCost;
1790 // For smaller types, we need to ensure that the gep's inputs are correctly
1791 // extended from a small enough value. Other sizes (including i64) are
1792 // scalarized for now.
1793 if (ExtSize != 8 && ExtSize != 16)
1794 return ScalarCost;
1795
1796 if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
1797 Ptr = BC->getOperand(0);
1798 if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
1799 if (GEP->getNumOperands() != 2)
1800 return ScalarCost;
1801 unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
1802 // Scale needs to be correct (which is only relevant for i16s).
1803 if (Scale != 1 && Scale * 8 != ExtSize)
1804 return ScalarCost;
1805 // And we need to zext (not sext) the indexes from a small enough type.
1806 if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
1807 if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
1808 return VectorCost;
1809 }
1810 return ScalarCost;
1811 }
1812 return ScalarCost;
1813}
1814
1817 std::optional<FastMathFlags> FMF,
1819
1820 EVT ValVT = TLI->getValueType(DL, ValTy);
1821 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1822 unsigned EltSize = ValVT.getScalarSizeInBits();
1823
1824 // In general floating point reductions are a series of elementwise
1825 // operations, with free extracts on each step. These are either in-order or
1826 // treewise depending on whether that is allowed by the fast math flags.
1827 if ((ISD == ISD::FADD || ISD == ISD::FMUL) &&
1828 ((EltSize == 32 && ST->hasVFP2Base()) ||
1829 (EltSize == 64 && ST->hasFP64()) ||
1830 (EltSize == 16 && ST->hasFullFP16()))) {
1831 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1832 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1833 InstructionCost VecCost = 0;
1834 while (!TTI::requiresOrderedReduction(FMF) && isPowerOf2_32(NumElts) &&
1835 NumElts * EltSize > VecLimit) {
1836 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1837 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1838 NumElts /= 2;
1839 }
1840
1841 // For fp16 we need to extract the upper lane elements. MVE can add a
1842 // VREV+FMIN/MAX to perform another vector step instead.
1843 InstructionCost ExtractCost = 0;
1844 if (!TTI::requiresOrderedReduction(FMF) && ST->hasMVEFloatOps() &&
1845 ValVT.getVectorElementType() == MVT::f16 && NumElts == 8) {
1846 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1847 NumElts /= 2;
1848 } else if (ValVT.getVectorElementType() == MVT::f16)
1849 ExtractCost = NumElts / 2;
1850
1851 return VecCost + ExtractCost +
1852 NumElts *
1854 }
1855
1856 if ((ISD == ISD::AND || ISD == ISD::OR || ISD == ISD::XOR) &&
1857 (EltSize == 64 || EltSize == 32 || EltSize == 16 || EltSize == 8)) {
1858 unsigned NumElts = cast<FixedVectorType>(ValTy)->getNumElements();
1859 unsigned VecLimit =
1860 ST->hasMVEIntegerOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1861 InstructionCost VecCost = 0;
1862 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1863 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts / 2);
1864 VecCost += getArithmeticInstrCost(Opcode, VecTy, CostKind);
1865 NumElts /= 2;
1866 }
1867 // For i16/i8, MVE will perform a VREV + VORR/VAND/VEOR for the 64bit vector
1868 // step.
1869 if (ST->hasMVEIntegerOps() && ValVT.getScalarSizeInBits() <= 16 &&
1870 NumElts * EltSize == 64) {
1871 Type *VecTy = FixedVectorType::get(ValTy->getElementType(), NumElts);
1872 VecCost += ST->getMVEVectorCostFactor(CostKind) +
1873 getArithmeticInstrCost(Opcode, VecTy, CostKind);
1874 NumElts /= 2;
1875 }
1876
1877 // From here we extract the elements and perform the and/or/xor.
1878 InstructionCost ExtractCost = NumElts;
1879 return VecCost + ExtractCost +
1880 (NumElts - 1) * getArithmeticInstrCost(
1881 Opcode, ValTy->getElementType(), CostKind);
1882 }
1883
1884 if (!ST->hasMVEIntegerOps() || !ValVT.isSimple() || ISD != ISD::ADD ||
1886 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1887
1888 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1889
1890 static const CostTblEntry CostTblAdd[]{
1891 {ISD::ADD, MVT::v16i8, 1},
1892 {ISD::ADD, MVT::v8i16, 1},
1893 {ISD::ADD, MVT::v4i32, 1},
1894 };
1895 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD, LT.second))
1896 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
1897
1898 return BaseT::getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
1899}
1900
1902 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
1903 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
1904 EVT ValVT = TLI->getValueType(DL, ValTy);
1905 EVT ResVT = TLI->getValueType(DL, ResTy);
1906
1907 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1908
1909 switch (ISD) {
1910 case ISD::ADD:
1911 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1912 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1913
1914 // The legal cases are:
1915 // VADDV u/s 8/16/32
1916 // VADDLV u/s 32
1917 // Codegen currently cannot always handle larger than legal vectors very
1918 // well, especially for predicated reductions where the mask needs to be
1919 // split, so restrict to 128bit or smaller input types.
1920 unsigned RevVTSize = ResVT.getSizeInBits();
1921 if (ValVT.getSizeInBits() <= 128 &&
1922 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1923 (LT.second == MVT::v8i16 && RevVTSize <= 32) ||
1924 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1925 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1926 }
1927 break;
1928 default:
1929 break;
1930 }
1931 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy, FMF,
1932 CostKind);
1933}
1934
1936ARMTTIImpl::getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode,
1937 Type *ResTy, VectorType *ValTy,
1939 if (RedOpcode != Instruction::Add)
1941 EVT ValVT = TLI->getValueType(DL, ValTy);
1942 EVT ResVT = TLI->getValueType(DL, ResTy);
1943
1944 if (ST->hasMVEIntegerOps() && ValVT.isSimple() && ResVT.isSimple()) {
1945 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
1946
1947 // The legal cases are:
1948 // VMLAV u/s 8/16/32
1949 // VMLALV u/s 16/32
1950 // Codegen currently cannot always handle larger than legal vectors very
1951 // well, especially for predicated reductions where the mask needs to be
1952 // split, so restrict to 128bit or smaller input types.
1953 unsigned RevVTSize = ResVT.getSizeInBits();
1954 if (ValVT.getSizeInBits() <= 128 &&
1955 ((LT.second == MVT::v16i8 && RevVTSize <= 32) ||
1956 (LT.second == MVT::v8i16 && RevVTSize <= 64) ||
1957 (LT.second == MVT::v4i32 && RevVTSize <= 64)))
1958 return ST->getMVEVectorCostFactor(CostKind) * LT.first;
1959 }
1960
1961 return BaseT::getMulAccReductionCost(IsUnsigned, RedOpcode, ResTy, ValTy,
1962 CostKind);
1963}
1964
1967 FastMathFlags FMF,
1969 EVT ValVT = TLI->getValueType(DL, Ty);
1970
1971 // In general floating point reductions are a series of elementwise
1972 // operations, with free extracts on each step. These are either in-order or
1973 // treewise depending on whether that is allowed by the fast math flags.
1974 if ((IID == Intrinsic::minnum || IID == Intrinsic::maxnum) &&
1975 ((ValVT.getVectorElementType() == MVT::f32 && ST->hasVFP2Base()) ||
1976 (ValVT.getVectorElementType() == MVT::f64 && ST->hasFP64()) ||
1977 (ValVT.getVectorElementType() == MVT::f16 && ST->hasFullFP16()))) {
1978 unsigned NumElts = cast<FixedVectorType>(Ty)->getNumElements();
1979 unsigned EltSize = ValVT.getScalarSizeInBits();
1980 unsigned VecLimit = ST->hasMVEFloatOps() ? 128 : (ST->hasNEON() ? 64 : -1);
1981 InstructionCost VecCost;
1982 while (isPowerOf2_32(NumElts) && NumElts * EltSize > VecLimit) {
1983 Type *VecTy = FixedVectorType::get(Ty->getElementType(), NumElts/2);
1984 IntrinsicCostAttributes ICA(IID, VecTy, {VecTy, VecTy}, FMF);
1985 VecCost += getIntrinsicInstrCost(ICA, CostKind);
1986 NumElts /= 2;
1987 }
1988
1989 // For fp16 we need to extract the upper lane elements. MVE can add a
1990 // VREV+FMIN/MAX to perform another vector step instead.
1991 InstructionCost ExtractCost = 0;
1992 if (ST->hasMVEFloatOps() && ValVT.getVectorElementType() == MVT::f16 &&
1993 NumElts == 8) {
1994 VecCost += ST->getMVEVectorCostFactor(CostKind) * 2;
1995 NumElts /= 2;
1996 } else if (ValVT.getVectorElementType() == MVT::f16)
1997 ExtractCost = cast<FixedVectorType>(Ty)->getNumElements() / 2;
1998
1999 IntrinsicCostAttributes ICA(IID, Ty->getElementType(),
2000 {Ty->getElementType(), Ty->getElementType()},
2001 FMF);
2002 return VecCost + ExtractCost +
2003 (NumElts - 1) * getIntrinsicInstrCost(ICA, CostKind);
2004 }
2005
2006 if (IID == Intrinsic::smin || IID == Intrinsic::smax ||
2007 IID == Intrinsic::umin || IID == Intrinsic::umax) {
2008 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2009
2010 // All costs are the same for u/s min/max. These lower to vminv, which are
2011 // given a slightly higher cost as they tend to take multiple cycles for
2012 // smaller type sizes.
2013 static const CostTblEntry CostTblAdd[]{
2014 {ISD::SMIN, MVT::v16i8, 4},
2015 {ISD::SMIN, MVT::v8i16, 3},
2016 {ISD::SMIN, MVT::v4i32, 2},
2017 };
2018 if (const auto *Entry = CostTableLookup(CostTblAdd, ISD::SMIN, LT.second))
2019 return Entry->Cost * ST->getMVEVectorCostFactor(CostKind) * LT.first;
2020 }
2021
2022 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
2023}
2024
2028 unsigned Opc = ICA.getID();
2029 switch (Opc) {
2030 case Intrinsic::get_active_lane_mask:
2031 // Currently we make a somewhat optimistic assumption that
2032 // active_lane_mask's are always free. In reality it may be freely folded
2033 // into a tail predicated loop, expanded into a VCPT or expanded into a lot
2034 // of add/icmp code. We may need to improve this in the future, but being
2035 // able to detect if it is free or not involves looking at a lot of other
2036 // code. We currently assume that the vectorizer inserted these, and knew
2037 // what it was doing in adding one.
2038 if (ST->hasMVEIntegerOps())
2039 return 0;
2040 break;
2041 case Intrinsic::sadd_sat:
2042 case Intrinsic::ssub_sat:
2043 case Intrinsic::uadd_sat:
2044 case Intrinsic::usub_sat: {
2045 bool IsAdd = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2046 bool IsSigned = (Opc == Intrinsic::sadd_sat || Opc == Intrinsic::ssub_sat);
2047 Type *RetTy = ICA.getReturnType();
2048
2049 if (auto *ITy = dyn_cast<IntegerType>(RetTy)) {
2050 if (IsSigned && ST->hasDSP() && ITy->getBitWidth() == 32)
2051 return 1; // qadd / qsub
2052 if (ST->hasDSP() && (ITy->getBitWidth() == 8 || ITy->getBitWidth() == 16))
2053 return 2; // uqadd16 / qadd16 / uqsub16 / qsub16 + possible extend.
2054 // Otherwise return the cost of expanding the node. Generally an add +
2055 // icmp + sel.
2057 Type *CondTy = RetTy->getWithNewBitWidth(1);
2058 return getArithmeticInstrCost(IsAdd ? Instruction::Add : Instruction::Sub,
2059 RetTy, CostKind) +
2060 2 * getCmpSelInstrCost(BinaryOperator::ICmp, RetTy, CondTy, Pred,
2061 CostKind) +
2062 2 * getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy, Pred,
2063 CostKind);
2064 }
2065
2066 if (!ST->hasMVEIntegerOps())
2067 break;
2068
2069 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(RetTy);
2070 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2071 LT.second == MVT::v16i8) {
2072 // This is a base cost of 1 for the vqadd, plus 3 extract shifts if we
2073 // need to extend the type, as it uses shr(qadd(shl, shl)).
2074 unsigned Instrs =
2075 LT.second.getScalarSizeInBits() == RetTy->getScalarSizeInBits() ? 1
2076 : 4;
2077 return LT.first * ST->getMVEVectorCostFactor(CostKind) * Instrs;
2078 }
2079 break;
2080 }
2081 case Intrinsic::abs:
2082 case Intrinsic::smin:
2083 case Intrinsic::smax:
2084 case Intrinsic::umin:
2085 case Intrinsic::umax: {
2086 if (!ST->hasMVEIntegerOps())
2087 break;
2088 Type *VT = ICA.getReturnType();
2089
2090 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2091 if (LT.second == MVT::v4i32 || LT.second == MVT::v8i16 ||
2092 LT.second == MVT::v16i8)
2093 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2094 break;
2095 }
2096 case Intrinsic::minnum:
2097 case Intrinsic::maxnum: {
2098 if (!ST->hasMVEFloatOps())
2099 break;
2100 Type *VT = ICA.getReturnType();
2101 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VT);
2102 if (LT.second == MVT::v4f32 || LT.second == MVT::v8f16)
2103 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2104 break;
2105 }
2106 case Intrinsic::fptosi_sat:
2107 case Intrinsic::fptoui_sat: {
2108 if (ICA.getArgTypes().empty())
2109 break;
2110 bool IsSigned = Opc == Intrinsic::fptosi_sat;
2111 auto LT = getTypeLegalizationCost(ICA.getArgTypes()[0]);
2112 EVT MTy = TLI->getValueType(DL, ICA.getReturnType());
2113 // Check for the legal types, with the correct subtarget features.
2114 if ((ST->hasVFP2Base() && LT.second == MVT::f32 && MTy == MVT::i32) ||
2115 (ST->hasFP64() && LT.second == MVT::f64 && MTy == MVT::i32) ||
2116 (ST->hasFullFP16() && LT.second == MVT::f16 && MTy == MVT::i32))
2117 return LT.first;
2118
2119 // Equally for MVE vector types
2120 if (ST->hasMVEFloatOps() &&
2121 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16) &&
2122 LT.second.getScalarSizeInBits() == MTy.getScalarSizeInBits())
2123 return LT.first * ST->getMVEVectorCostFactor(CostKind);
2124
2125 // If we can we use a legal convert followed by a min+max
2126 if (((ST->hasVFP2Base() && LT.second == MVT::f32) ||
2127 (ST->hasFP64() && LT.second == MVT::f64) ||
2128 (ST->hasFullFP16() && LT.second == MVT::f16) ||
2129 (ST->hasMVEFloatOps() &&
2130 (LT.second == MVT::v4f32 || LT.second == MVT::v8f16))) &&
2131 LT.second.getScalarSizeInBits() >= MTy.getScalarSizeInBits()) {
2132 Type *LegalTy = Type::getIntNTy(ICA.getReturnType()->getContext(),
2133 LT.second.getScalarSizeInBits());
2135 LT.second.isVector() ? ST->getMVEVectorCostFactor(CostKind) : 1;
2136 IntrinsicCostAttributes Attrs1(IsSigned ? Intrinsic::smin
2137 : Intrinsic::umin,
2138 LegalTy, {LegalTy, LegalTy});
2140 IntrinsicCostAttributes Attrs2(IsSigned ? Intrinsic::smax
2141 : Intrinsic::umax,
2142 LegalTy, {LegalTy, LegalTy});
2144 return LT.first * Cost;
2145 }
2146 // Otherwise we need to follow the default expansion that clamps the value
2147 // using a float min/max with a fcmp+sel for nan handling when signed.
2148 Type *FPTy = ICA.getArgTypes()[0];
2149 Type *RetTy = ICA.getReturnType();
2150 IntrinsicCostAttributes Attrs1(Intrinsic::minnum, FPTy, {FPTy, FPTy});
2152 IntrinsicCostAttributes Attrs2(Intrinsic::maxnum, FPTy, {FPTy, FPTy});
2154 Cost +=
2155 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
2156 RetTy, FPTy, TTI::CastContextHint::None, CostKind);
2157 if (IsSigned) {
2158 Type *CondTy = RetTy->getWithNewBitWidth(1);
2159 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, FPTy, CondTy,
2161 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
2163 }
2164 return Cost;
2165 }
2166 }
2167
2169}
2170
2172 if (!F->isIntrinsic())
2173 return BaseT::isLoweredToCall(F);
2174
2175 // Assume all Arm-specific intrinsics map to an instruction.
2176 if (F->getName().starts_with("llvm.arm"))
2177 return false;
2178
2179 switch (F->getIntrinsicID()) {
2180 default: break;
2181 case Intrinsic::powi:
2182 case Intrinsic::sin:
2183 case Intrinsic::cos:
2184 case Intrinsic::sincos:
2185 case Intrinsic::pow:
2186 case Intrinsic::log:
2187 case Intrinsic::log10:
2188 case Intrinsic::log2:
2189 case Intrinsic::exp:
2190 case Intrinsic::exp2:
2191 return true;
2192 case Intrinsic::sqrt:
2193 case Intrinsic::fabs:
2194 case Intrinsic::copysign:
2195 case Intrinsic::floor:
2196 case Intrinsic::ceil:
2197 case Intrinsic::trunc:
2198 case Intrinsic::rint:
2199 case Intrinsic::nearbyint:
2200 case Intrinsic::round:
2201 case Intrinsic::canonicalize:
2202 case Intrinsic::lround:
2203 case Intrinsic::llround:
2204 case Intrinsic::lrint:
2205 case Intrinsic::llrint:
2206 if (F->getReturnType()->isDoubleTy() && !ST->hasFP64())
2207 return true;
2208 if (F->getReturnType()->isHalfTy() && !ST->hasFullFP16())
2209 return true;
2210 // Some operations can be handled by vector instructions and assume
2211 // unsupported vectors will be expanded into supported scalar ones.
2212 // TODO Handle scalar operations properly.
2213 return !ST->hasFPARMv8Base() && !ST->hasVFP2Base();
2214 case Intrinsic::masked_store:
2215 case Intrinsic::masked_load:
2216 case Intrinsic::masked_gather:
2217 case Intrinsic::masked_scatter:
2218 return !ST->hasMVEIntegerOps();
2219 case Intrinsic::sadd_with_overflow:
2220 case Intrinsic::uadd_with_overflow:
2221 case Intrinsic::ssub_with_overflow:
2222 case Intrinsic::usub_with_overflow:
2223 case Intrinsic::sadd_sat:
2224 case Intrinsic::uadd_sat:
2225 case Intrinsic::ssub_sat:
2226 case Intrinsic::usub_sat:
2227 return false;
2228 }
2229
2230 return BaseT::isLoweredToCall(F);
2231}
2232
2234 unsigned ISD = TLI->InstructionOpcodeToISD(I.getOpcode());
2235 EVT VT = TLI->getValueType(DL, I.getType(), true);
2236 if (TLI->getOperationAction(ISD, VT) == TargetLowering::LibCall)
2237 return true;
2238
2239 // Check if an intrinsic will be lowered to a call and assume that any
2240 // other CallInst will generate a bl.
2241 if (auto *Call = dyn_cast<CallInst>(&I)) {
2242 if (auto *II = dyn_cast<IntrinsicInst>(Call)) {
2243 switch(II->getIntrinsicID()) {
2244 case Intrinsic::memcpy:
2245 case Intrinsic::memset:
2246 case Intrinsic::memmove:
2247 return getNumMemOps(II) == -1;
2248 default:
2249 if (const Function *F = Call->getCalledFunction())
2250 return isLoweredToCall(F);
2251 }
2252 }
2253 return true;
2254 }
2255
2256 // FPv5 provides conversions between integer, double-precision,
2257 // single-precision, and half-precision formats.
2258 switch (I.getOpcode()) {
2259 default:
2260 break;
2261 case Instruction::FPToSI:
2262 case Instruction::FPToUI:
2263 case Instruction::SIToFP:
2264 case Instruction::UIToFP:
2265 case Instruction::FPTrunc:
2266 case Instruction::FPExt:
2267 return !ST->hasFPARMv8Base();
2268 }
2269
2270 // FIXME: Unfortunately the approach of checking the Operation Action does
2271 // not catch all cases of Legalization that use library calls. Our
2272 // Legalization step categorizes some transformations into library calls as
2273 // Custom, Expand or even Legal when doing type legalization. So for now
2274 // we have to special case for instance the SDIV of 64bit integers and the
2275 // use of floating point emulation.
2276 if (VT.isInteger() && VT.getSizeInBits() >= 64) {
2277 switch (ISD) {
2278 default:
2279 break;
2280 case ISD::SDIV:
2281 case ISD::UDIV:
2282 case ISD::SREM:
2283 case ISD::UREM:
2284 case ISD::SDIVREM:
2285 case ISD::UDIVREM:
2286 return true;
2287 }
2288 }
2289
2290 // Assume all other non-float operations are supported.
2291 if (!VT.isFloatingPoint())
2292 return false;
2293
2294 // We'll need a library call to handle most floats when using soft.
2295 if (TLI->useSoftFloat()) {
2296 switch (I.getOpcode()) {
2297 default:
2298 return true;
2299 case Instruction::Alloca:
2300 case Instruction::Load:
2301 case Instruction::Store:
2302 case Instruction::Select:
2303 case Instruction::PHI:
2304 return false;
2305 }
2306 }
2307
2308 // We'll need a libcall to perform double precision operations on a single
2309 // precision only FPU.
2310 if (I.getType()->isDoubleTy() && !ST->hasFP64())
2311 return true;
2312
2313 // Likewise for half precision arithmetic.
2314 if (I.getType()->isHalfTy() && !ST->hasFullFP16())
2315 return true;
2316
2317 return false;
2318}
2319
2321 AssumptionCache &AC,
2322 TargetLibraryInfo *LibInfo,
2323 HardwareLoopInfo &HWLoopInfo) const {
2324 // Low-overhead branches are only supported in the 'low-overhead branch'
2325 // extension of v8.1-m.
2326 if (!ST->hasLOB() || DisableLowOverheadLoops) {
2327 LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
2328 return false;
2329 }
2330
2332 LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
2333 return false;
2334 }
2335
2336 const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
2337 if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
2338 LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
2339 return false;
2340 }
2341
2342 const SCEV *TripCountSCEV =
2343 SE.getAddExpr(BackedgeTakenCount,
2344 SE.getOne(BackedgeTakenCount->getType()));
2345
2346 // We need to store the trip count in LR, a 32-bit register.
2347 if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
2348 LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
2349 return false;
2350 }
2351
2352 // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
2353 // point in generating a hardware loop if that's going to happen.
2354
2355 auto IsHardwareLoopIntrinsic = [](Instruction &I) {
2356 if (auto *Call = dyn_cast<IntrinsicInst>(&I)) {
2357 switch (Call->getIntrinsicID()) {
2358 default:
2359 break;
2360 case Intrinsic::start_loop_iterations:
2361 case Intrinsic::test_start_loop_iterations:
2362 case Intrinsic::loop_decrement:
2363 case Intrinsic::loop_decrement_reg:
2364 return true;
2365 }
2366 }
2367 return false;
2368 };
2369
2370 // Scan the instructions to see if there's any that we know will turn into a
2371 // call or if this loop is already a low-overhead loop or will become a tail
2372 // predicated loop.
2373 bool IsTailPredLoop = false;
2374 auto ScanLoop = [&](Loop *L) {
2375 for (auto *BB : L->getBlocks()) {
2376 for (auto &I : *BB) {
2377 if (maybeLoweredToCall(I) || IsHardwareLoopIntrinsic(I) ||
2378 isa<InlineAsm>(I)) {
2379 LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
2380 return false;
2381 }
2382 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2383 IsTailPredLoop |=
2384 II->getIntrinsicID() == Intrinsic::get_active_lane_mask ||
2385 II->getIntrinsicID() == Intrinsic::arm_mve_vctp8 ||
2386 II->getIntrinsicID() == Intrinsic::arm_mve_vctp16 ||
2387 II->getIntrinsicID() == Intrinsic::arm_mve_vctp32 ||
2388 II->getIntrinsicID() == Intrinsic::arm_mve_vctp64;
2389 }
2390 }
2391 return true;
2392 };
2393
2394 // Visit inner loops.
2395 for (auto *Inner : *L)
2396 if (!ScanLoop(Inner))
2397 return false;
2398
2399 if (!ScanLoop(L))
2400 return false;
2401
2402 // TODO: Check whether the trip count calculation is expensive. If L is the
2403 // inner loop but we know it has a low trip count, calculating that trip
2404 // count (in the parent loop) may be detrimental.
2405
2406 LLVMContext &C = L->getHeader()->getContext();
2407 HWLoopInfo.CounterInReg = true;
2408 HWLoopInfo.IsNestingLegal = false;
2409 HWLoopInfo.PerformEntryTest = AllowWLSLoops && !IsTailPredLoop;
2410 HWLoopInfo.CountType = Type::getInt32Ty(C);
2411 HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
2412 return true;
2413}
2414
2415static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
2416 // We don't allow icmp's, and because we only look at single block loops,
2417 // we simply count the icmps, i.e. there should only be 1 for the backedge.
2418 if (isa<ICmpInst>(&I) && ++ICmpCount > 1)
2419 return false;
2420 // FIXME: This is a workaround for poor cost modelling. Min/Max intrinsics are
2421 // not currently canonical, but soon will be. Code without them uses icmp, and
2422 // so is not tail predicated as per the condition above. In order to get the
2423 // same performance we treat min and max the same as an icmp for tailpred
2424 // purposes for the moment (we often rely on non-tailpred and higher VF's to
2425 // pick more optimal instructions like VQDMULH. They need to be recognized
2426 // directly by the vectorizer).
2427 if (auto *II = dyn_cast<IntrinsicInst>(&I))
2428 if ((II->getIntrinsicID() == Intrinsic::smin ||
2429 II->getIntrinsicID() == Intrinsic::smax ||
2430 II->getIntrinsicID() == Intrinsic::umin ||
2431 II->getIntrinsicID() == Intrinsic::umax) &&
2432 ++ICmpCount > 1)
2433 return false;
2434
2435 if (isa<FCmpInst>(&I))
2436 return false;
2437
2438 // We could allow extending/narrowing FP loads/stores, but codegen is
2439 // too inefficient so reject this for now.
2441 return false;
2442
2443 // Extends have to be extending-loads
2444 if (isa<SExtInst>(&I) || isa<ZExtInst>(&I) )
2445 if (!I.getOperand(0)->hasOneUse() || !isa<LoadInst>(I.getOperand(0)))
2446 return false;
2447
2448 // Truncs have to be narrowing-stores
2449 if (isa<TruncInst>(&I) )
2450 if (!I.hasOneUse() || !isa<StoreInst>(*I.user_begin()))
2451 return false;
2452
2453 return true;
2454}
2455
2456// To set up a tail-predicated loop, we need to know the total number of
2457// elements processed by that loop. Thus, we need to determine the element
2458// size and:
2459// 1) it should be uniform for all operations in the vector loop, so we
2460// e.g. don't want any widening/narrowing operations.
2461// 2) it should be smaller than i64s because we don't have vector operations
2462// that work on i64s.
2463// 3) we don't want elements to be reversed or shuffled, to make sure the
2464// tail-predication masks/predicates the right lanes.
2465//
2467 const DataLayout &DL,
2468 const LoopAccessInfo *LAI,
2469 const DominatorTree &DT) {
2470 LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
2471
2472 // If there are live-out values, it is probably a reduction. We can predicate
2473 // most reduction operations freely under MVE using a combination of
2474 // prefer-predicated-reduction-select and inloop reductions. We limit this to
2475 // floating point and integer reductions, but don't check for operators
2476 // specifically here. If the value ends up not being a reduction (and so the
2477 // vectorizer cannot tailfold the loop), we should fall back to standard
2478 // vectorization automatically.
2480 LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
2481 bool ReductionsDisabled =
2484
2485 for (auto *I : LiveOuts) {
2486 if (!I->getType()->isIntegerTy() && !I->getType()->isFloatTy() &&
2487 !I->getType()->isHalfTy()) {
2488 LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer/float "
2489 "live-out value\n");
2490 return false;
2491 }
2492 if (ReductionsDisabled) {
2493 LLVM_DEBUG(dbgs() << "Reductions not enabled\n");
2494 return false;
2495 }
2496 }
2497
2498 // Next, check that all instructions can be tail-predicated.
2499 PredicatedScalarEvolution PSE = LAI->getPSE();
2500 int ICmpCount = 0;
2501
2502 for (BasicBlock *BB : L->blocks()) {
2503 for (Instruction &I : *BB) {
2505 continue;
2506 if (!canTailPredicateInstruction(I, ICmpCount)) {
2507 LLVM_DEBUG(dbgs() << "Instruction not allowed: "; I.dump());
2508 return false;
2509 }
2510
2511 Type *T = I.getType();
2512 if (T->getScalarSizeInBits() > 32) {
2513 LLVM_DEBUG(dbgs() << "Unsupported Type: "; T->dump());
2514 return false;
2515 }
2516 if (isa<StoreInst>(I) || isa<LoadInst>(I)) {
2518 Type *AccessTy = getLoadStoreType(&I);
2519 int64_t NextStride =
2520 getPtrStride(PSE, AccessTy, Ptr, L, DT).value_or(0);
2521 if (NextStride == 1) {
2522 // TODO: for now only allow consecutive strides of 1. We could support
2523 // other strides as long as it is uniform, but let's keep it simple
2524 // for now.
2525 continue;
2526 } else if (NextStride == -1 ||
2527 (NextStride == 2 && MVEMaxSupportedInterleaveFactor >= 2) ||
2528 (NextStride == 4 && MVEMaxSupportedInterleaveFactor >= 4)) {
2530 << "Consecutive strides of 2 found, vld2/vstr2 can't "
2531 "be tail-predicated\n.");
2532 return false;
2533 // TODO: don't tail predicate if there is a reversed load?
2534 } else if (EnableMaskedGatherScatters) {
2535 // Gather/scatters do allow loading from arbitrary strides, at
2536 // least if they are loop invariant.
2537 // TODO: Loop variant strides should in theory work, too, but
2538 // this requires further testing.
2539 const SCEV *PtrScev = PSE.getSE()->getSCEV(Ptr);
2540 if (auto AR = dyn_cast<SCEVAddRecExpr>(PtrScev)) {
2541 const SCEV *Step = AR->getStepRecurrence(*PSE.getSE());
2542 if (PSE.getSE()->isLoopInvariant(Step, L))
2543 continue;
2544 }
2545 }
2546 LLVM_DEBUG(dbgs() << "Bad stride found, can't "
2547 "tail-predicate\n.");
2548 return false;
2549 }
2550 }
2551 }
2552
2553 LLVM_DEBUG(dbgs() << "tail-predication: all instructions allowed!\n");
2554 return true;
2555}
2556
2558 if (!EnableTailPredication) {
2559 LLVM_DEBUG(dbgs() << "Tail-folding not enabled.\n");
2560 return false;
2561 }
2562
2563 // Creating a tail-folded vector loop is the first step for generating a
2564 // tail-folded hardware loop, for which we need the MVE masked
2565 // load/stores instructions:
2566 if (!ST->hasMVEIntegerOps())
2567 return false;
2568
2569 LoopVectorizationLegality *LVL = TFI->LVL;
2570 Loop *L = LVL->getLoop();
2571
2572 // For now, restrict this to single block loops.
2573 if (L->getNumBlocks() > 1) {
2574 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: not a single block "
2575 "loop.\n");
2576 return false;
2577 }
2578
2579 assert(L->isInnermost() &&
2580 "preferTailFoldingOverEpilogue: inner-loop expected");
2581
2582 LoopInfo *LI = LVL->getLoopInfo();
2583 HardwareLoopInfo HWLoopInfo(L);
2584 if (!HWLoopInfo.canAnalyze(*LI)) {
2585 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: hardware-loop is not "
2586 "analyzable.\n");
2587 return false;
2588 }
2589
2592
2593 // This checks if we have the low-overhead branch architecture
2594 // extension, and if we will create a hardware-loop:
2595 if (!isHardwareLoopProfitable(L, *SE, *AC, TFI->TLI, HWLoopInfo)) {
2596 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: hardware-loop is not "
2597 "profitable.\n");
2598 return false;
2599 }
2600
2601 DominatorTree *DT = LVL->getDominatorTree();
2602 if (!HWLoopInfo.isHardwareLoopCandidate(*SE, *LI, *DT)) {
2603 LLVM_DEBUG(dbgs() << "preferTailFoldingOverEpilogue: hardware-loop is not "
2604 "a candidate.\n");
2605 return false;
2606 }
2607
2608 return canTailPredicateLoop(L, LI, *SE, DL, LVL->getLAI(),
2609 *LVL->getDominatorTree());
2610}
2611
2613 if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
2615
2616 // Intrinsic @llvm.get.active.lane.mask is supported.
2617 // It is used in the MVETailPredication pass, which requires the number of
2618 // elements processed by this vector loop to setup the tail-predicated
2619 // loop.
2621}
2624 OptimizationRemarkEmitter *ORE) const {
2625 // Enable Upper bound unrolling universally, providing that we do not see an
2626 // active lane mask, which will be better kept as a loop to become tail
2627 // predicated than to be conditionally unrolled.
2628 UP.UpperBound =
2629 !ST->hasMVEIntegerOps() || !any_of(*L->getHeader(), [](Instruction &I) {
2630 return isa<IntrinsicInst>(I) &&
2631 cast<IntrinsicInst>(I).getIntrinsicID() ==
2632 Intrinsic::get_active_lane_mask;
2633 });
2634
2635 // Only currently enable these preferences for M-Class cores.
2636 if (!ST->isMClass())
2637 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2638
2639 // Disable loop unrolling for Oz and Os.
2640 UP.OptSizeThreshold = 0;
2642 if (L->getHeader()->getParent()->hasOptSize())
2643 return;
2644
2645 SmallVector<BasicBlock*, 4> ExitingBlocks;
2646 L->getExitingBlocks(ExitingBlocks);
2647 LLVM_DEBUG(dbgs() << "Loop has:\n"
2648 << "Blocks: " << L->getNumBlocks() << "\n"
2649 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2650
2651 // Only allow another exit other than the latch. This acts as an early exit
2652 // as it mirrors the profitability calculation of the runtime unroller.
2653 if (ExitingBlocks.size() > 2)
2654 return;
2655
2656 // Limit the CFG of the loop body for targets with a branch predictor.
2657 // Allowing 4 blocks permits if-then-else diamonds in the body.
2658 if (ST->hasBranchPredictor() && L->getNumBlocks() > 4)
2659 return;
2660
2661 // Don't unroll vectorized loops, including the remainder loop
2662 if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized"))
2663 return;
2664
2665 // Scan the loop: don't unroll loops with calls as this could prevent
2666 // inlining.
2668 for (auto *BB : L->getBlocks()) {
2669 for (auto &I : *BB) {
2670 // Don't unroll vectorised loop. MVE does not benefit from it as much as
2671 // scalar code.
2672 if (I.getType()->isVectorTy())
2673 return;
2674
2675 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2676 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2677 if (!isLoweredToCall(F))
2678 continue;
2679 }
2680 return;
2681 }
2682
2683 SmallVector<const Value*, 4> Operands(I.operand_values());
2684 Cost += getInstructionCost(&I, Operands,
2686 }
2687 }
2688
2689 // On v6m cores, there are very few registers available. We can easily end up
2690 // spilling and reloading more registers in an unrolled loop. Look at the
2691 // number of LCSSA phis as a rough measure of how many registers will need to
2692 // be live out of the loop, reducing the default unroll count if more than 1
2693 // value is needed. In the long run, all of this should be being learnt by a
2694 // machine.
2695 unsigned UnrollCount = 4;
2696 if (ST->isThumb1Only()) {
2697 unsigned ExitingValues = 0;
2699 L->getExitBlocks(ExitBlocks);
2700 for (auto *Exit : ExitBlocks) {
2701 // Count the number of LCSSA phis. Exclude values coming from GEP's as
2702 // only the last is expected to be needed for address operands.
2703 unsigned LiveOuts = count_if(Exit->phis(), [](auto &PH) {
2704 return PH.getNumOperands() != 1 ||
2705 !isa<GetElementPtrInst>(PH.getOperand(0));
2706 });
2707 ExitingValues = ExitingValues < LiveOuts ? LiveOuts : ExitingValues;
2708 }
2709 if (ExitingValues)
2710 UnrollCount /= ExitingValues;
2711 if (UnrollCount <= 1)
2712 return;
2713 }
2714
2715 // For processors with low overhead branching (LOB), runtime unrolling the
2716 // innermost loop is often detrimental to performance. In these cases the loop
2717 // remainder gets unrolled into a series of compare-and-jump blocks, which in
2718 // deeply nested loops get executed multiple times, negating the benefits of
2719 // LOB. This is particularly noticeable when the loop trip count of the
2720 // innermost loop varies within the outer loop, such as in the case of
2721 // triangular matrix decompositions. In these cases we will prefer to not
2722 // unroll the innermost loop, with the intention for it to be executed as a
2723 // low overhead loop.
2724 bool Runtime = true;
2725 if (ST->hasLOB()) {
2727 const SCEV *BETC = SE.getBackedgeTakenCount(L);
2728 auto *Outer = L->getOutermostLoop();
2729 if ((L != Outer && Outer != L->getParentLoop()) ||
2730 (L != Outer && BETC && !SE.isLoopInvariant(BETC, Outer))) {
2731 Runtime = false;
2732 }
2733 }
2734 }
2735
2736 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2737 LLVM_DEBUG(dbgs() << "Default Runtime Unroll Count: " << UnrollCount << "\n");
2738
2739 UP.Partial = true;
2740 UP.Runtime = Runtime;
2741 UP.UnrollRemainder = true;
2743 UP.UnrollAndJam = true;
2745
2746 // Force unrolling small loops can be very useful because of the branch
2747 // taken cost of the backedge.
2749 UP.Force = true;
2750}
2751
2756
2758 if (!ST->hasMVEIntegerOps())
2759 return false;
2760
2761 unsigned ScalarBits = Ty->getScalarSizeInBits();
2762 switch (Kind) {
2763 case RecurKind::Add:
2764 return ScalarBits <= 64;
2765 default:
2766 return false;
2767 }
2768}
2769
2771 if (!ST->hasMVEIntegerOps())
2772 return false;
2773 return true;
2774}
2775
2777 StackOffset BaseOffset,
2778 bool HasBaseReg, int64_t Scale,
2779 unsigned AddrSpace) const {
2781 AM.BaseGV = BaseGV;
2782 AM.BaseOffs = BaseOffset.getFixed();
2783 AM.HasBaseReg = HasBaseReg;
2784 AM.Scale = Scale;
2785 AM.ScalableOffset = BaseOffset.getScalable();
2786 if (getTLI()->isLegalAddressingMode(DL, AM, Ty, AddrSpace)) {
2787 if (ST->hasFPAO())
2788 return AM.Scale < 0 ? 1 : 0; // positive offsets execute faster
2789 return 0;
2790 }
2792}
2793
2795 // MVE only has 8 vector registers, so we should consider register pressure to
2796 // avoid vectorizing when the cost of spills exceeds the gains from
2797 // vectorization.
2798 return ST->hasMVEIntegerOps();
2799}
2800
2801bool ARMTTIImpl::hasArmWideBranch(bool Thumb) const {
2802 if (Thumb) {
2803 // B.W is available in any Thumb2-supporting target, and also in every
2804 // version of Armv8-M, even Baseline which does not include the rest of
2805 // Thumb2.
2806 return ST->isThumb2() || ST->hasV8MBaselineOps();
2807 } else {
2808 // B is available in all versions of the Arm ISA, so the only question is
2809 // whether that ISA is available at all.
2810 return ST->hasARMOps();
2811 }
2812}
2813
2814/// Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth
2815/// of the vector elements.
2816static bool areExtractExts(Value *Ext1, Value *Ext2) {
2817 using namespace PatternMatch;
2818
2819 auto areExtDoubled = [](Instruction *Ext) {
2820 return Ext->getType()->getScalarSizeInBits() ==
2821 2 * Ext->getOperand(0)->getType()->getScalarSizeInBits();
2822 };
2823
2824 if (!match(Ext1, m_ZExtOrSExt(m_Value())) ||
2825 !match(Ext2, m_ZExtOrSExt(m_Value())) ||
2826 !areExtDoubled(cast<Instruction>(Ext1)) ||
2827 !areExtDoubled(cast<Instruction>(Ext2)))
2828 return false;
2829
2830 return true;
2831}
2832
2833/// Check if sinking \p I's operands to I's basic block is profitable, because
2834/// the operands can be folded into a target instruction, e.g.
2835/// sext/zext can be folded into vsubl.
2837 SmallVectorImpl<Use *> &Ops) const {
2838 using namespace PatternMatch;
2839
2840 if (!I->getType()->isVectorTy())
2841 return false;
2842
2843 if (ST->hasNEON()) {
2844 switch (I->getOpcode()) {
2845 case Instruction::Sub:
2846 case Instruction::Add: {
2847 if (!areExtractExts(I->getOperand(0), I->getOperand(1)))
2848 return false;
2849 Ops.push_back(&I->getOperandUse(0));
2850 Ops.push_back(&I->getOperandUse(1));
2851 return true;
2852 }
2853 default:
2854 return false;
2855 }
2856 }
2857
2858 if (!ST->hasMVEIntegerOps())
2859 return false;
2860
2861 auto IsFMSMul = [&](Instruction *I) {
2862 if (!I->hasOneUse())
2863 return false;
2864 auto *Sub = cast<Instruction>(*I->users().begin());
2865 return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
2866 };
2867 auto IsFMS = [&](Instruction *I) {
2868 if (match(I->getOperand(0), m_FNeg(m_Value())) ||
2869 match(I->getOperand(1), m_FNeg(m_Value())))
2870 return true;
2871 return false;
2872 };
2873
2874 auto IsSinker = [&](Instruction *I, int Operand) {
2875 switch (I->getOpcode()) {
2876 case Instruction::Add:
2877 case Instruction::Mul:
2878 case Instruction::FAdd:
2879 case Instruction::ICmp:
2880 case Instruction::FCmp:
2881 return true;
2882 case Instruction::FMul:
2883 return !IsFMSMul(I);
2884 case Instruction::Sub:
2885 case Instruction::FSub:
2886 case Instruction::Shl:
2887 case Instruction::LShr:
2888 case Instruction::AShr:
2889 return Operand == 1;
2890 case Instruction::Call:
2891 if (auto *II = dyn_cast<IntrinsicInst>(I)) {
2892 switch (II->getIntrinsicID()) {
2893 case Intrinsic::fma:
2894 return !IsFMS(I);
2895 case Intrinsic::sadd_sat:
2896 case Intrinsic::uadd_sat:
2897 case Intrinsic::arm_mve_add_predicated:
2898 case Intrinsic::arm_mve_mul_predicated:
2899 case Intrinsic::arm_mve_qadd_predicated:
2900 case Intrinsic::arm_mve_vhadd:
2901 case Intrinsic::arm_mve_hadd_predicated:
2902 case Intrinsic::arm_mve_vqdmull:
2903 case Intrinsic::arm_mve_vqdmull_predicated:
2904 case Intrinsic::arm_mve_vqdmulh:
2905 case Intrinsic::arm_mve_qdmulh_predicated:
2906 case Intrinsic::arm_mve_vqrdmulh:
2907 case Intrinsic::arm_mve_qrdmulh_predicated:
2908 case Intrinsic::arm_mve_fma_predicated:
2909 return true;
2910 case Intrinsic::ssub_sat:
2911 case Intrinsic::usub_sat:
2912 case Intrinsic::arm_mve_sub_predicated:
2913 case Intrinsic::arm_mve_qsub_predicated:
2914 case Intrinsic::arm_mve_hsub_predicated:
2915 case Intrinsic::arm_mve_vhsub:
2916 return Operand == 1;
2917 default:
2918 return false;
2919 }
2920 }
2921 return false;
2922 default:
2923 return false;
2924 }
2925 };
2926
2927 for (auto OpIdx : enumerate(I->operands())) {
2928 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
2929 // Make sure we are not already sinking this operand
2930 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
2931 continue;
2932
2933 Instruction *Shuffle = Op;
2934 if (Shuffle->getOpcode() == Instruction::BitCast)
2935 Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
2936 // We are looking for a splat that can be sunk.
2937 if (!Shuffle || !match(Shuffle, m_Shuffle(m_InsertElt(m_Undef(), m_Value(),
2938 m_ZeroInt()),
2939 m_Undef(), m_ZeroMask())))
2940 continue;
2941 if (!IsSinker(I, OpIdx.index()))
2942 continue;
2943
2944 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
2945 // and vector registers
2946 for (Use &U : Op->uses()) {
2947 Instruction *Insn = cast<Instruction>(U.getUser());
2948 if (!IsSinker(Insn, U.getOperandNo()))
2949 return false;
2950 }
2951
2952 Ops.push_back(&Shuffle->getOperandUse(0));
2953 if (Shuffle != Op)
2954 Ops.push_back(&Op->getOperandUse(0));
2955 Ops.push_back(&OpIdx.value());
2956 }
2957 return true;
2958}
2959
2961 Type *ArrayType) const {
2962 if (!UseWidenGlobalArrays) {
2963 LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
2964 return false;
2965 }
2966
2967 // Don't modify none integer array types
2968 if (!ArrayType || !ArrayType->isArrayTy() ||
2970 return 0;
2971
2972 // We pad to 4 byte boundaries
2973 if (Size % 4 == 0)
2974 return 0;
2975
2976 unsigned NumBytesToPad = 4 - (Size % 4);
2977 unsigned NewSize = Size + NumBytesToPad;
2978
2979 // Max number of bytes that memcpy allows for lowering to load/stores before
2980 // it uses library function (__aeabi_memcpy).
2981 unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
2982
2983 if (NewSize > MaxMemIntrinsicSize)
2984 return 0;
2985
2986 return NumBytesToPad;
2987}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
static bool areExtractExts(Value *Ext1, Value *Ext2)
Check if Ext1 and Ext2 are extends of the same type, doubling the bitwidth of the vector elements.
This file implements a class to represent arbitrary precision integral constant values and operations...
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< int > ArmForceUnrollThreshold("arm-force-unroll-threshold", cl::init(12), cl::Hidden, cl::desc("Threshold for forced unrolling of small loops in Arm architecture"))
static Value * isSSATMinMaxPattern(Instruction *Inst, const APInt &Imm)
static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE, const DataLayout &DL, const LoopAccessInfo *LAI, const DominatorTree &DT)
static cl::opt< bool > AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true), cl::desc("Enable the generation of WLS loops"))
static Value * simplifyNeonVld1(const IntrinsicInst &II, unsigned MemAlign, InstCombiner::BuilderTy &Builder)
Convert a vector load intrinsic into a simple llvm load instruction.
static bool isFPSatMinMaxPattern(Instruction *Inst, const APInt &Imm)
static cl::opt< bool > UseWidenGlobalArrays("widen-global-strings", cl::Hidden, cl::init(true), cl::desc("Enable the widening of global strings to alignment boundaries"))
cl::opt< bool > EnableMaskedGatherScatters
static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount)
cl::opt< TailPredication::Mode > EnableTailPredication
static cl::opt< bool > DisableLowOverheadLoops("disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops"))
static cl::opt< bool > EnableMaskedLoadStores("enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores"))
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
This file provides the interface for the instcombine pass implementation.
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static cl::opt< unsigned > UnrollCount("unroll-count", cl::Hidden, cl::desc("Use this unroll count for all loops including those with " "unroll_count pragma values, for testing purposes"))
This file defines the LoopVectorizationLegality class.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
static const Function * getCalledFunction(const Value *V)
#define T
MachineInstr unsigned OpIdx
ConstantRange Range(APInt(BitWidth, Low), APInt(BitWidth, High))
uint64_t IntrinsicInst * II
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
Value * RHS
Value * LHS
Class for arbitrary precision integers.
Definition APInt.h:78
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:652
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
InstructionCost getGatherScatterOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getAddressComputationCost(Type *Val, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMemcpyCost(const Instruction *I) const override
bool maybeLoweredToCall(Instruction &I) const
bool preferInLoopReduction(RecurKind Kind, Type *Ty) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *ValTy, TTI::TargetCostKind CostKind) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
bool hasArmWideBranch(bool Thumb) const override
bool shouldConsiderVectorizationRegPressure() const override
bool preferTailFoldingOverEpilogue(TailFoldingInfo *TFI) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
int getNumMemOps(const IntrinsicInst *I) const
Given a memcpy/memset/memmove instruction, return the number of memory operations performed,...
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty) const override
bool isLoweredToCall(const Function *F) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
uint64_t getMaxMemIntrinsicInlineSizeThreshold() const override
bool isLegalMaskedStore(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
std::optional< Value * > simplifyDemandedVectorEltsIntrinsic(InstCombiner &IC, IntrinsicInst &II, APInt DemandedElts, APInt &UndefElts, APInt &UndefElts2, APInt &UndefElts3, std::function< void(Instruction *, unsigned, APInt, APInt &)> SimplifyAndSetOp) const override
bool isLegalMaskedLoad(Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::MaskKind MaskKind=TTI::MaskKind::VariableOrConstantMask) const override
TailFoldingStyle getPreferredTailFoldingStyle() const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
std::optional< Instruction * > instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
bool preferPredicatedReductionSelect() const override
bool isLegalMaskedGather(Type *Ty, Align Alignment) const override
unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const override
bool isProfitableLSRChainElement(Instruction *I) const override
bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getScalingFactorCost(Type *Ty, GlobalValue *BaseGV, StackOffset BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace) const override
getScalingFactorCost - Return the cost of the scaling used in addressing mode represented by AM.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
Class to represent array types.
A cache of @llvm.assume calls within a function.
static LLVM_ABI Attribute getWithAlignment(LLVMContext &Context, Align Alignment)
Return a uniquified Attribute object that has the specific alignment set.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getCallInstrCost(Function *F, Type *RetTy, ArrayRef< Type * > Tys, TTI::TargetCostKind CostKind) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getMulAccReductionCost(bool IsUnsigned, unsigned RedOpcode, Type *ResTy, VectorType *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
static LLVM_ABI BinaryOperator * Create(BinaryOps Op, Value *S1, Value *S2, const Twine &Name=Twine(), InsertPosition InsertBefore=nullptr)
Construct a binary instruction, given the opcode and the two operands.
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
@ ICMP_SLE
signed less or equal
Definition InstrTypes.h:770
@ ICMP_SGT
signed greater than
Definition InstrTypes.h:767
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:750
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:457
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
The core instruction combiner logic.
const DataLayout & getDataLayout() const
virtual Instruction * eraseInstFromFunction(Instruction &I)=0
Combiner aware instruction erasure.
DominatorTree & getDominatorTree() const
Instruction * replaceInstUsesWith(Instruction &I, Value *V)
A combiner-aware RAUW-like routine.
virtual bool SimplifyDemandedBits(Instruction *I, unsigned OpNo, const APInt &DemandedMask, KnownBits &Known, const SimplifyQuery &Q, unsigned Depth=0)=0
IRBuilder< TargetFolder, IRBuilderInstCombineInserter > BuilderTy
An IRBuilder that automatically inserts new instructions into the worklist.
Instruction * replaceOperand(Instruction &I, unsigned OpNum, Value *V)
Replace operand of instruction and add old operand to the worklist.
AssumptionCache & getAssumptionCache() const
static InstructionCost getInvalid(CostType Val=0)
Instruction * user_back()
Specialize the methods defined in Value, as we know that an instruction can only be used by other ins...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isShift() const
const SmallVectorImpl< Type * > & getArgTypes() const
A wrapper class for inspecting calls to intrinsic functions.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Drive the analysis of memory accesses in the loop.
const PredicatedScalarEvolution & getPSE() const
Used to add runtime SCEV checks.
LoopVectorizationLegality checks if it is legal to vectorize a loop, and to what vectorization factor...
AssumptionCache * getAssumptionCache() const
const LoopAccessInfo * getLAI() const
ScalarEvolution * getScalarEvolution() const
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Information for memory intrinsic cost model.
const Instruction * getInst() const
The optimization diagnostic interface.
An interface layer with SCEV used to manage how we see SCEV expressions for values in the context of ...
ScalarEvolution * getSE() const
Returns the ScalarEvolution analysis used.
This class represents an analyzed expression in the program.
LLVM_ABI Type * getType() const
Return the LLVM type of this SCEV expression.
The main scalar evolution driver.
LLVM_ABI const SCEV * getBackedgeTakenCount(const Loop *L, ExitCountKind Kind=Exact)
If the specified loop has a predictable backedge-taken count, return it, otherwise return a SCEVCould...
LLVM_ABI const SCEV * getSCEV(Value *V)
Return a SCEV expression for the full generality of the specified expression.
const SCEV * getOne(Type *Ty)
Return a SCEV for the constant 1 of a specific type.
LLVM_ABI bool isLoopInvariant(const SCEV *S, const Loop *L)
Return true if the value of the given SCEV is unchanging in the specified loop.
LLVM_ABI bool hasLoopInvariantBackedgeTakenCount(const Loop *L)
Return true if the specified loop has an analyzable loop-invariant backedge-taken count.
APInt getUnsignedRangeMax(const SCEV *S)
Determine the max of the unsigned range for a particular SCEV.
LLVM_ABI const SCEV * getAddExpr(SmallVectorImpl< SCEVUse > &Ops, SCEV::NoWrapFlags Flags=SCEV::FlagAnyWrap, unsigned Depth=0)
Get a canonical add expression, or something simpler if possible.
static LLVM_ABI bool isDeInterleaveMaskOfFactor(ArrayRef< int > Mask, unsigned Factor, unsigned &Index)
Check if the mask is a DE-interleave mask of the given factor Factor like: <Index,...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
StackOffset holds a fixed and a scalable offset in bytes.
Definition TypeSize.h:30
static StackOffset getScalable(int64_t Scalable)
Definition TypeSize.h:40
static StackOffset getFixed(int64_t Fixed)
Definition TypeSize.h:39
Provides information about what library functions are available for the current target.
virtual bool isLoweredToCall(const Function *F) const
bool isConstantStridedAccessLessThan(ScalarEvolution *SE, const SCEV *Ptr, int64_t MergeDistance) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
VectorInstrContext
Represents a hint about the context in which an insert/extract is used.
@ None
The insert/extract is not used with a load/store.
MaskKind
Some targets only support masked load/store with a constant mask.
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
@ TCC_Expensive
The cost of a 'div' instruction on x86.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
@ AMK_PreIndexed
Prefer pre-indexed addressing mode.
@ AMK_None
Don't prefer any addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ Masked
The cast is used with a masked load/store.
@ Normal
The cast is used with a normal load/store.
This class represents a truncation of integer types.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
bool isArrayTy() const
True if this is an instance of ArrayType.
Definition Type.h:279
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:61
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isIntOrIntVectorTy() const
Return true if this is an integer type or a vector of integer types.
Definition Type.h:263
Type * getArrayElementType() const
Definition Type.h:425
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:313
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
const Use & getOperandUse(unsigned i) const
Definition User.h:220
Value * getOperand(unsigned i) const
Definition User.h:207
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:147
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
Type * getElementType() const
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< LHS, RHS, Instruction::Xor > m_Xor(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
auto m_Value()
Match an arbitrary value and ignore it.
auto m_Constant()
Match an arbitrary Constant and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
TypeConversionCostTblEntryT< unsigned > TypeConversionCostTblEntry
Definition CostTable.h:61
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
@ Runtime
Detect stack use after return if not disabled runtime with (ASAN_OPTIONS=detect_stack_use_after_retur...
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
Align getKnownAlignment(Value *V, const DataLayout &DL, const Instruction *CxtI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr)
Try to infer an alignment for the specified pointer.
Definition Local.h:253
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI SmallVector< Instruction *, 8 > findDefsUsedOutsideOfLoop(Loop *L)
Returns the instructions that use values defined in the loop.
SelectPatternFlavor
Specific patterns of select instructions we can match.
@ SPF_ABS
Floating point maxnum.
@ SPF_FMAXNUM
Floating point minnum.
@ SPF_UMIN
Signed minimum.
@ SPF_UMAX
Signed maximum.
@ SPF_SMAX
Unsigned minimum.
@ SPF_FMINNUM
Unsigned maximum.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI SelectPatternResult matchSelectPattern(Value *V, Value *&LHS, Value *&RHS, Instruction::CastOps *CastOp=nullptr, unsigned Depth=0)
Pattern match integer [SU]MIN, [SU]MAX and ABS idioms, returning the kind and providing the out param...
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
RecurKind
These are the kinds of recurrences that we support.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
@ DataWithoutLaneMask
Same as Data, but avoids using the get.active.lane.mask intrinsic to calculate the mask and instead i...
@ Data
Use predicate only to mask operations on data in the loop.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool ShouldCheckWrap=true, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
If the pointer has a constant stride return it in units of the access type size.
const TypeConversionCostTblEntryT< CostType > * ConvertCostTableLookup(ArrayRef< TypeConversionCostTblEntryT< CostType > > Tbl, int ISD, MVT Dst, MVT Src)
Find in type conversion cost table.
Definition CostTable.h:66
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
Attributes of a target dependent hardware loop.
LLVM_ABI bool canAnalyze(LoopInfo &LI)
LLVM_ABI bool isHardwareLoopCandidate(ScalarEvolution &SE, LoopInfo &LI, DominatorTree &DT, bool ForceNestedLoop=false, bool ForceHardwareLoopPHI=false)
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
static MemOp Set(uint64_t Size, bool DstAlignCanChange, Align DstAlign, bool IsZeroMemset, bool IsVolatile)
static MemOp Copy(uint64_t Size, bool DstAlignCanChange, Align DstAlign, Align SrcAlign, bool IsVolatile, bool MemcpyStrSrc=false)
SelectPatternFlavor Flavor
TargetLibraryInfo * TLI
LoopVectorizationLegality * LVL
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
unsigned DefaultUnrollRuntimeCount
Default unroll count for loops with run-time trip count.
unsigned UnrollAndJamInnerLoopThreshold
Threshold for unroll and jam, for inner loop size.
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).