LLVM 22.0.0git
RISCVTargetTransformInfo.cpp
Go to the documentation of this file.
1//===-- RISCVTargetTransformInfo.cpp - RISC-V specific TTI ----------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
11#include "llvm/ADT/STLExtras.h"
18#include "llvm/IR/IntrinsicsRISCV.h"
20#include <cmath>
21#include <optional>
22using namespace llvm;
23using namespace llvm::PatternMatch;
24
25#define DEBUG_TYPE "riscvtti"
26
28 "riscv-v-register-bit-width-lmul",
30 "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used "
31 "by autovectorized code. Fractional LMULs are not supported."),
33
35 "riscv-v-slp-max-vf",
37 "Overrides result used for getMaximumVF query which is used "
38 "exclusively by SLP vectorizer."),
40
42 RVVMinTripCount("riscv-v-min-trip-count",
43 cl::desc("Set the lower bound of a trip count to decide on "
44 "vectorization while tail-folding."),
46
48RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
50 // Check if the type is valid for all CostKind
51 if (!VT.isVector())
53 size_t NumInstr = OpCodes.size();
55 return NumInstr;
56 InstructionCost LMULCost = TLI->getLMULCost(VT);
58 return LMULCost * NumInstr;
59 InstructionCost Cost = 0;
60 for (auto Op : OpCodes) {
61 switch (Op) {
62 case RISCV::VRGATHER_VI:
63 Cost += TLI->getVRGatherVICost(VT);
64 break;
65 case RISCV::VRGATHER_VV:
66 Cost += TLI->getVRGatherVVCost(VT);
67 break;
68 case RISCV::VSLIDEUP_VI:
69 case RISCV::VSLIDEDOWN_VI:
70 Cost += TLI->getVSlideVICost(VT);
71 break;
72 case RISCV::VSLIDEUP_VX:
73 case RISCV::VSLIDEDOWN_VX:
74 Cost += TLI->getVSlideVXCost(VT);
75 break;
76 case RISCV::VREDMAX_VS:
77 case RISCV::VREDMIN_VS:
78 case RISCV::VREDMAXU_VS:
79 case RISCV::VREDMINU_VS:
80 case RISCV::VREDSUM_VS:
81 case RISCV::VREDAND_VS:
82 case RISCV::VREDOR_VS:
83 case RISCV::VREDXOR_VS:
84 case RISCV::VFREDMAX_VS:
85 case RISCV::VFREDMIN_VS:
86 case RISCV::VFREDUSUM_VS: {
87 unsigned VL = VT.getVectorMinNumElements();
88 if (!VT.isFixedLengthVector())
89 VL *= *getVScaleForTuning();
90 Cost += Log2_32_Ceil(VL);
91 break;
92 }
93 case RISCV::VFREDOSUM_VS: {
94 unsigned VL = VT.getVectorMinNumElements();
95 if (!VT.isFixedLengthVector())
96 VL *= *getVScaleForTuning();
97 Cost += VL;
98 break;
99 }
100 case RISCV::VMV_X_S:
101 case RISCV::VMV_S_X:
102 case RISCV::VFMV_F_S:
103 case RISCV::VFMV_S_F:
104 case RISCV::VMOR_MM:
105 case RISCV::VMXOR_MM:
106 case RISCV::VMAND_MM:
107 case RISCV::VMANDN_MM:
108 case RISCV::VMNAND_MM:
109 case RISCV::VCPOP_M:
110 case RISCV::VFIRST_M:
111 Cost += 1;
112 break;
113 default:
114 Cost += LMULCost;
115 }
116 }
117 return Cost;
118}
119
121 const RISCVSubtarget *ST,
122 const APInt &Imm, Type *Ty,
124 bool FreeZeroes) {
125 assert(Ty->isIntegerTy() &&
126 "getIntImmCost can only estimate cost of materialising integers");
127
128 // We have a Zero register, so 0 is always free.
129 if (Imm == 0)
130 return TTI::TCC_Free;
131
132 // Otherwise, we check how many instructions it will take to materialise.
133 return RISCVMatInt::getIntMatCost(Imm, DL.getTypeSizeInBits(Ty), *ST,
134 /*CompressionCost=*/false, FreeZeroes);
135}
136
140 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind, false);
141}
142
143// Look for patterns of shift followed by AND that can be turned into a pair of
144// shifts. We won't need to materialize an immediate for the AND so these can
145// be considered free.
146static bool canUseShiftPair(Instruction *Inst, const APInt &Imm) {
147 uint64_t Mask = Imm.getZExtValue();
148 auto *BO = dyn_cast<BinaryOperator>(Inst->getOperand(0));
149 if (!BO || !BO->hasOneUse())
150 return false;
151
152 if (BO->getOpcode() != Instruction::Shl)
153 return false;
154
155 if (!isa<ConstantInt>(BO->getOperand(1)))
156 return false;
157
158 unsigned ShAmt = cast<ConstantInt>(BO->getOperand(1))->getZExtValue();
159 // (and (shl x, c2), c1) will be matched to (srli (slli x, c2+c3), c3) if c1
160 // is a mask shifted by c2 bits with c3 leading zeros.
161 if (isShiftedMask_64(Mask)) {
162 unsigned Trailing = llvm::countr_zero(Mask);
163 if (ShAmt == Trailing)
164 return true;
165 }
166
167 return false;
168}
169
170// If this is i64 AND is part of (X & -(1 << C1) & 0xffffffff) == C2 << C1),
171// DAGCombiner can convert this to (sraiw X, C1) == sext(C2) for RV64. On RV32,
172// the type will be split so only the lower 32 bits need to be compared using
173// (srai/srli X, C) == C2.
174static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm) {
175 if (!Inst->hasOneUse())
176 return false;
177
178 // Look for equality comparison.
179 auto *Cmp = dyn_cast<ICmpInst>(*Inst->user_begin());
180 if (!Cmp || !Cmp->isEquality())
181 return false;
182
183 // Right hand side of comparison should be a constant.
184 auto *C = dyn_cast<ConstantInt>(Cmp->getOperand(1));
185 if (!C)
186 return false;
187
188 uint64_t Mask = Imm.getZExtValue();
189
190 // Mask should be of the form -(1 << C) in the lower 32 bits.
191 if (!isUInt<32>(Mask) || !isPowerOf2_32(-uint32_t(Mask)))
192 return false;
193
194 // Comparison constant should be a subset of Mask.
195 uint64_t CmpC = C->getZExtValue();
196 if ((CmpC & Mask) != CmpC)
197 return false;
198
199 // We'll need to sign extend the comparison constant and shift it right. Make
200 // sure the new constant can use addi/xori+seqz/snez.
201 unsigned ShiftBits = llvm::countr_zero(Mask);
202 int64_t NewCmpC = SignExtend64<32>(CmpC) >> ShiftBits;
203 return NewCmpC >= -2048 && NewCmpC <= 2048;
204}
205
207 const APInt &Imm, Type *Ty,
209 Instruction *Inst) const {
210 assert(Ty->isIntegerTy() &&
211 "getIntImmCost can only estimate cost of materialising integers");
212
213 // We have a Zero register, so 0 is always free.
214 if (Imm == 0)
215 return TTI::TCC_Free;
216
217 // Some instructions in RISC-V can take a 12-bit immediate. Some of these are
218 // commutative, in others the immediate comes from a specific argument index.
219 bool Takes12BitImm = false;
220 unsigned ImmArgIdx = ~0U;
221
222 switch (Opcode) {
223 case Instruction::GetElementPtr:
224 // Never hoist any arguments to a GetElementPtr. CodeGenPrepare will
225 // split up large offsets in GEP into better parts than ConstantHoisting
226 // can.
227 return TTI::TCC_Free;
228 case Instruction::Store: {
229 // Use the materialization cost regardless of if it's the address or the
230 // value that is constant, except for if the store is misaligned and
231 // misaligned accesses are not legal (experience shows constant hoisting
232 // can sometimes be harmful in such cases).
233 if (Idx == 1 || !Inst)
234 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
235 /*FreeZeroes=*/true);
236
237 StoreInst *ST = cast<StoreInst>(Inst);
238 if (!getTLI()->allowsMemoryAccessForAlignment(
239 Ty->getContext(), DL, getTLI()->getValueType(DL, Ty),
240 ST->getPointerAddressSpace(), ST->getAlign()))
241 return TTI::TCC_Free;
242
243 return getIntImmCostImpl(getDataLayout(), getST(), Imm, Ty, CostKind,
244 /*FreeZeroes=*/true);
245 }
246 case Instruction::Load:
247 // If the address is a constant, use the materialization cost.
248 return getIntImmCost(Imm, Ty, CostKind);
249 case Instruction::And:
250 // zext.h
251 if (Imm == UINT64_C(0xffff) && ST->hasStdExtZbb())
252 return TTI::TCC_Free;
253 // zext.w
254 if (Imm == UINT64_C(0xffffffff) &&
255 ((ST->hasStdExtZba() && ST->isRV64()) || ST->isRV32()))
256 return TTI::TCC_Free;
257 // bclri
258 if (ST->hasStdExtZbs() && (~Imm).isPowerOf2())
259 return TTI::TCC_Free;
260 if (Inst && Idx == 1 && Imm.getBitWidth() <= ST->getXLen() &&
261 canUseShiftPair(Inst, Imm))
262 return TTI::TCC_Free;
263 if (Inst && Idx == 1 && Imm.getBitWidth() == 64 &&
264 canUseShiftCmp(Inst, Imm))
265 return TTI::TCC_Free;
266 Takes12BitImm = true;
267 break;
268 case Instruction::Add:
269 Takes12BitImm = true;
270 break;
271 case Instruction::Or:
272 case Instruction::Xor:
273 // bseti/binvi
274 if (ST->hasStdExtZbs() && Imm.isPowerOf2())
275 return TTI::TCC_Free;
276 Takes12BitImm = true;
277 break;
278 case Instruction::Mul:
279 // Power of 2 is a shift. Negated power of 2 is a shift and a negate.
280 if (Imm.isPowerOf2() || Imm.isNegatedPowerOf2())
281 return TTI::TCC_Free;
282 // One more or less than a power of 2 can use SLLI+ADD/SUB.
283 if ((Imm + 1).isPowerOf2() || (Imm - 1).isPowerOf2())
284 return TTI::TCC_Free;
285 // FIXME: There is no MULI instruction.
286 Takes12BitImm = true;
287 break;
288 case Instruction::Sub:
289 case Instruction::Shl:
290 case Instruction::LShr:
291 case Instruction::AShr:
292 Takes12BitImm = true;
293 ImmArgIdx = 1;
294 break;
295 default:
296 break;
297 }
298
299 if (Takes12BitImm) {
300 // Check immediate is the correct argument...
301 if (Instruction::isCommutative(Opcode) || Idx == ImmArgIdx) {
302 // ... and fits into the 12-bit immediate.
303 if (Imm.getSignificantBits() <= 64 &&
304 getTLI()->isLegalAddImmediate(Imm.getSExtValue())) {
305 return TTI::TCC_Free;
306 }
307 }
308
309 // Otherwise, use the full materialisation cost.
310 return getIntImmCost(Imm, Ty, CostKind);
311 }
312
313 // By default, prevent hoisting.
314 return TTI::TCC_Free;
315}
316
319 const APInt &Imm, Type *Ty,
321 // Prevent hoisting in unknown cases.
322 return TTI::TCC_Free;
323}
324
326 return ST->hasVInstructions();
327}
328
330RISCVTTIImpl::getPopcntSupport(unsigned TyWidth) const {
331 assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
332 return ST->hasCPOPLike() ? TTI::PSK_FastHardware : TTI::PSK_Software;
333}
334
336 unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType,
338 TTI::PartialReductionExtendKind OpBExtend, std::optional<unsigned> BinOp,
340
341 // zve32x is broken for partial_reduce_umla, but let's make sure we
342 // don't generate them.
343 if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 ||
344 Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul ||
345 InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) ||
346 !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4))
348
349 Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4));
350 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
351 // Note: Asuming all vqdot* variants are equal cost
352 return LT.first *
353 getRISCVInstructionCost(RISCV::VQDOT_VV, LT.second, CostKind);
354}
355
357 // Currently, the ExpandReductions pass can't expand scalable-vector
358 // reductions, but we still request expansion as RVV doesn't support certain
359 // reductions and the SelectionDAG can't legalize them either.
360 switch (II->getIntrinsicID()) {
361 default:
362 return false;
363 // These reductions have no equivalent in RVV
364 case Intrinsic::vector_reduce_mul:
365 case Intrinsic::vector_reduce_fmul:
366 return true;
367 }
368}
369
370std::optional<unsigned> RISCVTTIImpl::getMaxVScale() const {
371 if (ST->hasVInstructions())
372 return ST->getRealMaxVLen() / RISCV::RVVBitsPerBlock;
373 return BaseT::getMaxVScale();
374}
375
376std::optional<unsigned> RISCVTTIImpl::getVScaleForTuning() const {
377 if (ST->hasVInstructions())
378 if (unsigned MinVLen = ST->getRealMinVLen();
379 MinVLen >= RISCV::RVVBitsPerBlock)
380 return MinVLen / RISCV::RVVBitsPerBlock;
382}
383
386 unsigned LMUL =
387 llvm::bit_floor(std::clamp<unsigned>(RVVRegisterWidthLMUL, 1, 8));
388 switch (K) {
390 return TypeSize::getFixed(ST->getXLen());
392 return TypeSize::getFixed(
393 ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0);
396 (ST->hasVInstructions() &&
397 ST->getRealMinVLen() >= RISCV::RVVBitsPerBlock)
399 : 0);
400 }
401
402 llvm_unreachable("Unsupported register kind");
403}
404
406RISCVTTIImpl::getConstantPoolLoadCost(Type *Ty,
408 // Add a cost of address generation + the cost of the load. The address
409 // is expected to be a PC relative offset to a constant pool entry
410 // using auipc/addi.
411 return 2 + getMemoryOpCost(Instruction::Load, Ty, DL.getABITypeAlign(Ty),
412 /*AddressSpace=*/0, CostKind);
413}
414
415static bool isRepeatedConcatMask(ArrayRef<int> Mask, int &SubVectorSize) {
416 unsigned Size = Mask.size();
417 if (!isPowerOf2_32(Size))
418 return false;
419 for (unsigned I = 0; I != Size; ++I) {
420 if (static_cast<unsigned>(Mask[I]) == I)
421 continue;
422 if (Mask[I] != 0)
423 return false;
424 if (Size % I != 0)
425 return false;
426 for (unsigned J = I + 1; J != Size; ++J)
427 // Check the pattern is repeated.
428 if (static_cast<unsigned>(Mask[J]) != J % I)
429 return false;
430 SubVectorSize = I;
431 return true;
432 }
433 // That means Mask is <0, 1, 2, 3>. This is not a concatenation.
434 return false;
435}
436
438 LLVMContext &C) {
439 assert((DataVT.getScalarSizeInBits() != 8 ||
440 DataVT.getVectorNumElements() <= 256) && "unhandled case in lowering");
441 MVT IndexVT = DataVT.changeTypeToInteger();
442 if (IndexVT.getScalarType().bitsGT(ST.getXLenVT()))
443 IndexVT = IndexVT.changeVectorElementType(MVT::i16);
444 return cast<VectorType>(EVT(IndexVT).getTypeForEVT(C));
445}
446
447/// Attempt to approximate the cost of a shuffle which will require splitting
448/// during legalization. Note that processShuffleMasks is not an exact proxy
449/// for the algorithm used in LegalizeVectorTypes, but hopefully it's a
450/// reasonably close upperbound.
452 MVT LegalVT, VectorType *Tp,
453 ArrayRef<int> Mask,
455 assert(LegalVT.isFixedLengthVector() && !Mask.empty() &&
456 "Expected fixed vector type and non-empty mask");
457 unsigned LegalNumElts = LegalVT.getVectorNumElements();
458 // Number of destination vectors after legalization:
459 unsigned NumOfDests = divideCeil(Mask.size(), LegalNumElts);
460 // We are going to permute multiple sources and the result will be in
461 // multiple destinations. Providing an accurate cost only for splits where
462 // the element type remains the same.
463 if (NumOfDests <= 1 ||
465 Tp->getElementType()->getPrimitiveSizeInBits() ||
466 LegalNumElts >= Tp->getElementCount().getFixedValue())
468
469 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
470 unsigned LegalVTSize = LegalVT.getStoreSize();
471 // Number of source vectors after legalization:
472 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
473
474 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(), LegalNumElts);
475
476 unsigned NormalizedVF = LegalNumElts * std::max(NumOfSrcs, NumOfDests);
477 unsigned NumOfSrcRegs = NormalizedVF / LegalNumElts;
478 unsigned NumOfDestRegs = NormalizedVF / LegalNumElts;
479 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
480 assert(NormalizedVF >= Mask.size() &&
481 "Normalized mask expected to be not shorter than original mask.");
482 copy(Mask, NormalizedMask.begin());
483 InstructionCost Cost = 0;
484 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
486 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
487 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
488 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
489 return;
490 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
491 .second)
492 return;
493 Cost += TTI.getShuffleCost(
495 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
496 SingleOpTy, RegMask, CostKind, 0, nullptr);
497 },
498 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
499 Cost += TTI.getShuffleCost(
501 FixedVectorType::get(SingleOpTy->getElementType(), RegMask.size()),
502 SingleOpTy, RegMask, CostKind, 0, nullptr);
503 });
504 return Cost;
505}
506
507/// Try to perform better estimation of the permutation.
508/// 1. Split the source/destination vectors into real registers.
509/// 2. Do the mask analysis to identify which real registers are
510/// permuted. If more than 1 source registers are used for the
511/// destination register building, the cost for this destination register
512/// is (Number_of_source_register - 1) * Cost_PermuteTwoSrc. If only one
513/// source register is used, build mask and calculate the cost as a cost
514/// of PermuteSingleSrc.
515/// Also, for the single register permute we try to identify if the
516/// destination register is just a copy of the source register or the
517/// copy of the previous destination register (the cost is
518/// TTI::TCC_Basic). If the source register is just reused, the cost for
519/// this operation is 0.
520static InstructionCost
522 std::optional<unsigned> VLen, VectorType *Tp,
524 assert(LegalVT.isFixedLengthVector());
525 if (!VLen || Mask.empty())
527 MVT ElemVT = LegalVT.getVectorElementType();
528 unsigned ElemsPerVReg = *VLen / ElemVT.getFixedSizeInBits();
529 LegalVT = TTI.getTypeLegalizationCost(
530 FixedVectorType::get(Tp->getElementType(), ElemsPerVReg))
531 .second;
532 // Number of destination vectors after legalization:
533 InstructionCost NumOfDests =
534 divideCeil(Mask.size(), LegalVT.getVectorNumElements());
535 if (NumOfDests <= 1 ||
537 Tp->getElementType()->getPrimitiveSizeInBits() ||
538 LegalVT.getVectorNumElements() >= Tp->getElementCount().getFixedValue())
540
541 unsigned VecTySize = TTI.getDataLayout().getTypeStoreSize(Tp);
542 unsigned LegalVTSize = LegalVT.getStoreSize();
543 // Number of source vectors after legalization:
544 unsigned NumOfSrcs = divideCeil(VecTySize, LegalVTSize);
545
546 auto *SingleOpTy = FixedVectorType::get(Tp->getElementType(),
547 LegalVT.getVectorNumElements());
548
549 unsigned E = NumOfDests.getValue();
550 unsigned NormalizedVF =
551 LegalVT.getVectorNumElements() * std::max(NumOfSrcs, E);
552 unsigned NumOfSrcRegs = NormalizedVF / LegalVT.getVectorNumElements();
553 unsigned NumOfDestRegs = NormalizedVF / LegalVT.getVectorNumElements();
554 SmallVector<int> NormalizedMask(NormalizedVF, PoisonMaskElem);
555 assert(NormalizedVF >= Mask.size() &&
556 "Normalized mask expected to be not shorter than original mask.");
557 copy(Mask, NormalizedMask.begin());
558 InstructionCost Cost = 0;
559 int NumShuffles = 0;
560 SmallDenseSet<std::pair<ArrayRef<int>, unsigned>> ReusedSingleSrcShuffles;
562 NormalizedMask, NumOfSrcRegs, NumOfDestRegs, NumOfDestRegs, []() {},
563 [&](ArrayRef<int> RegMask, unsigned SrcReg, unsigned DestReg) {
564 if (ShuffleVectorInst::isIdentityMask(RegMask, RegMask.size()))
565 return;
566 if (!ReusedSingleSrcShuffles.insert(std::make_pair(RegMask, SrcReg))
567 .second)
568 return;
569 ++NumShuffles;
570 Cost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, SingleOpTy,
571 SingleOpTy, RegMask, CostKind, 0, nullptr);
572 },
573 [&](ArrayRef<int> RegMask, unsigned Idx1, unsigned Idx2, bool NewReg) {
574 Cost += TTI.getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy,
575 SingleOpTy, RegMask, CostKind, 0, nullptr);
576 NumShuffles += 2;
577 });
578 // Note: check that we do not emit too many shuffles here to prevent code
579 // size explosion.
580 // TODO: investigate, if it can be improved by extra analysis of the masks
581 // to check if the code is more profitable.
582 if ((NumOfDestRegs > 2 && NumShuffles <= static_cast<int>(NumOfDestRegs)) ||
583 (NumOfDestRegs <= 2 && NumShuffles < 4))
584 return Cost;
586}
587
588InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp,
589 ArrayRef<int> Mask,
591 // Avoid missing masks and length changing shuffles
592 if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements())
594
595 int NumElts = Tp->getNumElements();
596 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Tp);
597 // Avoid scalarization cases
598 if (!LT.second.isFixedLengthVector())
600
601 // Requires moving elements between parts, which requires additional
602 // unmodeled instructions.
603 if (LT.first != 1)
605
606 auto GetSlideOpcode = [&](int SlideAmt) {
607 assert(SlideAmt != 0);
608 bool IsVI = isUInt<5>(std::abs(SlideAmt));
609 if (SlideAmt < 0)
610 return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX;
611 return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX;
612 };
613
614 std::array<std::pair<int, int>, 2> SrcInfo;
615 if (!isMaskedSlidePair(Mask, NumElts, SrcInfo))
617
618 if (SrcInfo[1].second == 0)
619 std::swap(SrcInfo[0], SrcInfo[1]);
620
621 InstructionCost FirstSlideCost = 0;
622 if (SrcInfo[0].second != 0) {
623 unsigned Opcode = GetSlideOpcode(SrcInfo[0].second);
624 FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
625 }
626
627 if (SrcInfo[1].first == -1)
628 return FirstSlideCost;
629
630 InstructionCost SecondSlideCost = 0;
631 if (SrcInfo[1].second != 0) {
632 unsigned Opcode = GetSlideOpcode(SrcInfo[1].second);
633 SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind);
634 } else {
635 SecondSlideCost =
636 getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind);
637 }
638
639 auto EC = Tp->getElementCount();
640 VectorType *MaskTy =
642 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
643 return FirstSlideCost + SecondSlideCost + MaskCost;
644}
645
648 VectorType *SrcTy, ArrayRef<int> Mask,
649 TTI::TargetCostKind CostKind, int Index,
651 const Instruction *CxtI) const {
652 assert((Mask.empty() || DstTy->isScalableTy() ||
653 Mask.size() == DstTy->getElementCount().getKnownMinValue()) &&
654 "Expected the Mask to match the return size if given");
655 assert(SrcTy->getScalarType() == DstTy->getScalarType() &&
656 "Expected the same scalar types");
657
658 Kind = improveShuffleKindFromMask(Kind, Mask, SrcTy, Index, SubTp);
659 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcTy);
660
661 // First, handle cases where having a fixed length vector enables us to
662 // give a more accurate cost than falling back to generic scalable codegen.
663 // TODO: Each of these cases hints at a modeling gap around scalable vectors.
664 if (auto *FVTp = dyn_cast<FixedVectorType>(SrcTy);
665 FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) {
667 *this, LT.second, ST->getRealVLen(),
668 Kind == TTI::SK_InsertSubvector ? DstTy : SrcTy, Mask, CostKind);
669 if (VRegSplittingCost.isValid())
670 return VRegSplittingCost;
671 switch (Kind) {
672 default:
673 break;
675 if (Mask.size() >= 2) {
676 MVT EltTp = LT.second.getVectorElementType();
677 // If the size of the element is < ELEN then shuffles of interleaves and
678 // deinterleaves of 2 vectors can be lowered into the following
679 // sequences
680 if (EltTp.getScalarSizeInBits() < ST->getELen()) {
681 // Example sequence:
682 // vsetivli zero, 4, e8, mf4, ta, ma (ignored)
683 // vwaddu.vv v10, v8, v9
684 // li a0, -1 (ignored)
685 // vwmaccu.vx v10, a0, v9
686 if (ShuffleVectorInst::isInterleaveMask(Mask, 2, Mask.size()))
687 return 2 * LT.first * TLI->getLMULCost(LT.second);
688
689 if (Mask[0] == 0 || Mask[0] == 1) {
690 auto DeinterleaveMask = createStrideMask(Mask[0], 2, Mask.size());
691 // Example sequence:
692 // vnsrl.wi v10, v8, 0
693 if (equal(DeinterleaveMask, Mask))
694 return LT.first * getRISCVInstructionCost(RISCV::VNSRL_WI,
695 LT.second, CostKind);
696 }
697 }
698 int SubVectorSize;
699 if (LT.second.getScalarSizeInBits() != 1 &&
700 isRepeatedConcatMask(Mask, SubVectorSize)) {
702 unsigned NumSlides = Log2_32(Mask.size() / SubVectorSize);
703 // The cost of extraction from a subvector is 0 if the index is 0.
704 for (unsigned I = 0; I != NumSlides; ++I) {
705 unsigned InsertIndex = SubVectorSize * (1 << I);
706 FixedVectorType *SubTp =
707 FixedVectorType::get(SrcTy->getElementType(), InsertIndex);
708 FixedVectorType *DestTp =
710 std::pair<InstructionCost, MVT> DestLT =
712 // Add the cost of whole vector register move because the
713 // destination vector register group for vslideup cannot overlap the
714 // source.
715 Cost += DestLT.first * TLI->getLMULCost(DestLT.second);
716 Cost += getShuffleCost(TTI::SK_InsertSubvector, DestTp, DestTp, {},
717 CostKind, InsertIndex, SubTp);
718 }
719 return Cost;
720 }
721 }
722
723 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
724 SlideCost.isValid())
725 return SlideCost;
726
727 // vrgather + cost of generating the mask constant.
728 // We model this for an unknown mask with a single vrgather.
729 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
730 LT.second.getVectorNumElements() <= 256)) {
731 VectorType *IdxTy =
732 getVRGatherIndexType(LT.second, *ST, SrcTy->getContext());
733 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
734 return IndexCost +
735 getRISCVInstructionCost(RISCV::VRGATHER_VV, LT.second, CostKind);
736 }
737 break;
738 }
741
742 if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind);
743 SlideCost.isValid())
744 return SlideCost;
745
746 // 2 x (vrgather + cost of generating the mask constant) + cost of mask
747 // register for the second vrgather. We model this for an unknown
748 // (shuffle) mask.
749 if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 ||
750 LT.second.getVectorNumElements() <= 256)) {
751 auto &C = SrcTy->getContext();
752 auto EC = SrcTy->getElementCount();
753 VectorType *IdxTy = getVRGatherIndexType(LT.second, *ST, C);
755 InstructionCost IndexCost = getConstantPoolLoadCost(IdxTy, CostKind);
756 InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind);
757 return 2 * IndexCost +
758 getRISCVInstructionCost({RISCV::VRGATHER_VV, RISCV::VRGATHER_VV},
759 LT.second, CostKind) +
760 MaskCost;
761 }
762 break;
763 }
764 }
765
766 auto shouldSplit = [](TTI::ShuffleKind Kind) {
767 switch (Kind) {
768 default:
769 return false;
773 return true;
774 }
775 };
776
777 if (!Mask.empty() && LT.first.isValid() && LT.first != 1 &&
778 shouldSplit(Kind)) {
779 InstructionCost SplitCost =
780 costShuffleViaSplitting(*this, LT.second, FVTp, Mask, CostKind);
781 if (SplitCost.isValid())
782 return SplitCost;
783 }
784 }
785
786 // Handle scalable vectors (and fixed vectors legalized to scalable vectors).
787 switch (Kind) {
788 default:
789 // Fallthrough to generic handling.
790 // TODO: Most of these cases will return getInvalid in generic code, and
791 // must be implemented here.
792 break;
794 // Extract at zero is always a subregister extract
795 if (Index == 0)
796 return TTI::TCC_Free;
797
798 // If we're extracting a subvector of at most m1 size at a sub-register
799 // boundary - which unfortunately we need exact vlen to identify - this is
800 // a subregister extract at worst and thus won't require a vslidedown.
801 // TODO: Extend for aligned m2, m4 subvector extracts
802 // TODO: Extend for misalgined (but contained) extracts
803 // TODO: Extend for scalable subvector types
804 if (std::pair<InstructionCost, MVT> SubLT = getTypeLegalizationCost(SubTp);
805 SubLT.second.isValid() && SubLT.second.isFixedLengthVector()) {
806 if (std::optional<unsigned> VLen = ST->getRealVLen();
807 VLen && SubLT.second.getScalarSizeInBits() * Index % *VLen == 0 &&
808 SubLT.second.getSizeInBits() <= *VLen)
809 return TTI::TCC_Free;
810 }
811
812 // Example sequence:
813 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
814 // vslidedown.vi v8, v9, 2
815 return LT.first *
816 getRISCVInstructionCost(RISCV::VSLIDEDOWN_VI, LT.second, CostKind);
818 // Example sequence:
819 // vsetivli zero, 4, e8, mf2, tu, ma (ignored)
820 // vslideup.vi v8, v9, 2
821 LT = getTypeLegalizationCost(DstTy);
822 return LT.first *
823 getRISCVInstructionCost(RISCV::VSLIDEUP_VI, LT.second, CostKind);
824 case TTI::SK_Select: {
825 // Example sequence:
826 // li a0, 90
827 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
828 // vmv.s.x v0, a0
829 // vmerge.vvm v8, v9, v8, v0
830 // We use 2 for the cost of the mask materialization as this is the true
831 // cost for small masks and most shuffles are small. At worst, this cost
832 // should be a very small constant for the constant pool load. As such,
833 // we may bias towards large selects slightly more than truly warranted.
834 return LT.first *
835 (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
836 LT.second, CostKind));
837 }
838 case TTI::SK_Broadcast: {
839 bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
840 Instruction::InsertElement);
841 if (LT.second.getScalarSizeInBits() == 1) {
842 if (HasScalar) {
843 // Example sequence:
844 // andi a0, a0, 1
845 // vsetivli zero, 2, e8, mf8, ta, ma (ignored)
846 // vmv.v.x v8, a0
847 // vmsne.vi v0, v8, 0
848 return LT.first *
849 (1 + getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
850 LT.second, CostKind));
851 }
852 // Example sequence:
853 // vsetivli zero, 2, e8, mf8, ta, mu (ignored)
854 // vmv.v.i v8, 0
855 // vmerge.vim v8, v8, 1, v0
856 // vmv.x.s a0, v8
857 // andi a0, a0, 1
858 // vmv.v.x v8, a0
859 // vmsne.vi v0, v8, 0
860
861 return LT.first *
862 (1 + getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
863 RISCV::VMV_X_S, RISCV::VMV_V_X,
864 RISCV::VMSNE_VI},
865 LT.second, CostKind));
866 }
867
868 if (HasScalar) {
869 // Example sequence:
870 // vmv.v.x v8, a0
871 return LT.first *
872 getRISCVInstructionCost(RISCV::VMV_V_X, LT.second, CostKind);
873 }
874
875 // Example sequence:
876 // vrgather.vi v9, v8, 0
877 return LT.first *
878 getRISCVInstructionCost(RISCV::VRGATHER_VI, LT.second, CostKind);
879 }
880 case TTI::SK_Splice: {
881 // vslidedown+vslideup.
882 // TODO: Multiplying by LT.first implies this legalizes into multiple copies
883 // of similar code, but I think we expand through memory.
884 unsigned Opcodes[2] = {RISCV::VSLIDEDOWN_VX, RISCV::VSLIDEUP_VX};
885 if (Index >= 0 && Index < 32)
886 Opcodes[0] = RISCV::VSLIDEDOWN_VI;
887 else if (Index < 0 && Index > -32)
888 Opcodes[1] = RISCV::VSLIDEUP_VI;
889 return LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
890 }
891 case TTI::SK_Reverse: {
892
893 if (!LT.second.isVector())
895
896 // TODO: Cases to improve here:
897 // * Illegal vector types
898 // * i64 on RV32
899 if (SrcTy->getElementType()->isIntegerTy(1)) {
900 VectorType *WideTy =
901 VectorType::get(IntegerType::get(SrcTy->getContext(), 8),
902 cast<VectorType>(SrcTy)->getElementCount());
903 return getCastInstrCost(Instruction::ZExt, WideTy, SrcTy,
905 getShuffleCost(TTI::SK_Reverse, WideTy, WideTy, {}, CostKind, 0,
906 nullptr) +
907 getCastInstrCost(Instruction::Trunc, SrcTy, WideTy,
909 }
910
911 MVT ContainerVT = LT.second;
912 if (LT.second.isFixedLengthVector())
913 ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
914 MVT M1VT = RISCVTargetLowering::getM1VT(ContainerVT);
915 if (ContainerVT.bitsLE(M1VT)) {
916 // Example sequence:
917 // csrr a0, vlenb
918 // srli a0, a0, 3
919 // addi a0, a0, -1
920 // vsetvli a1, zero, e8, mf8, ta, mu (ignored)
921 // vid.v v9
922 // vrsub.vx v10, v9, a0
923 // vrgather.vv v9, v8, v10
924 InstructionCost LenCost = 3;
925 if (LT.second.isFixedLengthVector())
926 // vrsub.vi has a 5 bit immediate field, otherwise an li suffices
927 LenCost = isInt<5>(LT.second.getVectorNumElements() - 1) ? 0 : 1;
928 unsigned Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX, RISCV::VRGATHER_VV};
929 if (LT.second.isFixedLengthVector() &&
930 isInt<5>(LT.second.getVectorNumElements() - 1))
931 Opcodes[1] = RISCV::VRSUB_VI;
932 InstructionCost GatherCost =
933 getRISCVInstructionCost(Opcodes, LT.second, CostKind);
934 return LT.first * (LenCost + GatherCost);
935 }
936
937 // At high LMUL, we split into a series of M1 reverses (see
938 // lowerVECTOR_REVERSE) and then do a single slide at the end to eliminate
939 // the resulting gap at the bottom (for fixed vectors only). The important
940 // bit is that the cost scales linearly, not quadratically with LMUL.
941 unsigned M1Opcodes[] = {RISCV::VID_V, RISCV::VRSUB_VX};
942 InstructionCost FixedCost =
943 getRISCVInstructionCost(M1Opcodes, M1VT, CostKind) + 3;
944 unsigned Ratio =
946 InstructionCost GatherCost =
947 getRISCVInstructionCost({RISCV::VRGATHER_VV}, M1VT, CostKind) * Ratio;
948 InstructionCost SlideCost = !LT.second.isFixedLengthVector() ? 0 :
949 getRISCVInstructionCost({RISCV::VSLIDEDOWN_VX}, LT.second, CostKind);
950 return FixedCost + LT.first * (GatherCost + SlideCost);
951 }
952 }
953 return BaseT::getShuffleCost(Kind, DstTy, SrcTy, Mask, CostKind, Index,
954 SubTp);
955}
956
957static unsigned isM1OrSmaller(MVT VT) {
959 return (LMUL == RISCVVType::VLMUL::LMUL_F8 ||
963}
964
966 VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract,
967 TTI::TargetCostKind CostKind, bool ForPoisonSrc,
968 ArrayRef<Value *> VL) const {
971
972 // A build_vector (which is m1 sized or smaller) can be done in no
973 // worse than one vslide1down.vx per element in the type. We could
974 // in theory do an explode_vector in the inverse manner, but our
975 // lowering today does not have a first class node for this pattern.
977 Ty, DemandedElts, Insert, Extract, CostKind);
978 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
979 if (Insert && !Extract && LT.first.isValid() && LT.second.isVector()) {
980 if (Ty->getScalarSizeInBits() == 1) {
981 auto *WideVecTy = cast<VectorType>(Ty->getWithNewBitWidth(8));
982 // Note: Implicit scalar anyextend is assumed to be free since the i1
983 // must be stored in a GPR.
984 return getScalarizationOverhead(WideVecTy, DemandedElts, Insert, Extract,
985 CostKind) +
986 getCastInstrCost(Instruction::Trunc, Ty, WideVecTy,
988 }
989
990 assert(LT.second.isFixedLengthVector());
991 MVT ContainerVT = TLI->getContainerForFixedLengthVector(LT.second);
992 if (isM1OrSmaller(ContainerVT)) {
993 InstructionCost BV =
994 cast<FixedVectorType>(Ty)->getNumElements() *
995 getRISCVInstructionCost(RISCV::VSLIDE1DOWN_VX, LT.second, CostKind);
996 if (BV < Cost)
997 Cost = BV;
998 }
999 }
1000 return Cost;
1001}
1002
1004RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
1005 unsigned AddressSpace,
1007 if (!isLegalMaskedLoadStore(Src, Alignment) ||
1009 return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
1010 CostKind);
1011
1012 return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
1013}
1014
1016 unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
1017 Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
1018 bool UseMaskForCond, bool UseMaskForGaps) const {
1019
1020 // The interleaved memory access pass will lower (de)interleave ops combined
1021 // with an adjacent appropriate memory to vlseg/vsseg intrinsics. vlseg/vsseg
1022 // only support masking per-iteration (i.e. condition), not per-segment (i.e.
1023 // gap).
1024 if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) {
1025 auto *VTy = cast<VectorType>(VecTy);
1026 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(VTy);
1027 // Need to make sure type has't been scalarized
1028 if (LT.second.isVector()) {
1029 auto *SubVecTy =
1030 VectorType::get(VTy->getElementType(),
1031 VTy->getElementCount().divideCoefficientBy(Factor));
1032 if (VTy->getElementCount().isKnownMultipleOf(Factor) &&
1033 TLI->isLegalInterleavedAccessType(SubVecTy, Factor, Alignment,
1034 AddressSpace, DL)) {
1035
1036 // Some processors optimize segment loads/stores as one wide memory op +
1037 // Factor * LMUL shuffle ops.
1038 if (ST->hasOptimizedSegmentLoadStore(Factor)) {
1040 getMemoryOpCost(Opcode, VTy, Alignment, AddressSpace, CostKind);
1041 MVT SubVecVT = getTLI()->getValueType(DL, SubVecTy).getSimpleVT();
1042 Cost += Factor * TLI->getLMULCost(SubVecVT);
1043 return LT.first * Cost;
1044 }
1045
1046 // Otherwise, the cost is proportional to the number of elements (VL *
1047 // Factor ops).
1048 InstructionCost MemOpCost =
1049 getMemoryOpCost(Opcode, VTy->getElementType(), Alignment, 0,
1050 CostKind, {TTI::OK_AnyValue, TTI::OP_None});
1051 unsigned NumLoads = getEstimatedVLFor(VTy);
1052 return NumLoads * MemOpCost;
1053 }
1054 }
1055 }
1056
1057 // TODO: Return the cost of interleaved accesses for scalable vector when
1058 // unable to convert to segment accesses instructions.
1059 if (isa<ScalableVectorType>(VecTy))
1061
1062 auto *FVTy = cast<FixedVectorType>(VecTy);
1063 InstructionCost MemCost =
1064 getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace, CostKind);
1065 unsigned VF = FVTy->getNumElements() / Factor;
1066
1067 // An interleaved load will look like this for Factor=3:
1068 // %wide.vec = load <12 x i32>, ptr %3, align 4
1069 // %strided.vec = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1070 // %strided.vec1 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1071 // %strided.vec2 = shufflevector %wide.vec, poison, <4 x i32> <stride mask>
1072 if (Opcode == Instruction::Load) {
1073 InstructionCost Cost = MemCost;
1074 for (unsigned Index : Indices) {
1075 FixedVectorType *VecTy =
1076 FixedVectorType::get(FVTy->getElementType(), VF * Factor);
1077 auto Mask = createStrideMask(Index, Factor, VF);
1078 Mask.resize(VF * Factor, -1);
1079 InstructionCost ShuffleCost =
1081 Mask, CostKind, 0, nullptr, {});
1082 Cost += ShuffleCost;
1083 }
1084 return Cost;
1085 }
1086
1087 // TODO: Model for NF > 2
1088 // We'll need to enhance getShuffleCost to model shuffles that are just
1089 // inserts and extracts into subvectors, since they won't have the full cost
1090 // of a vrgather.
1091 // An interleaved store for 3 vectors of 4 lanes will look like
1092 // %11 = shufflevector <4 x i32> %4, <4 x i32> %6, <8 x i32> <0...7>
1093 // %12 = shufflevector <4 x i32> %9, <4 x i32> poison, <8 x i32> <0...3>
1094 // %13 = shufflevector <8 x i32> %11, <8 x i32> %12, <12 x i32> <0...11>
1095 // %interleaved.vec = shufflevector %13, poison, <12 x i32> <interleave mask>
1096 // store <12 x i32> %interleaved.vec, ptr %10, align 4
1097 if (Factor != 2)
1098 return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
1099 Alignment, AddressSpace, CostKind,
1100 UseMaskForCond, UseMaskForGaps);
1101
1102 assert(Opcode == Instruction::Store && "Opcode must be a store");
1103 // For an interleaving store of 2 vectors, we perform one large interleaving
1104 // shuffle that goes into the wide store
1105 auto Mask = createInterleaveMask(VF, Factor);
1106 InstructionCost ShuffleCost =
1108 CostKind, 0, nullptr, {});
1109 return MemCost + ShuffleCost;
1110}
1111
1113 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1114 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1116 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1117 Alignment, CostKind, I);
1118
1119 if ((Opcode == Instruction::Load &&
1120 !isLegalMaskedGather(DataTy, Align(Alignment))) ||
1121 (Opcode == Instruction::Store &&
1122 !isLegalMaskedScatter(DataTy, Align(Alignment))))
1123 return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
1124 Alignment, CostKind, I);
1125
1126 // Cost is proportional to the number of memory operations implied. For
1127 // scalable vectors, we use an estimate on that number since we don't
1128 // know exactly what VL will be.
1129 auto &VTy = *cast<VectorType>(DataTy);
1130 InstructionCost MemOpCost =
1131 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1132 {TTI::OK_AnyValue, TTI::OP_None}, I);
1133 unsigned NumLoads = getEstimatedVLFor(&VTy);
1134 return NumLoads * MemOpCost;
1135}
1136
1138 unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment,
1139 TTI::TargetCostKind CostKind, const Instruction *I) const {
1140 bool IsLegal = (Opcode == Instruction::Store &&
1141 isLegalMaskedCompressStore(DataTy, Alignment)) ||
1142 (Opcode == Instruction::Load &&
1143 isLegalMaskedExpandLoad(DataTy, Alignment));
1144 if (!IsLegal || CostKind != TTI::TCK_RecipThroughput)
1145 return BaseT::getExpandCompressMemoryOpCost(Opcode, DataTy, VariableMask,
1146 Alignment, CostKind, I);
1147 // Example compressstore sequence:
1148 // vsetivli zero, 8, e32, m2, ta, ma (ignored)
1149 // vcompress.vm v10, v8, v0
1150 // vcpop.m a1, v0
1151 // vsetvli zero, a1, e32, m2, ta, ma
1152 // vse32.v v10, (a0)
1153 // Example expandload sequence:
1154 // vsetivli zero, 8, e8, mf2, ta, ma (ignored)
1155 // vcpop.m a1, v0
1156 // vsetvli zero, a1, e32, m2, ta, ma
1157 // vle32.v v10, (a0)
1158 // vsetivli zero, 8, e32, m2, ta, ma
1159 // viota.m v12, v0
1160 // vrgather.vv v8, v10, v12, v0.t
1161 auto MemOpCost =
1162 getMemoryOpCost(Opcode, DataTy, Alignment, /*AddressSpace*/ 0, CostKind);
1163 auto LT = getTypeLegalizationCost(DataTy);
1164 SmallVector<unsigned, 4> Opcodes{RISCV::VSETVLI};
1165 if (VariableMask)
1166 Opcodes.push_back(RISCV::VCPOP_M);
1167 if (Opcode == Instruction::Store)
1168 Opcodes.append({RISCV::VCOMPRESS_VM});
1169 else
1170 Opcodes.append({RISCV::VSETIVLI, RISCV::VIOTA_M, RISCV::VRGATHER_VV});
1171 return MemOpCost +
1172 LT.first * getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1173}
1174
1176 unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
1177 Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
1178 if (((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
1179 !isLegalStridedLoadStore(DataTy, Alignment)) ||
1180 (Opcode != Instruction::Load && Opcode != Instruction::Store))
1181 return BaseT::getStridedMemoryOpCost(Opcode, DataTy, Ptr, VariableMask,
1182 Alignment, CostKind, I);
1183
1185 return TTI::TCC_Basic;
1186
1187 // Cost is proportional to the number of memory operations implied. For
1188 // scalable vectors, we use an estimate on that number since we don't
1189 // know exactly what VL will be.
1190 auto &VTy = *cast<VectorType>(DataTy);
1191 InstructionCost MemOpCost =
1192 getMemoryOpCost(Opcode, VTy.getElementType(), Alignment, 0, CostKind,
1193 {TTI::OK_AnyValue, TTI::OP_None}, I);
1194 unsigned NumLoads = getEstimatedVLFor(&VTy);
1195 return NumLoads * MemOpCost;
1196}
1197
1200 // FIXME: This is a property of the default vector convention, not
1201 // all possible calling conventions. Fixing that will require
1202 // some TTI API and SLP rework.
1205 for (auto *Ty : Tys) {
1206 if (!Ty->isVectorTy())
1207 continue;
1208 Align A = DL.getPrefTypeAlign(Ty);
1209 Cost += getMemoryOpCost(Instruction::Store, Ty, A, 0, CostKind) +
1210 getMemoryOpCost(Instruction::Load, Ty, A, 0, CostKind);
1211 }
1212 return Cost;
1213}
1214
1215// Currently, these represent both throughput and codesize costs
1216// for the respective intrinsics. The costs in this table are simply
1217// instruction counts with the following adjustments made:
1218// * One vsetvli is considered free.
1220 {Intrinsic::floor, MVT::f32, 9},
1221 {Intrinsic::floor, MVT::f64, 9},
1222 {Intrinsic::ceil, MVT::f32, 9},
1223 {Intrinsic::ceil, MVT::f64, 9},
1224 {Intrinsic::trunc, MVT::f32, 7},
1225 {Intrinsic::trunc, MVT::f64, 7},
1226 {Intrinsic::round, MVT::f32, 9},
1227 {Intrinsic::round, MVT::f64, 9},
1228 {Intrinsic::roundeven, MVT::f32, 9},
1229 {Intrinsic::roundeven, MVT::f64, 9},
1230 {Intrinsic::rint, MVT::f32, 7},
1231 {Intrinsic::rint, MVT::f64, 7},
1232 {Intrinsic::nearbyint, MVT::f32, 9},
1233 {Intrinsic::nearbyint, MVT::f64, 9},
1234 {Intrinsic::bswap, MVT::i16, 3},
1235 {Intrinsic::bswap, MVT::i32, 12},
1236 {Intrinsic::bswap, MVT::i64, 31},
1237 {Intrinsic::vp_bswap, MVT::i16, 3},
1238 {Intrinsic::vp_bswap, MVT::i32, 12},
1239 {Intrinsic::vp_bswap, MVT::i64, 31},
1240 {Intrinsic::vp_fshl, MVT::i8, 7},
1241 {Intrinsic::vp_fshl, MVT::i16, 7},
1242 {Intrinsic::vp_fshl, MVT::i32, 7},
1243 {Intrinsic::vp_fshl, MVT::i64, 7},
1244 {Intrinsic::vp_fshr, MVT::i8, 7},
1245 {Intrinsic::vp_fshr, MVT::i16, 7},
1246 {Intrinsic::vp_fshr, MVT::i32, 7},
1247 {Intrinsic::vp_fshr, MVT::i64, 7},
1248 {Intrinsic::bitreverse, MVT::i8, 17},
1249 {Intrinsic::bitreverse, MVT::i16, 24},
1250 {Intrinsic::bitreverse, MVT::i32, 33},
1251 {Intrinsic::bitreverse, MVT::i64, 52},
1252 {Intrinsic::vp_bitreverse, MVT::i8, 17},
1253 {Intrinsic::vp_bitreverse, MVT::i16, 24},
1254 {Intrinsic::vp_bitreverse, MVT::i32, 33},
1255 {Intrinsic::vp_bitreverse, MVT::i64, 52},
1256 {Intrinsic::ctpop, MVT::i8, 12},
1257 {Intrinsic::ctpop, MVT::i16, 19},
1258 {Intrinsic::ctpop, MVT::i32, 20},
1259 {Intrinsic::ctpop, MVT::i64, 21},
1260 {Intrinsic::ctlz, MVT::i8, 19},
1261 {Intrinsic::ctlz, MVT::i16, 28},
1262 {Intrinsic::ctlz, MVT::i32, 31},
1263 {Intrinsic::ctlz, MVT::i64, 35},
1264 {Intrinsic::cttz, MVT::i8, 16},
1265 {Intrinsic::cttz, MVT::i16, 23},
1266 {Intrinsic::cttz, MVT::i32, 24},
1267 {Intrinsic::cttz, MVT::i64, 25},
1268 {Intrinsic::vp_ctpop, MVT::i8, 12},
1269 {Intrinsic::vp_ctpop, MVT::i16, 19},
1270 {Intrinsic::vp_ctpop, MVT::i32, 20},
1271 {Intrinsic::vp_ctpop, MVT::i64, 21},
1272 {Intrinsic::vp_ctlz, MVT::i8, 19},
1273 {Intrinsic::vp_ctlz, MVT::i16, 28},
1274 {Intrinsic::vp_ctlz, MVT::i32, 31},
1275 {Intrinsic::vp_ctlz, MVT::i64, 35},
1276 {Intrinsic::vp_cttz, MVT::i8, 16},
1277 {Intrinsic::vp_cttz, MVT::i16, 23},
1278 {Intrinsic::vp_cttz, MVT::i32, 24},
1279 {Intrinsic::vp_cttz, MVT::i64, 25},
1280};
1281
1285 auto *RetTy = ICA.getReturnType();
1286 switch (ICA.getID()) {
1287 case Intrinsic::lrint:
1288 case Intrinsic::llrint:
1289 case Intrinsic::lround:
1290 case Intrinsic::llround: {
1291 auto LT = getTypeLegalizationCost(RetTy);
1292 Type *SrcTy = ICA.getArgTypes().front();
1293 auto SrcLT = getTypeLegalizationCost(SrcTy);
1294 if (ST->hasVInstructions() && LT.second.isVector()) {
1296 unsigned SrcEltSz = DL.getTypeSizeInBits(SrcTy->getScalarType());
1297 unsigned DstEltSz = DL.getTypeSizeInBits(RetTy->getScalarType());
1298 if (LT.second.getVectorElementType() == MVT::bf16) {
1299 if (!ST->hasVInstructionsBF16Minimal())
1301 if (DstEltSz == 32)
1302 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFCVT_X_F_V};
1303 else
1304 Ops = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVT_X_F_V};
1305 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1306 !ST->hasVInstructionsF16()) {
1307 if (!ST->hasVInstructionsF16Minimal())
1309 if (DstEltSz == 32)
1310 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFCVT_X_F_V};
1311 else
1312 Ops = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_X_F_V};
1313
1314 } else if (SrcEltSz > DstEltSz) {
1315 Ops = {RISCV::VFNCVT_X_F_W};
1316 } else if (SrcEltSz < DstEltSz) {
1317 Ops = {RISCV::VFWCVT_X_F_V};
1318 } else {
1319 Ops = {RISCV::VFCVT_X_F_V};
1320 }
1321
1322 // We need to use the source LMUL in the case of a narrowing op, and the
1323 // destination LMUL otherwise.
1324 if (SrcEltSz > DstEltSz)
1325 return SrcLT.first *
1326 getRISCVInstructionCost(Ops, SrcLT.second, CostKind);
1327 return LT.first * getRISCVInstructionCost(Ops, LT.second, CostKind);
1328 }
1329 break;
1330 }
1331 case Intrinsic::ceil:
1332 case Intrinsic::floor:
1333 case Intrinsic::trunc:
1334 case Intrinsic::rint:
1335 case Intrinsic::round:
1336 case Intrinsic::roundeven: {
1337 // These all use the same code.
1338 auto LT = getTypeLegalizationCost(RetTy);
1339 if (!LT.second.isVector() && TLI->isOperationCustom(ISD::FCEIL, LT.second))
1340 return LT.first * 8;
1341 break;
1342 }
1343 case Intrinsic::umin:
1344 case Intrinsic::umax:
1345 case Intrinsic::smin:
1346 case Intrinsic::smax: {
1347 auto LT = getTypeLegalizationCost(RetTy);
1348 if (LT.second.isScalarInteger() && ST->hasStdExtZbb())
1349 return LT.first;
1350
1351 if (ST->hasVInstructions() && LT.second.isVector()) {
1352 unsigned Op;
1353 switch (ICA.getID()) {
1354 case Intrinsic::umin:
1355 Op = RISCV::VMINU_VV;
1356 break;
1357 case Intrinsic::umax:
1358 Op = RISCV::VMAXU_VV;
1359 break;
1360 case Intrinsic::smin:
1361 Op = RISCV::VMIN_VV;
1362 break;
1363 case Intrinsic::smax:
1364 Op = RISCV::VMAX_VV;
1365 break;
1366 }
1367 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1368 }
1369 break;
1370 }
1371 case Intrinsic::sadd_sat:
1372 case Intrinsic::ssub_sat:
1373 case Intrinsic::uadd_sat:
1374 case Intrinsic::usub_sat: {
1375 auto LT = getTypeLegalizationCost(RetTy);
1376 if (ST->hasVInstructions() && LT.second.isVector()) {
1377 unsigned Op;
1378 switch (ICA.getID()) {
1379 case Intrinsic::sadd_sat:
1380 Op = RISCV::VSADD_VV;
1381 break;
1382 case Intrinsic::ssub_sat:
1383 Op = RISCV::VSSUBU_VV;
1384 break;
1385 case Intrinsic::uadd_sat:
1386 Op = RISCV::VSADDU_VV;
1387 break;
1388 case Intrinsic::usub_sat:
1389 Op = RISCV::VSSUBU_VV;
1390 break;
1391 }
1392 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1393 }
1394 break;
1395 }
1396 case Intrinsic::fma:
1397 case Intrinsic::fmuladd: {
1398 // TODO: handle promotion with f16/bf16 with zvfhmin/zvfbfmin
1399 auto LT = getTypeLegalizationCost(RetTy);
1400 if (ST->hasVInstructions() && LT.second.isVector())
1401 return LT.first *
1402 getRISCVInstructionCost(RISCV::VFMADD_VV, LT.second, CostKind);
1403 break;
1404 }
1405 case Intrinsic::fabs: {
1406 auto LT = getTypeLegalizationCost(RetTy);
1407 if (ST->hasVInstructions() && LT.second.isVector()) {
1408 // lui a0, 8
1409 // addi a0, a0, -1
1410 // vsetvli a1, zero, e16, m1, ta, ma
1411 // vand.vx v8, v8, a0
1412 // f16 with zvfhmin and bf16 with zvfhbmin
1413 if (LT.second.getVectorElementType() == MVT::bf16 ||
1414 (LT.second.getVectorElementType() == MVT::f16 &&
1415 !ST->hasVInstructionsF16()))
1416 return LT.first * getRISCVInstructionCost(RISCV::VAND_VX, LT.second,
1417 CostKind) +
1418 2;
1419 else
1420 return LT.first *
1421 getRISCVInstructionCost(RISCV::VFSGNJX_VV, LT.second, CostKind);
1422 }
1423 break;
1424 }
1425 case Intrinsic::sqrt: {
1426 auto LT = getTypeLegalizationCost(RetTy);
1427 if (ST->hasVInstructions() && LT.second.isVector()) {
1430 MVT ConvType = LT.second;
1431 MVT FsqrtType = LT.second;
1432 // f16 with zvfhmin and bf16 with zvfbfmin and the type of nxv32[b]f16
1433 // will be spilt.
1434 if (LT.second.getVectorElementType() == MVT::bf16) {
1435 if (LT.second == MVT::nxv32bf16) {
1436 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFWCVTBF16_F_F_V,
1437 RISCV::VFNCVTBF16_F_F_W, RISCV::VFNCVTBF16_F_F_W};
1438 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1439 ConvType = MVT::nxv16f16;
1440 FsqrtType = MVT::nxv16f32;
1441 } else {
1442 ConvOp = {RISCV::VFWCVTBF16_F_F_V, RISCV::VFNCVTBF16_F_F_W};
1443 FsqrtOp = {RISCV::VFSQRT_V};
1444 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1445 }
1446 } else if (LT.second.getVectorElementType() == MVT::f16 &&
1447 !ST->hasVInstructionsF16()) {
1448 if (LT.second == MVT::nxv32f16) {
1449 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFWCVT_F_F_V,
1450 RISCV::VFNCVT_F_F_W, RISCV::VFNCVT_F_F_W};
1451 FsqrtOp = {RISCV::VFSQRT_V, RISCV::VFSQRT_V};
1452 ConvType = MVT::nxv16f16;
1453 FsqrtType = MVT::nxv16f32;
1454 } else {
1455 ConvOp = {RISCV::VFWCVT_F_F_V, RISCV::VFNCVT_F_F_W};
1456 FsqrtOp = {RISCV::VFSQRT_V};
1457 FsqrtType = TLI->getTypeToPromoteTo(ISD::FSQRT, FsqrtType);
1458 }
1459 } else {
1460 FsqrtOp = {RISCV::VFSQRT_V};
1461 }
1462
1463 return LT.first * (getRISCVInstructionCost(FsqrtOp, FsqrtType, CostKind) +
1464 getRISCVInstructionCost(ConvOp, ConvType, CostKind));
1465 }
1466 break;
1467 }
1468 case Intrinsic::cttz:
1469 case Intrinsic::ctlz:
1470 case Intrinsic::ctpop: {
1471 auto LT = getTypeLegalizationCost(RetTy);
1472 if (ST->hasStdExtZvbb() && LT.second.isVector()) {
1473 unsigned Op;
1474 switch (ICA.getID()) {
1475 case Intrinsic::cttz:
1476 Op = RISCV::VCTZ_V;
1477 break;
1478 case Intrinsic::ctlz:
1479 Op = RISCV::VCLZ_V;
1480 break;
1481 case Intrinsic::ctpop:
1482 Op = RISCV::VCPOP_V;
1483 break;
1484 }
1485 return LT.first * getRISCVInstructionCost(Op, LT.second, CostKind);
1486 }
1487 break;
1488 }
1489 case Intrinsic::abs: {
1490 auto LT = getTypeLegalizationCost(RetTy);
1491 if (ST->hasVInstructions() && LT.second.isVector()) {
1492 // vrsub.vi v10, v8, 0
1493 // vmax.vv v8, v8, v10
1494 return LT.first *
1495 getRISCVInstructionCost({RISCV::VRSUB_VI, RISCV::VMAX_VV},
1496 LT.second, CostKind);
1497 }
1498 break;
1499 }
1500 case Intrinsic::get_active_lane_mask: {
1501 if (ST->hasVInstructions()) {
1502 Type *ExpRetTy = VectorType::get(
1503 ICA.getArgTypes()[0], cast<VectorType>(RetTy)->getElementCount());
1504 auto LT = getTypeLegalizationCost(ExpRetTy);
1505
1506 // vid.v v8 // considered hoisted
1507 // vsaddu.vx v8, v8, a0
1508 // vmsltu.vx v0, v8, a1
1509 return LT.first *
1510 getRISCVInstructionCost({RISCV::VSADDU_VX, RISCV::VMSLTU_VX},
1511 LT.second, CostKind);
1512 }
1513 break;
1514 }
1515 // TODO: add more intrinsic
1516 case Intrinsic::stepvector: {
1517 auto LT = getTypeLegalizationCost(RetTy);
1518 // Legalisation of illegal types involves an `index' instruction plus
1519 // (LT.first - 1) vector adds.
1520 if (ST->hasVInstructions())
1521 return getRISCVInstructionCost(RISCV::VID_V, LT.second, CostKind) +
1522 (LT.first - 1) *
1523 getRISCVInstructionCost(RISCV::VADD_VX, LT.second, CostKind);
1524 return 1 + (LT.first - 1);
1525 }
1526 case Intrinsic::experimental_cttz_elts: {
1527 Type *ArgTy = ICA.getArgTypes()[0];
1528 EVT ArgType = TLI->getValueType(DL, ArgTy, true);
1529 if (getTLI()->shouldExpandCttzElements(ArgType))
1530 break;
1531 InstructionCost Cost = getRISCVInstructionCost(
1532 RISCV::VFIRST_M, getTypeLegalizationCost(ArgTy).second, CostKind);
1533
1534 // If zero_is_poison is false, then we will generate additional
1535 // cmp + select instructions to convert -1 to EVL.
1536 Type *BoolTy = Type::getInt1Ty(RetTy->getContext());
1537 if (ICA.getArgs().size() > 1 &&
1538 cast<ConstantInt>(ICA.getArgs()[1])->isZero())
1539 Cost += getCmpSelInstrCost(Instruction::ICmp, BoolTy, RetTy,
1541 getCmpSelInstrCost(Instruction::Select, RetTy, BoolTy,
1543
1544 return Cost;
1545 }
1546 case Intrinsic::experimental_vp_splat: {
1547 auto LT = getTypeLegalizationCost(RetTy);
1548 // TODO: Lower i1 experimental_vp_splat
1549 if (!ST->hasVInstructions() || LT.second.getScalarType() == MVT::i1)
1551 return LT.first * getRISCVInstructionCost(LT.second.isFloatingPoint()
1552 ? RISCV::VFMV_V_F
1553 : RISCV::VMV_V_X,
1554 LT.second, CostKind);
1555 }
1556 case Intrinsic::experimental_vp_splice: {
1557 // To support type-based query from vectorizer, set the index to 0.
1558 // Note that index only change the cost from vslide.vx to vslide.vi and in
1559 // current implementations they have same costs.
1561 cast<VectorType>(ICA.getArgTypes()[0]), {}, CostKind,
1563 }
1564 case Intrinsic::fptoui_sat:
1565 case Intrinsic::fptosi_sat: {
1567 bool IsSigned = ICA.getID() == Intrinsic::fptosi_sat;
1568 Type *SrcTy = ICA.getArgTypes()[0];
1569
1570 auto SrcLT = getTypeLegalizationCost(SrcTy);
1571 auto DstLT = getTypeLegalizationCost(RetTy);
1572 if (!SrcTy->isVectorTy())
1573 break;
1574
1575 if (!SrcLT.first.isValid() || !DstLT.first.isValid())
1577
1578 Cost +=
1579 getCastInstrCost(IsSigned ? Instruction::FPToSI : Instruction::FPToUI,
1580 RetTy, SrcTy, TTI::CastContextHint::None, CostKind);
1581
1582 // Handle NaN.
1583 // vmfne v0, v8, v8 # If v8[i] is NaN set v0[i] to 1.
1584 // vmerge.vim v8, v8, 0, v0 # Convert NaN to 0.
1585 Type *CondTy = RetTy->getWithNewBitWidth(1);
1586 Cost += getCmpSelInstrCost(BinaryOperator::FCmp, SrcTy, CondTy,
1588 Cost += getCmpSelInstrCost(BinaryOperator::Select, RetTy, CondTy,
1590 return Cost;
1591 }
1592 }
1593
1594 if (ST->hasVInstructions() && RetTy->isVectorTy()) {
1595 if (auto LT = getTypeLegalizationCost(RetTy);
1596 LT.second.isVector()) {
1597 MVT EltTy = LT.second.getVectorElementType();
1598 if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
1599 ICA.getID(), EltTy))
1600 return LT.first * Entry->Cost;
1601 }
1602 }
1603
1605}
1606
1609 const SCEV *Ptr,
1611 // Address computations for vector indexed load/store likely require an offset
1612 // and/or scaling.
1613 if (ST->hasVInstructions() && PtrTy->isVectorTy())
1614 return getArithmeticInstrCost(Instruction::Add, PtrTy, CostKind);
1615
1616 return BaseT::getAddressComputationCost(PtrTy, SE, Ptr, CostKind);
1617}
1618
1620 Type *Src,
1623 const Instruction *I) const {
1624 bool IsVectorType = isa<VectorType>(Dst) && isa<VectorType>(Src);
1625 if (!IsVectorType)
1626 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1627
1628 // FIXME: Need to compute legalizing cost for illegal types. The current
1629 // code handles only legal types and those which can be trivially
1630 // promoted to legal.
1631 if (!ST->hasVInstructions() || Src->getScalarSizeInBits() > ST->getELen() ||
1632 Dst->getScalarSizeInBits() > ST->getELen())
1633 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1634
1635 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1636 assert(ISD && "Invalid opcode");
1637 std::pair<InstructionCost, MVT> SrcLT = getTypeLegalizationCost(Src);
1638 std::pair<InstructionCost, MVT> DstLT = getTypeLegalizationCost(Dst);
1639
1640 // Handle i1 source and dest cases *before* calling logic in BasicTTI.
1641 // The shared implementation doesn't model vector widening during legalization
1642 // and instead assumes scalarization. In order to scalarize an <N x i1>
1643 // vector, we need to extend/trunc to/from i8. If we don't special case
1644 // this, we can get an infinite recursion cycle.
1645 switch (ISD) {
1646 default:
1647 break;
1648 case ISD::SIGN_EXTEND:
1649 case ISD::ZERO_EXTEND:
1650 if (Src->getScalarSizeInBits() == 1) {
1651 // We do not use vsext/vzext to extend from mask vector.
1652 // Instead we use the following instructions to extend from mask vector:
1653 // vmv.v.i v8, 0
1654 // vmerge.vim v8, v8, -1, v0 (repeated per split)
1655 return getRISCVInstructionCost(RISCV::VMV_V_I, DstLT.second, CostKind) +
1656 DstLT.first * getRISCVInstructionCost(RISCV::VMERGE_VIM,
1657 DstLT.second, CostKind) +
1658 DstLT.first - 1;
1659 }
1660 break;
1661 case ISD::TRUNCATE:
1662 if (Dst->getScalarSizeInBits() == 1) {
1663 // We do not use several vncvt to truncate to mask vector. So we could
1664 // not use PowDiff to calculate it.
1665 // Instead we use the following instructions to truncate to mask vector:
1666 // vand.vi v8, v8, 1
1667 // vmsne.vi v0, v8, 0
1668 return SrcLT.first *
1669 getRISCVInstructionCost({RISCV::VAND_VI, RISCV::VMSNE_VI},
1670 SrcLT.second, CostKind) +
1671 SrcLT.first - 1;
1672 }
1673 break;
1674 };
1675
1676 // Our actual lowering for the case where a wider legal type is available
1677 // uses promotion to the wider type. This is reflected in the result of
1678 // getTypeLegalizationCost, but BasicTTI assumes the widened cases are
1679 // scalarized if the legalized Src and Dst are not equal sized.
1680 const DataLayout &DL = this->getDataLayout();
1681 if (!SrcLT.second.isVector() || !DstLT.second.isVector() ||
1682 !SrcLT.first.isValid() || !DstLT.first.isValid() ||
1683 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Src),
1684 SrcLT.second.getSizeInBits()) ||
1685 !TypeSize::isKnownLE(DL.getTypeSizeInBits(Dst),
1686 DstLT.second.getSizeInBits()))
1687 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1688
1689 // The split cost is handled by the base getCastInstrCost
1690 assert((SrcLT.first == 1) && (DstLT.first == 1) && "Illegal type");
1691
1692 int PowDiff = (int)Log2_32(DstLT.second.getScalarSizeInBits()) -
1693 (int)Log2_32(SrcLT.second.getScalarSizeInBits());
1694 switch (ISD) {
1695 case ISD::SIGN_EXTEND:
1696 case ISD::ZERO_EXTEND: {
1697 if ((PowDiff < 1) || (PowDiff > 3))
1698 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1699 unsigned SExtOp[] = {RISCV::VSEXT_VF2, RISCV::VSEXT_VF4, RISCV::VSEXT_VF8};
1700 unsigned ZExtOp[] = {RISCV::VZEXT_VF2, RISCV::VZEXT_VF4, RISCV::VZEXT_VF8};
1701 unsigned Op =
1702 (ISD == ISD::SIGN_EXTEND) ? SExtOp[PowDiff - 1] : ZExtOp[PowDiff - 1];
1703 return getRISCVInstructionCost(Op, DstLT.second, CostKind);
1704 }
1705 case ISD::TRUNCATE:
1706 case ISD::FP_EXTEND:
1707 case ISD::FP_ROUND: {
1708 // Counts of narrow/widen instructions.
1709 unsigned SrcEltSize = SrcLT.second.getScalarSizeInBits();
1710 unsigned DstEltSize = DstLT.second.getScalarSizeInBits();
1711
1712 unsigned Op = (ISD == ISD::TRUNCATE) ? RISCV::VNSRL_WI
1713 : (ISD == ISD::FP_EXTEND) ? RISCV::VFWCVT_F_F_V
1714 : RISCV::VFNCVT_F_F_W;
1716 for (; SrcEltSize != DstEltSize;) {
1717 MVT ElementMVT = (ISD == ISD::TRUNCATE)
1718 ? MVT::getIntegerVT(DstEltSize)
1719 : MVT::getFloatingPointVT(DstEltSize);
1720 MVT DstMVT = DstLT.second.changeVectorElementType(ElementMVT);
1721 DstEltSize =
1722 (DstEltSize > SrcEltSize) ? DstEltSize >> 1 : DstEltSize << 1;
1723 Cost += getRISCVInstructionCost(Op, DstMVT, CostKind);
1724 }
1725 return Cost;
1726 }
1727 case ISD::FP_TO_SINT:
1728 case ISD::FP_TO_UINT: {
1729 unsigned IsSigned = ISD == ISD::FP_TO_SINT;
1730 unsigned FCVT = IsSigned ? RISCV::VFCVT_RTZ_X_F_V : RISCV::VFCVT_RTZ_XU_F_V;
1731 unsigned FWCVT =
1732 IsSigned ? RISCV::VFWCVT_RTZ_X_F_V : RISCV::VFWCVT_RTZ_XU_F_V;
1733 unsigned FNCVT =
1734 IsSigned ? RISCV::VFNCVT_RTZ_X_F_W : RISCV::VFNCVT_RTZ_XU_F_W;
1735 unsigned SrcEltSize = Src->getScalarSizeInBits();
1736 unsigned DstEltSize = Dst->getScalarSizeInBits();
1738 if ((SrcEltSize == 16) &&
1739 (!ST->hasVInstructionsF16() || ((DstEltSize / 2) > SrcEltSize))) {
1740 // If the target only supports zvfhmin or it is fp16-to-i64 conversion
1741 // pre-widening to f32 and then convert f32 to integer
1742 VectorType *VecF32Ty =
1743 VectorType::get(Type::getFloatTy(Dst->getContext()),
1744 cast<VectorType>(Dst)->getElementCount());
1745 std::pair<InstructionCost, MVT> VecF32LT =
1746 getTypeLegalizationCost(VecF32Ty);
1747 Cost +=
1748 VecF32LT.first * getRISCVInstructionCost(RISCV::VFWCVT_F_F_V,
1749 VecF32LT.second, CostKind);
1750 Cost += getCastInstrCost(Opcode, Dst, VecF32Ty, CCH, CostKind, I);
1751 return Cost;
1752 }
1753 if (DstEltSize == SrcEltSize)
1754 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1755 else if (DstEltSize > SrcEltSize)
1756 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1757 else { // (SrcEltSize > DstEltSize)
1758 // First do a narrowing conversion to an integer half the size, then
1759 // truncate if needed.
1760 MVT ElementVT = MVT::getIntegerVT(SrcEltSize / 2);
1761 MVT VecVT = DstLT.second.changeVectorElementType(ElementVT);
1762 Cost += getRISCVInstructionCost(FNCVT, VecVT, CostKind);
1763 if ((SrcEltSize / 2) > DstEltSize) {
1764 Type *VecTy = EVT(VecVT).getTypeForEVT(Dst->getContext());
1765 Cost +=
1766 getCastInstrCost(Instruction::Trunc, Dst, VecTy, CCH, CostKind, I);
1767 }
1768 }
1769 return Cost;
1770 }
1771 case ISD::SINT_TO_FP:
1772 case ISD::UINT_TO_FP: {
1773 unsigned IsSigned = ISD == ISD::SINT_TO_FP;
1774 unsigned FCVT = IsSigned ? RISCV::VFCVT_F_X_V : RISCV::VFCVT_F_XU_V;
1775 unsigned FWCVT = IsSigned ? RISCV::VFWCVT_F_X_V : RISCV::VFWCVT_F_XU_V;
1776 unsigned FNCVT = IsSigned ? RISCV::VFNCVT_F_X_W : RISCV::VFNCVT_F_XU_W;
1777 unsigned SrcEltSize = Src->getScalarSizeInBits();
1778 unsigned DstEltSize = Dst->getScalarSizeInBits();
1779
1781 if ((DstEltSize == 16) &&
1782 (!ST->hasVInstructionsF16() || ((SrcEltSize / 2) > DstEltSize))) {
1783 // If the target only supports zvfhmin or it is i64-to-fp16 conversion
1784 // it is converted to f32 and then converted to f16
1785 VectorType *VecF32Ty =
1786 VectorType::get(Type::getFloatTy(Dst->getContext()),
1787 cast<VectorType>(Dst)->getElementCount());
1788 std::pair<InstructionCost, MVT> VecF32LT =
1789 getTypeLegalizationCost(VecF32Ty);
1790 Cost += getCastInstrCost(Opcode, VecF32Ty, Src, CCH, CostKind, I);
1791 Cost += VecF32LT.first * getRISCVInstructionCost(RISCV::VFNCVT_F_F_W,
1792 DstLT.second, CostKind);
1793 return Cost;
1794 }
1795
1796 if (DstEltSize == SrcEltSize)
1797 Cost += getRISCVInstructionCost(FCVT, DstLT.second, CostKind);
1798 else if (DstEltSize > SrcEltSize) {
1799 if ((DstEltSize / 2) > SrcEltSize) {
1800 VectorType *VecTy =
1801 VectorType::get(IntegerType::get(Dst->getContext(), DstEltSize / 2),
1802 cast<VectorType>(Dst)->getElementCount());
1803 unsigned Op = IsSigned ? Instruction::SExt : Instruction::ZExt;
1804 Cost += getCastInstrCost(Op, VecTy, Src, CCH, CostKind, I);
1805 }
1806 Cost += getRISCVInstructionCost(FWCVT, DstLT.second, CostKind);
1807 } else
1808 Cost += getRISCVInstructionCost(FNCVT, DstLT.second, CostKind);
1809 return Cost;
1810 }
1811 }
1812 return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
1813}
1814
1815unsigned RISCVTTIImpl::getEstimatedVLFor(VectorType *Ty) const {
1816 if (isa<ScalableVectorType>(Ty)) {
1817 const unsigned EltSize = DL.getTypeSizeInBits(Ty->getElementType());
1818 const unsigned MinSize = DL.getTypeSizeInBits(Ty).getKnownMinValue();
1819 const unsigned VectorBits = *getVScaleForTuning() * RISCV::RVVBitsPerBlock;
1820 return RISCVTargetLowering::computeVLMAX(VectorBits, EltSize, MinSize);
1821 }
1822 return cast<FixedVectorType>(Ty)->getNumElements();
1823}
1824
1827 FastMathFlags FMF,
1829 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1830 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1831
1832 // Skip if scalar size of Ty is bigger than ELEN.
1833 if (Ty->getScalarSizeInBits() > ST->getELen())
1834 return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind);
1835
1836 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1837 if (Ty->getElementType()->isIntegerTy(1)) {
1838 // SelectionDAGBuilder does following transforms:
1839 // vector_reduce_{smin,umax}(<n x i1>) --> vector_reduce_or(<n x i1>)
1840 // vector_reduce_{smax,umin}(<n x i1>) --> vector_reduce_and(<n x i1>)
1841 if (IID == Intrinsic::umax || IID == Intrinsic::smin)
1842 return getArithmeticReductionCost(Instruction::Or, Ty, FMF, CostKind);
1843 else
1844 return getArithmeticReductionCost(Instruction::And, Ty, FMF, CostKind);
1845 }
1846
1847 if (IID == Intrinsic::maximum || IID == Intrinsic::minimum) {
1849 InstructionCost ExtraCost = 0;
1850 switch (IID) {
1851 case Intrinsic::maximum:
1852 if (FMF.noNaNs()) {
1853 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1854 } else {
1855 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMAX_VS,
1856 RISCV::VFMV_F_S};
1857 // Cost of Canonical Nan + branch
1858 // lui a0, 523264
1859 // fmv.w.x fa0, a0
1860 Type *DstTy = Ty->getScalarType();
1861 const unsigned EltTyBits = DstTy->getScalarSizeInBits();
1862 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1863 ExtraCost = 1 +
1864 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1866 getCFInstrCost(Instruction::Br, CostKind);
1867 }
1868 break;
1869
1870 case Intrinsic::minimum:
1871 if (FMF.noNaNs()) {
1872 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1873 } else {
1874 Opcodes = {RISCV::VMFNE_VV, RISCV::VCPOP_M, RISCV::VFREDMIN_VS,
1875 RISCV::VFMV_F_S};
1876 // Cost of Canonical Nan + branch
1877 // lui a0, 523264
1878 // fmv.w.x fa0, a0
1879 Type *DstTy = Ty->getScalarType();
1880 const unsigned EltTyBits = DL.getTypeSizeInBits(DstTy);
1881 Type *SrcTy = IntegerType::getIntNTy(DstTy->getContext(), EltTyBits);
1882 ExtraCost = 1 +
1883 getCastInstrCost(Instruction::UIToFP, DstTy, SrcTy,
1885 getCFInstrCost(Instruction::Br, CostKind);
1886 }
1887 break;
1888 }
1889 return ExtraCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1890 }
1891
1892 // IR Reduction is composed by one rvv reduction instruction and vmv
1893 unsigned SplitOp;
1895 switch (IID) {
1896 default:
1897 llvm_unreachable("Unsupported intrinsic");
1898 case Intrinsic::smax:
1899 SplitOp = RISCV::VMAX_VV;
1900 Opcodes = {RISCV::VREDMAX_VS, RISCV::VMV_X_S};
1901 break;
1902 case Intrinsic::smin:
1903 SplitOp = RISCV::VMIN_VV;
1904 Opcodes = {RISCV::VREDMIN_VS, RISCV::VMV_X_S};
1905 break;
1906 case Intrinsic::umax:
1907 SplitOp = RISCV::VMAXU_VV;
1908 Opcodes = {RISCV::VREDMAXU_VS, RISCV::VMV_X_S};
1909 break;
1910 case Intrinsic::umin:
1911 SplitOp = RISCV::VMINU_VV;
1912 Opcodes = {RISCV::VREDMINU_VS, RISCV::VMV_X_S};
1913 break;
1914 case Intrinsic::maxnum:
1915 SplitOp = RISCV::VFMAX_VV;
1916 Opcodes = {RISCV::VFREDMAX_VS, RISCV::VFMV_F_S};
1917 break;
1918 case Intrinsic::minnum:
1919 SplitOp = RISCV::VFMIN_VV;
1920 Opcodes = {RISCV::VFREDMIN_VS, RISCV::VFMV_F_S};
1921 break;
1922 }
1923 // Add a cost for data larger than LMUL8
1924 InstructionCost SplitCost =
1925 (LT.first > 1) ? (LT.first - 1) *
1926 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
1927 : 0;
1928 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
1929}
1930
1933 std::optional<FastMathFlags> FMF,
1935 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
1936 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1937
1938 // Skip if scalar size of Ty is bigger than ELEN.
1939 if (Ty->getScalarSizeInBits() > ST->getELen())
1940 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1941
1942 int ISD = TLI->InstructionOpcodeToISD(Opcode);
1943 assert(ISD && "Invalid opcode");
1944
1945 if (ISD != ISD::ADD && ISD != ISD::OR && ISD != ISD::XOR && ISD != ISD::AND &&
1946 ISD != ISD::FADD)
1947 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
1948
1949 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
1950 Type *ElementTy = Ty->getElementType();
1951 if (ElementTy->isIntegerTy(1)) {
1952 // Example sequences:
1953 // vfirst.m a0, v0
1954 // seqz a0, a0
1955 if (LT.second == MVT::v1i1)
1956 return getRISCVInstructionCost(RISCV::VFIRST_M, LT.second, CostKind) +
1957 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1959
1960 if (ISD == ISD::AND) {
1961 // Example sequences:
1962 // vmand.mm v8, v9, v8 ; needed every time type is split
1963 // vmnot.m v8, v0 ; alias for vmnand
1964 // vcpop.m a0, v8
1965 // seqz a0, a0
1966
1967 // See the discussion: https://github.com/llvm/llvm-project/pull/119160
1968 // For LMUL <= 8, there is no splitting,
1969 // the sequences are vmnot, vcpop and seqz.
1970 // When LMUL > 8 and split = 1,
1971 // the sequences are vmnand, vcpop and seqz.
1972 // When LMUL > 8 and split > 1,
1973 // the sequences are (LT.first-2) * vmand, vmnand, vcpop and seqz.
1974 return ((LT.first > 2) ? (LT.first - 2) : 0) *
1975 getRISCVInstructionCost(RISCV::VMAND_MM, LT.second, CostKind) +
1976 getRISCVInstructionCost(RISCV::VMNAND_MM, LT.second, CostKind) +
1977 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1978 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
1980 } else if (ISD == ISD::XOR || ISD == ISD::ADD) {
1981 // Example sequences:
1982 // vsetvli a0, zero, e8, mf8, ta, ma
1983 // vmxor.mm v8, v0, v8 ; needed every time type is split
1984 // vcpop.m a0, v8
1985 // andi a0, a0, 1
1986 return (LT.first - 1) *
1987 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind) +
1988 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) + 1;
1989 } else {
1990 assert(ISD == ISD::OR);
1991 // Example sequences:
1992 // vsetvli a0, zero, e8, mf8, ta, ma
1993 // vmor.mm v8, v9, v8 ; needed every time type is split
1994 // vcpop.m a0, v0
1995 // snez a0, a0
1996 return (LT.first - 1) *
1997 getRISCVInstructionCost(RISCV::VMOR_MM, LT.second, CostKind) +
1998 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind) +
1999 getCmpSelInstrCost(Instruction::ICmp, ElementTy, ElementTy,
2001 }
2002 }
2003
2004 // IR Reduction of or/and is composed by one vmv and one rvv reduction
2005 // instruction, and others is composed by two vmv and one rvv reduction
2006 // instruction
2007 unsigned SplitOp;
2009 switch (ISD) {
2010 case ISD::ADD:
2011 SplitOp = RISCV::VADD_VV;
2012 Opcodes = {RISCV::VMV_S_X, RISCV::VREDSUM_VS, RISCV::VMV_X_S};
2013 break;
2014 case ISD::OR:
2015 SplitOp = RISCV::VOR_VV;
2016 Opcodes = {RISCV::VREDOR_VS, RISCV::VMV_X_S};
2017 break;
2018 case ISD::XOR:
2019 SplitOp = RISCV::VXOR_VV;
2020 Opcodes = {RISCV::VMV_S_X, RISCV::VREDXOR_VS, RISCV::VMV_X_S};
2021 break;
2022 case ISD::AND:
2023 SplitOp = RISCV::VAND_VV;
2024 Opcodes = {RISCV::VREDAND_VS, RISCV::VMV_X_S};
2025 break;
2026 case ISD::FADD:
2027 // We can't promote f16/bf16 fadd reductions.
2028 if ((LT.second.getScalarType() == MVT::f16 && !ST->hasVInstructionsF16()) ||
2029 LT.second.getScalarType() == MVT::bf16)
2030 return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind);
2032 Opcodes.push_back(RISCV::VFMV_S_F);
2033 for (unsigned i = 0; i < LT.first.getValue(); i++)
2034 Opcodes.push_back(RISCV::VFREDOSUM_VS);
2035 Opcodes.push_back(RISCV::VFMV_F_S);
2036 return getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2037 }
2038 SplitOp = RISCV::VFADD_VV;
2039 Opcodes = {RISCV::VFMV_S_F, RISCV::VFREDUSUM_VS, RISCV::VFMV_F_S};
2040 break;
2041 }
2042 // Add a cost for data larger than LMUL8
2043 InstructionCost SplitCost =
2044 (LT.first > 1) ? (LT.first - 1) *
2045 getRISCVInstructionCost(SplitOp, LT.second, CostKind)
2046 : 0;
2047 return SplitCost + getRISCVInstructionCost(Opcodes, LT.second, CostKind);
2048}
2049
2051 unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy,
2052 std::optional<FastMathFlags> FMF, TTI::TargetCostKind CostKind) const {
2053 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2054 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2055 FMF, CostKind);
2056
2057 // Skip if scalar size of ResTy is bigger than ELEN.
2058 if (ResTy->getScalarSizeInBits() > ST->getELen())
2059 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2060 FMF, CostKind);
2061
2062 if (Opcode != Instruction::Add && Opcode != Instruction::FAdd)
2063 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2064 FMF, CostKind);
2065
2066 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2067
2068 if (IsUnsigned && Opcode == Instruction::Add &&
2069 LT.second.isFixedLengthVector() && LT.second.getScalarType() == MVT::i1) {
2070 // Represent vector_reduce_add(ZExt(<n x i1>)) as
2071 // ZExtOrTrunc(ctpop(bitcast <n x i1> to in)).
2072 return LT.first *
2073 getRISCVInstructionCost(RISCV::VCPOP_M, LT.second, CostKind);
2074 }
2075
2076 if (ResTy->getScalarSizeInBits() != 2 * LT.second.getScalarSizeInBits())
2077 return BaseT::getExtendedReductionCost(Opcode, IsUnsigned, ResTy, ValTy,
2078 FMF, CostKind);
2079
2080 return (LT.first - 1) +
2081 getArithmeticReductionCost(Opcode, ValTy, FMF, CostKind);
2082}
2083
2087 assert(OpInfo.isConstant() && "non constant operand?");
2088 if (!isa<VectorType>(Ty))
2089 // FIXME: We need to account for immediate materialization here, but doing
2090 // a decent job requires more knowledge about the immediate than we
2091 // currently have here.
2092 return 0;
2093
2094 if (OpInfo.isUniform())
2095 // vmv.v.i, vmv.v.x, or vfmv.v.f
2096 // We ignore the cost of the scalar constant materialization to be consistent
2097 // with how we treat scalar constants themselves just above.
2098 return 1;
2099
2100 return getConstantPoolLoadCost(Ty, CostKind);
2101}
2102
2104 Align Alignment,
2105 unsigned AddressSpace,
2107 TTI::OperandValueInfo OpInfo,
2108 const Instruction *I) const {
2109 EVT VT = TLI->getValueType(DL, Src, true);
2110 // Type legalization can't handle structs
2111 if (VT == MVT::Other)
2112 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2113 CostKind, OpInfo, I);
2114
2116 if (Opcode == Instruction::Store && OpInfo.isConstant())
2117 Cost += getStoreImmCost(Src, OpInfo, CostKind);
2118
2119 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Src);
2120
2121 InstructionCost BaseCost = [&]() {
2122 InstructionCost Cost = LT.first;
2124 return Cost;
2125
2126 // Our actual lowering for the case where a wider legal type is available
2127 // uses the a VL predicated load on the wider type. This is reflected in
2128 // the result of getTypeLegalizationCost, but BasicTTI assumes the
2129 // widened cases are scalarized.
2130 const DataLayout &DL = this->getDataLayout();
2131 if (Src->isVectorTy() && LT.second.isVector() &&
2132 TypeSize::isKnownLT(DL.getTypeStoreSizeInBits(Src),
2133 LT.second.getSizeInBits()))
2134 return Cost;
2135
2136 return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
2137 CostKind, OpInfo, I);
2138 }();
2139
2140 // Assume memory ops cost scale with the number of vector registers
2141 // possible accessed by the instruction. Note that BasicTTI already
2142 // handles the LT.first term for us.
2143 if (ST->hasVInstructions() && LT.second.isVector() &&
2145 BaseCost *= TLI->getLMULCost(LT.second);
2146 return Cost + BaseCost;
2147}
2148
2150 unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred,
2152 TTI::OperandValueInfo Op2Info, const Instruction *I) const {
2154 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2155 Op1Info, Op2Info, I);
2156
2157 if (isa<FixedVectorType>(ValTy) && !ST->useRVVForFixedLengthVectors())
2158 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2159 Op1Info, Op2Info, I);
2160
2161 // Skip if scalar size of ValTy is bigger than ELEN.
2162 if (ValTy->isVectorTy() && ValTy->getScalarSizeInBits() > ST->getELen())
2163 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2164 Op1Info, Op2Info, I);
2165
2166 auto GetConstantMatCost =
2167 [&](TTI::OperandValueInfo OpInfo) -> InstructionCost {
2168 if (OpInfo.isUniform())
2169 // We return 0 we currently ignore the cost of materializing scalar
2170 // constants in GPRs.
2171 return 0;
2172
2173 return getConstantPoolLoadCost(ValTy, CostKind);
2174 };
2175
2176 InstructionCost ConstantMatCost;
2177 if (Op1Info.isConstant())
2178 ConstantMatCost += GetConstantMatCost(Op1Info);
2179 if (Op2Info.isConstant())
2180 ConstantMatCost += GetConstantMatCost(Op2Info);
2181
2182 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(ValTy);
2183 if (Opcode == Instruction::Select && ValTy->isVectorTy()) {
2184 if (CondTy->isVectorTy()) {
2185 if (ValTy->getScalarSizeInBits() == 1) {
2186 // vmandn.mm v8, v8, v9
2187 // vmand.mm v9, v0, v9
2188 // vmor.mm v0, v9, v8
2189 return ConstantMatCost +
2190 LT.first *
2191 getRISCVInstructionCost(
2192 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2193 LT.second, CostKind);
2194 }
2195 // vselect and max/min are supported natively.
2196 return ConstantMatCost +
2197 LT.first * getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second,
2198 CostKind);
2199 }
2200
2201 if (ValTy->getScalarSizeInBits() == 1) {
2202 // vmv.v.x v9, a0
2203 // vmsne.vi v9, v9, 0
2204 // vmandn.mm v8, v8, v9
2205 // vmand.mm v9, v0, v9
2206 // vmor.mm v0, v9, v8
2207 MVT InterimVT = LT.second.changeVectorElementType(MVT::i8);
2208 return ConstantMatCost +
2209 LT.first *
2210 getRISCVInstructionCost({RISCV::VMV_V_X, RISCV::VMSNE_VI},
2211 InterimVT, CostKind) +
2212 LT.first * getRISCVInstructionCost(
2213 {RISCV::VMANDN_MM, RISCV::VMAND_MM, RISCV::VMOR_MM},
2214 LT.second, CostKind);
2215 }
2216
2217 // vmv.v.x v10, a0
2218 // vmsne.vi v0, v10, 0
2219 // vmerge.vvm v8, v9, v8, v0
2220 return ConstantMatCost +
2221 LT.first * getRISCVInstructionCost(
2222 {RISCV::VMV_V_X, RISCV::VMSNE_VI, RISCV::VMERGE_VVM},
2223 LT.second, CostKind);
2224 }
2225
2226 if ((Opcode == Instruction::ICmp) && ValTy->isVectorTy() &&
2227 CmpInst::isIntPredicate(VecPred)) {
2228 // Use VMSLT_VV to represent VMSEQ, VMSNE, VMSLTU, VMSLEU, VMSLT, VMSLE
2229 // provided they incur the same cost across all implementations
2230 return ConstantMatCost + LT.first * getRISCVInstructionCost(RISCV::VMSLT_VV,
2231 LT.second,
2232 CostKind);
2233 }
2234
2235 if ((Opcode == Instruction::FCmp) && ValTy->isVectorTy() &&
2236 CmpInst::isFPPredicate(VecPred)) {
2237
2238 // Use VMXOR_MM and VMXNOR_MM to generate all true/false mask
2239 if ((VecPred == CmpInst::FCMP_FALSE) || (VecPred == CmpInst::FCMP_TRUE))
2240 return ConstantMatCost +
2241 getRISCVInstructionCost(RISCV::VMXOR_MM, LT.second, CostKind);
2242
2243 // If we do not support the input floating point vector type, use the base
2244 // one which will calculate as:
2245 // ScalarizeCost + Num * Cost for fixed vector,
2246 // InvalidCost for scalable vector.
2247 if ((ValTy->getScalarSizeInBits() == 16 && !ST->hasVInstructionsF16()) ||
2248 (ValTy->getScalarSizeInBits() == 32 && !ST->hasVInstructionsF32()) ||
2249 (ValTy->getScalarSizeInBits() == 64 && !ST->hasVInstructionsF64()))
2250 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2251 Op1Info, Op2Info, I);
2252
2253 // Assuming vector fp compare and mask instructions are all the same cost
2254 // until a need arises to differentiate them.
2255 switch (VecPred) {
2256 case CmpInst::FCMP_ONE: // vmflt.vv + vmflt.vv + vmor.mm
2257 case CmpInst::FCMP_ORD: // vmfeq.vv + vmfeq.vv + vmand.mm
2258 case CmpInst::FCMP_UNO: // vmfne.vv + vmfne.vv + vmor.mm
2259 case CmpInst::FCMP_UEQ: // vmflt.vv + vmflt.vv + vmnor.mm
2260 return ConstantMatCost +
2261 LT.first * getRISCVInstructionCost(
2262 {RISCV::VMFLT_VV, RISCV::VMFLT_VV, RISCV::VMOR_MM},
2263 LT.second, CostKind);
2264
2265 case CmpInst::FCMP_UGT: // vmfle.vv + vmnot.m
2266 case CmpInst::FCMP_UGE: // vmflt.vv + vmnot.m
2267 case CmpInst::FCMP_ULT: // vmfle.vv + vmnot.m
2268 case CmpInst::FCMP_ULE: // vmflt.vv + vmnot.m
2269 return ConstantMatCost +
2270 LT.first *
2271 getRISCVInstructionCost({RISCV::VMFLT_VV, RISCV::VMNAND_MM},
2272 LT.second, CostKind);
2273
2274 case CmpInst::FCMP_OEQ: // vmfeq.vv
2275 case CmpInst::FCMP_OGT: // vmflt.vv
2276 case CmpInst::FCMP_OGE: // vmfle.vv
2277 case CmpInst::FCMP_OLT: // vmflt.vv
2278 case CmpInst::FCMP_OLE: // vmfle.vv
2279 case CmpInst::FCMP_UNE: // vmfne.vv
2280 return ConstantMatCost +
2281 LT.first *
2282 getRISCVInstructionCost(RISCV::VMFLT_VV, LT.second, CostKind);
2283 default:
2284 break;
2285 }
2286 }
2287
2288 // With ShortForwardBranchOpt or ConditionalMoveFusion, scalar icmp + select
2289 // instructions will lower to SELECT_CC and lower to PseudoCCMOVGPR which will
2290 // generate a conditional branch + mv. The cost of scalar (icmp + select) will
2291 // be (0 + select instr cost).
2292 if (ST->hasConditionalMoveFusion() && I && isa<ICmpInst>(I) &&
2293 ValTy->isIntegerTy() && !I->user_empty()) {
2294 if (all_of(I->users(), [&](const User *U) {
2295 return match(U, m_Select(m_Specific(I), m_Value(), m_Value())) &&
2296 U->getType()->isIntegerTy() &&
2297 !isa<ConstantData>(U->getOperand(1)) &&
2298 !isa<ConstantData>(U->getOperand(2));
2299 }))
2300 return 0;
2301 }
2302
2303 // TODO: Add cost for scalar type.
2304
2305 return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, VecPred, CostKind,
2306 Op1Info, Op2Info, I);
2307}
2308
2311 const Instruction *I) const {
2313 return Opcode == Instruction::PHI ? 0 : 1;
2314 // Branches are assumed to be predicted.
2315 return 0;
2316}
2317
2320 unsigned Index,
2321 const Value *Op0,
2322 const Value *Op1) const {
2323 assert(Val->isVectorTy() && "This must be a vector type");
2324
2325 if (Opcode != Instruction::ExtractElement &&
2326 Opcode != Instruction::InsertElement)
2327 return BaseT::getVectorInstrCost(Opcode, Val, CostKind, Index, Op0, Op1);
2328
2329 // Legalize the type.
2330 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Val);
2331
2332 // This type is legalized to a scalar type.
2333 if (!LT.second.isVector()) {
2334 auto *FixedVecTy = cast<FixedVectorType>(Val);
2335 // If Index is a known constant, cost is zero.
2336 if (Index != -1U)
2337 return 0;
2338 // Extract/InsertElement with non-constant index is very costly when
2339 // scalarized; estimate cost of loads/stores sequence via the stack:
2340 // ExtractElement cost: store vector to stack, load scalar;
2341 // InsertElement cost: store vector to stack, store scalar, load vector.
2342 Type *ElemTy = FixedVecTy->getElementType();
2343 auto NumElems = FixedVecTy->getNumElements();
2344 auto Align = DL.getPrefTypeAlign(ElemTy);
2345 InstructionCost LoadCost =
2346 getMemoryOpCost(Instruction::Load, ElemTy, Align, 0, CostKind);
2347 InstructionCost StoreCost =
2348 getMemoryOpCost(Instruction::Store, ElemTy, Align, 0, CostKind);
2349 return Opcode == Instruction::ExtractElement
2350 ? StoreCost * NumElems + LoadCost
2351 : (StoreCost + LoadCost) * NumElems + StoreCost;
2352 }
2353
2354 // For unsupported scalable vector.
2355 if (LT.second.isScalableVector() && !LT.first.isValid())
2356 return LT.first;
2357
2358 // Mask vector extract/insert is expanded via e8.
2359 if (Val->getScalarSizeInBits() == 1) {
2360 VectorType *WideTy =
2362 cast<VectorType>(Val)->getElementCount());
2363 if (Opcode == Instruction::ExtractElement) {
2364 InstructionCost ExtendCost
2365 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2367 InstructionCost ExtractCost
2368 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2369 return ExtendCost + ExtractCost;
2370 }
2371 InstructionCost ExtendCost
2372 = getCastInstrCost(Instruction::ZExt, WideTy, Val,
2374 InstructionCost InsertCost
2375 = getVectorInstrCost(Opcode, WideTy, CostKind, Index, nullptr, nullptr);
2376 InstructionCost TruncCost
2377 = getCastInstrCost(Instruction::Trunc, Val, WideTy,
2379 return ExtendCost + InsertCost + TruncCost;
2380 }
2381
2382
2383 // In RVV, we could use vslidedown + vmv.x.s to extract element from vector
2384 // and vslideup + vmv.s.x to insert element to vector.
2385 unsigned BaseCost = 1;
2386 // When insertelement we should add the index with 1 as the input of vslideup.
2387 unsigned SlideCost = Opcode == Instruction::InsertElement ? 2 : 1;
2388
2389 if (Index != -1U) {
2390 // The type may be split. For fixed-width vectors we can normalize the
2391 // index to the new type.
2392 if (LT.second.isFixedLengthVector()) {
2393 unsigned Width = LT.second.getVectorNumElements();
2394 Index = Index % Width;
2395 }
2396
2397 // If exact VLEN is known, we will insert/extract into the appropriate
2398 // subvector with no additional subvector insert/extract cost.
2399 if (auto VLEN = ST->getRealVLen()) {
2400 unsigned EltSize = LT.second.getScalarSizeInBits();
2401 unsigned M1Max = *VLEN / EltSize;
2402 Index = Index % M1Max;
2403 }
2404
2405 if (Index == 0)
2406 // We can extract/insert the first element without vslidedown/vslideup.
2407 SlideCost = 0;
2408 else if (ST->hasVendorXRivosVisni() && isUInt<5>(Index) &&
2409 Val->getScalarType()->isIntegerTy())
2410 SlideCost = 0; // With ri.vinsert/ri.vextract there is no slide needed
2411 else if (Opcode == Instruction::InsertElement)
2412 SlideCost = 1; // With a constant index, we do not need to use addi.
2413 }
2414
2415 // When the vector needs to split into multiple register groups and the index
2416 // exceeds single vector register group, we need to insert/extract the element
2417 // via stack.
2418 if (LT.first > 1 &&
2419 ((Index == -1U) || (Index >= LT.second.getVectorMinNumElements() &&
2420 LT.second.isScalableVector()))) {
2421 Type *ScalarType = Val->getScalarType();
2422 Align VecAlign = DL.getPrefTypeAlign(Val);
2423 Align SclAlign = DL.getPrefTypeAlign(ScalarType);
2424 // Extra addi for unknown index.
2425 InstructionCost IdxCost = Index == -1U ? 1 : 0;
2426
2427 // Store all split vectors into stack and load the target element.
2428 if (Opcode == Instruction::ExtractElement)
2429 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2430 getMemoryOpCost(Instruction::Load, ScalarType, SclAlign, 0,
2431 CostKind) +
2432 IdxCost;
2433
2434 // Store all split vectors into stack and store the target element and load
2435 // vectors back.
2436 return getMemoryOpCost(Instruction::Store, Val, VecAlign, 0, CostKind) +
2437 getMemoryOpCost(Instruction::Load, Val, VecAlign, 0, CostKind) +
2438 getMemoryOpCost(Instruction::Store, ScalarType, SclAlign, 0,
2439 CostKind) +
2440 IdxCost;
2441 }
2442
2443 // Extract i64 in the target that has XLEN=32 need more instruction.
2444 if (Val->getScalarType()->isIntegerTy() &&
2445 ST->getXLen() < Val->getScalarSizeInBits()) {
2446 // For extractelement, we need the following instructions:
2447 // vsetivli zero, 1, e64, m1, ta, mu (not count)
2448 // vslidedown.vx v8, v8, a0
2449 // vmv.x.s a0, v8
2450 // li a1, 32
2451 // vsrl.vx v8, v8, a1
2452 // vmv.x.s a1, v8
2453
2454 // For insertelement, we need the following instructions:
2455 // vsetivli zero, 2, e32, m4, ta, mu (not count)
2456 // vmv.v.i v12, 0
2457 // vslide1up.vx v16, v12, a1
2458 // vslide1up.vx v12, v16, a0
2459 // addi a0, a2, 1
2460 // vsetvli zero, a0, e64, m4, tu, mu (not count)
2461 // vslideup.vx v8, v12, a2
2462
2463 // TODO: should we count these special vsetvlis?
2464 BaseCost = Opcode == Instruction::InsertElement ? 3 : 4;
2465 }
2466 return BaseCost + SlideCost;
2467}
2468
2472 unsigned Index) const {
2473 if (isa<FixedVectorType>(Val))
2475 Index);
2476
2477 // TODO: This code replicates what LoopVectorize.cpp used to do when asking
2478 // for the cost of extracting the last lane of a scalable vector. It probably
2479 // needs a more accurate cost.
2480 ElementCount EC = cast<VectorType>(Val)->getElementCount();
2481 assert(Index < EC.getKnownMinValue() && "Unexpected reverse index");
2482 return getVectorInstrCost(Opcode, Val, CostKind,
2483 EC.getKnownMinValue() - 1 - Index, nullptr,
2484 nullptr);
2485}
2486
2488 unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
2490 ArrayRef<const Value *> Args, const Instruction *CxtI) const {
2491
2492 // TODO: Handle more cost kinds.
2494 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2495 Args, CxtI);
2496
2497 if (isa<FixedVectorType>(Ty) && !ST->useRVVForFixedLengthVectors())
2498 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2499 Args, CxtI);
2500
2501 // Skip if scalar size of Ty is bigger than ELEN.
2502 if (isa<VectorType>(Ty) && Ty->getScalarSizeInBits() > ST->getELen())
2503 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2504 Args, CxtI);
2505
2506 // Legalize the type.
2507 std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(Ty);
2508
2509 // TODO: Handle scalar type.
2510 if (!LT.second.isVector())
2511 return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2512 Args, CxtI);
2513
2514 // f16 with zvfhmin and bf16 will be promoted to f32.
2515 // FIXME: nxv32[b]f16 will be custom lowered and split.
2516 unsigned ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
2517 InstructionCost CastCost = 0;
2518 if ((LT.second.getVectorElementType() == MVT::f16 ||
2519 LT.second.getVectorElementType() == MVT::bf16) &&
2520 TLI->getOperationAction(ISDOpcode, LT.second) ==
2522 MVT PromotedVT = TLI->getTypeToPromoteTo(ISDOpcode, LT.second);
2523 Type *PromotedTy = EVT(PromotedVT).getTypeForEVT(Ty->getContext());
2524 Type *LegalTy = EVT(LT.second).getTypeForEVT(Ty->getContext());
2525 // Add cost of extending arguments
2526 CastCost += LT.first * Args.size() *
2527 getCastInstrCost(Instruction::FPExt, PromotedTy, LegalTy,
2529 // Add cost of truncating result
2530 CastCost +=
2531 LT.first * getCastInstrCost(Instruction::FPTrunc, LegalTy, PromotedTy,
2533 // Compute cost of op in promoted type
2534 LT.second = PromotedVT;
2535 }
2536
2537 auto getConstantMatCost =
2538 [&](unsigned Operand, TTI::OperandValueInfo OpInfo) -> InstructionCost {
2539 if (OpInfo.isUniform() && canSplatOperand(Opcode, Operand))
2540 // Two sub-cases:
2541 // * Has a 5 bit immediate operand which can be splatted.
2542 // * Has a larger immediate which must be materialized in scalar register
2543 // We return 0 for both as we currently ignore the cost of materializing
2544 // scalar constants in GPRs.
2545 return 0;
2546
2547 return getConstantPoolLoadCost(Ty, CostKind);
2548 };
2549
2550 // Add the cost of materializing any constant vectors required.
2551 InstructionCost ConstantMatCost = 0;
2552 if (Op1Info.isConstant())
2553 ConstantMatCost += getConstantMatCost(0, Op1Info);
2554 if (Op2Info.isConstant())
2555 ConstantMatCost += getConstantMatCost(1, Op2Info);
2556
2557 unsigned Op;
2558 switch (ISDOpcode) {
2559 case ISD::ADD:
2560 case ISD::SUB:
2561 Op = RISCV::VADD_VV;
2562 break;
2563 case ISD::SHL:
2564 case ISD::SRL:
2565 case ISD::SRA:
2566 Op = RISCV::VSLL_VV;
2567 break;
2568 case ISD::AND:
2569 case ISD::OR:
2570 case ISD::XOR:
2571 Op = (Ty->getScalarSizeInBits() == 1) ? RISCV::VMAND_MM : RISCV::VAND_VV;
2572 break;
2573 case ISD::MUL:
2574 case ISD::MULHS:
2575 case ISD::MULHU:
2576 Op = RISCV::VMUL_VV;
2577 break;
2578 case ISD::SDIV:
2579 case ISD::UDIV:
2580 Op = RISCV::VDIV_VV;
2581 break;
2582 case ISD::SREM:
2583 case ISD::UREM:
2584 Op = RISCV::VREM_VV;
2585 break;
2586 case ISD::FADD:
2587 case ISD::FSUB:
2588 Op = RISCV::VFADD_VV;
2589 break;
2590 case ISD::FMUL:
2591 Op = RISCV::VFMUL_VV;
2592 break;
2593 case ISD::FDIV:
2594 Op = RISCV::VFDIV_VV;
2595 break;
2596 case ISD::FNEG:
2597 Op = RISCV::VFSGNJN_VV;
2598 break;
2599 default:
2600 // Assuming all other instructions have the same cost until a need arises to
2601 // differentiate them.
2602 return CastCost + ConstantMatCost +
2603 BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
2604 Args, CxtI);
2605 }
2606
2607 InstructionCost InstrCost = getRISCVInstructionCost(Op, LT.second, CostKind);
2608 // We use BasicTTIImpl to calculate scalar costs, which assumes floating point
2609 // ops are twice as expensive as integer ops. Do the same for vectors so
2610 // scalar floating point ops aren't cheaper than their vector equivalents.
2611 if (Ty->isFPOrFPVectorTy())
2612 InstrCost *= 2;
2613 return CastCost + ConstantMatCost + LT.first * InstrCost;
2614}
2615
2616// TODO: Deduplicate from TargetTransformInfoImplCRTPBase.
2618 ArrayRef<const Value *> Ptrs, const Value *Base,
2619 const TTI::PointersChainInfo &Info, Type *AccessTy,
2622 // In the basic model we take into account GEP instructions only
2623 // (although here can come alloca instruction, a value, constants and/or
2624 // constant expressions, PHIs, bitcasts ... whatever allowed to be used as a
2625 // pointer). Typically, if Base is a not a GEP-instruction and all the
2626 // pointers are relative to the same base address, all the rest are
2627 // either GEP instructions, PHIs, bitcasts or constants. When we have same
2628 // base, we just calculate cost of each non-Base GEP as an ADD operation if
2629 // any their index is a non-const.
2630 // If no known dependencies between the pointers cost is calculated as a sum
2631 // of costs of GEP instructions.
2632 for (auto [I, V] : enumerate(Ptrs)) {
2633 const auto *GEP = dyn_cast<GetElementPtrInst>(V);
2634 if (!GEP)
2635 continue;
2636 if (Info.isSameBase() && V != Base) {
2637 if (GEP->hasAllConstantIndices())
2638 continue;
2639 // If the chain is unit-stride and BaseReg + stride*i is a legal
2640 // addressing mode, then presume the base GEP is sitting around in a
2641 // register somewhere and check if we can fold the offset relative to
2642 // it.
2643 unsigned Stride = DL.getTypeStoreSize(AccessTy);
2644 if (Info.isUnitStride() &&
2645 isLegalAddressingMode(AccessTy,
2646 /* BaseGV */ nullptr,
2647 /* BaseOffset */ Stride * I,
2648 /* HasBaseReg */ true,
2649 /* Scale */ 0,
2650 GEP->getType()->getPointerAddressSpace()))
2651 continue;
2652 Cost += getArithmeticInstrCost(Instruction::Add, GEP->getType(), CostKind,
2653 {TTI::OK_AnyValue, TTI::OP_None},
2654 {TTI::OK_AnyValue, TTI::OP_None}, {});
2655 } else {
2656 SmallVector<const Value *> Indices(GEP->indices());
2657 Cost += getGEPCost(GEP->getSourceElementType(), GEP->getPointerOperand(),
2658 Indices, AccessTy, CostKind);
2659 }
2660 }
2661 return Cost;
2662}
2663
2666 OptimizationRemarkEmitter *ORE) const {
2667 // TODO: More tuning on benchmarks and metrics with changes as needed
2668 // would apply to all settings below to enable performance.
2669
2670
2671 if (ST->enableDefaultUnroll())
2672 return BasicTTIImplBase::getUnrollingPreferences(L, SE, UP, ORE);
2673
2674 // Enable Upper bound unrolling universally, not dependent upon the conditions
2675 // below.
2676 UP.UpperBound = true;
2677
2678 // Disable loop unrolling for Oz and Os.
2679 UP.OptSizeThreshold = 0;
2681 if (L->getHeader()->getParent()->hasOptSize())
2682 return;
2683
2684 SmallVector<BasicBlock *, 4> ExitingBlocks;
2685 L->getExitingBlocks(ExitingBlocks);
2686 LLVM_DEBUG(dbgs() << "Loop has:\n"
2687 << "Blocks: " << L->getNumBlocks() << "\n"
2688 << "Exit blocks: " << ExitingBlocks.size() << "\n");
2689
2690 // Only allow another exit other than the latch. This acts as an early exit
2691 // as it mirrors the profitability calculation of the runtime unroller.
2692 if (ExitingBlocks.size() > 2)
2693 return;
2694
2695 // Limit the CFG of the loop body for targets with a branch predictor.
2696 // Allowing 4 blocks permits if-then-else diamonds in the body.
2697 if (L->getNumBlocks() > 4)
2698 return;
2699
2700 // Scan the loop: don't unroll loops with calls as this could prevent
2701 // inlining. Don't unroll auto-vectorized loops either, though do allow
2702 // unrolling of the scalar remainder.
2703 bool IsVectorized = getBooleanLoopAttribute(L, "llvm.loop.isvectorized");
2705 for (auto *BB : L->getBlocks()) {
2706 for (auto &I : *BB) {
2707 // Both auto-vectorized loops and the scalar remainder have the
2708 // isvectorized attribute, so differentiate between them by the presence
2709 // of vector instructions.
2710 if (IsVectorized && I.getType()->isVectorTy())
2711 return;
2712
2713 if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
2714 if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
2715 if (!isLoweredToCall(F))
2716 continue;
2717 }
2718 return;
2719 }
2720
2721 SmallVector<const Value *> Operands(I.operand_values());
2722 Cost += getInstructionCost(&I, Operands,
2724 }
2725 }
2726
2727 LLVM_DEBUG(dbgs() << "Cost of loop: " << Cost << "\n");
2728
2729 UP.Partial = true;
2730 UP.Runtime = true;
2731 UP.UnrollRemainder = true;
2732 UP.UnrollAndJam = true;
2733
2734 // Force unrolling small loops can be very useful because of the branch
2735 // taken cost of the backedge.
2736 if (Cost < 12)
2737 UP.Force = true;
2738}
2739
2744
2746 MemIntrinsicInfo &Info) const {
2747 const DataLayout &DL = getDataLayout();
2748 Intrinsic::ID IID = Inst->getIntrinsicID();
2749 LLVMContext &C = Inst->getContext();
2750 bool HasMask = false;
2751
2752 auto getSegNum = [](const IntrinsicInst *II, unsigned PtrOperandNo,
2753 bool IsWrite) -> int64_t {
2754 if (auto *TarExtTy =
2755 dyn_cast<TargetExtType>(II->getArgOperand(0)->getType()))
2756 return TarExtTy->getIntParameter(0);
2757
2758 return 1;
2759 };
2760
2761 switch (IID) {
2762 case Intrinsic::riscv_vle_mask:
2763 case Intrinsic::riscv_vse_mask:
2764 case Intrinsic::riscv_vlseg2_mask:
2765 case Intrinsic::riscv_vlseg3_mask:
2766 case Intrinsic::riscv_vlseg4_mask:
2767 case Intrinsic::riscv_vlseg5_mask:
2768 case Intrinsic::riscv_vlseg6_mask:
2769 case Intrinsic::riscv_vlseg7_mask:
2770 case Intrinsic::riscv_vlseg8_mask:
2771 case Intrinsic::riscv_vsseg2_mask:
2772 case Intrinsic::riscv_vsseg3_mask:
2773 case Intrinsic::riscv_vsseg4_mask:
2774 case Intrinsic::riscv_vsseg5_mask:
2775 case Intrinsic::riscv_vsseg6_mask:
2776 case Intrinsic::riscv_vsseg7_mask:
2777 case Intrinsic::riscv_vsseg8_mask:
2778 HasMask = true;
2779 [[fallthrough]];
2780 case Intrinsic::riscv_vle:
2781 case Intrinsic::riscv_vse:
2782 case Intrinsic::riscv_vlseg2:
2783 case Intrinsic::riscv_vlseg3:
2784 case Intrinsic::riscv_vlseg4:
2785 case Intrinsic::riscv_vlseg5:
2786 case Intrinsic::riscv_vlseg6:
2787 case Intrinsic::riscv_vlseg7:
2788 case Intrinsic::riscv_vlseg8:
2789 case Intrinsic::riscv_vsseg2:
2790 case Intrinsic::riscv_vsseg3:
2791 case Intrinsic::riscv_vsseg4:
2792 case Intrinsic::riscv_vsseg5:
2793 case Intrinsic::riscv_vsseg6:
2794 case Intrinsic::riscv_vsseg7:
2795 case Intrinsic::riscv_vsseg8: {
2796 // Intrinsic interface:
2797 // riscv_vle(merge, ptr, vl)
2798 // riscv_vle_mask(merge, ptr, mask, vl, policy)
2799 // riscv_vse(val, ptr, vl)
2800 // riscv_vse_mask(val, ptr, mask, vl, policy)
2801 // riscv_vlseg#(merge, ptr, vl, sew)
2802 // riscv_vlseg#_mask(merge, ptr, mask, vl, policy, sew)
2803 // riscv_vsseg#(val, ptr, vl, sew)
2804 // riscv_vsseg#_mask(val, ptr, mask, vl, sew)
2805 bool IsWrite = Inst->getType()->isVoidTy();
2806 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2807 // The results of segment loads are TargetExtType.
2808 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2809 unsigned SEW =
2810 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2811 ->getZExtValue();
2812 Ty = TarExtTy->getTypeParameter(0U);
2814 IntegerType::get(C, SEW),
2815 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2816 }
2817 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2818 unsigned VLIndex = RVVIInfo->VLOperand;
2819 unsigned PtrOperandNo = VLIndex - 1 - HasMask;
2820 MaybeAlign Alignment =
2821 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2822 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2823 Value *Mask = ConstantInt::getTrue(MaskType);
2824 if (HasMask)
2825 Mask = Inst->getArgOperand(VLIndex - 1);
2826 Value *EVL = Inst->getArgOperand(VLIndex);
2827 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2828 // RVV uses contiguous elements as a segment.
2829 if (SegNum > 1) {
2830 unsigned ElemSize = Ty->getScalarSizeInBits();
2831 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2832 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2833 }
2834 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2835 Alignment, Mask, EVL);
2836 return true;
2837 }
2838 case Intrinsic::riscv_vlse_mask:
2839 case Intrinsic::riscv_vsse_mask:
2840 case Intrinsic::riscv_vlsseg2_mask:
2841 case Intrinsic::riscv_vlsseg3_mask:
2842 case Intrinsic::riscv_vlsseg4_mask:
2843 case Intrinsic::riscv_vlsseg5_mask:
2844 case Intrinsic::riscv_vlsseg6_mask:
2845 case Intrinsic::riscv_vlsseg7_mask:
2846 case Intrinsic::riscv_vlsseg8_mask:
2847 case Intrinsic::riscv_vssseg2_mask:
2848 case Intrinsic::riscv_vssseg3_mask:
2849 case Intrinsic::riscv_vssseg4_mask:
2850 case Intrinsic::riscv_vssseg5_mask:
2851 case Intrinsic::riscv_vssseg6_mask:
2852 case Intrinsic::riscv_vssseg7_mask:
2853 case Intrinsic::riscv_vssseg8_mask:
2854 HasMask = true;
2855 [[fallthrough]];
2856 case Intrinsic::riscv_vlse:
2857 case Intrinsic::riscv_vsse:
2858 case Intrinsic::riscv_vlsseg2:
2859 case Intrinsic::riscv_vlsseg3:
2860 case Intrinsic::riscv_vlsseg4:
2861 case Intrinsic::riscv_vlsseg5:
2862 case Intrinsic::riscv_vlsseg6:
2863 case Intrinsic::riscv_vlsseg7:
2864 case Intrinsic::riscv_vlsseg8:
2865 case Intrinsic::riscv_vssseg2:
2866 case Intrinsic::riscv_vssseg3:
2867 case Intrinsic::riscv_vssseg4:
2868 case Intrinsic::riscv_vssseg5:
2869 case Intrinsic::riscv_vssseg6:
2870 case Intrinsic::riscv_vssseg7:
2871 case Intrinsic::riscv_vssseg8: {
2872 // Intrinsic interface:
2873 // riscv_vlse(merge, ptr, stride, vl)
2874 // riscv_vlse_mask(merge, ptr, stride, mask, vl, policy)
2875 // riscv_vsse(val, ptr, stride, vl)
2876 // riscv_vsse_mask(val, ptr, stride, mask, vl, policy)
2877 // riscv_vlsseg#(merge, ptr, offset, vl, sew)
2878 // riscv_vlsseg#_mask(merge, ptr, offset, mask, vl, policy, sew)
2879 // riscv_vssseg#(val, ptr, offset, vl, sew)
2880 // riscv_vssseg#_mask(val, ptr, offset, mask, vl, sew)
2881 bool IsWrite = Inst->getType()->isVoidTy();
2882 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
2883 // The results of segment loads are TargetExtType.
2884 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
2885 unsigned SEW =
2886 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
2887 ->getZExtValue();
2888 Ty = TarExtTy->getTypeParameter(0U);
2890 IntegerType::get(C, SEW),
2891 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
2892 }
2893 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
2894 unsigned VLIndex = RVVIInfo->VLOperand;
2895 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
2896 MaybeAlign Alignment =
2897 Inst->getArgOperand(PtrOperandNo)->getPointerAlignment(DL);
2898
2899 Value *Stride = Inst->getArgOperand(PtrOperandNo + 1);
2900 // Use the pointer alignment as the element alignment if the stride is a
2901 // multiple of the pointer alignment. Otherwise, the element alignment
2902 // should be the greatest common divisor of pointer alignment and stride.
2903 // For simplicity, just consider unalignment for elements.
2904 unsigned PointerAlign = Alignment.valueOrOne().value();
2905 if (!isa<ConstantInt>(Stride) ||
2906 cast<ConstantInt>(Stride)->getZExtValue() % PointerAlign != 0)
2907 Alignment = Align(1);
2908
2909 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
2910 Value *Mask = ConstantInt::getTrue(MaskType);
2911 if (HasMask)
2912 Mask = Inst->getArgOperand(VLIndex - 1);
2913 Value *EVL = Inst->getArgOperand(VLIndex);
2914 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
2915 // RVV uses contiguous elements as a segment.
2916 if (SegNum > 1) {
2917 unsigned ElemSize = Ty->getScalarSizeInBits();
2918 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
2919 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
2920 }
2921 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
2922 Alignment, Mask, EVL, Stride);
2923 return true;
2924 }
2925 case Intrinsic::riscv_vloxei_mask:
2926 case Intrinsic::riscv_vluxei_mask:
2927 case Intrinsic::riscv_vsoxei_mask:
2928 case Intrinsic::riscv_vsuxei_mask:
2929 case Intrinsic::riscv_vloxseg2_mask:
2930 case Intrinsic::riscv_vloxseg3_mask:
2931 case Intrinsic::riscv_vloxseg4_mask:
2932 case Intrinsic::riscv_vloxseg5_mask:
2933 case Intrinsic::riscv_vloxseg6_mask:
2934 case Intrinsic::riscv_vloxseg7_mask:
2935 case Intrinsic::riscv_vloxseg8_mask:
2936 case Intrinsic::riscv_vluxseg2_mask:
2937 case Intrinsic::riscv_vluxseg3_mask:
2938 case Intrinsic::riscv_vluxseg4_mask:
2939 case Intrinsic::riscv_vluxseg5_mask:
2940 case Intrinsic::riscv_vluxseg6_mask:
2941 case Intrinsic::riscv_vluxseg7_mask:
2942 case Intrinsic::riscv_vluxseg8_mask:
2943 case Intrinsic::riscv_vsoxseg2_mask:
2944 case Intrinsic::riscv_vsoxseg3_mask:
2945 case Intrinsic::riscv_vsoxseg4_mask:
2946 case Intrinsic::riscv_vsoxseg5_mask:
2947 case Intrinsic::riscv_vsoxseg6_mask:
2948 case Intrinsic::riscv_vsoxseg7_mask:
2949 case Intrinsic::riscv_vsoxseg8_mask:
2950 case Intrinsic::riscv_vsuxseg2_mask:
2951 case Intrinsic::riscv_vsuxseg3_mask:
2952 case Intrinsic::riscv_vsuxseg4_mask:
2953 case Intrinsic::riscv_vsuxseg5_mask:
2954 case Intrinsic::riscv_vsuxseg6_mask:
2955 case Intrinsic::riscv_vsuxseg7_mask:
2956 case Intrinsic::riscv_vsuxseg8_mask:
2957 HasMask = true;
2958 [[fallthrough]];
2959 case Intrinsic::riscv_vloxei:
2960 case Intrinsic::riscv_vluxei:
2961 case Intrinsic::riscv_vsoxei:
2962 case Intrinsic::riscv_vsuxei:
2963 case Intrinsic::riscv_vloxseg2:
2964 case Intrinsic::riscv_vloxseg3:
2965 case Intrinsic::riscv_vloxseg4:
2966 case Intrinsic::riscv_vloxseg5:
2967 case Intrinsic::riscv_vloxseg6:
2968 case Intrinsic::riscv_vloxseg7:
2969 case Intrinsic::riscv_vloxseg8:
2970 case Intrinsic::riscv_vluxseg2:
2971 case Intrinsic::riscv_vluxseg3:
2972 case Intrinsic::riscv_vluxseg4:
2973 case Intrinsic::riscv_vluxseg5:
2974 case Intrinsic::riscv_vluxseg6:
2975 case Intrinsic::riscv_vluxseg7:
2976 case Intrinsic::riscv_vluxseg8:
2977 case Intrinsic::riscv_vsoxseg2:
2978 case Intrinsic::riscv_vsoxseg3:
2979 case Intrinsic::riscv_vsoxseg4:
2980 case Intrinsic::riscv_vsoxseg5:
2981 case Intrinsic::riscv_vsoxseg6:
2982 case Intrinsic::riscv_vsoxseg7:
2983 case Intrinsic::riscv_vsoxseg8:
2984 case Intrinsic::riscv_vsuxseg2:
2985 case Intrinsic::riscv_vsuxseg3:
2986 case Intrinsic::riscv_vsuxseg4:
2987 case Intrinsic::riscv_vsuxseg5:
2988 case Intrinsic::riscv_vsuxseg6:
2989 case Intrinsic::riscv_vsuxseg7:
2990 case Intrinsic::riscv_vsuxseg8: {
2991 // Intrinsic interface (only listed ordered version):
2992 // riscv_vloxei(merge, ptr, index, vl)
2993 // riscv_vloxei_mask(merge, ptr, index, mask, vl, policy)
2994 // riscv_vsoxei(val, ptr, index, vl)
2995 // riscv_vsoxei_mask(val, ptr, index, mask, vl, policy)
2996 // riscv_vloxseg#(merge, ptr, index, vl, sew)
2997 // riscv_vloxseg#_mask(merge, ptr, index, mask, vl, policy, sew)
2998 // riscv_vsoxseg#(val, ptr, index, vl, sew)
2999 // riscv_vsoxseg#_mask(val, ptr, index, mask, vl, sew)
3000 bool IsWrite = Inst->getType()->isVoidTy();
3001 Type *Ty = IsWrite ? Inst->getArgOperand(0)->getType() : Inst->getType();
3002 // The results of segment loads are TargetExtType.
3003 if (auto *TarExtTy = dyn_cast<TargetExtType>(Ty)) {
3004 unsigned SEW =
3005 1 << cast<ConstantInt>(Inst->getArgOperand(Inst->arg_size() - 1))
3006 ->getZExtValue();
3007 Ty = TarExtTy->getTypeParameter(0U);
3009 IntegerType::get(C, SEW),
3010 cast<ScalableVectorType>(Ty)->getMinNumElements() * 8 / SEW);
3011 }
3012 const auto *RVVIInfo = RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IID);
3013 unsigned VLIndex = RVVIInfo->VLOperand;
3014 unsigned PtrOperandNo = VLIndex - 2 - HasMask;
3015 Value *Mask;
3016 if (HasMask) {
3017 Mask = Inst->getArgOperand(VLIndex - 1);
3018 } else {
3019 // Mask cannot be nullptr here: vector GEP produces <vscale x N x ptr>,
3020 // and casting that to scalar i64 triggers a vector/scalar mismatch
3021 // assertion in CreatePointerCast. Use an all-true mask so ASan lowers it
3022 // via extractelement instead.
3023 Type *MaskType = Ty->getWithNewType(Type::getInt1Ty(C));
3024 Mask = ConstantInt::getTrue(MaskType);
3025 }
3026 Value *EVL = Inst->getArgOperand(VLIndex);
3027 unsigned SegNum = getSegNum(Inst, PtrOperandNo, IsWrite);
3028 // RVV uses contiguous elements as a segment.
3029 if (SegNum > 1) {
3030 unsigned ElemSize = Ty->getScalarSizeInBits();
3031 auto *SegTy = IntegerType::get(C, ElemSize * SegNum);
3032 Ty = VectorType::get(SegTy, cast<VectorType>(Ty));
3033 }
3034 Value *OffsetOp = Inst->getArgOperand(PtrOperandNo + 1);
3035 Info.InterestingOperands.emplace_back(Inst, PtrOperandNo, IsWrite, Ty,
3036 Align(1), Mask, EVL,
3037 /* Stride */ nullptr, OffsetOp);
3038 return true;
3039 }
3040 }
3041 return false;
3042}
3043
3045 if (Ty->isVectorTy()) {
3046 // f16 with only zvfhmin and bf16 will be promoted to f32
3047 Type *EltTy = cast<VectorType>(Ty)->getElementType();
3048 if ((EltTy->isHalfTy() && !ST->hasVInstructionsF16()) ||
3049 EltTy->isBFloatTy())
3050 Ty = VectorType::get(Type::getFloatTy(Ty->getContext()),
3051 cast<VectorType>(Ty));
3052
3053 TypeSize Size = DL.getTypeSizeInBits(Ty);
3054 if (Size.isScalable() && ST->hasVInstructions())
3055 return divideCeil(Size.getKnownMinValue(), RISCV::RVVBitsPerBlock);
3056
3057 if (ST->useRVVForFixedLengthVectors())
3058 return divideCeil(Size, ST->getRealMinVLen());
3059 }
3060
3061 return BaseT::getRegUsageForType(Ty);
3062}
3063
3064unsigned RISCVTTIImpl::getMaximumVF(unsigned ElemWidth, unsigned Opcode) const {
3065 if (SLPMaxVF.getNumOccurrences())
3066 return SLPMaxVF;
3067
3068 // Return how many elements can fit in getRegisterBitwidth. This is the
3069 // same routine as used in LoopVectorizer. We should probably be
3070 // accounting for whether we actually have instructions with the right
3071 // lane type, but we don't have enough information to do that without
3072 // some additional plumbing which hasn't been justified yet.
3073 TypeSize RegWidth =
3075 // If no vector registers, or absurd element widths, disable
3076 // vectorization by returning 1.
3077 return std::max<unsigned>(1U, RegWidth.getFixedValue() / ElemWidth);
3078}
3079
3083
3085 return ST->enableUnalignedVectorMem();
3086}
3087
3090 ScalarEvolution *SE) const {
3091 if (ST->hasVendorXCVmem() && !ST->is64Bit())
3092 return TTI::AMK_PostIndexed;
3093
3095}
3096
3098 const TargetTransformInfo::LSRCost &C2) const {
3099 // RISC-V specific here are "instruction number 1st priority".
3100 // If we need to emit adds inside the loop to add up base registers, then
3101 // we need at least one extra temporary register.
3102 unsigned C1NumRegs = C1.NumRegs + (C1.NumBaseAdds != 0);
3103 unsigned C2NumRegs = C2.NumRegs + (C2.NumBaseAdds != 0);
3104 return std::tie(C1.Insns, C1NumRegs, C1.AddRecCost,
3105 C1.NumIVMuls, C1.NumBaseAdds,
3106 C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
3107 std::tie(C2.Insns, C2NumRegs, C2.AddRecCost,
3108 C2.NumIVMuls, C2.NumBaseAdds,
3109 C2.ScaleCost, C2.ImmCost, C2.SetupCost);
3110}
3111
3113 Align Alignment) const {
3114 auto *VTy = dyn_cast<VectorType>(DataTy);
3115 if (!VTy || VTy->isScalableTy())
3116 return false;
3117
3118 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3119 return false;
3120
3121 // FIXME: If it is an i8 vector and the element count exceeds 256, we should
3122 // scalarize these types with LMUL >= maximum fixed-length LMUL.
3123 if (VTy->getElementType()->isIntegerTy(8))
3124 if (VTy->getElementCount().getFixedValue() > 256)
3125 return VTy->getPrimitiveSizeInBits() / ST->getRealMinVLen() <
3126 ST->getMaxLMULForFixedLengthVectors();
3127 return true;
3128}
3129
3131 Align Alignment) const {
3132 auto *VTy = dyn_cast<VectorType>(DataTy);
3133 if (!VTy || VTy->isScalableTy())
3134 return false;
3135
3136 if (!isLegalMaskedLoadStore(DataTy, Alignment))
3137 return false;
3138 return true;
3139}
3140
3141/// See if \p I should be considered for address type promotion. We check if \p
3142/// I is a sext with right type and used in memory accesses. If it used in a
3143/// "complex" getelementptr, we allow it to be promoted without finding other
3144/// sext instructions that sign extended the same initial value. A getelementptr
3145/// is considered as "complex" if it has more than 2 operands.
3147 const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
3148 bool Considerable = false;
3149 AllowPromotionWithoutCommonHeader = false;
3150 if (!isa<SExtInst>(&I))
3151 return false;
3152 Type *ConsideredSExtType =
3153 Type::getInt64Ty(I.getParent()->getParent()->getContext());
3154 if (I.getType() != ConsideredSExtType)
3155 return false;
3156 // See if the sext is the one with the right type and used in at least one
3157 // GetElementPtrInst.
3158 for (const User *U : I.users()) {
3159 if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
3160 Considerable = true;
3161 // A getelementptr is considered as "complex" if it has more than 2
3162 // operands. We will promote a SExt used in such complex GEP as we
3163 // expect some computation to be merged if they are done on 64 bits.
3164 if (GEPInst->getNumOperands() > 2) {
3165 AllowPromotionWithoutCommonHeader = true;
3166 break;
3167 }
3168 }
3169 }
3170 return Considerable;
3171}
3172
3173bool RISCVTTIImpl::canSplatOperand(unsigned Opcode, int Operand) const {
3174 switch (Opcode) {
3175 case Instruction::Add:
3176 case Instruction::Sub:
3177 case Instruction::Mul:
3178 case Instruction::And:
3179 case Instruction::Or:
3180 case Instruction::Xor:
3181 case Instruction::FAdd:
3182 case Instruction::FSub:
3183 case Instruction::FMul:
3184 case Instruction::FDiv:
3185 case Instruction::ICmp:
3186 case Instruction::FCmp:
3187 return true;
3188 case Instruction::Shl:
3189 case Instruction::LShr:
3190 case Instruction::AShr:
3191 case Instruction::UDiv:
3192 case Instruction::SDiv:
3193 case Instruction::URem:
3194 case Instruction::SRem:
3195 case Instruction::Select:
3196 return Operand == 1;
3197 default:
3198 return false;
3199 }
3200}
3201
3203 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3204 return false;
3205
3206 if (canSplatOperand(I->getOpcode(), Operand))
3207 return true;
3208
3209 auto *II = dyn_cast<IntrinsicInst>(I);
3210 if (!II)
3211 return false;
3212
3213 switch (II->getIntrinsicID()) {
3214 case Intrinsic::fma:
3215 case Intrinsic::vp_fma:
3216 case Intrinsic::fmuladd:
3217 case Intrinsic::vp_fmuladd:
3218 return Operand == 0 || Operand == 1;
3219 case Intrinsic::vp_shl:
3220 case Intrinsic::vp_lshr:
3221 case Intrinsic::vp_ashr:
3222 case Intrinsic::vp_udiv:
3223 case Intrinsic::vp_sdiv:
3224 case Intrinsic::vp_urem:
3225 case Intrinsic::vp_srem:
3226 case Intrinsic::ssub_sat:
3227 case Intrinsic::vp_ssub_sat:
3228 case Intrinsic::usub_sat:
3229 case Intrinsic::vp_usub_sat:
3230 case Intrinsic::vp_select:
3231 return Operand == 1;
3232 // These intrinsics are commutative.
3233 case Intrinsic::vp_add:
3234 case Intrinsic::vp_mul:
3235 case Intrinsic::vp_and:
3236 case Intrinsic::vp_or:
3237 case Intrinsic::vp_xor:
3238 case Intrinsic::vp_fadd:
3239 case Intrinsic::vp_fmul:
3240 case Intrinsic::vp_icmp:
3241 case Intrinsic::vp_fcmp:
3242 case Intrinsic::smin:
3243 case Intrinsic::vp_smin:
3244 case Intrinsic::umin:
3245 case Intrinsic::vp_umin:
3246 case Intrinsic::smax:
3247 case Intrinsic::vp_smax:
3248 case Intrinsic::umax:
3249 case Intrinsic::vp_umax:
3250 case Intrinsic::sadd_sat:
3251 case Intrinsic::vp_sadd_sat:
3252 case Intrinsic::uadd_sat:
3253 case Intrinsic::vp_uadd_sat:
3254 // These intrinsics have 'vr' versions.
3255 case Intrinsic::vp_sub:
3256 case Intrinsic::vp_fsub:
3257 case Intrinsic::vp_fdiv:
3258 return Operand == 0 || Operand == 1;
3259 default:
3260 return false;
3261 }
3262}
3263
3264/// Check if sinking \p I's operands to I's basic block is profitable, because
3265/// the operands can be folded into a target instruction, e.g.
3266/// splats of scalars can fold into vector instructions.
3269 using namespace llvm::PatternMatch;
3270
3271 if (I->isBitwiseLogicOp()) {
3272 if (!I->getType()->isVectorTy()) {
3273 if (ST->hasStdExtZbb() || ST->hasStdExtZbkb()) {
3274 for (auto &Op : I->operands()) {
3275 // (and/or/xor X, (not Y)) -> (andn/orn/xnor X, Y)
3276 if (match(Op.get(), m_Not(m_Value()))) {
3277 Ops.push_back(&Op);
3278 return true;
3279 }
3280 }
3281 }
3282 } else if (I->getOpcode() == Instruction::And && ST->hasStdExtZvkb()) {
3283 for (auto &Op : I->operands()) {
3284 // (and X, (not Y)) -> (vandn.vv X, Y)
3285 if (match(Op.get(), m_Not(m_Value()))) {
3286 Ops.push_back(&Op);
3287 return true;
3288 }
3289 // (and X, (splat (not Y))) -> (vandn.vx X, Y)
3291 m_ZeroInt()),
3292 m_Value(), m_ZeroMask()))) {
3293 Use &InsertElt = cast<Instruction>(Op)->getOperandUse(0);
3294 Use &Not = cast<Instruction>(InsertElt)->getOperandUse(1);
3295 Ops.push_back(&Not);
3296 Ops.push_back(&InsertElt);
3297 Ops.push_back(&Op);
3298 return true;
3299 }
3300 }
3301 }
3302 }
3303
3304 if (!I->getType()->isVectorTy() || !ST->hasVInstructions())
3305 return false;
3306
3307 // Don't sink splat operands if the target prefers it. Some targets requires
3308 // S2V transfer buffers and we can run out of them copying the same value
3309 // repeatedly.
3310 // FIXME: It could still be worth doing if it would improve vector register
3311 // pressure and prevent a vector spill.
3312 if (!ST->sinkSplatOperands())
3313 return false;
3314
3315 for (auto OpIdx : enumerate(I->operands())) {
3316 if (!canSplatOperand(I, OpIdx.index()))
3317 continue;
3318
3319 Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
3320 // Make sure we are not already sinking this operand
3321 if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
3322 continue;
3323
3324 // We are looking for a splat/vp.splat that can be sunk.
3326 m_Value(), m_Value(), m_Value()));
3327 if (!IsVPSplat &&
3329 m_Value(), m_ZeroMask())))
3330 continue;
3331
3332 // Don't sink i1 splats.
3333 if (cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(1))
3334 continue;
3335
3336 // All uses of the shuffle should be sunk to avoid duplicating it across gpr
3337 // and vector registers
3338 for (Use &U : Op->uses()) {
3339 Instruction *Insn = cast<Instruction>(U.getUser());
3340 if (!canSplatOperand(Insn, U.getOperandNo()))
3341 return false;
3342 }
3343
3344 // Sink any fpexts since they might be used in a widening fp pattern.
3345 if (IsVPSplat) {
3346 if (isa<FPExtInst>(Op->getOperand(0)))
3347 Ops.push_back(&Op->getOperandUse(0));
3348 } else {
3349 Use *InsertEltUse = &Op->getOperandUse(0);
3350 auto *InsertElt = cast<InsertElementInst>(InsertEltUse);
3351 if (isa<FPExtInst>(InsertElt->getOperand(1)))
3352 Ops.push_back(&InsertElt->getOperandUse(1));
3353 Ops.push_back(InsertEltUse);
3354 }
3355 Ops.push_back(&OpIdx.value());
3356 }
3357 return true;
3358}
3359
3361RISCVTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
3363 // TODO: Enable expansion when unaligned access is not supported after we fix
3364 // issues in ExpandMemcmp.
3365 if (!ST->enableUnalignedScalarMem())
3366 return Options;
3367
3368 if (!ST->hasStdExtZbb() && !ST->hasStdExtZbkb() && !IsZeroCmp)
3369 return Options;
3370
3371 Options.AllowOverlappingLoads = true;
3372 Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
3373 Options.NumLoadsPerBlock = Options.MaxNumLoads;
3374 if (ST->is64Bit()) {
3375 Options.LoadSizes = {8, 4, 2, 1};
3376 Options.AllowedTailExpansions = {3, 5, 6};
3377 } else {
3378 Options.LoadSizes = {4, 2, 1};
3379 Options.AllowedTailExpansions = {3};
3380 }
3381
3382 if (IsZeroCmp && ST->hasVInstructions()) {
3383 unsigned VLenB = ST->getRealMinVLen() / 8;
3384 // The minimum size should be `XLen / 8 + 1`, and the maxinum size should be
3385 // `VLenB * MaxLMUL` so that it fits in a single register group.
3386 unsigned MinSize = ST->getXLen() / 8 + 1;
3387 unsigned MaxSize = VLenB * ST->getMaxLMULForFixedLengthVectors();
3388 for (unsigned Size = MinSize; Size <= MaxSize; Size++)
3389 Options.LoadSizes.insert(Options.LoadSizes.begin(), Size);
3390 }
3391 return Options;
3392}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
This file provides a helper that implements much of the TTI interface in terms of the target-independ...
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static bool shouldSplit(Instruction *InsertPoint, DenseSet< Value * > &PrevConditionValues, DenseSet< Value * > &ConditionValues, DominatorTree &DT, DenseSet< Instruction * > &Unhoistables)
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
Cost tables and simple lookup functions.
Hexagon Common GEP
static cl::opt< int > InstrCost("inline-instr-cost", cl::Hidden, cl::init(5), cl::desc("Cost of a single instruction when inlining"))
std::pair< Instruction::BinaryOps, Value * > OffsetOp
Find all possible pairs (BinOp, RHS) that BinOp V, RHS can be simplified.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static LVOptions Options
Definition LVOptions.cpp:25
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
static const Function * getCalledFunction(const Value *V)
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
static InstructionCost costShuffleViaVRegSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, std::optional< unsigned > VLen, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Try to perform better estimation of the permutation.
static InstructionCost costShuffleViaSplitting(const RISCVTTIImpl &TTI, MVT LegalVT, VectorType *Tp, ArrayRef< int > Mask, TTI::TargetCostKind CostKind)
Attempt to approximate the cost of a shuffle which will require splitting during legalization.
static bool isRepeatedConcatMask(ArrayRef< int > Mask, int &SubVectorSize)
static unsigned isM1OrSmaller(MVT VT)
static cl::opt< unsigned > SLPMaxVF("riscv-v-slp-max-vf", cl::desc("Overrides result used for getMaximumVF query which is used " "exclusively by SLP vectorizer."), cl::Hidden)
static cl::opt< unsigned > RVVRegisterWidthLMUL("riscv-v-register-bit-width-lmul", cl::desc("The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " "by autovectorized code. Fractional LMULs are not supported."), cl::init(2), cl::Hidden)
static cl::opt< unsigned > RVVMinTripCount("riscv-v-min-trip-count", cl::desc("Set the lower bound of a trip count to decide on " "vectorization while tail-folding."), cl::init(5), cl::Hidden)
static InstructionCost getIntImmCostImpl(const DataLayout &DL, const RISCVSubtarget *ST, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, bool FreeZeroes)
static VectorType * getVRGatherIndexType(MVT DataVT, const RISCVSubtarget &ST, LLVMContext &C)
static const CostTblEntry VectorIntrinsicCostTable[]
static bool canUseShiftPair(Instruction *Inst, const APInt &Imm)
static bool canUseShiftCmp(Instruction *Inst, const APInt &Imm)
This file defines a TargetTransformInfoImplBase conforming object specific to the RISC-V target machi...
static Type * getValueType(Value *V)
Returns the type of the given value/instruction V.
This file contains some templates that are useful if you are working with the STL at all.
#define LLVM_DEBUG(...)
Definition Debug.h:114
This file describes how to lower LLVM code to machine code.
This pass exposes codegen information to IR-level passes.
Class for arbitrary precision integers.
Definition APInt.h:78
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getGEPCost(Type *PointeeType, const Value *Ptr, ArrayRef< const Value * > Operands, Type *AccessType, TTI::TargetCostKind CostKind) const override
InstructionCost getScalarizationOverhead(VectorType *InTy, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TTI::ShuffleKind improveShuffleKindFromMask(TTI::ShuffleKind Kind, ArrayRef< int > Mask, VectorType *SrcTy, int &Index, VectorType *&SubTy) const
bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV, int64_t BaseOffset, bool HasBaseReg, int64_t Scale, unsigned AddrSpace, Instruction *I=nullptr, int64_t ScalableOffset=0) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *DataTy, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
std::optional< unsigned > getMaxVScale() const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
std::pair< InstructionCost, MVT > getTypeLegalizationCost(Type *Ty) const
bool isLegalAddImmediate(int64_t imm) const override
std::optional< unsigned > getVScaleForTuning() const override
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *, const SCEV *, TTI::TargetCostKind) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
Value * getArgOperand(unsigned i) const
unsigned arg_size() const
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
@ FCMP_OEQ
0 0 0 1 True if ordered and equal
Definition InstrTypes.h:679
@ FCMP_TRUE
1 1 1 1 Always true (always folded)
Definition InstrTypes.h:693
@ ICMP_SLT
signed less than
Definition InstrTypes.h:705
@ FCMP_OLT
0 1 0 0 True if ordered and less than
Definition InstrTypes.h:682
@ FCMP_ULE
1 1 0 1 True if unordered, less than, or equal
Definition InstrTypes.h:691
@ FCMP_OGT
0 0 1 0 True if ordered and greater than
Definition InstrTypes.h:680
@ FCMP_OGE
0 0 1 1 True if ordered and greater than or equal
Definition InstrTypes.h:681
@ FCMP_ULT
1 1 0 0 True if unordered or less than
Definition InstrTypes.h:690
@ FCMP_ONE
0 1 1 0 True if ordered and operands are unequal
Definition InstrTypes.h:684
@ FCMP_UEQ
1 0 0 1 True if unordered or equal
Definition InstrTypes.h:687
@ FCMP_UGT
1 0 1 0 True if unordered or greater than
Definition InstrTypes.h:688
@ FCMP_OLE
0 1 0 1 True if ordered and less than or equal
Definition InstrTypes.h:683
@ FCMP_ORD
0 1 1 1 True if ordered (no nans)
Definition InstrTypes.h:685
@ ICMP_NE
not equal
Definition InstrTypes.h:698
@ FCMP_UNE
1 1 1 0 True if unordered or not equal
Definition InstrTypes.h:692
@ FCMP_UGE
1 0 1 1 True if unordered, greater than, or equal
Definition InstrTypes.h:689
@ FCMP_FALSE
0 0 0 0 Always false (always folded)
Definition InstrTypes.h:678
@ FCMP_UNO
1 0 0 0 True if unordered: isnan(X) | isnan(Y)
Definition InstrTypes.h:686
static bool isFPPredicate(Predicate P)
Definition InstrTypes.h:770
static bool isIntPredicate(Predicate P)
Definition InstrTypes.h:776
static LLVM_ABI ConstantInt * getTrue(LLVMContext &Context)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
bool noNaNs() const
Definition FMF.h:65
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
static InstructionCost getInvalid(CostType Val=0)
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
LLVM_ABI bool isCommutative() const LLVM_READONLY
Return true if the instruction is commutative:
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
const SmallVectorImpl< Type * > & getArgTypes() const
const SmallVectorImpl< const Value * > & getArgs() const
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Represents a single loop in the control flow graph.
Definition LoopInfo.h:40
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
uint64_t getScalarSizeInBits() const
MVT changeVectorElementType(MVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
bool bitsLE(MVT VT) const
Return true if this has no more bits than VT.
unsigned getVectorNumElements() const
bool isVector() const
Return true if this is a vector value type.
MVT changeTypeToInteger()
Return the type converted to an equivalently sized integer or vector with integer element type.
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool bitsGT(MVT VT) const
Return true if this has more bits than VT.
bool isFixedLengthVector() const
TypeSize getStoreSize() const
Return the number of bytes overwritten by a store of the specified value type.
MVT getVectorElementType() const
static MVT getIntegerVT(unsigned BitWidth)
MVT getScalarType() const
If this is a vector, return the element type, otherwise return this.
unsigned getOpcode() const
Return the opcode for this Instruction or ConstantExpr.
Definition Operator.h:43
The optimization diagnostic interface.
InstructionCost getExtendedReductionCost(unsigned Opcode, bool IsUnsigned, Type *ResTy, VectorType *ValTy, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
InstructionCost getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) const override
bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) const
InstructionCost getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
unsigned getMinTripCountTailFoldingThreshold() const override
TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const override
InstructionCost getAddressComputationCost(Type *PTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const override
InstructionCost getStoreImmCost(Type *VecTy, TTI::OperandValueInfo OpInfo, TTI::TargetCostKind CostKind) const
Return the cost of materializing an immediate for a value operand of a store instruction.
bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const override
InstructionCost getCostOfKeepingLiveOverCall(ArrayRef< Type * > Tys) const override
bool hasActiveVectorLength() const override
InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind, TTI::OperandValueInfo Op1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Op2Info={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIndexedVectorInstrCostFromEnd(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index) const override
InstructionCost getExpandCompressMemoryOpCost(unsigned Opcode, Type *Src, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I=nullptr) const override
void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) const override
InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind) const override
InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
InstructionCost getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind, Instruction *Inst=nullptr) const override
InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind) const override
Try to calculate op costs for min/max reduction operations.
bool canSplatOperand(Instruction *I, int Operand) const
Return true if the (vector) instruction I will be lowered to an instruction with a scalar splat opera...
bool isLSRCostLess(const TargetTransformInfo::LSRCost &C1, const TargetTransformInfo::LSRCost &C2) const override
bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const override
unsigned getRegUsageForType(Type *Ty) const override
InstructionCost getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef< unsigned > Indices, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond=false, bool UseMaskForGaps=false) const override
InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const override
Estimate the overhead of scalarizing an instruction.
bool isLegalMaskedScatter(Type *DataType, Align Alignment) const override
bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment) const override
bool preferAlternateOpcodeVectorization() const override
bool isProfitableToSinkOperands(Instruction *I, SmallVectorImpl< Use * > &Ops) const override
Check if sinking I's operands to I's basic block is profitable, because the operands can be folded in...
InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index, const Value *Op0, const Value *Op1) const override
std::optional< unsigned > getMaxVScale() const override
bool shouldExpandReduction(const IntrinsicInst *II) const override
std::optional< unsigned > getVScaleForTuning() const override
bool isLegalMaskedGather(Type *DataType, Align Alignment) const override
InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask, TTI::TargetCostKind CostKind, int Index, VectorType *SubTp, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const override
unsigned getMaximumVF(unsigned ElemWidth, unsigned Opcode) const override
InstructionCost getPointersChainCost(ArrayRef< const Value * > Ptrs, const Value *Base, const TTI::PointersChainInfo &Info, Type *AccessTy, TTI::TargetCostKind CostKind) const override
TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override
InstructionCost getPartialReductionCost(unsigned Opcode, Type *InputTypeA, Type *InputTypeB, Type *AccumType, ElementCount VF, TTI::PartialReductionExtendKind OpAExtend, TTI::PartialReductionExtendKind OpBExtend, std::optional< unsigned > BinOp, TTI::TargetCostKind CostKind) const override
InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpdInfo={TTI::OK_AnyValue, TTI::OP_None}, const Instruction *I=nullptr) const override
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const override
Get intrinsic cost based on arguments.
InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind) const override
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const override
void getPeelingPreferences(Loop *L, ScalarEvolution &SE, TTI::PeelingPreferences &PP) const override
bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const override
See if I should be considered for address type promotion.
InstructionCost getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) const override
InstructionCost getStridedMemoryOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const override
TargetTransformInfo::PopcntSupportKind getPopcntSupport(unsigned TyWidth) const override
static MVT getM1VT(MVT VT)
Given a vector (either fixed or scalable), return the scalable vector corresponding to a vector regis...
InstructionCost getVRGatherVVCost(MVT VT) const
Return the cost of a vrgather.vv instruction for the type VT.
InstructionCost getVRGatherVICost(MVT VT) const
Return the cost of a vrgather.vi (or vx) instruction for the type VT.
static unsigned computeVLMAX(unsigned VectorBits, unsigned EltSize, unsigned MinSize)
InstructionCost getLMULCost(MVT VT) const
Return the cost of LMUL for linear operations.
InstructionCost getVSlideVICost(MVT VT) const
Return the cost of a vslidedown.vi or vslideup.vi instruction for the type VT.
InstructionCost getVSlideVXCost(MVT VT) const
Return the cost of a vslidedown.vx or vslideup.vx instruction for the type VT.
static RISCVVType::VLMUL getLMUL(MVT VT)
This class represents an analyzed expression in the program.
static LLVM_ABI ScalableVectorType * get(Type *ElementType, unsigned MinNumElts)
Definition Type.cpp:825
The main scalar evolution driver.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static LLVM_ABI bool isInterleaveMask(ArrayRef< int > Mask, unsigned Factor, unsigned NumInputElts, SmallVectorImpl< unsigned > &StartIndexes)
Return true if the mask interleaves one or more input vectors together.
Implements a dense probed hash-table based set with some number of buckets stored inline.
Definition DenseSet.h:291
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
virtual const DataLayout & getDataLayout() const
virtual TTI::AddressingModeKind getPreferredAddressingMode(const Loop *L, ScalarEvolution *SE) const
virtual bool isLoweredToCall(const Function *F) const
InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TTI::TargetCostKind CostKind) const override
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCK_Latency
The latency of instruction.
static bool requiresOrderedReduction(std::optional< FastMathFlags > FMF)
A helper function to determine the type of reduction algorithm used for a given Opcode and set of Fas...
PopcntSupportKind
Flags indicating the kind of support for population count.
@ TCC_Free
Expected to fold away in lowering.
@ TCC_Basic
The cost of a typical 'add' instruction.
AddressingModeKind
Which addressing mode Loop Strength Reduction will try to generate.
@ AMK_PostIndexed
Prefer post-indexed addressing mode.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_InsertSubvector
InsertSubvector. Index indicates start offset.
@ SK_Select
Selects elements from the corresponding lane of either source operand.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Transpose
Transpose two vectors.
@ SK_Splice
Concatenates elements from the first input vector with elements of the second input vector.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_Reverse
Reverse the order of the vector.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
CastContextHint
Represents a hint about the context in which a cast is used.
@ None
The cast is not used with a load/store of any kind.
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:344
static constexpr TypeSize getScalable(ScalarTy MinimumSize)
Definition TypeSize.h:347
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:298
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:273
LLVM_ABI bool isScalableTy(SmallPtrSetImpl< const Type * > &Visited) const
Return true if this is a type whose size is a known multiple of vscale.
Definition Type.cpp:62
bool isBFloatTy() const
Return true if this is 'bfloat', a 16-bit bfloat type.
Definition Type.h:145
LLVM_ABI unsigned getPointerAddressSpace() const
Get the address space of this pointer or pointer vector type.
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:352
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:142
LLVM_ABI Type * getWithNewType(Type *EltTy) const
Given vector type, change the element type, whilst keeping the old number of elements.
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
static LLVM_ABI IntegerType * getInt1Ty(LLVMContext &C)
Definition Type.cpp:294
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:301
static LLVM_ABI Type * getFloatTy(LLVMContext &C)
Definition Type.cpp:285
bool isVoidTy() const
Return true if this is 'void'.
Definition Type.h:139
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:232
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
user_iterator user_begin()
Definition Value.h:402
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
LLVM_ABI LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.cpp:1099
Base class of all SIMD vector types.
ElementCount getElementCount() const
Return an ElementCount instance to represent the (possibly scalable) number of elements in the vector...
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
constexpr bool isKnownMultipleOf(ScalarTy RHS) const
This function tells the caller whether the element count is known at compile time to be a multiple of...
Definition TypeSize.h:181
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:201
static constexpr bool isKnownLE(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:231
static constexpr bool isKnownLT(const FixedOrScalableQuantity &LHS, const FixedOrScalableQuantity &RHS)
Definition TypeSize.h:217
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:166
constexpr LeafTy divideCoefficientBy(ScalarTy RHS) const
We do not provide the '/' operator here because division for polynomial types does not work in the sa...
Definition TypeSize.h:253
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
ISD namespace - This namespace contains an enum which represents all of the SelectionDAG node types a...
Definition ISDOpcodes.h:24
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:259
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:868
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:410
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:832
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:701
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:762
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:838
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:914
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:736
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:947
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:844
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SrcTy, SpecificConstantMatch, TargetOpcode::G_XOR, true > m_Not(const SrcTy &&Src)
Matches a register not-ed by a G_XOR.
bool match(Val *V, const Pattern &P)
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
int getIntMatCost(const APInt &Val, unsigned Size, const MCSubtargetInfo &STI, bool CompressionCost, bool FreeZeroes)
static constexpr unsigned RVVBitsPerBlock
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
const CostTblEntryT< CostType > * CostTableLookup(ArrayRef< CostTblEntryT< CostType > > Tbl, int ISD, MVT Ty)
Find in cost table.
Definition CostTable.h:35
LLVM_ABI bool getBooleanLoopAttribute(const Loop *TheLoop, StringRef Name)
Returns true if Name is applied to TheLoop and enabled.
InstructionCost Cost
constexpr bool isInt(int64_t x)
Checks if an integer fits into the given bit width.
Definition MathExtras.h:165
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:202
constexpr bool isShiftedMask_64(uint64_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (64 bit ver...
Definition MathExtras.h:273
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
constexpr T divideCeil(U Numerator, V Denominator)
Returns the integer ceil(Numerator / Denominator).
Definition MathExtras.h:394
TargetTransformInfo TTI
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
DWARFExpression::Operation Op
CostTblEntryT< unsigned > CostTblEntry
Definition CostTable.h:30
OutputIt copy(R &&Range, OutputIt Out)
Definition STLExtras.h:1835
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
constexpr int64_t SignExtend64(uint64_t x)
Sign-extend the number in the bottom B bits of X to a 64-bit integer.
Definition MathExtras.h:572
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool equal(L &&LRange, R &&RRange)
Wrapper function around std::equal to detect if pair-wise elements between two ranges are the same.
Definition STLExtras.h:2088
T bit_floor(T Value)
Returns the largest integral power of two no greater than Value if Value is nonzero.
Definition bit.h:330
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
Extended Value Type.
Definition ValueTypes.h:35
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
Align valueOrOne() const
For convenience, returns a valid alignment or 1 if undefined.
Definition Alignment.h:130
Information about a load/store intrinsic defined by the target.
unsigned Insns
TODO: Some of these could be merged.
Returns options for expansion of memcmp. IsZeroCmp is.
Describe known properties for a set of pointers.
Parameters that control the generic loop unrolling transformation.
bool UpperBound
Allow using trip count upper bound to unroll loops.
bool Force
Apply loop unroll on any kind of loop (mainly to loops that fail runtime unrolling).
unsigned PartialOptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size, like OptSizeThreshold,...
bool UnrollAndJam
Allow unroll and jam. Used to enable unroll and jam for the target.
bool UnrollRemainder
Allow unrolling of all the iterations of the runtime loop remainder.
bool Runtime
Allow runtime unrolling (unrolling of loops to expand the size of the loop body even when the number ...
bool Partial
Allow partial unrolling (unrolling of loops to expand the size of the loop body, not only to eliminat...
unsigned OptSizeThreshold
The cost threshold for the unrolled loop when optimizing for size (set to UINT_MAX to disable).