LLVM 23.0.0git
VectorUtils.cpp
Go to the documentation of this file.
1//===----------- VectorUtils.cpp - Vectorizer utility functions -----------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines vectorizer utilities.
10//
11//===----------------------------------------------------------------------===//
12
23#include "llvm/IR/Constants.h"
25#include "llvm/IR/IRBuilder.h"
28#include "llvm/IR/Value.h"
30
31#define DEBUG_TYPE "vectorutils"
32
33using namespace llvm;
34using namespace llvm::PatternMatch;
35
36/// Maximum factor for an interleaved memory access.
38 "max-interleave-group-factor", cl::Hidden,
39 cl::desc("Maximum factor for an interleaved access group (default = 8)"),
40 cl::init(8));
41
42/// Return true if all of the intrinsic's arguments and return type are scalars
43/// for the scalar form of the intrinsic, and vectors for the vector form of the
44/// intrinsic (except operands that are marked as always being scalar by
45/// isVectorIntrinsicWithScalarOpAtArg).
47 switch (ID) {
48 case Intrinsic::abs: // Begin integer bit-manipulation.
49 case Intrinsic::bswap:
50 case Intrinsic::bitreverse:
51 case Intrinsic::ctpop:
52 case Intrinsic::ctlz:
53 case Intrinsic::cttz:
54 case Intrinsic::fshl:
55 case Intrinsic::fshr:
56 case Intrinsic::smax:
57 case Intrinsic::smin:
58 case Intrinsic::umax:
59 case Intrinsic::umin:
60 case Intrinsic::sadd_sat:
61 case Intrinsic::ssub_sat:
62 case Intrinsic::uadd_sat:
63 case Intrinsic::usub_sat:
64 case Intrinsic::smul_fix:
65 case Intrinsic::smul_fix_sat:
66 case Intrinsic::umul_fix:
67 case Intrinsic::umul_fix_sat:
68 case Intrinsic::uadd_with_overflow:
69 case Intrinsic::sadd_with_overflow:
70 case Intrinsic::usub_with_overflow:
71 case Intrinsic::ssub_with_overflow:
72 case Intrinsic::umul_with_overflow:
73 case Intrinsic::smul_with_overflow:
74 case Intrinsic::sqrt: // Begin floating-point.
75 case Intrinsic::asin:
76 case Intrinsic::acos:
77 case Intrinsic::atan:
78 case Intrinsic::atan2:
79 case Intrinsic::sin:
80 case Intrinsic::cos:
81 case Intrinsic::sincos:
82 case Intrinsic::sincospi:
83 case Intrinsic::tan:
84 case Intrinsic::sinh:
85 case Intrinsic::cosh:
86 case Intrinsic::tanh:
87 case Intrinsic::exp:
88 case Intrinsic::exp10:
89 case Intrinsic::exp2:
90 case Intrinsic::frexp:
91 case Intrinsic::ldexp:
92 case Intrinsic::log:
93 case Intrinsic::log10:
94 case Intrinsic::log2:
95 case Intrinsic::fabs:
96 case Intrinsic::minnum:
97 case Intrinsic::maxnum:
98 case Intrinsic::minimum:
99 case Intrinsic::maximum:
100 case Intrinsic::minimumnum:
101 case Intrinsic::maximumnum:
102 case Intrinsic::modf:
103 case Intrinsic::copysign:
104 case Intrinsic::floor:
105 case Intrinsic::ceil:
106 case Intrinsic::trunc:
107 case Intrinsic::rint:
108 case Intrinsic::nearbyint:
109 case Intrinsic::round:
110 case Intrinsic::roundeven:
111 case Intrinsic::pow:
112 case Intrinsic::fma:
113 case Intrinsic::fmuladd:
114 case Intrinsic::is_fpclass:
115 case Intrinsic::powi:
116 case Intrinsic::canonicalize:
117 case Intrinsic::fptosi_sat:
118 case Intrinsic::fptoui_sat:
119 case Intrinsic::lround:
120 case Intrinsic::llround:
121 case Intrinsic::lrint:
122 case Intrinsic::llrint:
123 case Intrinsic::ucmp:
124 case Intrinsic::scmp:
125 case Intrinsic::clmul:
126 return true;
127 default:
128 return false;
129 }
130}
131
138
139/// Identifies if the vector form of the intrinsic has a scalar operand.
141 unsigned ScalarOpdIdx,
142 const TargetTransformInfo *TTI) {
143
145 return TTI->isTargetIntrinsicWithScalarOpAtArg(ID, ScalarOpdIdx);
146
147 // Vector predication intrinsics have the EVL as the last operand.
148 if (VPIntrinsic::getVectorLengthParamPos(ID) == ScalarOpdIdx)
149 return true;
150
151 switch (ID) {
152 case Intrinsic::abs:
153 case Intrinsic::vp_abs:
154 case Intrinsic::ctlz:
155 case Intrinsic::vp_ctlz:
156 case Intrinsic::cttz:
157 case Intrinsic::vp_cttz:
158 case Intrinsic::is_fpclass:
159 case Intrinsic::vp_is_fpclass:
160 case Intrinsic::powi:
161 case Intrinsic::vector_extract:
162 return (ScalarOpdIdx == 1);
163 case Intrinsic::smul_fix:
164 case Intrinsic::smul_fix_sat:
165 case Intrinsic::umul_fix:
166 case Intrinsic::umul_fix_sat:
167 case Intrinsic::vector_splice_left:
168 case Intrinsic::vector_splice_right:
169 return (ScalarOpdIdx == 2);
170 case Intrinsic::experimental_vp_splice:
171 return ScalarOpdIdx == 2 || ScalarOpdIdx == 4;
172 case Intrinsic::experimental_vp_strided_load:
173 return ScalarOpdIdx == 0 || ScalarOpdIdx == 1;
174 case Intrinsic::loop_dependence_war_mask:
175 return true;
176 default:
177 return false;
178 }
179}
180
182 Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI) {
183 assert(ID != Intrinsic::not_intrinsic && "Not an intrinsic!");
184
186 return TTI->isTargetIntrinsicWithOverloadTypeAtArg(ID, OpdIdx);
187
189 return OpdIdx == -1 || OpdIdx == 0;
190
191 switch (ID) {
192 case Intrinsic::fptosi_sat:
193 case Intrinsic::fptoui_sat:
194 case Intrinsic::lround:
195 case Intrinsic::llround:
196 case Intrinsic::lrint:
197 case Intrinsic::llrint:
198 case Intrinsic::vp_lrint:
199 case Intrinsic::vp_llrint:
200 case Intrinsic::ucmp:
201 case Intrinsic::scmp:
202 case Intrinsic::vector_extract:
203 case Intrinsic::loop_dependence_war_mask:
204 return OpdIdx == -1 || OpdIdx == 0;
205 case Intrinsic::modf:
206 case Intrinsic::sincos:
207 case Intrinsic::sincospi:
208 case Intrinsic::is_fpclass:
209 case Intrinsic::vp_is_fpclass:
210 return OpdIdx == 0;
211 case Intrinsic::powi:
212 case Intrinsic::ldexp:
213 return OpdIdx == -1 || OpdIdx == 1;
214 case Intrinsic::experimental_vp_strided_load:
215 return OpdIdx == -1 || OpdIdx == 0 || OpdIdx == 1;
216 default:
217 return OpdIdx == -1;
218 }
219}
220
222 Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI) {
223
225 return TTI->isTargetIntrinsicWithStructReturnOverloadAtField(ID, RetIdx);
226
227 switch (ID) {
228 case Intrinsic::frexp:
229 return RetIdx == 0 || RetIdx == 1;
230 default:
231 return RetIdx == 0;
232 }
233}
234
235/// Returns intrinsic ID for call.
236/// For the input call instruction it finds mapping intrinsic and returns
237/// its ID, in case it does not found it return not_intrinsic.
239 const TargetLibraryInfo *TLI) {
243
244 if (isTriviallyVectorizable(ID) || ID == Intrinsic::lifetime_start ||
245 ID == Intrinsic::lifetime_end || ID == Intrinsic::assume ||
246 ID == Intrinsic::experimental_noalias_scope_decl ||
247 ID == Intrinsic::sideeffect || ID == Intrinsic::pseudoprobe)
248 return ID;
250}
251
253 switch (ID) {
254 case Intrinsic::vector_interleave2:
255 return 2;
256 case Intrinsic::vector_interleave3:
257 return 3;
258 case Intrinsic::vector_interleave4:
259 return 4;
260 case Intrinsic::vector_interleave5:
261 return 5;
262 case Intrinsic::vector_interleave6:
263 return 6;
264 case Intrinsic::vector_interleave7:
265 return 7;
266 case Intrinsic::vector_interleave8:
267 return 8;
268 default:
269 return 0;
270 }
271}
272
274 switch (ID) {
275 case Intrinsic::vector_deinterleave2:
276 return 2;
277 case Intrinsic::vector_deinterleave3:
278 return 3;
279 case Intrinsic::vector_deinterleave4:
280 return 4;
281 case Intrinsic::vector_deinterleave5:
282 return 5;
283 case Intrinsic::vector_deinterleave6:
284 return 6;
285 case Intrinsic::vector_deinterleave7:
286 return 7;
287 case Intrinsic::vector_deinterleave8:
288 return 8;
289 default:
290 return 0;
291 }
292}
293
295 [[maybe_unused]] unsigned Factor =
297 ArrayRef<Type *> DISubtypes = DI->getType()->subtypes();
298 assert(Factor && Factor == DISubtypes.size() &&
299 "unexpected deinterleave factor or result type");
300 return cast<VectorType>(DISubtypes[0]);
301}
302
303/// Given a vector and an element number, see if the scalar value is
304/// already around as a register, for example if it were inserted then extracted
305/// from the vector.
306Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
307 assert(V->getType()->isVectorTy() && "Not looking at a vector?");
308 VectorType *VTy = cast<VectorType>(V->getType());
309 // For fixed-length vector, return poison for out of range access.
310 if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
311 unsigned Width = FVTy->getNumElements();
312 if (EltNo >= Width)
313 return PoisonValue::get(FVTy->getElementType());
314 }
315
316 if (Constant *C = dyn_cast<Constant>(V))
317 return C->getAggregateElement(EltNo);
318
320 // If this is an insert to a variable element, we don't know what it is.
321 uint64_t IIElt;
322 if (!match(III->getOperand(2), m_ConstantInt(IIElt)))
323 return nullptr;
324
325 // If this is an insert to the element we are looking for, return the
326 // inserted value.
327 if (EltNo == IIElt)
328 return III->getOperand(1);
329
330 // Guard against infinite loop on malformed, unreachable IR.
331 if (III == III->getOperand(0))
332 return nullptr;
333
334 // Otherwise, the insertelement doesn't modify the value, recurse on its
335 // vector input.
336 return findScalarElement(III->getOperand(0), EltNo);
337 }
338
340 // Restrict the following transformation to fixed-length vector.
341 if (SVI && isa<FixedVectorType>(SVI->getType())) {
342 unsigned LHSWidth =
343 cast<FixedVectorType>(SVI->getOperand(0)->getType())->getNumElements();
344 int InEl = SVI->getMaskValue(EltNo);
345 if (InEl < 0)
346 return PoisonValue::get(VTy->getElementType());
347 if (InEl < (int)LHSWidth)
348 return findScalarElement(SVI->getOperand(0), InEl);
349 return findScalarElement(SVI->getOperand(1), InEl - LHSWidth);
350 }
351
352 // Extract a value from a vector add operation with a constant zero.
353 // TODO: Use getBinOpIdentity() to generalize this.
354 Value *Val; Constant *C;
355 if (match(V, m_Add(m_Value(Val), m_Constant(C))))
356 if (Constant *Elt = C->getAggregateElement(EltNo))
357 if (Elt->isNullValue())
358 return findScalarElement(Val, EltNo);
359
360 // If the vector is a splat then we can trivially find the scalar element.
362 if (Value *Splat = getSplatValue(V))
363 if (EltNo < VTy->getElementCount().getKnownMinValue())
364 return Splat;
365
366 // Otherwise, we don't know.
367 return nullptr;
368}
369
371 int SplatIndex = -1;
372 for (int M : Mask) {
373 // Ignore invalid (undefined) mask elements.
374 if (M < 0)
375 continue;
376
377 // There can be only 1 non-negative mask element value if this is a splat.
378 if (SplatIndex != -1 && SplatIndex != M)
379 return -1;
380
381 // Initialize the splat index to the 1st non-negative mask element.
382 SplatIndex = M;
383 }
384 assert((SplatIndex == -1 || SplatIndex >= 0) && "Negative index?");
385 return SplatIndex;
386}
387
388/// Get splat value if the input is a splat vector or return nullptr.
389/// This function is not fully general. It checks only 2 cases:
390/// the input value is (1) a splat constant vector or (2) a sequence
391/// of instructions that broadcasts a scalar at element 0.
393 if (isa<VectorType>(V->getType()))
394 if (auto *C = dyn_cast<Constant>(V))
395 return C->getSplatValue();
396
397 // shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...>
398 Value *Splat;
399 if (match(V,
401 m_Value(), m_ZeroMask())))
402 return Splat;
403
404 return nullptr;
405}
406
407bool llvm::isSplatValue(const Value *V, int Index, unsigned Depth) {
408 assert(Depth <= MaxAnalysisRecursionDepth && "Limit Search Depth");
409
410 if (isa<VectorType>(V->getType())) {
411 if (isa<UndefValue>(V))
412 return true;
413 // FIXME: We can allow undefs, but if Index was specified, we may want to
414 // check that the constant is defined at that index.
415 if (auto *C = dyn_cast<Constant>(V))
416 return C->getSplatValue() != nullptr;
417 }
418
419 if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) {
420 // FIXME: We can safely allow undefs here. If Index was specified, we will
421 // check that the mask elt is defined at the required index.
422 if (!all_equal(Shuf->getShuffleMask()))
423 return false;
424
425 // Match any index.
426 if (Index == -1)
427 return true;
428
429 // Match a specific element. The mask should be defined at and match the
430 // specified index.
431 return Shuf->getMaskValue(Index) == Index;
432 }
433
434 // The remaining tests are all recursive, so bail out if we hit the limit.
436 return false;
437
438 // If both operands of a binop are splats, the result is a splat.
439 Value *X, *Y, *Z;
440 if (match(V, m_BinOp(m_Value(X), m_Value(Y))))
441 return isSplatValue(X, Index, Depth) && isSplatValue(Y, Index, Depth);
442
443 // If all operands of a select are splats, the result is a splat.
444 if (match(V, m_Select(m_Value(X), m_Value(Y), m_Value(Z))))
445 return isSplatValue(X, Index, Depth) && isSplatValue(Y, Index, Depth) &&
446 isSplatValue(Z, Index, Depth);
447
448 // TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
449
450 return false;
451}
452
454 const APInt &DemandedElts, APInt &DemandedLHS,
455 APInt &DemandedRHS, bool AllowUndefElts) {
456 DemandedLHS = DemandedRHS = APInt::getZero(SrcWidth);
457
458 // Early out if we don't demand any elements.
459 if (DemandedElts.isZero())
460 return true;
461
462 // Simple case of a shuffle with zeroinitializer.
463 if (all_of(Mask, equal_to(0))) {
464 DemandedLHS.setBit(0);
465 return true;
466 }
467
468 for (unsigned I = 0, E = Mask.size(); I != E; ++I) {
469 int M = Mask[I];
470 assert((-1 <= M) && (M < (SrcWidth * 2)) &&
471 "Invalid shuffle mask constant");
472
473 if (!DemandedElts[I] || (AllowUndefElts && (M < 0)))
474 continue;
475
476 // For undef elements, we don't know anything about the common state of
477 // the shuffle result.
478 if (M < 0)
479 return false;
480
481 if (M < SrcWidth)
482 DemandedLHS.setBit(M);
483 else
484 DemandedRHS.setBit(M - SrcWidth);
485 }
486
487 return true;
488}
489
491 std::array<std::pair<int, int>, 2> &SrcInfo) {
492 const int SignalValue = NumElts * 2;
493 SrcInfo[0] = {-1, SignalValue};
494 SrcInfo[1] = {-1, SignalValue};
495 for (auto [i, M] : enumerate(Mask)) {
496 if (M < 0)
497 continue;
498 int Src = M >= NumElts;
499 int Diff = (int)i - (M % NumElts);
500 bool Match = false;
501 for (int j = 0; j < 2; j++) {
502 auto &[SrcE, DiffE] = SrcInfo[j];
503 if (SrcE == -1) {
504 assert(DiffE == SignalValue);
505 SrcE = Src;
506 DiffE = Diff;
507 }
508 if (SrcE == Src && DiffE == Diff) {
509 Match = true;
510 break;
511 }
512 }
513 if (!Match)
514 return false;
515 }
516 // Avoid all undef masks
517 return SrcInfo[0].first != -1;
518}
519
521 SmallVectorImpl<int> &ScaledMask) {
522 assert(Scale > 0 && "Unexpected scaling factor");
523
524 // Fast-path: if no scaling, then it is just a copy.
525 if (Scale == 1) {
526 ScaledMask.assign(Mask.begin(), Mask.end());
527 return;
528 }
529
530 ScaledMask.clear();
531 for (int MaskElt : Mask) {
532 if (MaskElt >= 0) {
533 assert(((uint64_t)Scale * MaskElt + (Scale - 1)) <= INT32_MAX &&
534 "Overflowed 32-bits");
535 }
536 for (int SliceElt = 0; SliceElt != Scale; ++SliceElt)
537 ScaledMask.push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + SliceElt);
538 }
539}
540
542 SmallVectorImpl<int> &ScaledMask) {
543 assert(Scale > 0 && "Unexpected scaling factor");
544
545 // Fast-path: if no scaling, then it is just a copy.
546 if (Scale == 1) {
547 ScaledMask.assign(Mask.begin(), Mask.end());
548 return true;
549 }
550
551 // We must map the original elements down evenly to a type with less elements.
552 int NumElts = Mask.size();
553 if (NumElts % Scale != 0)
554 return false;
555
556 ScaledMask.clear();
557 ScaledMask.reserve(NumElts / Scale);
558
559 // Step through the input mask by splitting into Scale-sized slices.
560 do {
561 ArrayRef<int> MaskSlice = Mask.take_front(Scale);
562 assert((int)MaskSlice.size() == Scale && "Expected Scale-sized slice.");
563
564 // The first element of the slice determines how we evaluate this slice.
565 int SliceFront = MaskSlice.front();
566 if (SliceFront < 0) {
567 // Negative values (undef or other "sentinel" values) must be equal across
568 // the entire slice.
569 if (!all_equal(MaskSlice))
570 return false;
571 ScaledMask.push_back(SliceFront);
572 } else {
573 // A positive mask element must be cleanly divisible.
574 if (SliceFront % Scale != 0)
575 return false;
576 // Elements of the slice must be consecutive.
577 for (int i = 1; i < Scale; ++i)
578 if (MaskSlice[i] != SliceFront + i)
579 return false;
580 ScaledMask.push_back(SliceFront / Scale);
581 }
582 Mask = Mask.drop_front(Scale);
583 } while (!Mask.empty());
584
585 assert((int)ScaledMask.size() * Scale == NumElts && "Unexpected scaled mask");
586
587 // All elements of the original mask can be scaled down to map to the elements
588 // of a mask with wider elements.
589 return true;
590}
591
593 SmallVectorImpl<int> &NewMask) {
594 unsigned NumElts = M.size();
595 if (NumElts % 2 != 0)
596 return false;
597
598 NewMask.clear();
599 for (unsigned i = 0; i < NumElts; i += 2) {
600 int M0 = M[i];
601 int M1 = M[i + 1];
602
603 // If both elements are undef, new mask is undef too.
604 if (M0 == -1 && M1 == -1) {
605 NewMask.push_back(-1);
606 continue;
607 }
608
609 if (M0 == -1 && M1 != -1 && (M1 % 2) == 1) {
610 NewMask.push_back(M1 / 2);
611 continue;
612 }
613
614 if (M0 != -1 && (M0 % 2) == 0 && ((M0 + 1) == M1 || M1 == -1)) {
615 NewMask.push_back(M0 / 2);
616 continue;
617 }
618
619 NewMask.clear();
620 return false;
621 }
622
623 assert(NewMask.size() == NumElts / 2 && "Incorrect size for mask!");
624 return true;
625}
626
627bool llvm::scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef<int> Mask,
628 SmallVectorImpl<int> &ScaledMask) {
629 unsigned NumSrcElts = Mask.size();
630 assert(NumSrcElts > 0 && NumDstElts > 0 && "Unexpected scaling factor");
631
632 // Fast-path: if no scaling, then it is just a copy.
633 if (NumSrcElts == NumDstElts) {
634 ScaledMask.assign(Mask.begin(), Mask.end());
635 return true;
636 }
637
638 // Ensure we can find a whole scale factor.
639 assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
640 "Unexpected scaling factor");
641
642 if (NumSrcElts > NumDstElts) {
643 int Scale = NumSrcElts / NumDstElts;
644 return widenShuffleMaskElts(Scale, Mask, ScaledMask);
645 }
646
647 int Scale = NumDstElts / NumSrcElts;
648 narrowShuffleMaskElts(Scale, Mask, ScaledMask);
649 return true;
650}
651
653 SmallVectorImpl<int> &ScaledMask) {
654 std::array<SmallVector<int, 16>, 2> TmpMasks;
655 SmallVectorImpl<int> *Output = &TmpMasks[0], *Tmp = &TmpMasks[1];
656 ArrayRef<int> InputMask = Mask;
657 for (unsigned Scale = 2; Scale <= InputMask.size(); ++Scale) {
658 while (widenShuffleMaskElts(Scale, InputMask, *Output)) {
659 InputMask = *Output;
660 std::swap(Output, Tmp);
661 }
662 }
663 ScaledMask.assign(InputMask.begin(), InputMask.end());
664}
665
667 ArrayRef<int> Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs,
668 unsigned NumOfUsedRegs, function_ref<void()> NoInputAction,
669 function_ref<void(ArrayRef<int>, unsigned, unsigned)> SingleInputAction,
670 function_ref<void(ArrayRef<int>, unsigned, unsigned, bool)>
671 ManyInputsAction) {
672 SmallVector<SmallVector<SmallVector<int>>> Res(NumOfDestRegs);
673 // Try to perform better estimation of the permutation.
674 // 1. Split the source/destination vectors into real registers.
675 // 2. Do the mask analysis to identify which real registers are
676 // permuted.
677 int Sz = Mask.size();
678 unsigned SzDest = Sz / NumOfDestRegs;
679 unsigned SzSrc = Sz / NumOfSrcRegs;
680 for (unsigned I = 0; I < NumOfDestRegs; ++I) {
681 auto &RegMasks = Res[I];
682 RegMasks.assign(2 * NumOfSrcRegs, {});
683 // Check that the values in dest registers are in the one src
684 // register.
685 for (unsigned K = 0; K < SzDest; ++K) {
686 int Idx = I * SzDest + K;
687 if (Idx == Sz)
688 break;
689 if (Mask[Idx] >= 2 * Sz || Mask[Idx] == PoisonMaskElem)
690 continue;
691 int MaskIdx = Mask[Idx] % Sz;
692 int SrcRegIdx = MaskIdx / SzSrc + (Mask[Idx] >= Sz ? NumOfSrcRegs : 0);
693 // Add a cost of PermuteTwoSrc for each new source register permute,
694 // if we have more than one source registers.
695 if (RegMasks[SrcRegIdx].empty())
696 RegMasks[SrcRegIdx].assign(SzDest, PoisonMaskElem);
697 RegMasks[SrcRegIdx][K] = MaskIdx % SzSrc;
698 }
699 }
700 // Process split mask.
701 for (unsigned I : seq<unsigned>(NumOfUsedRegs)) {
702 auto &Dest = Res[I];
703 int NumSrcRegs =
704 count_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
705 switch (NumSrcRegs) {
706 case 0:
707 // No input vectors were used!
708 NoInputAction();
709 break;
710 case 1: {
711 // Find the only mask with at least single undef mask elem.
712 auto *It =
713 find_if(Dest, [](ArrayRef<int> Mask) { return !Mask.empty(); });
714 unsigned SrcReg = std::distance(Dest.begin(), It);
715 SingleInputAction(*It, SrcReg, I);
716 break;
717 }
718 default: {
719 // The first mask is a permutation of a single register. Since we have >2
720 // input registers to shuffle, we merge the masks for 2 first registers
721 // and generate a shuffle of 2 registers rather than the reordering of the
722 // first register and then shuffle with the second register. Next,
723 // generate the shuffles of the resulting register + the remaining
724 // registers from the list.
725 auto &&CombineMasks = [](MutableArrayRef<int> FirstMask,
726 ArrayRef<int> SecondMask) {
727 for (int Idx = 0, VF = FirstMask.size(); Idx < VF; ++Idx) {
728 if (SecondMask[Idx] != PoisonMaskElem) {
729 assert(FirstMask[Idx] == PoisonMaskElem &&
730 "Expected undefined mask element.");
731 FirstMask[Idx] = SecondMask[Idx] + VF;
732 }
733 }
734 };
735 auto &&NormalizeMask = [](MutableArrayRef<int> Mask) {
736 for (int Idx = 0, VF = Mask.size(); Idx < VF; ++Idx) {
737 if (Mask[Idx] != PoisonMaskElem)
738 Mask[Idx] = Idx;
739 }
740 };
741 int SecondIdx;
742 bool NewReg = true;
743 do {
744 int FirstIdx = -1;
745 SecondIdx = -1;
746 MutableArrayRef<int> FirstMask, SecondMask;
747 for (unsigned I : seq<unsigned>(2 * NumOfSrcRegs)) {
748 SmallVectorImpl<int> &RegMask = Dest[I];
749 if (RegMask.empty())
750 continue;
751
752 if (FirstIdx == SecondIdx) {
753 FirstIdx = I;
754 FirstMask = RegMask;
755 continue;
756 }
757 SecondIdx = I;
758 SecondMask = RegMask;
759 CombineMasks(FirstMask, SecondMask);
760 ManyInputsAction(FirstMask, FirstIdx, SecondIdx, NewReg);
761 NewReg = false;
762 NormalizeMask(FirstMask);
763 RegMask.clear();
764 SecondMask = FirstMask;
765 SecondIdx = FirstIdx;
766 }
767 if (FirstIdx != SecondIdx && SecondIdx >= 0) {
768 CombineMasks(SecondMask, FirstMask);
769 ManyInputsAction(SecondMask, SecondIdx, FirstIdx, NewReg);
770 NewReg = false;
771 Dest[FirstIdx].clear();
772 NormalizeMask(SecondMask);
773 }
774 } while (SecondIdx >= 0);
775 break;
776 }
777 }
778 }
779}
780
781void llvm::getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth,
782 const APInt &DemandedElts,
783 APInt &DemandedLHS,
784 APInt &DemandedRHS) {
785 assert(VectorBitWidth >= 128 && "Vectors smaller than 128 bit not supported");
786 int NumLanes = VectorBitWidth / 128;
787 int NumElts = DemandedElts.getBitWidth();
788 int NumEltsPerLane = NumElts / NumLanes;
789 int HalfEltsPerLane = NumEltsPerLane / 2;
790
791 DemandedLHS = APInt::getZero(NumElts);
792 DemandedRHS = APInt::getZero(NumElts);
793
794 // Map DemandedElts to the horizontal operands.
795 for (int Idx = 0; Idx != NumElts; ++Idx) {
796 if (!DemandedElts[Idx])
797 continue;
798 int LaneIdx = (Idx / NumEltsPerLane) * NumEltsPerLane;
799 int LocalIdx = Idx % NumEltsPerLane;
800 if (LocalIdx < HalfEltsPerLane) {
801 DemandedLHS.setBit(LaneIdx + 2 * LocalIdx);
802 } else {
803 LocalIdx -= HalfEltsPerLane;
804 DemandedRHS.setBit(LaneIdx + 2 * LocalIdx);
805 }
806 }
807}
808
811 const TargetTransformInfo *TTI) {
812
813 // DemandedBits will give us every value's live-out bits. But we want
814 // to ensure no extra casts would need to be inserted, so every DAG
815 // of connected values must have the same minimum bitwidth.
821 SmallPtrSet<Instruction *, 4> InstructionSet;
823
824 // Determine the roots. We work bottom-up, from truncs or icmps.
825 bool SeenExtFromIllegalType = false;
826 for (auto *BB : Blocks)
827 for (auto &I : *BB) {
828 InstructionSet.insert(&I);
829
830 if (TTI && (isa<ZExtInst>(&I) || isa<SExtInst>(&I)) &&
831 !TTI->isTypeLegal(I.getOperand(0)->getType()))
832 SeenExtFromIllegalType = true;
833
834 // Only deal with non-vector integers up to 64-bits wide.
835 if ((isa<TruncInst>(&I) || isa<ICmpInst>(&I)) &&
836 !I.getType()->isVectorTy() &&
837 I.getOperand(0)->getType()->getScalarSizeInBits() <= 64) {
838 // Don't make work for ourselves. If we know the loaded type is legal,
839 // don't add it to the worklist.
840 if (TTI && isa<TruncInst>(&I) && TTI->isTypeLegal(I.getType()))
841 continue;
842
843 Worklist.push_back(&I);
844 Roots.insert(&I);
845 }
846 }
847 // Early exit.
848 if (Worklist.empty() || (TTI && !SeenExtFromIllegalType))
849 return MinBWs;
850
851 // Now proceed breadth-first, unioning values together.
852 while (!Worklist.empty()) {
853 Instruction *I = Worklist.pop_back_val();
854 Value *Leader = ECs.getOrInsertLeaderValue(I);
855
856 if (!Visited.insert(I).second)
857 continue;
858
859 // If we encounter a type that is larger than 64 bits, we can't represent
860 // it so bail out.
861 if (DB.getDemandedBits(I).getBitWidth() > 64)
863
864 uint64_t V = DB.getDemandedBits(I).getZExtValue();
865 DBits[Leader] |= V;
866 DBits[I] = V;
867
868 // Casts, loads and instructions outside of our range terminate a chain
869 // successfully.
871 !InstructionSet.count(I))
872 continue;
873
874 // Unsafe casts terminate a chain unsuccessfully. We can't do anything
875 // useful with bitcasts, ptrtoints or inttoptrs and it'd be unsafe to
876 // transform anything that relies on them.
878 !I->getType()->isIntegerTy()) {
879 DBits[Leader] |= ~0ULL;
880 continue;
881 }
882
883 // We don't modify the types of PHIs. Reductions will already have been
884 // truncated if possible, and inductions' sizes will have been chosen by
885 // indvars.
886 if (isa<PHINode>(I))
887 continue;
888
889 // Don't modify the types of operands of a call, as doing that would cause a
890 // signature mismatch.
891 if (isa<CallBase>(I))
892 continue;
893
894 if (DBits[Leader] == ~0ULL)
895 // All bits demanded, no point continuing.
896 continue;
897
898 for (Value *O : I->operands()) {
899 ECs.unionSets(Leader, O);
900 if (auto *OI = dyn_cast<Instruction>(O))
901 Worklist.push_back(OI);
902 }
903 }
904
905 // Now we've discovered all values, walk them to see if there are
906 // any users we didn't see. If there are, we can't optimize that
907 // chain.
908 for (auto &I : DBits)
909 for (auto *U : I.first->users())
910 if (U->getType()->isIntegerTy() && DBits.count(U) == 0)
911 DBits[ECs.getOrInsertLeaderValue(I.first)] |= ~0ULL;
912
913 for (const auto &E : ECs) {
914 if (!E->isLeader())
915 continue;
916 uint64_t LeaderDemandedBits = 0;
917 for (Value *M : ECs.members(*E))
918 LeaderDemandedBits |= DBits[M];
919
920 uint64_t MinBW = llvm::bit_width(LeaderDemandedBits);
921 // Round up to a power of 2
922 MinBW = llvm::bit_ceil(MinBW);
923
924 // We don't modify the types of PHIs. Reductions will already have been
925 // truncated if possible, and inductions' sizes will have been chosen by
926 // indvars.
927 // If we are required to shrink a PHI, abandon this entire equivalence class.
928 bool Abort = false;
929 for (Value *M : ECs.members(*E))
930 if (isa<PHINode>(M) && MinBW < M->getType()->getScalarSizeInBits()) {
931 Abort = true;
932 break;
933 }
934 if (Abort)
935 continue;
936
937 for (Value *M : ECs.members(*E)) {
938 auto *MI = dyn_cast<Instruction>(M);
939 if (!MI)
940 continue;
941 Type *Ty = M->getType();
942 if (Roots.count(MI))
943 Ty = MI->getOperand(0)->getType();
944
945 if (MinBW >= Ty->getScalarSizeInBits())
946 continue;
947
948 // If any of M's operands demand more bits than MinBW then M cannot be
949 // performed safely in MinBW.
950 auto *Call = dyn_cast<CallBase>(MI);
951 auto Ops = Call ? Call->args() : MI->operands();
952 if (any_of(Ops, [&DB, MinBW](Use &U) {
953 auto *CI = dyn_cast<ConstantInt>(U);
954 // For constants shift amounts, check if the shift would result in
955 // poison.
956 if (CI &&
958 U.getOperandNo() == 1)
959 return CI->uge(MinBW);
960 uint64_t BW = bit_width(DB.getDemandedBits(&U).getZExtValue());
961 return bit_ceil(BW) > MinBW;
962 }))
963 continue;
964
965 MinBWs[MI] = MinBW;
966 }
967 }
968
969 return MinBWs;
970}
971
972/// Add all access groups in @p AccGroups to @p List.
973template <typename ListT>
974static void addToAccessGroupList(ListT &List, MDNode *AccGroups) {
975 // Interpret an access group as a list containing itself.
976 if (AccGroups->getNumOperands() == 0) {
977 assert(isValidAsAccessGroup(AccGroups) && "Node must be an access group");
978 List.insert(AccGroups);
979 return;
980 }
981
982 for (const auto &AccGroupListOp : AccGroups->operands()) {
983 auto *Item = cast<MDNode>(AccGroupListOp.get());
984 assert(isValidAsAccessGroup(Item) && "List item must be an access group");
985 List.insert(Item);
986 }
987}
988
989MDNode *llvm::uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2) {
990 if (!AccGroups1)
991 return AccGroups2;
992 if (!AccGroups2)
993 return AccGroups1;
994 if (AccGroups1 == AccGroups2)
995 return AccGroups1;
996
998 addToAccessGroupList(Union, AccGroups1);
999 addToAccessGroupList(Union, AccGroups2);
1000
1001 if (Union.size() == 0)
1002 return nullptr;
1003 if (Union.size() == 1)
1004 return cast<MDNode>(Union.front());
1005
1006 LLVMContext &Ctx = AccGroups1->getContext();
1007 return MDNode::get(Ctx, Union.getArrayRef());
1008}
1009
1011 const Instruction *Inst2) {
1012 bool MayAccessMem1 = Inst1->mayReadOrWriteMemory();
1013 bool MayAccessMem2 = Inst2->mayReadOrWriteMemory();
1014
1015 if (!MayAccessMem1 && !MayAccessMem2)
1016 return nullptr;
1017 if (!MayAccessMem1)
1018 return Inst2->getMetadata(LLVMContext::MD_access_group);
1019 if (!MayAccessMem2)
1020 return Inst1->getMetadata(LLVMContext::MD_access_group);
1021
1022 MDNode *MD1 = Inst1->getMetadata(LLVMContext::MD_access_group);
1023 MDNode *MD2 = Inst2->getMetadata(LLVMContext::MD_access_group);
1024 if (!MD1 || !MD2)
1025 return nullptr;
1026 if (MD1 == MD2)
1027 return MD1;
1028
1029 // Use set for scalable 'contains' check.
1030 SmallPtrSet<Metadata *, 4> AccGroupSet2;
1031 addToAccessGroupList(AccGroupSet2, MD2);
1032
1033 SmallVector<Metadata *, 4> Intersection;
1034 if (MD1->getNumOperands() == 0) {
1035 assert(isValidAsAccessGroup(MD1) && "Node must be an access group");
1036 if (AccGroupSet2.count(MD1))
1037 Intersection.push_back(MD1);
1038 } else {
1039 for (const MDOperand &Node : MD1->operands()) {
1040 auto *Item = cast<MDNode>(Node.get());
1041 assert(isValidAsAccessGroup(Item) && "List item must be an access group");
1042 if (AccGroupSet2.count(Item))
1043 Intersection.push_back(Item);
1044 }
1045 }
1046
1047 if (Intersection.size() == 0)
1048 return nullptr;
1049 if (Intersection.size() == 1)
1050 return cast<MDNode>(Intersection.front());
1051
1052 LLVMContext &Ctx = Inst1->getContext();
1053 return MDNode::get(Ctx, Intersection);
1054}
1055
1056/// Add metadata from \p Inst to \p Metadata, if it can be preserved after
1057/// vectorization.
1059 Instruction *Inst,
1060 SmallVectorImpl<std::pair<unsigned, MDNode *>> &Metadata) {
1062 static const unsigned SupportedIDs[] = {
1063 LLVMContext::MD_tbaa, LLVMContext::MD_alias_scope,
1064 LLVMContext::MD_noalias, LLVMContext::MD_fpmath,
1065 LLVMContext::MD_nontemporal, LLVMContext::MD_invariant_load,
1066 LLVMContext::MD_access_group, LLVMContext::MD_mmra};
1067
1068 // Remove any unsupported metadata kinds from Metadata.
1069 for (unsigned Idx = 0; Idx != Metadata.size();) {
1070 if (is_contained(SupportedIDs, Metadata[Idx].first)) {
1071 ++Idx;
1072 } else {
1073 // Swap element to end and remove it.
1074 std::swap(Metadata[Idx], Metadata.back());
1075 Metadata.pop_back();
1076 }
1077 }
1078}
1079
1080/// \returns \p I after propagating metadata from \p VL.
1082 if (VL.empty())
1083 return Inst;
1086
1087 for (auto &[Kind, MD] : Metadata) {
1088 // Skip MMRA metadata if the instruction cannot have it.
1089 if (Kind == LLVMContext::MD_mmra && !canInstructionHaveMMRAs(*Inst))
1090 continue;
1091
1092 for (int J = 1, E = VL.size(); MD && J != E; ++J) {
1093 const Instruction *IJ = cast<Instruction>(VL[J]);
1094 MDNode *IMD = IJ->getMetadata(Kind);
1095
1096 switch (Kind) {
1097 case LLVMContext::MD_mmra: {
1098 MD = MMRAMetadata::combine(Inst->getContext(), MD, IMD);
1099 break;
1100 }
1101 case LLVMContext::MD_tbaa:
1102 MD = MDNode::getMostGenericTBAA(MD, IMD);
1103 break;
1104 case LLVMContext::MD_alias_scope:
1106 break;
1107 case LLVMContext::MD_fpmath:
1108 MD = MDNode::getMostGenericFPMath(MD, IMD);
1109 break;
1110 case LLVMContext::MD_noalias:
1111 case LLVMContext::MD_nontemporal:
1112 case LLVMContext::MD_invariant_load:
1113 MD = MDNode::intersect(MD, IMD);
1114 break;
1115 case LLVMContext::MD_access_group:
1116 MD = intersectAccessGroups(Inst, IJ);
1117 break;
1118 default:
1119 llvm_unreachable("unhandled metadata");
1120 }
1121 }
1122
1123 Inst->setMetadata(Kind, MD);
1124 }
1125
1126 return Inst;
1127}
1128
1129Constant *
1131 const InterleaveGroup<Instruction> &Group) {
1132 // All 1's means mask is not needed.
1133 if (Group.isFull())
1134 return nullptr;
1135
1136 // TODO: support reversed access.
1137 assert(!Group.isReverse() && "Reversed group not supported.");
1138
1140 for (unsigned i = 0; i < VF; i++)
1141 for (unsigned j = 0; j < Group.getFactor(); ++j) {
1142 unsigned HasMember = Group.getMember(j) ? 1 : 0;
1143 Mask.push_back(Builder.getInt1(HasMember));
1144 }
1145
1146 return ConstantVector::get(Mask);
1147}
1148
1150llvm::createReplicatedMask(unsigned ReplicationFactor, unsigned VF) {
1151 SmallVector<int, 16> MaskVec;
1152 for (unsigned i = 0; i < VF; i++)
1153 for (unsigned j = 0; j < ReplicationFactor; j++)
1154 MaskVec.push_back(i);
1155
1156 return MaskVec;
1157}
1158
1160 unsigned NumVecs) {
1162 for (unsigned i = 0; i < VF; i++)
1163 for (unsigned j = 0; j < NumVecs; j++)
1164 Mask.push_back(j * VF + i);
1165
1166 return Mask;
1167}
1168
1170llvm::createStrideMask(unsigned Start, unsigned Stride, unsigned VF) {
1172 for (unsigned i = 0; i < VF; i++)
1173 Mask.push_back(Start + i * Stride);
1174
1175 return Mask;
1176}
1177
1179 unsigned NumInts,
1180 unsigned NumUndefs) {
1182 for (unsigned i = 0; i < NumInts; i++)
1183 Mask.push_back(Start + i);
1184
1185 for (unsigned i = 0; i < NumUndefs; i++)
1186 Mask.push_back(-1);
1187
1188 return Mask;
1189}
1190
1192 unsigned NumElts) {
1193 // Avoid casts in the loop and make sure we have a reasonable number.
1194 int NumEltsSigned = NumElts;
1195 assert(NumEltsSigned > 0 && "Expected smaller or non-zero element count");
1196
1197 // If the mask chooses an element from operand 1, reduce it to choose from the
1198 // corresponding element of operand 0. Undef mask elements are unchanged.
1199 SmallVector<int, 16> UnaryMask;
1200 for (int MaskElt : Mask) {
1201 assert((MaskElt < NumEltsSigned * 2) && "Expected valid shuffle mask");
1202 int UnaryElt = MaskElt >= NumEltsSigned ? MaskElt - NumEltsSigned : MaskElt;
1203 UnaryMask.push_back(UnaryElt);
1204 }
1205 return UnaryMask;
1206}
1207
1208/// A helper function for concatenating vectors. This function concatenates two
1209/// vectors having the same element type. If the second vector has fewer
1210/// elements than the first, it is padded with undefs.
1212 Value *V2) {
1213 VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
1214 VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
1215 assert(VecTy1 && VecTy2 &&
1216 VecTy1->getScalarType() == VecTy2->getScalarType() &&
1217 "Expect two vectors with the same element type");
1218
1219 unsigned NumElts1 = cast<FixedVectorType>(VecTy1)->getNumElements();
1220 unsigned NumElts2 = cast<FixedVectorType>(VecTy2)->getNumElements();
1221 assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
1222
1223 if (NumElts1 > NumElts2) {
1224 // Extend with UNDEFs.
1225 V2 = Builder.CreateShuffleVector(
1226 V2, createSequentialMask(0, NumElts2, NumElts1 - NumElts2));
1227 }
1228
1229 return Builder.CreateShuffleVector(
1230 V1, V2, createSequentialMask(0, NumElts1 + NumElts2, 0));
1231}
1232
1234 ArrayRef<Value *> Vecs) {
1235 unsigned NumVecs = Vecs.size();
1236 assert(NumVecs > 1 && "Should be at least two vectors");
1237
1239 ResList.append(Vecs.begin(), Vecs.end());
1240 do {
1242 for (unsigned i = 0; i < NumVecs - 1; i += 2) {
1243 Value *V0 = ResList[i], *V1 = ResList[i + 1];
1244 assert((V0->getType() == V1->getType() || i == NumVecs - 2) &&
1245 "Only the last vector may have a different type");
1246
1247 TmpList.push_back(concatenateTwoVectors(Builder, V0, V1));
1248 }
1249
1250 // Push the last vector if the total number of vectors is odd.
1251 if (NumVecs % 2 != 0)
1252 TmpList.push_back(ResList[NumVecs - 1]);
1253
1254 ResList = TmpList;
1255 NumVecs = ResList.size();
1256 } while (NumVecs > 1);
1257
1258 return ResList[0];
1259}
1260
1262 assert(isa<VectorType>(Mask->getType()) &&
1263 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1264 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1265 1 &&
1266 "Mask must be a vector of i1");
1267
1268 auto AllOneOrUndef = m_CombineOr(m_AllOnes(), m_UndefValue());
1269 return match(Mask, m_CombineOr(AllOneOrUndef, m_ContainsMatchingVectorElement(
1270 AllOneOrUndef)));
1271}
1272
1273/// TODO: This is a lot like known bits, but for
1274/// vectors. Is there something we can common this with?
1276 assert(isa<FixedVectorType>(Mask->getType()) &&
1277 isa<IntegerType>(Mask->getType()->getScalarType()) &&
1278 cast<IntegerType>(Mask->getType()->getScalarType())->getBitWidth() ==
1279 1 &&
1280 "Mask must be a fixed width vector of i1");
1281
1282 const unsigned VWidth =
1283 cast<FixedVectorType>(Mask->getType())->getNumElements();
1284 APInt DemandedElts = APInt::getAllOnes(VWidth);
1285 if (auto *CV = dyn_cast<ConstantVector>(Mask))
1286 for (unsigned i = 0; i < VWidth; i++)
1287 if (CV->getAggregateElement(i)->isNullValue())
1288 DemandedElts.clearBit(i);
1289 return DemandedElts;
1290}
1291
1292bool InterleavedAccessInfo::isStrided(int Stride) {
1293 unsigned Factor = std::abs(Stride);
1294 return Factor >= 2 && Factor <= MaxInterleaveGroupFactor;
1295}
1296
1297void InterleavedAccessInfo::collectConstStrideAccesses(
1299 const DenseMap<Value *, const SCEV *> &Strides,
1301 auto &DL = TheLoop->getHeader()->getDataLayout();
1302
1303 // Since it's desired that the load/store instructions be maintained in
1304 // "program order" for the interleaved access analysis, we have to visit the
1305 // blocks in the loop in reverse postorder (i.e., in a topological order).
1306 // Such an ordering will ensure that any load/store that may be executed
1307 // before a second load/store will precede the second load/store in
1308 // AccessStrideInfo.
1309 LoopBlocksDFS DFS(TheLoop);
1310 DFS.perform(LI);
1311 for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
1312 for (auto &I : *BB) {
1314 if (!Ptr)
1315 continue;
1316 Type *ElementTy = getLoadStoreType(&I);
1317
1318 // Currently, codegen doesn't support cases where the type size doesn't
1319 // match the alloc size. Skip them for now.
1320 uint64_t Size = DL.getTypeAllocSize(ElementTy);
1321 if (Size * 8 != DL.getTypeSizeInBits(ElementTy))
1322 continue;
1323
1324 // We don't check wrapping here because we don't know yet if Ptr will be
1325 // part of a full group or a group with gaps. Checking wrapping for all
1326 // pointers (even those that end up in groups with no gaps) will be overly
1327 // conservative. For full groups, wrapping should be ok since if we would
1328 // wrap around the address space we would do a memory access at nullptr
1329 // even without the transformation. The wrapping checks are therefore
1330 // deferred until after we've formed the interleaved groups.
1331 int64_t Stride = getPtrStride(PSE, ElementTy, Ptr, TheLoop, *DT, Strides,
1332 /*ShouldCheckWrap=*/false, Predicates)
1333 .value_or(0);
1334
1335 const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
1336 AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
1338 }
1339}
1340
1341// Analyze interleaved accesses and collect them into interleaved load and
1342// store groups.
1343//
1344// When generating code for an interleaved load group, we effectively hoist all
1345// loads in the group to the location of the first load in program order. When
1346// generating code for an interleaved store group, we sink all stores to the
1347// location of the last store. This code motion can change the order of load
1348// and store instructions and may break dependences.
1349//
1350// The code generation strategy mentioned above ensures that we won't violate
1351// any write-after-read (WAR) dependences.
1352//
1353// E.g., for the WAR dependence: a = A[i]; // (1)
1354// A[i] = b; // (2)
1355//
1356// The store group of (2) is always inserted at or below (2), and the load
1357// group of (1) is always inserted at or above (1). Thus, the instructions will
1358// never be reordered. All other dependences are checked to ensure the
1359// correctness of the instruction reordering.
1360//
1361// The algorithm visits all memory accesses in the loop in bottom-up program
1362// order. Program order is established by traversing the blocks in the loop in
1363// reverse postorder when collecting the accesses.
1364//
1365// We visit the memory accesses in bottom-up order because it can simplify the
1366// construction of store groups in the presence of write-after-write (WAW)
1367// dependences.
1368//
1369// E.g., for the WAW dependence: A[i] = a; // (1)
1370// A[i] = b; // (2)
1371// A[i + 1] = c; // (3)
1372//
1373// We will first create a store group with (3) and (2). (1) can't be added to
1374// this group because it and (2) are dependent. However, (1) can be grouped
1375// with other accesses that may precede it in program order. Note that a
1376// bottom-up order does not imply that WAW dependences should not be checked.
1378 bool EnablePredicatedInterleavedMemAccesses) {
1379 LLVM_DEBUG(dbgs() << "LV: Analyzing interleaved accesses...\n");
1380 const auto &Strides = LAI->getSymbolicStrides();
1381
1382 // Holds all accesses with a constant stride.
1385 collectConstStrideAccesses(AccessStrideInfo, Strides,
1386 OptForSize ? nullptr : &Predicates);
1387
1388 if (AccessStrideInfo.empty())
1389 return;
1390
1391 // Collect the dependences in the loop.
1392 collectDependences();
1393
1394 // Holds all interleaved store groups temporarily.
1396 // Holds all interleaved load groups temporarily.
1398 // Groups added to this set cannot have new members added.
1399 SmallPtrSet<InterleaveGroup<Instruction> *, 4> CompletedLoadGroups;
1400
1401 // Search in bottom-up program order for pairs of accesses (A and B) that can
1402 // form interleaved load or store groups. In the algorithm below, access A
1403 // precedes access B in program order. We initialize a group for B in the
1404 // outer loop of the algorithm, and then in the inner loop, we attempt to
1405 // insert each A into B's group if:
1406 //
1407 // 1. A and B have the same stride,
1408 // 2. A and B have the same memory object size, and
1409 // 3. A belongs in B's group according to its distance from B.
1410 //
1411 // Special care is taken to ensure group formation will not break any
1412 // dependences.
1413 for (auto BI = AccessStrideInfo.rbegin(), E = AccessStrideInfo.rend();
1414 BI != E; ++BI) {
1415 Instruction *B = BI->first;
1416 StrideDescriptor DesB = BI->second;
1417
1418 // Initialize a group for B if it has an allowable stride. Even if we don't
1419 // create a group for B, we continue with the bottom-up algorithm to ensure
1420 // we don't break any of B's dependences.
1421 InterleaveGroup<Instruction> *GroupB = nullptr;
1422 if (isStrided(DesB.Stride) &&
1423 (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
1424 GroupB = getInterleaveGroup(B);
1425 if (!GroupB) {
1426 LLVM_DEBUG(dbgs() << "LV: Creating an interleave group with:" << *B
1427 << '\n');
1428 GroupB = createInterleaveGroup(B, DesB.Stride, DesB.Alignment);
1429 if (B->mayWriteToMemory())
1430 StoreGroups.insert(GroupB);
1431 else
1432 LoadGroups.insert(GroupB);
1433 }
1434 }
1435
1436 for (auto AI = std::next(BI); AI != E; ++AI) {
1437 Instruction *A = AI->first;
1438 StrideDescriptor DesA = AI->second;
1439
1440 // Our code motion strategy implies that we can't have dependences
1441 // between accesses in an interleaved group and other accesses located
1442 // between the first and last member of the group. Note that this also
1443 // means that a group can't have more than one member at a given offset.
1444 // The accesses in a group can have dependences with other accesses, but
1445 // we must ensure we don't extend the boundaries of the group such that
1446 // we encompass those dependent accesses.
1447 //
1448 // For example, assume we have the sequence of accesses shown below in a
1449 // stride-2 loop:
1450 //
1451 // (1, 2) is a group | A[i] = a; // (1)
1452 // | A[i-1] = b; // (2) |
1453 // A[i-3] = c; // (3)
1454 // A[i] = d; // (4) | (2, 4) is not a group
1455 //
1456 // Because accesses (2) and (3) are dependent, we can group (2) with (1)
1457 // but not with (4). If we did, the dependent access (3) would be within
1458 // the boundaries of the (2, 4) group.
1459 auto DependentMember = [&](InterleaveGroup<Instruction> *Group,
1460 StrideEntry *A) -> Instruction * {
1461 for (uint32_t Index = 0; Index < Group->getFactor(); ++Index) {
1462 Instruction *MemberOfGroupB = Group->getMember(Index);
1463 if (MemberOfGroupB && !canReorderMemAccessesForInterleavedGroups(
1464 A, &*AccessStrideInfo.find(MemberOfGroupB)))
1465 return MemberOfGroupB;
1466 }
1467 return nullptr;
1468 };
1469
1470 auto GroupA = getInterleaveGroup(A);
1471 // If A is a load, dependencies are tolerable, there's nothing to do here.
1472 // If both A and B belong to the same (store) group, they are independent,
1473 // even if dependencies have not been recorded.
1474 // If both GroupA and GroupB are null, there's nothing to do here.
1475 if (A->mayWriteToMemory() && GroupA != GroupB) {
1476 Instruction *DependentInst = nullptr;
1477 // If GroupB is a load group, we have to compare AI against all
1478 // members of GroupB because if any load within GroupB has a dependency
1479 // on AI, we need to mark GroupB as complete and also release the
1480 // store GroupA (if A belongs to one). The former prevents incorrect
1481 // hoisting of load B above store A while the latter prevents incorrect
1482 // sinking of store A below load B.
1483 if (GroupB && LoadGroups.contains(GroupB))
1484 DependentInst = DependentMember(GroupB, &*AI);
1485 else if (!canReorderMemAccessesForInterleavedGroups(&*AI, &*BI))
1486 DependentInst = B;
1487
1488 if (DependentInst) {
1489 // A has a store dependence on B (or on some load within GroupB) and
1490 // is part of a store group. Release A's group to prevent illegal
1491 // sinking of A below B. A will then be free to form another group
1492 // with instructions that precede it.
1493 if (GroupA && StoreGroups.contains(GroupA)) {
1494 LLVM_DEBUG(dbgs() << "LV: Invalidated store group due to "
1495 "dependence between "
1496 << *A << " and " << *DependentInst << '\n');
1497 StoreGroups.remove(GroupA);
1498 releaseGroup(GroupA);
1499 }
1500 // If B is a load and part of an interleave group, no earlier loads
1501 // can be added to B's interleave group, because this would mean the
1502 // DependentInst would move across store A. Mark the interleave group
1503 // as complete.
1504 if (GroupB && LoadGroups.contains(GroupB)) {
1505 LLVM_DEBUG(dbgs() << "LV: Marking interleave group for " << *B
1506 << " as complete.\n");
1507 CompletedLoadGroups.insert(GroupB);
1508 }
1509 }
1510 }
1511 if (CompletedLoadGroups.contains(GroupB)) {
1512 // Skip trying to add A to B, continue to look for other conflicting A's
1513 // in groups to be released.
1514 continue;
1515 }
1516
1517 // At this point, we've checked for illegal code motion. If either A or B
1518 // isn't strided, there's nothing left to do.
1519 if (!isStrided(DesA.Stride) || !isStrided(DesB.Stride))
1520 continue;
1521
1522 // Ignore A if it's already in a group or isn't the same kind of memory
1523 // operation as B.
1524 // Note that mayReadFromMemory() isn't mutually exclusive to
1525 // mayWriteToMemory in the case of atomic loads. We shouldn't see those
1526 // here, canVectorizeMemory() should have returned false - except for the
1527 // case we asked for optimization remarks.
1528 if (isInterleaved(A) ||
1529 (A->mayReadFromMemory() != B->mayReadFromMemory()) ||
1530 (A->mayWriteToMemory() != B->mayWriteToMemory()))
1531 continue;
1532
1533 // Check rules 1 and 2. Ignore A if its stride or size is different from
1534 // that of B.
1535 if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
1536 continue;
1537
1538 // Ignore A if the memory object of A and B don't belong to the same
1539 // address space
1541 continue;
1542
1543 // Calculate the distance from A to B.
1544 const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
1545 PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
1546 if (!DistToB)
1547 continue;
1548 int64_t DistanceToB = DistToB->getAPInt().getSExtValue();
1549
1550 // Check rule 3. Ignore A if its distance to B is not a multiple of the
1551 // size.
1552 if (DistanceToB % static_cast<int64_t>(DesB.Size))
1553 continue;
1554
1555 // All members of a predicated interleave-group must have the same predicate,
1556 // and currently must reside in the same BB.
1557 BasicBlock *BlockA = A->getParent();
1558 BasicBlock *BlockB = B->getParent();
1559 if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
1560 (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
1561 continue;
1562
1563 // The index of A is the index of B plus A's distance to B in multiples
1564 // of the size.
1565 int IndexA =
1566 GroupB->getIndex(B) + DistanceToB / static_cast<int64_t>(DesB.Size);
1567
1568 // Try to insert A into B's group.
1569 if (GroupB->insertMember(A, IndexA, DesA.Alignment)) {
1570 LLVM_DEBUG(dbgs() << "LV: Inserted:" << *A << '\n'
1571 << " into the interleave group with" << *B
1572 << '\n');
1573 InterleaveGroupMap[A] = GroupB;
1574
1575 // Set the first load in program order as the insert position.
1576 if (A->mayReadFromMemory())
1577 GroupB->setInsertPos(A);
1578 }
1579 } // Iteration over A accesses.
1580 } // Iteration over B accesses.
1581
1582 // Commit the collected predicates to PSE if any candidate group was formed.
1583 if (!LoadGroups.empty() || !StoreGroups.empty())
1584 PSE.addPredicates(Predicates);
1585
1586 auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
1587 int Index,
1588 const char *FirstOrLast) -> bool {
1589 Instruction *Member = Group->getMember(Index);
1590 assert(Member && "Group member does not exist");
1591 Value *MemberPtr = getLoadStorePointerOperand(Member);
1592 Type *AccessTy = getLoadStoreType(Member);
1593 if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, *DT, Strides,
1594 /*Assume=*/false, /*ShouldCheckWrap=*/true)
1595 .value_or(0))
1596 return false;
1597 LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
1598 << FirstOrLast
1599 << " group member potentially pointer-wrapping.\n");
1600 releaseGroup(Group);
1601 return true;
1602 };
1603
1604 // Remove interleaved groups with gaps whose memory
1605 // accesses may wrap around. We have to revisit the getPtrStride analysis,
1606 // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
1607 // not check wrapping (see documentation there).
1608 // FORNOW we use Assume=false;
1609 // TODO: Change to Assume=true but making sure we don't exceed the threshold
1610 // of runtime SCEV assumptions checks (thereby potentially failing to
1611 // vectorize altogether).
1612 // Additional optional optimizations:
1613 // TODO: If we are peeling the loop and we know that the first pointer doesn't
1614 // wrap then we can deduce that all pointers in the group don't wrap.
1615 // This means that we can forcefully peel the loop in order to only have to
1616 // check the first pointer for no-wrap. When we'll change to use Assume=true
1617 // we'll only need at most one runtime check per interleaved group.
1618 for (auto *Group : LoadGroups) {
1619 // Case 1: A full group. Can Skip the checks; For full groups, if the wide
1620 // load would wrap around the address space we would do a memory access at
1621 // nullptr even without the transformation.
1622 if (Group->isFull())
1623 continue;
1624
1625 // Case 2: If first and last members of the group don't wrap this implies
1626 // that all the pointers in the group don't wrap.
1627 // So we check only group member 0 (which is always guaranteed to exist),
1628 // and group member Factor - 1; If the latter doesn't exist we rely on
1629 // peeling (if it is a non-reversed access -- see Case 3).
1630 if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
1631 continue;
1632 if (Group->getMember(Group->getFactor() - 1))
1633 InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, "last");
1634 else {
1635 // Case 3: A non-reversed interleaved load group with gaps: We need
1636 // to execute at least one scalar epilogue iteration. This will ensure
1637 // we don't speculatively access memory out-of-bounds. We only need
1638 // to look for a member at index factor - 1, since every group must have
1639 // a member at index zero.
1640 if (Group->isReverse()) {
1641 LLVM_DEBUG(
1642 dbgs() << "LV: Invalidate candidate interleaved group due to "
1643 "a reverse access with gaps.\n");
1644 releaseGroup(Group);
1645 continue;
1646 }
1647 LLVM_DEBUG(
1648 dbgs() << "LV: Interleaved group requires epilogue iteration.\n");
1649 RequiresScalarEpilogue = true;
1650 }
1651 }
1652
1653 for (auto *Group : StoreGroups) {
1654 // Case 1: A full group. Can Skip the checks; For full groups, if the wide
1655 // store would wrap around the address space we would do a memory access at
1656 // nullptr even without the transformation.
1657 if (Group->isFull())
1658 continue;
1659
1660 // Interleave-store-group with gaps is implemented using masked wide store.
1661 // Remove interleaved store groups with gaps if
1662 // masked-interleaved-accesses are not enabled by the target.
1663 if (!EnablePredicatedInterleavedMemAccesses) {
1664 LLVM_DEBUG(
1665 dbgs() << "LV: Invalidate candidate interleaved store group due "
1666 "to gaps.\n");
1667 releaseGroup(Group);
1668 continue;
1669 }
1670
1671 // Case 2: If first and last members of the group don't wrap this implies
1672 // that all the pointers in the group don't wrap.
1673 // So we check only group member 0 (which is always guaranteed to exist),
1674 // and the last group member. Case 3 (scalar epilog) is not relevant for
1675 // stores with gaps, which are implemented with masked-store (rather than
1676 // speculative access, as in loads).
1677 if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
1678 continue;
1679 for (int Index = Group->getFactor() - 1; Index > 0; Index--)
1680 if (Group->getMember(Index)) {
1681 InvalidateGroupIfMemberMayWrap(Group, Index, "last");
1682 break;
1683 }
1684 }
1685}
1686
1688 // If no group had triggered the requirement to create an epilogue loop,
1689 // there is nothing to do.
1691 return;
1692
1693 // Release groups requiring scalar epilogues. Note that this also removes them
1694 // from InterleaveGroups.
1695 bool ReleasedGroup = InterleaveGroups.remove_if([&](auto *Group) {
1696 if (!Group->requiresScalarEpilogue())
1697 return false;
1698 LLVM_DEBUG(
1699 dbgs()
1700 << "LV: Invalidate candidate interleaved group due to gaps that "
1701 "require a scalar epilogue (not allowed under optsize) and cannot "
1702 "be masked (not enabled). \n");
1703 releaseGroupWithoutRemovingFromSet(Group);
1704 return true;
1705 });
1706 assert(ReleasedGroup && "At least one group must be invalidated, as a "
1707 "scalar epilogue was required");
1708 (void)ReleasedGroup;
1709 RequiresScalarEpilogue = false;
1710}
1711
1712template <typename InstT>
1713void InterleaveGroup<InstT>::addMetadata(InstT *NewInst) const {
1714 llvm_unreachable("addMetadata can only be used for Instruction");
1715}
1716
1717namespace llvm {
1718template <>
1723} // namespace llvm
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
This file contains the declarations for the subclasses of Constant, which represent the different fla...
Generic implementation of equivalence classes through the use Tarjan's efficient union-find algorithm...
IRTranslator LLVM IR MI
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define I(x, y, z)
Definition MD5.cpp:57
This file provides utility for Memory Model Relaxation Annotations (MMRAs).
This file defines the SmallVector class.
#define LLVM_DEBUG(...)
Definition Debug.h:119
static unsigned getScalarSizeInBits(Type *Ty)
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static Value * concatenateTwoVectors(IRBuilderBase &Builder, Value *V1, Value *V2)
A helper function for concatenating vectors.
static cl::opt< unsigned > MaxInterleaveGroupFactor("max-interleave-group-factor", cl::Hidden, cl::desc("Maximum factor for an interleaved access group (default = 8)"), cl::init(8))
Maximum factor for an interleaved memory access.
static void addToAccessGroupList(ListT &List, MDNode *AccGroups)
Add all access groups in AccGroups to List.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
void clearBit(unsigned BitPosition)
Set a given bit to 0.
Definition APInt.h:1429
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
iterator end() const
Definition ArrayRef.h:130
size_t size() const
Get the array size.
Definition ArrayRef.h:141
iterator begin() const
Definition ArrayRef.h:129
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
LLVM Basic Block Representation.
Definition BasicBlock.h:62
This class represents a function call, abstracting a target machine's calling convention.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
This is an important base class in LLVM.
Definition Constant.h:43
size_type count(const_arg_type_t< KeyT > Val) const
Return 1 if the specified key is in the map, 0 otherwise.
Definition DenseMap.h:221
This represents a collection of equivalence classes and supports three efficient operations: insert a...
iterator_range< member_iterator > members(const ECValue &ECV) const
const ElemTy & getOrInsertLeaderValue(const ElemTy &V)
Return the leader for the specified value that is in the set.
member_iterator unionSets(const ElemTy &V1, const ElemTy &V2)
Merge the two equivalence sets for the specified values, inserting them if they do not already exist ...
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This instruction inserts a single (scalar) element into a VectorType value.
bool mayReadOrWriteMemory() const
Return true if this instruction may read or write memory.
MDNode * getMetadata(unsigned KindID) const
Get the metadata of given kind attached to this Instruction.
LLVM_ABI void setMetadata(unsigned KindID, MDNode *Node)
Set the metadata of the specified kind to the specified node.
void getAllMetadataOtherThanDebugLoc(SmallVectorImpl< std::pair< unsigned, MDNode * > > &MDs) const
This does the same thing as getAllMetadata, except that it filters out the debug location.
The group of interleaved loads/stores sharing the same stride and close to each other.
uint32_t getFactor() const
InstTy * getMember(uint32_t Index) const
Get the member with the given index Index.
bool isFull() const
Return true if this group is full, i.e. it has no gaps.
uint32_t getIndex(const InstTy *Instr) const
Get the index for the given member.
void setInsertPos(InstTy *Inst)
bool isReverse() const
void addMetadata(InstTy *NewInst) const
Add metadata (e.g.
bool insertMember(InstTy *Instr, int32_t Index, Align NewAlign)
Try to insert a new member Instr with index Index and alignment NewAlign.
InterleaveGroup< Instruction > * getInterleaveGroup(const Instruction *Instr) const
Get the interleave group that Instr belongs to.
bool requiresScalarEpilogue() const
Returns true if an interleaved group that may access memory out-of-bounds requires a scalar epilogue ...
bool isInterleaved(Instruction *Instr) const
Check if Instr belongs to any interleave group.
LLVM_ABI void analyzeInterleaving(bool EnableMaskedInterleavedGroup)
Analyze the interleaved accesses and collect them in interleave groups.
LLVM_ABI void invalidateGroupsRequiringScalarEpilogue()
Invalidate groups that require a scalar epilogue (due to gaps).
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
Metadata node.
Definition Metadata.h:1069
static LLVM_ABI MDNode * getMostGenericAliasScope(MDNode *A, MDNode *B)
static LLVM_ABI MDNode * getMostGenericTBAA(MDNode *A, MDNode *B)
ArrayRef< MDOperand > operands() const
Definition Metadata.h:1424
static MDTuple * get(LLVMContext &Context, ArrayRef< Metadata * > MDs)
Definition Metadata.h:1554
static LLVM_ABI MDNode * getMostGenericFPMath(MDNode *A, MDNode *B)
unsigned getNumOperands() const
Return number of MDNode operands.
Definition Metadata.h:1432
static LLVM_ABI MDNode * intersect(MDNode *A, MDNode *B)
LLVMContext & getContext() const
Definition Metadata.h:1233
Tracking metadata reference owned by Metadata.
Definition Metadata.h:891
static LLVM_ABI MDNode * combine(LLVMContext &Ctx, const MMRAMetadata &A, const MMRAMetadata &B)
Combines A and B according to MMRA semantics.
This class implements a map that also provides access to all stored values in a deterministic order.
Definition MapVector.h:38
iterator find(const KeyT &Key)
Definition MapVector.h:156
bool empty() const
Definition MapVector.h:79
reverse_iterator rend()
Definition MapVector.h:76
reverse_iterator rbegin()
Definition MapVector.h:72
Root of the metadata hierarchy.
Definition Metadata.h:64
Represent a mutable reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:294
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
This class represents a constant integer value.
const APInt & getAPInt() const
bool remove(const value_type &X)
Remove an item from the set vector.
Definition SetVector.h:181
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
size_type count(ConstPtrType Ptr) const
count - Return 1 if the specified pointer is in the set, 0 otherwise.
bool remove_if(UnaryPredicate P)
Remove elements that match the given predicate.
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
A SetVector that performs no allocations if smaller than a certain size.
Definition SetVector.h:339
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void assign(size_type NumElts, ValueParamT Elt)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
Provides information about what library functions are available for the current target.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
ArrayRef< Type * > subtypes() const
Definition Type.h:381
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPCast(Intrinsic::ID ID)
static LLVM_ABI std::optional< unsigned > getVectorLengthParamPos(Intrinsic::ID IntrinsicID)
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:258
Base class of all SIMD vector types.
Type * getElementType() const
An efficient, type-erasing, non-owning reference to a callable.
CallInst * Call
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
LLVM_ABI bool isTriviallyScalarizable(ID id)
Returns true if the intrinsic is trivially scalarizable.
LLVM_ABI bool isTargetIntrinsic(ID IID)
isTargetIntrinsic - Returns true if IID is an intrinsic specific to a certain target.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
match_combine_or< Ty... > m_CombineOr(const Ty &...Ps)
Combine pattern matchers matching any of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
bool match(Val *V, const Pattern &P)
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
auto m_UndefValue()
Match an arbitrary UndefValue constant.
auto m_Constant()
Match an arbitrary Constant and ignore it.
ContainsMatchingVectorElement_match< SPTy > m_ContainsMatchingVectorElement(const SPTy &SubPattern)
Match a vector constant where at least one of its elements matches the subpattern.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
initializer< Ty > init(const Ty &Val)
This is an optimization pass for GlobalISel generic memory operations.
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
unsigned getLoadStoreAddressSpace(const Value *I)
A helper function that returns the address space of the pointer operand of load or store instruction.
LLVM_ABI Intrinsic::ID getVectorIntrinsicIDForCall(const CallInst *CI, const TargetLibraryInfo *TLI)
Returns intrinsic ID for call.
LLVM_ABI bool canInstructionHaveMMRAs(const Instruction &I)
LLVM_ABI APInt possiblyDemandedEltsInMask(Value *Mask)
Given a mask vector of the form <Y x i1>, return an APInt (of bitwidth Y) for each lane which may be ...
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
const Value * getLoadStorePointerOperand(const Value *V)
A helper function that returns the pointer operand of a load or store instruction.
LLVM_ABI llvm::SmallVector< int, 16 > createUnaryMask(ArrayRef< int > Mask, unsigned NumElts)
Given a shuffle mask for a binary shuffle, create the equivalent shuffle mask assuming both operands ...
LLVM_ABI void getMetadataToPropagate(Instruction *Inst, SmallVectorImpl< std::pair< unsigned, MDNode * > > &Metadata)
Add metadata from Inst to Metadata, if it can be preserved after vectorization.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
Align getLoadStoreAlignment(const Value *I)
A helper function that returns the alignment of load or store instruction.
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI Instruction * propagateMetadata(Instruction *I, ArrayRef< Value * > VL)
Specifically, let Kinds = [MD_tbaa, MD_alias_scope, MD_noalias, MD_fpmath, MD_nontemporal,...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
T bit_ceil(T Value)
Returns the smallest integral power of two no smaller than Value if Value is nonzero.
Definition bit.h:362
constexpr auto equal_to(T &&Arg)
Functor variant of std::equal_to that can be used as a UnaryPredicate in functional algorithms like a...
Definition STLExtras.h:2173
LLVM_ABI MDNode * intersectAccessGroups(const Instruction *Inst1, const Instruction *Inst2)
Compute the access-group list of access groups that Inst1 and Inst2 are both in.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool getShuffleDemandedElts(int SrcWidth, ArrayRef< int > Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts=false)
Transform a shuffle mask's output demanded element mask into demanded element masks for the 2 operand...
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
LLVM_ABI Constant * createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF, const InterleaveGroup< Instruction > &Group)
Create a mask that filters the members of an interleave group where there are gaps.
constexpr unsigned MaxAnalysisRecursionDepth
LLVM_ABI llvm::SmallVector< int, 16 > createStrideMask(unsigned Start, unsigned Stride, unsigned VF)
Create a stride shuffle mask.
LLVM_ABI void getHorizDemandedEltsForFirstOperand(unsigned VectorBitWidth, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS)
Compute the demanded elements mask of horizontal binary operations.
LLVM_ABI llvm::SmallVector< int, 16 > createReplicatedMask(unsigned ReplicationFactor, unsigned VF)
Create a mask with replicated elements.
LLVM_ABI unsigned getDeinterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.deinterleaveN intrinsics.
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
LLVM_ABI unsigned getInterleaveIntrinsicFactor(Intrinsic::ID ID)
Returns the corresponding factor of llvm.vector.interleaveN intrinsics.
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
constexpr int PoisonMaskElem
LLVM_ABI bool isTriviallyScalarizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially scalarizable.
LLVM_ABI bool isValidAsAccessGroup(MDNode *AccGroup)
Return whether an MDNode might represent an access group.
LLVM_ABI Intrinsic::ID getIntrinsicForCallSite(const CallBase &CB, const TargetLibraryInfo *TLI)
Map a call instruction to an intrinsic ID.
LLVM_ABI bool isVectorIntrinsicWithStructReturnOverloadAtField(Intrinsic::ID ID, int RetIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic that returns a struct is overloaded at the struct elem...
TargetTransformInfo TTI
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI bool isMaskedSlidePair(ArrayRef< int > Mask, int NumElts, std::array< std::pair< int, int >, 2 > &SrcInfo)
Does this shuffle mask represent either one slide shuffle or a pair of two slide shuffles,...
LLVM_ABI VectorType * getDeinterleavedVectorType(IntrinsicInst *DI)
Given a deinterleaveN intrinsic, return the (narrow) vector type of each factor.
LLVM_ABI llvm::SmallVector< int, 16 > createInterleaveMask(unsigned VF, unsigned NumVecs)
Create an interleave shuffle mask.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
LLVM_ABI const SCEV * replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE, const DenseMap< Value *, const SCEV * > &PtrToStride, Value *Ptr)
Return the SCEV corresponding to a pointer with the symbolic stride replaced with constant one,...
LLVM_ABI Value * findScalarElement(Value *V, unsigned EltNo)
Given a vector and an element number, see if the scalar value is already around as a register,...
LLVM_ABI MDNode * uniteAccessGroups(MDNode *AccGroups1, MDNode *AccGroups2)
Compute the union of two access-group lists.
unsigned M0(unsigned Val)
Definition VE.h:376
auto make_second_range(ContainerTy &&c)
Given a container of pairs, return a range over the second elements.
Definition STLExtras.h:1409
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI void getShuffleMaskWithWidestElts(ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Repetitively apply widenShuffleMaskElts() for as long as it succeeds, to get the shuffle mask with wi...
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Type * getLoadStoreType(const Value *I)
A helper function that returns the type of a load or store instruction.
LLVM_ABI void processShuffleMasks(ArrayRef< int > Mask, unsigned NumOfSrcRegs, unsigned NumOfDestRegs, unsigned NumOfUsedRegs, function_ref< void()> NoInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned)> SingleInputAction, function_ref< void(ArrayRef< int >, unsigned, unsigned, bool)> ManyInputsAction)
Splits and processes shuffle mask depending on the number of input and output registers.
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
LLVM_ABI bool maskContainsAllOneOrUndef(Value *Mask)
Given a mask vector of i1, Return true if any of the elements of this predicate mask are known to be ...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
LLVM_ABI std::optional< int64_t > getPtrStride(PredicatedScalarEvolution &PSE, Type *AccessTy, Value *Ptr, const Loop *Lp, const DominatorTree &DT, const DenseMap< Value *, const SCEV * > &StridesMap=DenseMap< Value *, const SCEV * >(), bool ShouldCheckWrap=true, SmallVectorImpl< const SCEVPredicate * > *Predicates=nullptr)
If the pointer has a constant stride return it in units of the access type size.
LLVM_ABI bool isVectorIntrinsicWithOverloadTypeAtArg(Intrinsic::ID ID, int OpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic is overloaded on the type of the operand at index OpdI...
LLVM_ABI MapVector< Instruction *, uint64_t > computeMinimumValueSizes(ArrayRef< BasicBlock * > Blocks, DemandedBits &DB, const TargetTransformInfo *TTI=nullptr)
Compute a map of integer instructions to their minimum legal type size.
LLVM_ABI bool scaleShuffleMaskElts(unsigned NumDstElts, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Attempt to narrow/widen the Mask shuffle mask to the NumDstElts target width.
LLVM_ABI int getSplatIndex(ArrayRef< int > Mask)
If all non-negative Mask elements are the same value, return that value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862