LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoad(Instruction &I);
133 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
134 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
135 bool scalarizeExtExtract(Instruction &I);
136 bool foldConcatOfBoolMasks(Instruction &I);
137 bool foldPermuteOfBinops(Instruction &I);
138 bool foldShuffleOfBinops(Instruction &I);
139 bool foldShuffleOfSelects(Instruction &I);
140 bool foldShuffleOfCastops(Instruction &I);
141 bool foldShuffleOfShuffles(Instruction &I);
142 bool foldPermuteOfIntrinsic(Instruction &I);
143 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
144 bool foldShuffleOfIntrinsics(Instruction &I);
145 bool foldShuffleToIdentity(Instruction &I);
146 bool foldShuffleFromReductions(Instruction &I);
147 bool foldShuffleChainsToReduce(Instruction &I);
148 bool foldCastFromReductions(Instruction &I);
149 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
150 bool foldInterleaveIntrinsics(Instruction &I);
151 bool shrinkType(Instruction &I);
152 bool shrinkLoadForShuffles(Instruction &I);
153 bool shrinkPhiOfShuffles(Instruction &I);
154
155 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
156 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
157 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
158 Old.replaceAllUsesWith(&New);
159 if (auto *NewI = dyn_cast<Instruction>(&New)) {
160 New.takeName(&Old);
161 Worklist.pushUsersToWorkList(*NewI);
162 Worklist.pushValue(NewI);
163 }
164 if (Erase && isInstructionTriviallyDead(&Old)) {
165 eraseInstruction(Old);
166 } else {
167 Worklist.push(&Old);
168 }
169 }
170
171 void eraseInstruction(Instruction &I) {
172 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
173 SmallVector<Value *> Ops(I.operands());
174 Worklist.remove(&I);
175 I.eraseFromParent();
176
177 // Push remaining users of the operands and then the operand itself - allows
178 // further folds that were hindered by OneUse limits.
179 SmallPtrSet<Value *, 4> Visited;
180 for (Value *Op : Ops) {
181 if (!Visited.contains(Op)) {
182 if (auto *OpI = dyn_cast<Instruction>(Op)) {
184 OpI, nullptr, nullptr, [&](Value *V) {
185 if (auto *I = dyn_cast<Instruction>(V)) {
186 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
187 Worklist.remove(I);
188 if (I == NextInst)
189 NextInst = NextInst->getNextNode();
190 Visited.insert(I);
191 }
192 }))
193 continue;
194 Worklist.pushUsersToWorkList(*OpI);
195 Worklist.pushValue(OpI);
196 }
197 }
198 }
199 }
200};
201} // namespace
202
203/// Return the source operand of a potentially bitcasted value. If there is no
204/// bitcast, return the input value itself.
206 while (auto *BitCast = dyn_cast<BitCastInst>(V))
207 V = BitCast->getOperand(0);
208 return V;
209}
210
211static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
212 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
213 // The widened load may load data from dirty regions or create data races
214 // non-existent in the source.
215 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
216 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
218 return false;
219
220 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
221 // sure we have all of our type-based constraints in place for this target.
222 Type *ScalarTy = Load->getType()->getScalarType();
223 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
224 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
225 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
226 ScalarSize % 8 != 0)
227 return false;
228
229 return true;
230}
231
232bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
233 // Match insert into fixed vector of scalar value.
234 // TODO: Handle non-zero insert index.
235 Value *Scalar;
236 if (!match(&I,
238 return false;
239
240 // Optionally match an extract from another vector.
241 Value *X;
242 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
243 if (!HasExtract)
244 X = Scalar;
245
246 auto *Load = dyn_cast<LoadInst>(X);
247 if (!canWidenLoad(Load, TTI))
248 return false;
249
250 Type *ScalarTy = Scalar->getType();
251 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
252 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
253
254 // Check safety of replacing the scalar load with a larger vector load.
255 // We use minimal alignment (maximum flexibility) because we only care about
256 // the dereferenceable region. When calculating cost and creating a new op,
257 // we may use a larger value based on alignment attributes.
258 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
259 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
260
261 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
262 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
263 unsigned OffsetEltIndex = 0;
264 Align Alignment = Load->getAlign();
265 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
266 &DT)) {
267 // It is not safe to load directly from the pointer, but we can still peek
268 // through gep offsets and check if it safe to load from a base address with
269 // updated alignment. If it is, we can shuffle the element(s) into place
270 // after loading.
271 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
272 APInt Offset(OffsetBitWidth, 0);
274
275 // We want to shuffle the result down from a high element of a vector, so
276 // the offset must be positive.
277 if (Offset.isNegative())
278 return false;
279
280 // The offset must be a multiple of the scalar element to shuffle cleanly
281 // in the element's size.
282 uint64_t ScalarSizeInBytes = ScalarSize / 8;
283 if (Offset.urem(ScalarSizeInBytes) != 0)
284 return false;
285
286 // If we load MinVecNumElts, will our target element still be loaded?
287 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
288 if (OffsetEltIndex >= MinVecNumElts)
289 return false;
290
291 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
292 &DT))
293 return false;
294
295 // Update alignment with offset value. Note that the offset could be negated
296 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
297 // negation does not change the result of the alignment calculation.
298 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
299 }
300
301 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
302 // Use the greater of the alignment on the load or its source pointer.
303 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
304 Type *LoadTy = Load->getType();
305 unsigned AS = Load->getPointerAddressSpace();
306 InstructionCost OldCost =
307 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
308 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
309 OldCost +=
310 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
311 /* Insert */ true, HasExtract, CostKind);
312
313 // New pattern: load VecPtr
314 InstructionCost NewCost =
315 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
316 // Optionally, we are shuffling the loaded vector element(s) into place.
317 // For the mask set everything but element 0 to undef to prevent poison from
318 // propagating from the extra loaded memory. This will also optionally
319 // shrink/grow the vector from the loaded size to the output size.
320 // We assume this operation has no cost in codegen if there was no offset.
321 // Note that we could use freeze to avoid poison problems, but then we might
322 // still need a shuffle to change the vector size.
323 auto *Ty = cast<FixedVectorType>(I.getType());
324 unsigned OutputNumElts = Ty->getNumElements();
325 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
326 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
327 Mask[0] = OffsetEltIndex;
328 if (OffsetEltIndex)
329 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
330 CostKind);
331
332 // We can aggressively convert to the vector form because the backend can
333 // invert this transform if it does not result in a performance win.
334 if (OldCost < NewCost || !NewCost.isValid())
335 return false;
336
337 // It is safe and potentially profitable to load a vector directly:
338 // inselt undef, load Scalar, 0 --> load VecPtr
339 IRBuilder<> Builder(Load);
340 Value *CastedPtr =
341 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
342 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
343 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
344
345 replaceValue(I, *VecLd);
346 ++NumVecLoad;
347 return true;
348}
349
350/// If we are loading a vector and then inserting it into a larger vector with
351/// undefined elements, try to load the larger vector and eliminate the insert.
352/// This removes a shuffle in IR and may allow combining of other loaded values.
353bool VectorCombine::widenSubvectorLoad(Instruction &I) {
354 // Match subvector insert of fixed vector.
355 auto *Shuf = cast<ShuffleVectorInst>(&I);
356 if (!Shuf->isIdentityWithPadding())
357 return false;
358
359 // Allow a non-canonical shuffle mask that is choosing elements from op1.
360 unsigned NumOpElts =
361 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
362 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
363 return M >= (int)(NumOpElts);
364 });
365
366 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
367 if (!canWidenLoad(Load, TTI))
368 return false;
369
370 // We use minimal alignment (maximum flexibility) because we only care about
371 // the dereferenceable region. When calculating cost and creating a new op,
372 // we may use a larger value based on alignment attributes.
373 auto *Ty = cast<FixedVectorType>(I.getType());
374 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
375 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
376 Align Alignment = Load->getAlign();
377 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
378 return false;
379
380 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
381 Type *LoadTy = Load->getType();
382 unsigned AS = Load->getPointerAddressSpace();
383
384 // Original pattern: insert_subvector (load PtrOp)
385 // This conservatively assumes that the cost of a subvector insert into an
386 // undef value is 0. We could add that cost if the cost model accurately
387 // reflects the real cost of that operation.
388 InstructionCost OldCost =
389 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
390
391 // New pattern: load PtrOp
392 InstructionCost NewCost =
393 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
394
395 // We can aggressively convert to the vector form because the backend can
396 // invert this transform if it does not result in a performance win.
397 if (OldCost < NewCost || !NewCost.isValid())
398 return false;
399
400 IRBuilder<> Builder(Load);
401 Value *CastedPtr =
402 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
403 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
404 replaceValue(I, *VecLd);
405 ++NumVecLoad;
406 return true;
407}
408
409/// Determine which, if any, of the inputs should be replaced by a shuffle
410/// followed by extract from a different index.
411ExtractElementInst *VectorCombine::getShuffleExtract(
412 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
413 unsigned PreferredExtractIndex = InvalidIndex) const {
414 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
415 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
416 assert(Index0C && Index1C && "Expected constant extract indexes");
417
418 unsigned Index0 = Index0C->getZExtValue();
419 unsigned Index1 = Index1C->getZExtValue();
420
421 // If the extract indexes are identical, no shuffle is needed.
422 if (Index0 == Index1)
423 return nullptr;
424
425 Type *VecTy = Ext0->getVectorOperand()->getType();
426 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
427 InstructionCost Cost0 =
428 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
429 InstructionCost Cost1 =
430 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
431
432 // If both costs are invalid no shuffle is needed
433 if (!Cost0.isValid() && !Cost1.isValid())
434 return nullptr;
435
436 // We are extracting from 2 different indexes, so one operand must be shuffled
437 // before performing a vector operation and/or extract. The more expensive
438 // extract will be replaced by a shuffle.
439 if (Cost0 > Cost1)
440 return Ext0;
441 if (Cost1 > Cost0)
442 return Ext1;
443
444 // If the costs are equal and there is a preferred extract index, shuffle the
445 // opposite operand.
446 if (PreferredExtractIndex == Index0)
447 return Ext1;
448 if (PreferredExtractIndex == Index1)
449 return Ext0;
450
451 // Otherwise, replace the extract with the higher index.
452 return Index0 > Index1 ? Ext0 : Ext1;
453}
454
455/// Compare the relative costs of 2 extracts followed by scalar operation vs.
456/// vector operation(s) followed by extract. Return true if the existing
457/// instructions are cheaper than a vector alternative. Otherwise, return false
458/// and if one of the extracts should be transformed to a shufflevector, set
459/// \p ConvertToShuffle to that extract instruction.
460bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
461 ExtractElementInst *Ext1,
462 const Instruction &I,
463 ExtractElementInst *&ConvertToShuffle,
464 unsigned PreferredExtractIndex) {
465 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
466 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
467 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
468
469 unsigned Opcode = I.getOpcode();
470 Value *Ext0Src = Ext0->getVectorOperand();
471 Value *Ext1Src = Ext1->getVectorOperand();
472 Type *ScalarTy = Ext0->getType();
473 auto *VecTy = cast<VectorType>(Ext0Src->getType());
474 InstructionCost ScalarOpCost, VectorOpCost;
475
476 // Get cost estimates for scalar and vector versions of the operation.
477 bool IsBinOp = Instruction::isBinaryOp(Opcode);
478 if (IsBinOp) {
479 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
480 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
481 } else {
482 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
483 "Expected a compare");
484 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
485 ScalarOpCost = TTI.getCmpSelInstrCost(
486 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
487 VectorOpCost = TTI.getCmpSelInstrCost(
488 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
489 }
490
491 // Get cost estimates for the extract elements. These costs will factor into
492 // both sequences.
493 unsigned Ext0Index = Ext0IndexC->getZExtValue();
494 unsigned Ext1Index = Ext1IndexC->getZExtValue();
495
496 InstructionCost Extract0Cost =
497 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
498 InstructionCost Extract1Cost =
499 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
500
501 // A more expensive extract will always be replaced by a splat shuffle.
502 // For example, if Ext0 is more expensive:
503 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
504 // extelt (opcode (splat V0, Ext0), V1), Ext1
505 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
506 // check the cost of creating a broadcast shuffle and shuffling both
507 // operands to element 0.
508 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
509 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
510 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
511
512 // Extra uses of the extracts mean that we include those costs in the
513 // vector total because those instructions will not be eliminated.
514 InstructionCost OldCost, NewCost;
515 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
516 // Handle a special case. If the 2 extracts are identical, adjust the
517 // formulas to account for that. The extra use charge allows for either the
518 // CSE'd pattern or an unoptimized form with identical values:
519 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
520 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
521 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
522 OldCost = CheapExtractCost + ScalarOpCost;
523 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
524 } else {
525 // Handle the general case. Each extract is actually a different value:
526 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
527 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
528 NewCost = VectorOpCost + CheapExtractCost +
529 !Ext0->hasOneUse() * Extract0Cost +
530 !Ext1->hasOneUse() * Extract1Cost;
531 }
532
533 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
534 if (ConvertToShuffle) {
535 if (IsBinOp && DisableBinopExtractShuffle)
536 return true;
537
538 // If we are extracting from 2 different indexes, then one operand must be
539 // shuffled before performing the vector operation. The shuffle mask is
540 // poison except for 1 lane that is being translated to the remaining
541 // extraction lane. Therefore, it is a splat shuffle. Ex:
542 // ShufMask = { poison, poison, 0, poison }
543 // TODO: The cost model has an option for a "broadcast" shuffle
544 // (splat-from-element-0), but no option for a more general splat.
545 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
546 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
548 ShuffleMask[BestInsIndex] = BestExtIndex;
550 VecTy, VecTy, ShuffleMask, CostKind, 0,
551 nullptr, {ConvertToShuffle});
552 } else {
554 VecTy, VecTy, {}, CostKind, 0, nullptr,
555 {ConvertToShuffle});
556 }
557 }
558
559 // Aggressively form a vector op if the cost is equal because the transform
560 // may enable further optimization.
561 // Codegen can reverse this transform (scalarize) if it was not profitable.
562 return OldCost < NewCost;
563}
564
565/// Create a shuffle that translates (shifts) 1 element from the input vector
566/// to a new element location.
567static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
568 unsigned NewIndex, IRBuilderBase &Builder) {
569 // The shuffle mask is poison except for 1 lane that is being translated
570 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
571 // ShufMask = { 2, poison, poison, poison }
572 auto *VecTy = cast<FixedVectorType>(Vec->getType());
573 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
574 ShufMask[NewIndex] = OldIndex;
575 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
576}
577
578/// Given an extract element instruction with constant index operand, shuffle
579/// the source vector (shift the scalar element) to a NewIndex for extraction.
580/// Return null if the input can be constant folded, so that we are not creating
581/// unnecessary instructions.
582static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
583 IRBuilderBase &Builder) {
584 // Shufflevectors can only be created for fixed-width vectors.
585 Value *X = ExtElt->getVectorOperand();
586 if (!isa<FixedVectorType>(X->getType()))
587 return nullptr;
588
589 // If the extract can be constant-folded, this code is unsimplified. Defer
590 // to other passes to handle that.
591 Value *C = ExtElt->getIndexOperand();
592 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
593 if (isa<Constant>(X))
594 return nullptr;
595
596 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
597 NewIndex, Builder);
598 return Shuf;
599}
600
601/// Try to reduce extract element costs by converting scalar compares to vector
602/// compares followed by extract.
603/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
604Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
605 Instruction &I) {
606 assert(isa<CmpInst>(&I) && "Expected a compare");
607
608 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
609 // --> extelt (cmp Pred V0, V1), ExtIndex
610 ++NumVecCmp;
611 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
612 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
613 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
614}
615
616/// Try to reduce extract element costs by converting scalar binops to vector
617/// binops followed by extract.
618/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
619Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
620 Instruction &I) {
621 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
622
623 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
624 // --> extelt (bo V0, V1), ExtIndex
625 ++NumVecBO;
626 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
627 V1, "foldExtExtBinop");
628
629 // All IR flags are safe to back-propagate because any potential poison
630 // created in unused vector elements is discarded by the extract.
631 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
632 VecBOInst->copyIRFlags(&I);
633
634 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
635}
636
637/// Match an instruction with extracted vector operands.
638bool VectorCombine::foldExtractExtract(Instruction &I) {
639 // It is not safe to transform things like div, urem, etc. because we may
640 // create undefined behavior when executing those on unknown vector elements.
642 return false;
643
644 Instruction *I0, *I1;
645 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
646 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
648 return false;
649
650 Value *V0, *V1;
651 uint64_t C0, C1;
652 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
653 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
654 V0->getType() != V1->getType())
655 return false;
656
657 // If the scalar value 'I' is going to be re-inserted into a vector, then try
658 // to create an extract to that same element. The extract/insert can be
659 // reduced to a "select shuffle".
660 // TODO: If we add a larger pattern match that starts from an insert, this
661 // probably becomes unnecessary.
662 auto *Ext0 = cast<ExtractElementInst>(I0);
663 auto *Ext1 = cast<ExtractElementInst>(I1);
664 uint64_t InsertIndex = InvalidIndex;
665 if (I.hasOneUse())
666 match(I.user_back(),
667 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
668
669 ExtractElementInst *ExtractToChange;
670 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
671 return false;
672
673 Value *ExtOp0 = Ext0->getVectorOperand();
674 Value *ExtOp1 = Ext1->getVectorOperand();
675
676 if (ExtractToChange) {
677 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
678 Value *NewExtOp =
679 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
680 if (!NewExtOp)
681 return false;
682 if (ExtractToChange == Ext0)
683 ExtOp0 = NewExtOp;
684 else
685 ExtOp1 = NewExtOp;
686 }
687
688 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
689 : Ext0->getIndexOperand();
690 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
691 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
692 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
693 Worklist.push(Ext0);
694 Worklist.push(Ext1);
695 replaceValue(I, *NewExt);
696 return true;
697}
698
699/// Try to replace an extract + scalar fneg + insert with a vector fneg +
700/// shuffle.
701bool VectorCombine::foldInsExtFNeg(Instruction &I) {
702 // Match an insert (op (extract)) pattern.
703 Value *DstVec;
704 uint64_t ExtIdx, InsIdx;
705 Instruction *FNeg;
706 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
707 m_ConstantInt(InsIdx))))
708 return false;
709
710 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
711 Value *SrcVec;
712 Instruction *Extract;
713 if (!match(FNeg, m_FNeg(m_CombineAnd(
714 m_Instruction(Extract),
715 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
716 return false;
717
718 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
719 auto *DstVecScalarTy = DstVecTy->getScalarType();
720 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
721 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
722 return false;
723
724 // Ignore if insert/extract index is out of bounds or destination vector has
725 // one element
726 unsigned NumDstElts = DstVecTy->getNumElements();
727 unsigned NumSrcElts = SrcVecTy->getNumElements();
728 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
729 return false;
730
731 // We are inserting the negated element into the same lane that we extracted
732 // from. This is equivalent to a select-shuffle that chooses all but the
733 // negated element from the destination vector.
734 SmallVector<int> Mask(NumDstElts);
735 std::iota(Mask.begin(), Mask.end(), 0);
736 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
737 InstructionCost OldCost =
738 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
739 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
740
741 // If the extract has one use, it will be eliminated, so count it in the
742 // original cost. If it has more than one use, ignore the cost because it will
743 // be the same before/after.
744 if (Extract->hasOneUse())
745 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
746
747 InstructionCost NewCost =
748 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
750 DstVecTy, Mask, CostKind);
751
752 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
753 // If the lengths of the two vectors are not equal,
754 // we need to add a length-change vector. Add this cost.
755 SmallVector<int> SrcMask;
756 if (NeedLenChg) {
757 SrcMask.assign(NumDstElts, PoisonMaskElem);
758 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
760 DstVecTy, SrcVecTy, SrcMask, CostKind);
761 }
762
763 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
764 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
765 << "\n");
766 if (NewCost > OldCost)
767 return false;
768
769 Value *NewShuf, *LenChgShuf = nullptr;
770 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
771 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
772 if (NeedLenChg) {
773 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
774 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
775 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
776 Worklist.pushValue(LenChgShuf);
777 } else {
778 // shuffle DstVec, (fneg SrcVec), Mask
779 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
780 }
781
782 Worklist.pushValue(VecFNeg);
783 replaceValue(I, *NewShuf);
784 return true;
785}
786
787/// Try to fold insert(binop(x,y),binop(a,b),idx)
788/// --> binop(insert(x,a,idx),insert(y,b,idx))
789bool VectorCombine::foldInsExtBinop(Instruction &I) {
790 BinaryOperator *VecBinOp, *SclBinOp;
791 uint64_t Index;
792 if (!match(&I,
793 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
794 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
795 return false;
796
797 // TODO: Add support for addlike etc.
798 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
799 if (BinOpcode != SclBinOp->getOpcode())
800 return false;
801
802 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
803 if (!ResultTy)
804 return false;
805
806 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
807 // shuffle?
808
810 TTI.getInstructionCost(VecBinOp, CostKind) +
812 InstructionCost NewCost =
813 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
814 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
815 Index, VecBinOp->getOperand(0),
816 SclBinOp->getOperand(0)) +
817 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
818 Index, VecBinOp->getOperand(1),
819 SclBinOp->getOperand(1));
820
821 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
822 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
823 << "\n");
824 if (NewCost > OldCost)
825 return false;
826
827 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
828 SclBinOp->getOperand(0), Index);
829 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
830 SclBinOp->getOperand(1), Index);
831 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
832
833 // Intersect flags from the old binops.
834 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
835 NewInst->copyIRFlags(VecBinOp);
836 NewInst->andIRFlags(SclBinOp);
837 }
838
839 Worklist.pushValue(NewIns0);
840 Worklist.pushValue(NewIns1);
841 replaceValue(I, *NewBO);
842 return true;
843}
844
845/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
846/// Supports: bitcast, trunc, sext, zext
847bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
848 // Check if this is a bitwise logic operation
849 auto *BinOp = dyn_cast<BinaryOperator>(&I);
850 if (!BinOp || !BinOp->isBitwiseLogicOp())
851 return false;
852
853 // Get the cast instructions
854 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
855 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
856 if (!LHSCast || !RHSCast) {
857 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
858 return false;
859 }
860
861 // Both casts must be the same type
862 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
863 if (CastOpcode != RHSCast->getOpcode())
864 return false;
865
866 // Only handle supported cast operations
867 switch (CastOpcode) {
868 case Instruction::BitCast:
869 case Instruction::Trunc:
870 case Instruction::SExt:
871 case Instruction::ZExt:
872 break;
873 default:
874 return false;
875 }
876
877 Value *LHSSrc = LHSCast->getOperand(0);
878 Value *RHSSrc = RHSCast->getOperand(0);
879
880 // Source types must match
881 if (LHSSrc->getType() != RHSSrc->getType())
882 return false;
883
884 auto *SrcTy = LHSSrc->getType();
885 auto *DstTy = I.getType();
886 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
887 // Other casts only handle vector types with integer elements.
888 if (CastOpcode != Instruction::BitCast &&
889 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
890 return false;
891
892 // Only integer scalar/vector values are legal for bitwise logic operations.
893 if (!SrcTy->getScalarType()->isIntegerTy() ||
894 !DstTy->getScalarType()->isIntegerTy())
895 return false;
896
897 // Cost Check :
898 // OldCost = bitlogic + 2*casts
899 // NewCost = bitlogic + cast
900
901 // Calculate specific costs for each cast with instruction context
903 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
905 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
906
907 InstructionCost OldCost =
908 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
909 LHSCastCost + RHSCastCost;
910
911 // For new cost, we can't provide an instruction (it doesn't exist yet)
912 InstructionCost GenericCastCost = TTI.getCastInstrCost(
913 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
914
915 InstructionCost NewCost =
916 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
917 GenericCastCost;
918
919 // Account for multi-use casts using specific costs
920 if (!LHSCast->hasOneUse())
921 NewCost += LHSCastCost;
922 if (!RHSCast->hasOneUse())
923 NewCost += RHSCastCost;
924
925 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
926 << " NewCost=" << NewCost << "\n");
927
928 if (NewCost > OldCost)
929 return false;
930
931 // Create the operation on the source type
932 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
933 BinOp->getName() + ".inner");
934 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
935 NewBinOp->copyIRFlags(BinOp);
936
937 Worklist.pushValue(NewOp);
938
939 // Create the cast operation directly to ensure we get a new instruction
940 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
941
942 // Preserve cast instruction flags
943 NewCast->copyIRFlags(LHSCast);
944 NewCast->andIRFlags(RHSCast);
945
946 // Insert the new instruction
947 Value *Result = Builder.Insert(NewCast);
948
949 replaceValue(I, *Result);
950 return true;
951}
952
953/// Match:
954// bitop(castop(x), C) ->
955// bitop(castop(x), castop(InvC)) ->
956// castop(bitop(x, InvC))
957// Supports: bitcast
958bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
960 Constant *C;
961
962 // Check if this is a bitwise logic operation
964 return false;
965
966 // Get the cast instructions
967 auto *LHSCast = dyn_cast<CastInst>(LHS);
968 if (!LHSCast)
969 return false;
970
971 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
972
973 // Only handle supported cast operations
974 switch (CastOpcode) {
975 case Instruction::BitCast:
976 case Instruction::ZExt:
977 case Instruction::SExt:
978 case Instruction::Trunc:
979 break;
980 default:
981 return false;
982 }
983
984 Value *LHSSrc = LHSCast->getOperand(0);
985
986 auto *SrcTy = LHSSrc->getType();
987 auto *DstTy = I.getType();
988 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
989 // Other casts only handle vector types with integer elements.
990 if (CastOpcode != Instruction::BitCast &&
991 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
992 return false;
993
994 // Only integer scalar/vector values are legal for bitwise logic operations.
995 if (!SrcTy->getScalarType()->isIntegerTy() ||
996 !DstTy->getScalarType()->isIntegerTy())
997 return false;
998
999 // Find the constant InvC, such that castop(InvC) equals to C.
1000 PreservedCastFlags RHSFlags;
1001 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1002 if (!InvC)
1003 return false;
1004
1005 // Cost Check :
1006 // OldCost = bitlogic + cast
1007 // NewCost = bitlogic + cast
1008
1009 // Calculate specific costs for each cast with instruction context
1010 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1011 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1012
1013 InstructionCost OldCost =
1014 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1015
1016 // For new cost, we can't provide an instruction (it doesn't exist yet)
1017 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1018 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1019
1020 InstructionCost NewCost =
1021 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1022 GenericCastCost;
1023
1024 // Account for multi-use casts using specific costs
1025 if (!LHSCast->hasOneUse())
1026 NewCost += LHSCastCost;
1027
1028 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1029 << " NewCost=" << NewCost << "\n");
1030
1031 if (NewCost > OldCost)
1032 return false;
1033
1034 // Create the operation on the source type
1035 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1036 LHSSrc, InvC, I.getName() + ".inner");
1037 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1038 NewBinOp->copyIRFlags(&I);
1039
1040 Worklist.pushValue(NewOp);
1041
1042 // Create the cast operation directly to ensure we get a new instruction
1043 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1044
1045 // Preserve cast instruction flags
1046 if (RHSFlags.NNeg)
1047 NewCast->setNonNeg();
1048 if (RHSFlags.NUW)
1049 NewCast->setHasNoUnsignedWrap();
1050 if (RHSFlags.NSW)
1051 NewCast->setHasNoSignedWrap();
1052
1053 NewCast->andIRFlags(LHSCast);
1054
1055 // Insert the new instruction
1056 Value *Result = Builder.Insert(NewCast);
1057
1058 replaceValue(I, *Result);
1059 return true;
1060}
1061
1062/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1063/// destination type followed by shuffle. This can enable further transforms by
1064/// moving bitcasts or shuffles together.
1065bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1066 Value *V0, *V1;
1067 ArrayRef<int> Mask;
1068 if (!match(&I, m_BitCast(m_OneUse(
1069 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1070 return false;
1071
1072 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1073 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1074 // mask for scalable type is a splat or not.
1075 // 2) Disallow non-vector casts.
1076 // TODO: We could allow any shuffle.
1077 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1078 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1079 if (!DestTy || !SrcTy)
1080 return false;
1081
1082 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1083 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1084 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1085 return false;
1086
1087 bool IsUnary = isa<UndefValue>(V1);
1088
1089 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1090 // if it won't increase the number of bitcasts.
1091 if (!IsUnary) {
1094 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1095 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1096 return false;
1097 }
1098
1099 SmallVector<int, 16> NewMask;
1100 if (DestEltSize <= SrcEltSize) {
1101 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1102 // always be expanded to the equivalent form choosing narrower elements.
1103 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1104 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1105 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1106 } else {
1107 // The bitcast is from narrow elements to wide elements. The shuffle mask
1108 // must choose consecutive elements to allow casting first.
1109 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1110 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1111 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1112 return false;
1113 }
1114
1115 // Bitcast the shuffle src - keep its original width but using the destination
1116 // scalar type.
1117 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1118 auto *NewShuffleTy =
1119 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1120 auto *OldShuffleTy =
1121 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1122 unsigned NumOps = IsUnary ? 1 : 2;
1123
1124 // The new shuffle must not cost more than the old shuffle.
1128
1129 InstructionCost NewCost =
1130 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1131 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1132 TargetTransformInfo::CastContextHint::None,
1133 CostKind));
1134 InstructionCost OldCost =
1135 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1136 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1137 TargetTransformInfo::CastContextHint::None,
1138 CostKind);
1139
1140 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1141 << OldCost << " vs NewCost: " << NewCost << "\n");
1142
1143 if (NewCost > OldCost || !NewCost.isValid())
1144 return false;
1145
1146 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1147 ++NumShufOfBitcast;
1148 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1149 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1150 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1151 replaceValue(I, *Shuf);
1152 return true;
1153}
1154
1155/// VP Intrinsics whose vector operands are both splat values may be simplified
1156/// into the scalar version of the operation and the result splatted. This
1157/// can lead to scalarization down the line.
1158bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1159 if (!isa<VPIntrinsic>(I))
1160 return false;
1161 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1162 Value *Op0 = VPI.getArgOperand(0);
1163 Value *Op1 = VPI.getArgOperand(1);
1164
1165 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1166 return false;
1167
1168 // Check getSplatValue early in this function, to avoid doing unnecessary
1169 // work.
1170 Value *ScalarOp0 = getSplatValue(Op0);
1171 Value *ScalarOp1 = getSplatValue(Op1);
1172 if (!ScalarOp0 || !ScalarOp1)
1173 return false;
1174
1175 // For the binary VP intrinsics supported here, the result on disabled lanes
1176 // is a poison value. For now, only do this simplification if all lanes
1177 // are active.
1178 // TODO: Relax the condition that all lanes are active by using insertelement
1179 // on inactive lanes.
1180 auto IsAllTrueMask = [](Value *MaskVal) {
1181 if (Value *SplattedVal = getSplatValue(MaskVal))
1182 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1183 return ConstValue->isAllOnesValue();
1184 return false;
1185 };
1186 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1187 return false;
1188
1189 // Check to make sure we support scalarization of the intrinsic
1190 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1191 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1192 return false;
1193
1194 // Calculate cost of splatting both operands into vectors and the vector
1195 // intrinsic
1196 VectorType *VecTy = cast<VectorType>(VPI.getType());
1197 SmallVector<int> Mask;
1198 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1199 Mask.resize(FVTy->getNumElements(), 0);
1200 InstructionCost SplatCost =
1201 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1203 CostKind);
1204
1205 // Calculate the cost of the VP Intrinsic
1207 for (Value *V : VPI.args())
1208 Args.push_back(V->getType());
1209 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1210 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1211 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1212
1213 // Determine scalar opcode
1214 std::optional<unsigned> FunctionalOpcode =
1215 VPI.getFunctionalOpcode();
1216 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1217 if (!FunctionalOpcode) {
1218 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1219 if (!ScalarIntrID)
1220 return false;
1221 }
1222
1223 // Calculate cost of scalarizing
1224 InstructionCost ScalarOpCost = 0;
1225 if (ScalarIntrID) {
1226 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1227 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1228 } else {
1229 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1230 VecTy->getScalarType(), CostKind);
1231 }
1232
1233 // The existing splats may be kept around if other instructions use them.
1234 InstructionCost CostToKeepSplats =
1235 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1236 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1237
1238 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1239 << "\n");
1240 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1241 << ", Cost of scalarizing:" << NewCost << "\n");
1242
1243 // We want to scalarize unless the vector variant actually has lower cost.
1244 if (OldCost < NewCost || !NewCost.isValid())
1245 return false;
1246
1247 // Scalarize the intrinsic
1248 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1249 Value *EVL = VPI.getArgOperand(3);
1250
1251 // If the VP op might introduce UB or poison, we can scalarize it provided
1252 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1253 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1254 // scalarizing it.
1255 bool SafeToSpeculate;
1256 if (ScalarIntrID)
1257 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1258 .hasAttribute(Attribute::AttrKind::Speculatable);
1259 else
1261 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1262 if (!SafeToSpeculate &&
1263 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1264 return false;
1265
1266 Value *ScalarVal =
1267 ScalarIntrID
1268 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1269 {ScalarOp0, ScalarOp1})
1270 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1271 ScalarOp0, ScalarOp1);
1272
1273 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1274 return true;
1275}
1276
1277/// Match a vector op/compare/intrinsic with at least one
1278/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1279/// by insertelement.
1280bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1281 auto *UO = dyn_cast<UnaryOperator>(&I);
1282 auto *BO = dyn_cast<BinaryOperator>(&I);
1283 auto *CI = dyn_cast<CmpInst>(&I);
1284 auto *II = dyn_cast<IntrinsicInst>(&I);
1285 if (!UO && !BO && !CI && !II)
1286 return false;
1287
1288 // TODO: Allow intrinsics with different argument types
1289 if (II) {
1290 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1291 return false;
1292 for (auto [Idx, Arg] : enumerate(II->args()))
1293 if (Arg->getType() != II->getType() &&
1294 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1295 return false;
1296 }
1297
1298 // Do not convert the vector condition of a vector select into a scalar
1299 // condition. That may cause problems for codegen because of differences in
1300 // boolean formats and register-file transfers.
1301 // TODO: Can we account for that in the cost model?
1302 if (CI)
1303 for (User *U : I.users())
1304 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1305 return false;
1306
1307 // Match constant vectors or scalars being inserted into constant vectors:
1308 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1309 SmallVector<Value *> VecCs, ScalarOps;
1310 std::optional<uint64_t> Index;
1311
1312 auto Ops = II ? II->args() : I.operands();
1313 for (auto [OpNum, Op] : enumerate(Ops)) {
1314 Constant *VecC;
1315 Value *V;
1316 uint64_t InsIdx = 0;
1317 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1318 m_ConstantInt(InsIdx)))) {
1319 // Bail if any inserts are out of bounds.
1320 VectorType *OpTy = cast<VectorType>(Op->getType());
1321 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1322 return false;
1323 // All inserts must have the same index.
1324 // TODO: Deal with mismatched index constants and variable indexes?
1325 if (!Index)
1326 Index = InsIdx;
1327 else if (InsIdx != *Index)
1328 return false;
1329 VecCs.push_back(VecC);
1330 ScalarOps.push_back(V);
1331 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1332 OpNum, &TTI)) {
1333 VecCs.push_back(Op.get());
1334 ScalarOps.push_back(Op.get());
1335 } else if (match(Op.get(), m_Constant(VecC))) {
1336 VecCs.push_back(VecC);
1337 ScalarOps.push_back(nullptr);
1338 } else {
1339 return false;
1340 }
1341 }
1342
1343 // Bail if all operands are constant.
1344 if (!Index.has_value())
1345 return false;
1346
1347 VectorType *VecTy = cast<VectorType>(I.getType());
1348 Type *ScalarTy = VecTy->getScalarType();
1349 assert(VecTy->isVectorTy() &&
1350 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1351 ScalarTy->isPointerTy()) &&
1352 "Unexpected types for insert element into binop or cmp");
1353
1354 unsigned Opcode = I.getOpcode();
1355 InstructionCost ScalarOpCost, VectorOpCost;
1356 if (CI) {
1357 CmpInst::Predicate Pred = CI->getPredicate();
1358 ScalarOpCost = TTI.getCmpSelInstrCost(
1359 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1360 VectorOpCost = TTI.getCmpSelInstrCost(
1361 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1362 } else if (UO || BO) {
1363 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1364 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1365 } else {
1366 IntrinsicCostAttributes ScalarICA(
1367 II->getIntrinsicID(), ScalarTy,
1368 SmallVector<Type *>(II->arg_size(), ScalarTy));
1369 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1370 IntrinsicCostAttributes VectorICA(
1371 II->getIntrinsicID(), VecTy,
1372 SmallVector<Type *>(II->arg_size(), VecTy));
1373 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1374 }
1375
1376 // Fold the vector constants in the original vectors into a new base vector to
1377 // get more accurate cost modelling.
1378 Value *NewVecC = nullptr;
1379 if (CI)
1380 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1381 else if (UO)
1382 NewVecC =
1383 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1384 else if (BO)
1385 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1386 else if (II)
1387 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1388
1389 if (!NewVecC)
1390 return false;
1391
1392 // Get cost estimate for the insert element. This cost will factor into
1393 // both sequences.
1394 InstructionCost OldCost = VectorOpCost;
1395 InstructionCost NewCost =
1396 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1397 CostKind, *Index, NewVecC);
1398
1399 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1400 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1401 II->getIntrinsicID(), Idx, &TTI)))
1402 continue;
1404 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1405 OldCost += InsertCost;
1406 NewCost += !Op->hasOneUse() * InsertCost;
1407 }
1408
1409 // We want to scalarize unless the vector variant actually has lower cost.
1410 if (OldCost < NewCost || !NewCost.isValid())
1411 return false;
1412
1413 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1414 // inselt NewVecC, (scalar_op V0, V1), Index
1415 if (CI)
1416 ++NumScalarCmp;
1417 else if (UO || BO)
1418 ++NumScalarOps;
1419 else
1420 ++NumScalarIntrinsic;
1421
1422 // For constant cases, extract the scalar element, this should constant fold.
1423 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1424 if (!Scalar)
1426 cast<Constant>(VecC), Builder.getInt64(*Index));
1427
1428 Value *Scalar;
1429 if (CI)
1430 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1431 else if (UO || BO)
1432 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1433 else
1434 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1435
1436 Scalar->setName(I.getName() + ".scalar");
1437
1438 // All IR flags are safe to back-propagate. There is no potential for extra
1439 // poison to be created by the scalar instruction.
1440 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1441 ScalarInst->copyIRFlags(&I);
1442
1443 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1444 replaceValue(I, *Insert);
1445 return true;
1446}
1447
1448/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1449/// a vector into vector operations followed by extract. Note: The SLP pass
1450/// may miss this pattern because of implementation problems.
1451bool VectorCombine::foldExtractedCmps(Instruction &I) {
1452 auto *BI = dyn_cast<BinaryOperator>(&I);
1453
1454 // We are looking for a scalar binop of booleans.
1455 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1456 if (!BI || !I.getType()->isIntegerTy(1))
1457 return false;
1458
1459 // The compare predicates should match, and each compare should have a
1460 // constant operand.
1461 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1462 Instruction *I0, *I1;
1463 Constant *C0, *C1;
1464 CmpPredicate P0, P1;
1465 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1466 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1467 return false;
1468
1469 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1470 if (!MatchingPred)
1471 return false;
1472
1473 // The compare operands must be extracts of the same vector with constant
1474 // extract indexes.
1475 Value *X;
1476 uint64_t Index0, Index1;
1477 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1478 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1479 return false;
1480
1481 auto *Ext0 = cast<ExtractElementInst>(I0);
1482 auto *Ext1 = cast<ExtractElementInst>(I1);
1483 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1484 if (!ConvertToShuf)
1485 return false;
1486 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1487 "Unknown ExtractElementInst");
1488
1489 // The original scalar pattern is:
1490 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1491 CmpInst::Predicate Pred = *MatchingPred;
1492 unsigned CmpOpcode =
1493 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1494 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1495 if (!VecTy)
1496 return false;
1497
1498 InstructionCost Ext0Cost =
1499 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1500 InstructionCost Ext1Cost =
1501 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1503 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1504 CostKind);
1505
1506 InstructionCost OldCost =
1507 Ext0Cost + Ext1Cost + CmpCost * 2 +
1508 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1509
1510 // The proposed vector pattern is:
1511 // vcmp = cmp Pred X, VecC
1512 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1513 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1514 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1517 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1518 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1519 ShufMask[CheapIndex] = ExpensiveIndex;
1521 CmpTy, ShufMask, CostKind);
1522 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1523 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1524 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1525 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1526
1527 // Aggressively form vector ops if the cost is equal because the transform
1528 // may enable further optimization.
1529 // Codegen can reverse this transform (scalarize) if it was not profitable.
1530 if (OldCost < NewCost || !NewCost.isValid())
1531 return false;
1532
1533 // Create a vector constant from the 2 scalar constants.
1534 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1535 PoisonValue::get(VecTy->getElementType()));
1536 CmpC[Index0] = C0;
1537 CmpC[Index1] = C1;
1538 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1539 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1540 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1541 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1542 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1543 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1544 replaceValue(I, *NewExt);
1545 ++NumVecCmpBO;
1546 return true;
1547}
1548
1551 const TargetTransformInfo &TTI,
1552 InstructionCost &CostBeforeReduction,
1553 InstructionCost &CostAfterReduction) {
1554 Instruction *Op0, *Op1;
1555 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1556 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1557 unsigned ReductionOpc =
1558 getArithmeticReductionInstruction(II.getIntrinsicID());
1559 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1560 bool IsUnsigned = isa<ZExtInst>(RedOp);
1561 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1562
1563 CostBeforeReduction =
1564 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1566 CostAfterReduction =
1567 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1568 ExtType, FastMathFlags(), CostKind);
1569 return;
1570 }
1571 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1572 match(RedOp,
1574 match(Op0, m_ZExtOrSExt(m_Value())) &&
1575 Op0->getOpcode() == Op1->getOpcode() &&
1576 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1577 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1578 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1579 bool IsUnsigned = isa<ZExtInst>(Op0);
1580 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1581 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1582
1583 InstructionCost ExtCost =
1584 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1586 InstructionCost MulCost =
1587 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1588 InstructionCost Ext2Cost =
1589 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1591
1592 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1593 CostAfterReduction = TTI.getMulAccReductionCost(
1594 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1595 return;
1596 }
1597 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1598 std::nullopt, CostKind);
1599}
1600
1601bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1602 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1603 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1604 if (BinOpOpc == Instruction::Sub)
1605 ReductionIID = Intrinsic::vector_reduce_add;
1606 if (ReductionIID == Intrinsic::not_intrinsic)
1607 return false;
1608
1609 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1610 Intrinsic::ID IID) -> Value * {
1611 auto *II = dyn_cast<IntrinsicInst>(V);
1612 if (!II)
1613 return nullptr;
1614 if (II->getIntrinsicID() == IID && II->hasOneUse())
1615 return II->getArgOperand(0);
1616 return nullptr;
1617 };
1618
1619 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1620 if (!V0)
1621 return false;
1622 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1623 if (!V1)
1624 return false;
1625
1626 auto *VTy = cast<VectorType>(V0->getType());
1627 if (V1->getType() != VTy)
1628 return false;
1629 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1630 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1631 unsigned ReductionOpc =
1632 getArithmeticReductionInstruction(II0.getIntrinsicID());
1633
1634 InstructionCost OldCost = 0;
1635 InstructionCost NewCost = 0;
1636 InstructionCost CostOfRedOperand0 = 0;
1637 InstructionCost CostOfRed0 = 0;
1638 InstructionCost CostOfRedOperand1 = 0;
1639 InstructionCost CostOfRed1 = 0;
1640 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1641 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1642 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1643 NewCost =
1644 CostOfRedOperand0 + CostOfRedOperand1 +
1645 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1646 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1647 if (NewCost >= OldCost || !NewCost.isValid())
1648 return false;
1649
1650 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1651 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1652 << "\n");
1653 Value *VectorBO;
1654 if (BinOpOpc == Instruction::Or)
1655 VectorBO = Builder.CreateOr(V0, V1, "",
1656 cast<PossiblyDisjointInst>(I).isDisjoint());
1657 else
1658 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1659
1660 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1661 replaceValue(I, *Rdx);
1662 return true;
1663}
1664
1665// Check if memory loc modified between two instrs in the same BB
1668 const MemoryLocation &Loc, AAResults &AA) {
1669 unsigned NumScanned = 0;
1670 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1671 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1672 ++NumScanned > MaxInstrsToScan;
1673 });
1674}
1675
1676namespace {
1677/// Helper class to indicate whether a vector index can be safely scalarized and
1678/// if a freeze needs to be inserted.
1679class ScalarizationResult {
1680 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1681
1682 StatusTy Status;
1683 Value *ToFreeze;
1684
1685 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1686 : Status(Status), ToFreeze(ToFreeze) {}
1687
1688public:
1689 ScalarizationResult(const ScalarizationResult &Other) = default;
1690 ~ScalarizationResult() {
1691 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1692 }
1693
1694 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1695 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1696 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1697 return {StatusTy::SafeWithFreeze, ToFreeze};
1698 }
1699
1700 /// Returns true if the index can be scalarize without requiring a freeze.
1701 bool isSafe() const { return Status == StatusTy::Safe; }
1702 /// Returns true if the index cannot be scalarized.
1703 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1704 /// Returns true if the index can be scalarize, but requires inserting a
1705 /// freeze.
1706 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1707
1708 /// Reset the state of Unsafe and clear ToFreze if set.
1709 void discard() {
1710 ToFreeze = nullptr;
1711 Status = StatusTy::Unsafe;
1712 }
1713
1714 /// Freeze the ToFreeze and update the use in \p User to use it.
1715 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1716 assert(isSafeWithFreeze() &&
1717 "should only be used when freezing is required");
1718 assert(is_contained(ToFreeze->users(), &UserI) &&
1719 "UserI must be a user of ToFreeze");
1720 IRBuilder<>::InsertPointGuard Guard(Builder);
1721 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1722 Value *Frozen =
1723 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1724 for (Use &U : make_early_inc_range((UserI.operands())))
1725 if (U.get() == ToFreeze)
1726 U.set(Frozen);
1727
1728 ToFreeze = nullptr;
1729 }
1730};
1731} // namespace
1732
1733/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1734/// Idx. \p Idx must access a valid vector element.
1735static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1736 Instruction *CtxI,
1737 AssumptionCache &AC,
1738 const DominatorTree &DT) {
1739 // We do checks for both fixed vector types and scalable vector types.
1740 // This is the number of elements of fixed vector types,
1741 // or the minimum number of elements of scalable vector types.
1742 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1743 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1744
1745 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1746 if (C->getValue().ult(NumElements))
1747 return ScalarizationResult::safe();
1748 return ScalarizationResult::unsafe();
1749 }
1750
1751 // Always unsafe if the index type can't handle all inbound values.
1752 if (!llvm::isUIntN(IntWidth, NumElements))
1753 return ScalarizationResult::unsafe();
1754
1755 APInt Zero(IntWidth, 0);
1756 APInt MaxElts(IntWidth, NumElements);
1757 ConstantRange ValidIndices(Zero, MaxElts);
1758 ConstantRange IdxRange(IntWidth, true);
1759
1760 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1761 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1762 true, &AC, CtxI, &DT)))
1763 return ScalarizationResult::safe();
1764 return ScalarizationResult::unsafe();
1765 }
1766
1767 // If the index may be poison, check if we can insert a freeze before the
1768 // range of the index is restricted.
1769 Value *IdxBase;
1770 ConstantInt *CI;
1771 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1772 IdxRange = IdxRange.binaryAnd(CI->getValue());
1773 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1774 IdxRange = IdxRange.urem(CI->getValue());
1775 }
1776
1777 if (ValidIndices.contains(IdxRange))
1778 return ScalarizationResult::safeWithFreeze(IdxBase);
1779 return ScalarizationResult::unsafe();
1780}
1781
1782/// The memory operation on a vector of \p ScalarType had alignment of
1783/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1784/// alignment that will be valid for the memory operation on a single scalar
1785/// element of the same type with index \p Idx.
1787 Type *ScalarType, Value *Idx,
1788 const DataLayout &DL) {
1789 if (auto *C = dyn_cast<ConstantInt>(Idx))
1790 return commonAlignment(VectorAlignment,
1791 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1792 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1793}
1794
1795// Combine patterns like:
1796// %0 = load <4 x i32>, <4 x i32>* %a
1797// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1798// store <4 x i32> %1, <4 x i32>* %a
1799// to:
1800// %0 = bitcast <4 x i32>* %a to i32*
1801// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1802// store i32 %b, i32* %1
1803bool VectorCombine::foldSingleElementStore(Instruction &I) {
1805 return false;
1806 auto *SI = cast<StoreInst>(&I);
1807 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1808 return false;
1809
1810 // TODO: Combine more complicated patterns (multiple insert) by referencing
1811 // TargetTransformInfo.
1813 Value *NewElement;
1814 Value *Idx;
1815 if (!match(SI->getValueOperand(),
1816 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1817 m_Value(Idx))))
1818 return false;
1819
1820 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1821 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1822 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1823 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1824 // modified between, vector type matches store size, and index is inbounds.
1825 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1826 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1827 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1828 return false;
1829
1830 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1831 if (ScalarizableIdx.isUnsafe() ||
1832 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1833 MemoryLocation::get(SI), AA))
1834 return false;
1835
1836 // Ensure we add the load back to the worklist BEFORE its users so they can
1837 // erased in the correct order.
1838 Worklist.push(Load);
1839
1840 if (ScalarizableIdx.isSafeWithFreeze())
1841 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1842 Value *GEP = Builder.CreateInBoundsGEP(
1843 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1844 {ConstantInt::get(Idx->getType(), 0), Idx});
1845 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1846 NSI->copyMetadata(*SI);
1847 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1848 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1849 *DL);
1850 NSI->setAlignment(ScalarOpAlignment);
1851 replaceValue(I, *NSI);
1853 return true;
1854 }
1855
1856 return false;
1857}
1858
1859/// Try to scalarize vector loads feeding extractelement or bitcast
1860/// instructions.
1861bool VectorCombine::scalarizeLoad(Instruction &I) {
1862 Value *Ptr;
1863 if (!match(&I, m_Load(m_Value(Ptr))))
1864 return false;
1865
1866 auto *LI = cast<LoadInst>(&I);
1867 auto *VecTy = cast<VectorType>(LI->getType());
1868 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1869 return false;
1870
1871 bool AllExtracts = true;
1872 bool AllBitcasts = true;
1873 Instruction *LastCheckedInst = LI;
1874 unsigned NumInstChecked = 0;
1875
1876 // Check what type of users we have (must either all be extracts or
1877 // bitcasts) and ensure no memory modifications between the load and
1878 // its users.
1879 for (User *U : LI->users()) {
1880 auto *UI = dyn_cast<Instruction>(U);
1881 if (!UI || UI->getParent() != LI->getParent())
1882 return false;
1883
1884 // If any user is waiting to be erased, then bail out as this will
1885 // distort the cost calculation and possibly lead to infinite loops.
1886 if (UI->use_empty())
1887 return false;
1888
1889 if (!isa<ExtractElementInst>(UI))
1890 AllExtracts = false;
1891 if (!isa<BitCastInst>(UI))
1892 AllBitcasts = false;
1893
1894 // Check if any instruction between the load and the user may modify memory.
1895 if (LastCheckedInst->comesBefore(UI)) {
1896 for (Instruction &I :
1897 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1898 // Bail out if we reached the check limit or the instruction may write
1899 // to memory.
1900 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1901 return false;
1902 NumInstChecked++;
1903 }
1904 LastCheckedInst = UI;
1905 }
1906 }
1907
1908 if (AllExtracts)
1909 return scalarizeLoadExtract(LI, VecTy, Ptr);
1910 if (AllBitcasts)
1911 return scalarizeLoadBitcast(LI, VecTy, Ptr);
1912 return false;
1913}
1914
1915/// Try to scalarize vector loads feeding extractelement instructions.
1916bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
1917 Value *Ptr) {
1919 return false;
1920
1921 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1922 auto FailureGuard = make_scope_exit([&]() {
1923 // If the transform is aborted, discard the ScalarizationResults.
1924 for (auto &Pair : NeedFreeze)
1925 Pair.second.discard();
1926 });
1927
1928 InstructionCost OriginalCost =
1929 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1931 InstructionCost ScalarizedCost = 0;
1932
1933 for (User *U : LI->users()) {
1934 auto *UI = cast<ExtractElementInst>(U);
1935
1936 auto ScalarIdx =
1937 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1938 if (ScalarIdx.isUnsafe())
1939 return false;
1940 if (ScalarIdx.isSafeWithFreeze()) {
1941 NeedFreeze.try_emplace(UI, ScalarIdx);
1942 ScalarIdx.discard();
1943 }
1944
1945 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1946 OriginalCost +=
1947 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1948 Index ? Index->getZExtValue() : -1);
1949 ScalarizedCost +=
1950 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1952 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1953 nullptr, nullptr, CostKind);
1954 }
1955
1956 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
1957 << "\n LoadExtractCost: " << OriginalCost
1958 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1959
1960 if (ScalarizedCost >= OriginalCost)
1961 return false;
1962
1963 // Ensure we add the load back to the worklist BEFORE its users so they can
1964 // erased in the correct order.
1965 Worklist.push(LI);
1966
1967 Type *ElemType = VecTy->getElementType();
1968
1969 // Replace extracts with narrow scalar loads.
1970 for (User *U : LI->users()) {
1971 auto *EI = cast<ExtractElementInst>(U);
1972 Value *Idx = EI->getIndexOperand();
1973
1974 // Insert 'freeze' for poison indexes.
1975 auto It = NeedFreeze.find(EI);
1976 if (It != NeedFreeze.end())
1977 It->second.freeze(Builder, *cast<Instruction>(Idx));
1978
1979 Builder.SetInsertPoint(EI);
1980 Value *GEP =
1981 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1982 auto *NewLoad = cast<LoadInst>(
1983 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1984
1985 Align ScalarOpAlignment =
1986 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1987 NewLoad->setAlignment(ScalarOpAlignment);
1988
1989 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1990 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1991 AAMDNodes OldAAMD = LI->getAAMetadata();
1992 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1993 }
1994
1995 replaceValue(*EI, *NewLoad, false);
1996 }
1997
1998 FailureGuard.release();
1999 return true;
2000}
2001
2002/// Try to scalarize vector loads feeding bitcast instructions.
2003bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2004 Value *Ptr) {
2005 InstructionCost OriginalCost =
2006 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2008
2009 Type *TargetScalarType = nullptr;
2010 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2011
2012 for (User *U : LI->users()) {
2013 auto *BC = cast<BitCastInst>(U);
2014
2015 Type *DestTy = BC->getDestTy();
2016 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2017 return false;
2018
2019 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2020 if (DestBitWidth != VecBitWidth)
2021 return false;
2022
2023 // All bitcasts must target the same scalar type.
2024 if (!TargetScalarType)
2025 TargetScalarType = DestTy;
2026 else if (TargetScalarType != DestTy)
2027 return false;
2028
2029 OriginalCost +=
2030 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2032 }
2033
2034 if (!TargetScalarType)
2035 return false;
2036
2037 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2038 InstructionCost ScalarizedCost =
2039 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2041
2042 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2043 << "\n OriginalCost: " << OriginalCost
2044 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2045
2046 if (ScalarizedCost >= OriginalCost)
2047 return false;
2048
2049 // Ensure we add the load back to the worklist BEFORE its users so they can
2050 // erased in the correct order.
2051 Worklist.push(LI);
2052
2053 Builder.SetInsertPoint(LI);
2054 auto *ScalarLoad =
2055 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2056 ScalarLoad->setAlignment(LI->getAlign());
2057 ScalarLoad->copyMetadata(*LI);
2058
2059 // Replace all bitcast users with the scalar load.
2060 for (User *U : LI->users()) {
2061 auto *BC = cast<BitCastInst>(U);
2062 replaceValue(*BC, *ScalarLoad, false);
2063 }
2064
2065 return true;
2066}
2067
2068bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2070 return false;
2071 auto *Ext = dyn_cast<ZExtInst>(&I);
2072 if (!Ext)
2073 return false;
2074
2075 // Try to convert a vector zext feeding only extracts to a set of scalar
2076 // (Src << ExtIdx *Size) & (Size -1)
2077 // if profitable .
2078 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2079 if (!SrcTy)
2080 return false;
2081 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2082
2083 Type *ScalarDstTy = DstTy->getElementType();
2084 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2085 return false;
2086
2087 InstructionCost VectorCost =
2088 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2090 unsigned ExtCnt = 0;
2091 bool ExtLane0 = false;
2092 for (User *U : Ext->users()) {
2093 uint64_t Idx;
2094 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2095 return false;
2096 if (cast<Instruction>(U)->use_empty())
2097 continue;
2098 ExtCnt += 1;
2099 ExtLane0 |= !Idx;
2100 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2101 CostKind, Idx, U);
2102 }
2103
2104 InstructionCost ScalarCost =
2105 ExtCnt * TTI.getArithmeticInstrCost(
2106 Instruction::And, ScalarDstTy, CostKind,
2109 (ExtCnt - ExtLane0) *
2111 Instruction::LShr, ScalarDstTy, CostKind,
2114 if (ScalarCost > VectorCost)
2115 return false;
2116
2117 Value *ScalarV = Ext->getOperand(0);
2118 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2119 &DT)) {
2120 // Check wether all lanes are extracted, all extracts trigger UB
2121 // on poison, and the last extract (and hence all previous ones)
2122 // are guaranteed to execute if Ext executes. If so, we do not
2123 // need to insert a freeze.
2124 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2125 bool AllExtractsTriggerUB = true;
2126 ExtractElementInst *LastExtract = nullptr;
2127 BasicBlock *ExtBB = Ext->getParent();
2128 for (User *U : Ext->users()) {
2129 auto *Extract = cast<ExtractElementInst>(U);
2130 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2131 AllExtractsTriggerUB = false;
2132 break;
2133 }
2134 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2135 if (!LastExtract || LastExtract->comesBefore(Extract))
2136 LastExtract = Extract;
2137 }
2138 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2139 !AllExtractsTriggerUB ||
2141 LastExtract->getIterator()))
2142 ScalarV = Builder.CreateFreeze(ScalarV);
2143 }
2144 ScalarV = Builder.CreateBitCast(
2145 ScalarV,
2146 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2147 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2148 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2149 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2150 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2151 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2152 for (User *U : Ext->users()) {
2153 auto *Extract = cast<ExtractElementInst>(U);
2154 uint64_t Idx =
2155 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2156 uint64_t ShiftAmt =
2157 DL->isBigEndian()
2158 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2159 : (Idx * SrcEltSizeInBits);
2160 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2161 Value *And = Builder.CreateAnd(LShr, Mask);
2162 U->replaceAllUsesWith(And);
2163 }
2164 return true;
2165}
2166
2167/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2168/// to "(bitcast (concat X, Y))"
2169/// where X/Y are bitcasted from i1 mask vectors.
2170bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2171 Type *Ty = I.getType();
2172 if (!Ty->isIntegerTy())
2173 return false;
2174
2175 // TODO: Add big endian test coverage
2176 if (DL->isBigEndian())
2177 return false;
2178
2179 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2180 Instruction *X, *Y;
2182 return false;
2183
2184 // Allow both sources to contain shl, to handle more generic pattern:
2185 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2186 Value *SrcX;
2187 uint64_t ShAmtX = 0;
2188 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2189 !match(X, m_OneUse(
2191 m_ConstantInt(ShAmtX)))))
2192 return false;
2193
2194 Value *SrcY;
2195 uint64_t ShAmtY = 0;
2196 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2197 !match(Y, m_OneUse(
2199 m_ConstantInt(ShAmtY)))))
2200 return false;
2201
2202 // Canonicalize larger shift to the RHS.
2203 if (ShAmtX > ShAmtY) {
2204 std::swap(X, Y);
2205 std::swap(SrcX, SrcY);
2206 std::swap(ShAmtX, ShAmtY);
2207 }
2208
2209 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2210 // difference is the mask width so they can be easily concatenated together.
2211 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2212 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2213 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2214 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2215 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2216 !MaskTy->getElementType()->isIntegerTy(1) ||
2217 MaskTy->getNumElements() != ShAmtDiff ||
2218 MaskTy->getNumElements() > (BitWidth / 2))
2219 return false;
2220
2221 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2222 auto *ConcatIntTy =
2223 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2224 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2225
2226 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2227 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2228
2229 // TODO: Is it worth supporting multi use cases?
2230 InstructionCost OldCost = 0;
2231 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2232 OldCost +=
2233 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2234 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2236 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2238
2239 InstructionCost NewCost = 0;
2241 MaskTy, ConcatMask, CostKind);
2242 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2244 if (Ty != ConcatIntTy)
2245 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2247 if (ShAmtX > 0)
2248 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2249
2250 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2251 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2252 << "\n");
2253
2254 if (NewCost > OldCost)
2255 return false;
2256
2257 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2258 // any residual zero-extension or shifting.
2259 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2260 Worklist.pushValue(Concat);
2261
2262 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2263
2264 if (Ty != ConcatIntTy) {
2265 Worklist.pushValue(Result);
2266 Result = Builder.CreateZExt(Result, Ty);
2267 }
2268
2269 if (ShAmtX > 0) {
2270 Worklist.pushValue(Result);
2271 Result = Builder.CreateShl(Result, ShAmtX);
2272 }
2273
2274 replaceValue(I, *Result);
2275 return true;
2276}
2277
2278/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2279/// --> "binop (shuffle), (shuffle)".
2280bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2281 BinaryOperator *BinOp;
2282 ArrayRef<int> OuterMask;
2283 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2284 return false;
2285
2286 // Don't introduce poison into div/rem.
2287 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2288 return false;
2289
2290 Value *Op00, *Op01, *Op10, *Op11;
2291 ArrayRef<int> Mask0, Mask1;
2292 bool Match0 = match(BinOp->getOperand(0),
2293 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2294 bool Match1 = match(BinOp->getOperand(1),
2295 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2296 if (!Match0 && !Match1)
2297 return false;
2298
2299 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2300 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2301 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2302 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2303
2304 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2305 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2306 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2307 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2308 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2309 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2310 return false;
2311
2312 unsigned NumSrcElts = BinOpTy->getNumElements();
2313
2314 // Don't accept shuffles that reference the second operand in
2315 // div/rem or if its an undef arg.
2316 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2317 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2318 return false;
2319
2320 // Merge outer / inner (or identity if no match) shuffles.
2321 SmallVector<int> NewMask0, NewMask1;
2322 for (int M : OuterMask) {
2323 if (M < 0 || M >= (int)NumSrcElts) {
2324 NewMask0.push_back(PoisonMaskElem);
2325 NewMask1.push_back(PoisonMaskElem);
2326 } else {
2327 NewMask0.push_back(Match0 ? Mask0[M] : M);
2328 NewMask1.push_back(Match1 ? Mask1[M] : M);
2329 }
2330 }
2331
2332 unsigned NumOpElts = Op0Ty->getNumElements();
2333 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2334 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2335 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2336 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2337 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2338 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2339
2340 InstructionCost NewCost = 0;
2341 // Try to merge shuffles across the binop if the new shuffles are not costly.
2342 InstructionCost BinOpCost =
2343 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2344 InstructionCost OldCost =
2346 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2347 0, nullptr, {BinOp}, &I);
2348 if (!BinOp->hasOneUse())
2349 NewCost += BinOpCost;
2350
2351 if (Match0) {
2353 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2354 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2355 OldCost += Shuf0Cost;
2356 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2357 NewCost += Shuf0Cost;
2358 }
2359 if (Match1) {
2361 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2362 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2363 OldCost += Shuf1Cost;
2364 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2365 NewCost += Shuf1Cost;
2366 }
2367
2368 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2369
2370 if (!IsIdentity0)
2371 NewCost +=
2373 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2374 if (!IsIdentity1)
2375 NewCost +=
2377 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2378
2379 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2380 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2381 << "\n");
2382
2383 // If costs are equal, still fold as we reduce instruction count.
2384 if (NewCost > OldCost)
2385 return false;
2386
2387 Value *LHS =
2388 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2389 Value *RHS =
2390 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2391 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2392
2393 // Intersect flags from the old binops.
2394 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2395 NewInst->copyIRFlags(BinOp);
2396
2397 Worklist.pushValue(LHS);
2398 Worklist.pushValue(RHS);
2399 replaceValue(I, *NewBO);
2400 return true;
2401}
2402
2403/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2404/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2405bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2406 ArrayRef<int> OldMask;
2407 Instruction *LHS, *RHS;
2409 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2410 return false;
2411
2412 // TODO: Add support for addlike etc.
2413 if (LHS->getOpcode() != RHS->getOpcode())
2414 return false;
2415
2416 Value *X, *Y, *Z, *W;
2417 bool IsCommutative = false;
2418 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2419 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2420 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2421 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2422 auto *BO = cast<BinaryOperator>(LHS);
2423 // Don't introduce poison into div/rem.
2424 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2425 return false;
2426 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2427 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2428 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2429 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2430 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2431 } else
2432 return false;
2433
2434 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2435 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2436 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2437 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2438 return false;
2439
2440 unsigned NumSrcElts = BinOpTy->getNumElements();
2441
2442 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2443 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2444 std::swap(X, Y);
2445
2446 auto ConvertToUnary = [NumSrcElts](int &M) {
2447 if (M >= (int)NumSrcElts)
2448 M -= NumSrcElts;
2449 };
2450
2451 SmallVector<int> NewMask0(OldMask);
2453 if (X == Z) {
2454 llvm::for_each(NewMask0, ConvertToUnary);
2456 Z = PoisonValue::get(BinOpTy);
2457 }
2458
2459 SmallVector<int> NewMask1(OldMask);
2461 if (Y == W) {
2462 llvm::for_each(NewMask1, ConvertToUnary);
2464 W = PoisonValue::get(BinOpTy);
2465 }
2466
2467 // Try to replace a binop with a shuffle if the shuffle is not costly.
2468 InstructionCost OldCost =
2472 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2473 &I);
2474
2475 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2476 // where one use shuffles have gotten split across the binop/cmp. These
2477 // often allow a major reduction in total cost that wouldn't happen as
2478 // individual folds.
2479 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2480 TTI::TargetCostKind CostKind) -> bool {
2481 Value *InnerOp;
2482 ArrayRef<int> InnerMask;
2483 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2484 m_Mask(InnerMask)))) &&
2485 InnerOp->getType() == Op->getType() &&
2486 all_of(InnerMask,
2487 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2488 for (int &M : Mask)
2489 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2490 M = InnerMask[M - Offset];
2491 M = 0 <= M ? M + Offset : M;
2492 }
2494 Op = InnerOp;
2495 return true;
2496 }
2497 return false;
2498 };
2499 bool ReducedInstCount = false;
2500 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2501 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2502 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2503 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2504 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2505 ReducedInstCount |= SingleSrcBinOp;
2506
2507 auto *ShuffleCmpTy =
2508 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2510 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2511 if (!SingleSrcBinOp)
2512 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2513 CostKind, 0, nullptr, {Y, W});
2514
2515 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2516 NewCost +=
2517 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2518 } else {
2519 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2520 ShuffleDstTy, PredLHS, CostKind);
2521 }
2522
2523 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2524 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2525 << "\n");
2526
2527 // If either shuffle will constant fold away, then fold for the same cost as
2528 // we will reduce the instruction count.
2529 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2530 (isa<Constant>(Y) && isa<Constant>(W));
2531 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2532 return false;
2533
2534 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2535 Value *Shuf1 =
2536 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2537 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2538 ? Builder.CreateBinOp(
2539 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2540 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2541
2542 // Intersect flags from the old binops.
2543 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2544 NewInst->copyIRFlags(LHS);
2545 NewInst->andIRFlags(RHS);
2546 }
2547
2548 Worklist.pushValue(Shuf0);
2549 Worklist.pushValue(Shuf1);
2550 replaceValue(I, *NewBO);
2551 return true;
2552}
2553
2554/// Try to convert,
2555/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2556/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2557bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2558 ArrayRef<int> Mask;
2559 Value *C1, *T1, *F1, *C2, *T2, *F2;
2560 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2561 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2562 m_Mask(Mask))))
2563 return false;
2564
2565 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2566 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2567
2568 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2569 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2570 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2571 return false;
2572
2573 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2574 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2575 // SelectInsts must have the same FMF.
2576 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2577 ((SI0FOp != nullptr) &&
2578 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2579 return false;
2580
2581 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2582 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2584 auto SelOp = Instruction::Select;
2585
2587 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2589 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2590
2591 InstructionCost OldCost =
2592 CostSel1 + CostSel2 +
2593 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2594 {I.getOperand(0), I.getOperand(1)}, &I);
2595
2597 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2598 Mask, CostKind, 0, nullptr, {C1, C2});
2599 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2600 nullptr, {T1, T2});
2601 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2602 nullptr, {F1, F2});
2603 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2604 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2605 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2607
2608 if (!Sel1->hasOneUse())
2609 NewCost += CostSel1;
2610 if (!Sel2->hasOneUse())
2611 NewCost += CostSel2;
2612
2613 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2614 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2615 << "\n");
2616 if (NewCost > OldCost)
2617 return false;
2618
2619 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2620 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2621 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2622 Value *NewSel;
2623 // We presuppose that the SelectInsts have the same FMF.
2624 if (SI0FOp)
2625 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2626 SI0FOp->getFastMathFlags());
2627 else
2628 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2629
2630 Worklist.pushValue(ShuffleCmp);
2631 Worklist.pushValue(ShuffleTrue);
2632 Worklist.pushValue(ShuffleFalse);
2633 replaceValue(I, *NewSel);
2634 return true;
2635}
2636
2637/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2638/// into "castop (shuffle)".
2639bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2640 Value *V0, *V1;
2641 ArrayRef<int> OldMask;
2642 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2643 return false;
2644
2645 // Check whether this is a binary shuffle.
2646 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2647
2648 auto *C0 = dyn_cast<CastInst>(V0);
2649 auto *C1 = dyn_cast<CastInst>(V1);
2650 if (!C0 || (IsBinaryShuffle && !C1))
2651 return false;
2652
2653 Instruction::CastOps Opcode = C0->getOpcode();
2654
2655 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2656 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2657 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2658 return false;
2659
2660 if (IsBinaryShuffle) {
2661 if (C0->getSrcTy() != C1->getSrcTy())
2662 return false;
2663 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2664 if (Opcode != C1->getOpcode()) {
2665 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2666 Opcode = Instruction::SExt;
2667 else
2668 return false;
2669 }
2670 }
2671
2672 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2673 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2674 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2675 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2676 return false;
2677
2678 unsigned NumSrcElts = CastSrcTy->getNumElements();
2679 unsigned NumDstElts = CastDstTy->getNumElements();
2680 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2681 "Only bitcasts expected to alter src/dst element counts");
2682
2683 // Check for bitcasting of unscalable vector types.
2684 // e.g. <32 x i40> -> <40 x i32>
2685 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2686 (NumDstElts % NumSrcElts) != 0)
2687 return false;
2688
2689 SmallVector<int, 16> NewMask;
2690 if (NumSrcElts >= NumDstElts) {
2691 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2692 // always be expanded to the equivalent form choosing narrower elements.
2693 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2694 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2695 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2696 } else {
2697 // The bitcast is from narrow elements to wide elements. The shuffle mask
2698 // must choose consecutive elements to allow casting first.
2699 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2700 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2701 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2702 return false;
2703 }
2704
2705 auto *NewShuffleDstTy =
2706 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2707
2708 // Try to replace a castop with a shuffle if the shuffle is not costly.
2709 InstructionCost CostC0 =
2710 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2712
2714 if (IsBinaryShuffle)
2716 else
2718
2719 InstructionCost OldCost = CostC0;
2720 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2721 CostKind, 0, nullptr, {}, &I);
2722
2723 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2724 CastSrcTy, NewMask, CostKind);
2725 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2727 if (!C0->hasOneUse())
2728 NewCost += CostC0;
2729 if (IsBinaryShuffle) {
2730 InstructionCost CostC1 =
2731 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2733 OldCost += CostC1;
2734 if (!C1->hasOneUse())
2735 NewCost += CostC1;
2736 }
2737
2738 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2739 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2740 << "\n");
2741 if (NewCost > OldCost)
2742 return false;
2743
2744 Value *Shuf;
2745 if (IsBinaryShuffle)
2746 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2747 NewMask);
2748 else
2749 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2750
2751 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2752
2753 // Intersect flags from the old casts.
2754 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2755 NewInst->copyIRFlags(C0);
2756 if (IsBinaryShuffle)
2757 NewInst->andIRFlags(C1);
2758 }
2759
2760 Worklist.pushValue(Shuf);
2761 replaceValue(I, *Cast);
2762 return true;
2763}
2764
2765/// Try to convert any of:
2766/// "shuffle (shuffle x, y), (shuffle y, x)"
2767/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2768/// "shuffle (shuffle x, undef), y"
2769/// "shuffle x, (shuffle y, undef)"
2770/// into "shuffle x, y".
2771bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2772 ArrayRef<int> OuterMask;
2773 Value *OuterV0, *OuterV1;
2774 if (!match(&I,
2775 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2776 return false;
2777
2778 ArrayRef<int> InnerMask0, InnerMask1;
2779 Value *X0, *X1, *Y0, *Y1;
2780 bool Match0 =
2781 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2782 bool Match1 =
2783 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2784 if (!Match0 && !Match1)
2785 return false;
2786
2787 // If the outer shuffle is a permute, then create a fake inner all-poison
2788 // shuffle. This is easier than accounting for length-changing shuffles below.
2789 SmallVector<int, 16> PoisonMask1;
2790 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2791 X1 = X0;
2792 Y1 = Y0;
2793 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2794 InnerMask1 = PoisonMask1;
2795 Match1 = true; // fake match
2796 }
2797
2798 X0 = Match0 ? X0 : OuterV0;
2799 Y0 = Match0 ? Y0 : OuterV0;
2800 X1 = Match1 ? X1 : OuterV1;
2801 Y1 = Match1 ? Y1 : OuterV1;
2802 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2803 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2804 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2805 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2806 X0->getType() != X1->getType())
2807 return false;
2808
2809 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2810 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2811
2812 // Attempt to merge shuffles, matching upto 2 source operands.
2813 // Replace index to a poison arg with PoisonMaskElem.
2814 // Bail if either inner masks reference an undef arg.
2815 SmallVector<int, 16> NewMask(OuterMask);
2816 Value *NewX = nullptr, *NewY = nullptr;
2817 for (int &M : NewMask) {
2818 Value *Src = nullptr;
2819 if (0 <= M && M < (int)NumImmElts) {
2820 Src = OuterV0;
2821 if (Match0) {
2822 M = InnerMask0[M];
2823 Src = M >= (int)NumSrcElts ? Y0 : X0;
2824 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2825 }
2826 } else if (M >= (int)NumImmElts) {
2827 Src = OuterV1;
2828 M -= NumImmElts;
2829 if (Match1) {
2830 M = InnerMask1[M];
2831 Src = M >= (int)NumSrcElts ? Y1 : X1;
2832 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2833 }
2834 }
2835 if (Src && M != PoisonMaskElem) {
2836 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2837 if (isa<UndefValue>(Src)) {
2838 // We've referenced an undef element - if its poison, update the shuffle
2839 // mask, else bail.
2840 if (!isa<PoisonValue>(Src))
2841 return false;
2842 M = PoisonMaskElem;
2843 continue;
2844 }
2845 if (!NewX || NewX == Src) {
2846 NewX = Src;
2847 continue;
2848 }
2849 if (!NewY || NewY == Src) {
2850 M += NumSrcElts;
2851 NewY = Src;
2852 continue;
2853 }
2854 return false;
2855 }
2856 }
2857
2858 if (!NewX)
2859 return PoisonValue::get(ShuffleDstTy);
2860 if (!NewY)
2861 NewY = PoisonValue::get(ShuffleSrcTy);
2862
2863 // Have we folded to an Identity shuffle?
2864 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2865 replaceValue(I, *NewX);
2866 return true;
2867 }
2868
2869 // Try to merge the shuffles if the new shuffle is not costly.
2870 InstructionCost InnerCost0 = 0;
2871 if (Match0)
2872 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2873
2874 InstructionCost InnerCost1 = 0;
2875 if (Match1)
2876 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2877
2879
2880 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2881
2882 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2886 InstructionCost NewCost =
2887 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2888 nullptr, {NewX, NewY});
2889 if (!OuterV0->hasOneUse())
2890 NewCost += InnerCost0;
2891 if (!OuterV1->hasOneUse())
2892 NewCost += InnerCost1;
2893
2894 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2895 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2896 << "\n");
2897 if (NewCost > OldCost)
2898 return false;
2899
2900 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2901 replaceValue(I, *Shuf);
2902 return true;
2903}
2904
2905/// Try to convert a chain of length-preserving shuffles that are fed by
2906/// length-changing shuffles from the same source, e.g. a chain of length 3:
2907///
2908/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
2909/// (shuffle y, undef)),
2910// (shuffle y, undef)"
2911///
2912/// into a single shuffle fed by a length-changing shuffle:
2913///
2914/// "shuffle x, (shuffle y, undef)"
2915///
2916/// Such chains arise e.g. from folding extract/insert sequences.
2917bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
2918 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
2919 if (!TrunkType)
2920 return false;
2921
2922 unsigned ChainLength = 0;
2923 SmallVector<int> Mask;
2924 SmallVector<int> YMask;
2925 InstructionCost OldCost = 0;
2926 InstructionCost NewCost = 0;
2927 Value *Trunk = &I;
2928 unsigned NumTrunkElts = TrunkType->getNumElements();
2929 Value *Y = nullptr;
2930
2931 for (;;) {
2932 // Match the current trunk against (commutations of) the pattern
2933 // "shuffle trunk', (shuffle y, undef)"
2934 ArrayRef<int> OuterMask;
2935 Value *OuterV0, *OuterV1;
2936 if (ChainLength != 0 && !Trunk->hasOneUse())
2937 break;
2938 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
2939 m_Mask(OuterMask))))
2940 break;
2941 if (OuterV0->getType() != TrunkType) {
2942 // This shuffle is not length-preserving, so it cannot be part of the
2943 // chain.
2944 break;
2945 }
2946
2947 ArrayRef<int> InnerMask0, InnerMask1;
2948 Value *A0, *A1, *B0, *B1;
2949 bool Match0 =
2950 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
2951 bool Match1 =
2952 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
2953 bool Match0Leaf = Match0 && A0->getType() != I.getType();
2954 bool Match1Leaf = Match1 && A1->getType() != I.getType();
2955 if (Match0Leaf == Match1Leaf) {
2956 // Only handle the case of exactly one leaf in each step. The "two leaves"
2957 // case is handled by foldShuffleOfShuffles.
2958 break;
2959 }
2960
2961 SmallVector<int> CommutedOuterMask;
2962 if (Match0Leaf) {
2963 std::swap(OuterV0, OuterV1);
2964 std::swap(InnerMask0, InnerMask1);
2965 std::swap(A0, A1);
2966 std::swap(B0, B1);
2967 llvm::append_range(CommutedOuterMask, OuterMask);
2968 for (int &M : CommutedOuterMask) {
2969 if (M == PoisonMaskElem)
2970 continue;
2971 if (M < (int)NumTrunkElts)
2972 M += NumTrunkElts;
2973 else
2974 M -= NumTrunkElts;
2975 }
2976 OuterMask = CommutedOuterMask;
2977 }
2978 if (!OuterV1->hasOneUse())
2979 break;
2980
2981 if (!isa<UndefValue>(A1)) {
2982 if (!Y)
2983 Y = A1;
2984 else if (Y != A1)
2985 break;
2986 }
2987 if (!isa<UndefValue>(B1)) {
2988 if (!Y)
2989 Y = B1;
2990 else if (Y != B1)
2991 break;
2992 }
2993
2994 auto *YType = cast<FixedVectorType>(A1->getType());
2995 int NumLeafElts = YType->getNumElements();
2996 SmallVector<int> LocalYMask(InnerMask1);
2997 for (int &M : LocalYMask) {
2998 if (M >= NumLeafElts)
2999 M -= NumLeafElts;
3000 }
3001
3002 InstructionCost LocalOldCost =
3005
3006 // Handle the initial (start of chain) case.
3007 if (!ChainLength) {
3008 Mask.assign(OuterMask);
3009 YMask.assign(LocalYMask);
3010 OldCost = NewCost = LocalOldCost;
3011 Trunk = OuterV0;
3012 ChainLength++;
3013 continue;
3014 }
3015
3016 // For the non-root case, first attempt to combine masks.
3017 SmallVector<int> NewYMask(YMask);
3018 bool Valid = true;
3019 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3020 if (LeafM == -1 || CombinedM == LeafM)
3021 continue;
3022 if (CombinedM == -1) {
3023 CombinedM = LeafM;
3024 } else {
3025 Valid = false;
3026 break;
3027 }
3028 }
3029 if (!Valid)
3030 break;
3031
3032 SmallVector<int> NewMask;
3033 NewMask.reserve(NumTrunkElts);
3034 for (int M : Mask) {
3035 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3036 NewMask.push_back(M);
3037 else
3038 NewMask.push_back(OuterMask[M]);
3039 }
3040
3041 // Break the chain if adding this new step complicates the shuffles such
3042 // that it would increase the new cost by more than the old cost of this
3043 // step.
3044 InstructionCost LocalNewCost =
3046 YType, NewYMask, CostKind) +
3048 TrunkType, NewMask, CostKind);
3049
3050 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3051 break;
3052
3053 LLVM_DEBUG({
3054 if (ChainLength == 1) {
3055 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3056 << I << '\n';
3057 }
3058 dbgs() << " next chain link: " << *Trunk << '\n'
3059 << " old cost: " << (OldCost + LocalOldCost)
3060 << " new cost: " << LocalNewCost << '\n';
3061 });
3062
3063 Mask = NewMask;
3064 YMask = NewYMask;
3065 OldCost += LocalOldCost;
3066 NewCost = LocalNewCost;
3067 Trunk = OuterV0;
3068 ChainLength++;
3069 }
3070 if (ChainLength <= 1)
3071 return false;
3072
3073 if (llvm::all_of(Mask, [&](int M) {
3074 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3075 })) {
3076 // Produce a canonical simplified form if all elements are sourced from Y.
3077 for (int &M : Mask) {
3078 if (M >= static_cast<int>(NumTrunkElts))
3079 M = YMask[M - NumTrunkElts];
3080 }
3081 Value *Root =
3082 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3083 replaceValue(I, *Root);
3084 return true;
3085 }
3086
3087 Value *Leaf =
3088 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3089 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3090 replaceValue(I, *Root);
3091 return true;
3092}
3093
3094/// Try to convert
3095/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3096bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3097 Value *V0, *V1;
3098 ArrayRef<int> OldMask;
3099 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3100 return false;
3101
3102 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3103 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3104 if (!II0 || !II1)
3105 return false;
3106
3107 Intrinsic::ID IID = II0->getIntrinsicID();
3108 if (IID != II1->getIntrinsicID())
3109 return false;
3110 InstructionCost CostII0 =
3111 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3112 InstructionCost CostII1 =
3113 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3114
3115 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3116 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3117 if (!ShuffleDstTy || !II0Ty)
3118 return false;
3119
3120 if (!isTriviallyVectorizable(IID))
3121 return false;
3122
3123 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3125 II0->getArgOperand(I) != II1->getArgOperand(I))
3126 return false;
3127
3128 InstructionCost OldCost =
3129 CostII0 + CostII1 +
3131 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3132
3133 SmallVector<Type *> NewArgsTy;
3134 InstructionCost NewCost = 0;
3135 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3136 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3138 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3139 } else {
3140 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3141 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3142 ShuffleDstTy->getNumElements());
3143 NewArgsTy.push_back(ArgTy);
3144 std::pair<Value *, Value *> OperandPair =
3145 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3146 if (!SeenOperandPairs.insert(OperandPair).second) {
3147 // We've already computed the cost for this operand pair.
3148 continue;
3149 }
3150 NewCost += TTI.getShuffleCost(
3151 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3152 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3153 }
3154 }
3155 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3156
3157 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3158 if (!II0->hasOneUse())
3159 NewCost += CostII0;
3160 if (II1 != II0 && !II1->hasOneUse())
3161 NewCost += CostII1;
3162
3163 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3164 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3165 << "\n");
3166
3167 if (NewCost > OldCost)
3168 return false;
3169
3170 SmallVector<Value *> NewArgs;
3171 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3172 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3174 NewArgs.push_back(II0->getArgOperand(I));
3175 } else {
3176 std::pair<Value *, Value *> OperandPair =
3177 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3178 auto It = ShuffleCache.find(OperandPair);
3179 if (It != ShuffleCache.end()) {
3180 // Reuse previously created shuffle for this operand pair.
3181 NewArgs.push_back(It->second);
3182 continue;
3183 }
3184 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3185 II1->getArgOperand(I), OldMask);
3186 ShuffleCache[OperandPair] = Shuf;
3187 NewArgs.push_back(Shuf);
3188 Worklist.pushValue(Shuf);
3189 }
3190 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3191
3192 // Intersect flags from the old intrinsics.
3193 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3194 NewInst->copyIRFlags(II0);
3195 NewInst->andIRFlags(II1);
3196 }
3197
3198 replaceValue(I, *NewIntrinsic);
3199 return true;
3200}
3201
3202/// Try to convert
3203/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3204bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3205 Value *V0;
3206 ArrayRef<int> Mask;
3207 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_Undef(), m_Mask(Mask))))
3208 return false;
3209
3210 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3211 if (!II0)
3212 return false;
3213
3214 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3215 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3216 if (!ShuffleDstTy || !IntrinsicSrcTy)
3217 return false;
3218
3219 // Validate it's a pure permute, mask should only reference the first vector
3220 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3221 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3222 return false;
3223
3224 Intrinsic::ID IID = II0->getIntrinsicID();
3225 if (!isTriviallyVectorizable(IID))
3226 return false;
3227
3228 // Cost analysis
3229 InstructionCost OldCost =
3230 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
3232 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3233
3234 SmallVector<Type *> NewArgsTy;
3235 InstructionCost NewCost = 0;
3236 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3238 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3239 } else {
3240 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3241 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3242 ShuffleDstTy->getNumElements());
3243 NewArgsTy.push_back(ArgTy);
3245 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3246 {II0->getArgOperand(I)});
3247 }
3248 }
3249 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3250 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3251
3252 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3253 << OldCost << " vs NewCost: " << NewCost << "\n");
3254
3255 if (NewCost > OldCost)
3256 return false;
3257
3258 // Transform
3259 SmallVector<Value *> NewArgs;
3260 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3262 NewArgs.push_back(II0->getArgOperand(I));
3263 } else {
3264 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3265 NewArgs.push_back(Shuf);
3266 Worklist.pushValue(Shuf);
3267 }
3268 }
3269
3270 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3271
3272 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3273 NewInst->copyIRFlags(II0);
3274
3275 replaceValue(I, *NewIntrinsic);
3276 return true;
3277}
3278
3279using InstLane = std::pair<Use *, int>;
3280
3281static InstLane lookThroughShuffles(Use *U, int Lane) {
3282 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
3283 unsigned NumElts =
3284 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3285 int M = SV->getMaskValue(Lane);
3286 if (M < 0)
3287 return {nullptr, PoisonMaskElem};
3288 if (static_cast<unsigned>(M) < NumElts) {
3289 U = &SV->getOperandUse(0);
3290 Lane = M;
3291 } else {
3292 U = &SV->getOperandUse(1);
3293 Lane = M - NumElts;
3294 }
3295 }
3296 return InstLane{U, Lane};
3297}
3298
3302 for (InstLane IL : Item) {
3303 auto [U, Lane] = IL;
3304 InstLane OpLane =
3305 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
3306 Lane)
3307 : InstLane{nullptr, PoisonMaskElem};
3308 NItem.emplace_back(OpLane);
3309 }
3310 return NItem;
3311}
3312
3313/// Detect concat of multiple values into a vector
3315 const TargetTransformInfo &TTI) {
3316 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
3317 unsigned NumElts = Ty->getNumElements();
3318 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3319 return false;
3320
3321 // Check that the concat is free, usually meaning that the type will be split
3322 // during legalization.
3323 SmallVector<int, 16> ConcatMask(NumElts * 2);
3324 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3325 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3326 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3327 Ty, ConcatMask, CostKind) != 0)
3328 return false;
3329
3330 unsigned NumSlices = Item.size() / NumElts;
3331 // Currently we generate a tree of shuffles for the concats, which limits us
3332 // to a power2.
3333 if (!isPowerOf2_32(NumSlices))
3334 return false;
3335 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3336 Use *SliceV = Item[Slice * NumElts].first;
3337 if (!SliceV || SliceV->get()->getType() != Ty)
3338 return false;
3339 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3340 auto [V, Lane] = Item[Slice * NumElts + Elt];
3341 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
3342 return false;
3343 }
3344 }
3345 return true;
3346}
3347
3349 const SmallPtrSet<Use *, 4> &IdentityLeafs,
3350 const SmallPtrSet<Use *, 4> &SplatLeafs,
3351 const SmallPtrSet<Use *, 4> &ConcatLeafs,
3352 IRBuilderBase &Builder,
3353 const TargetTransformInfo *TTI) {
3354 auto [FrontU, FrontLane] = Item.front();
3355
3356 if (IdentityLeafs.contains(FrontU)) {
3357 return FrontU->get();
3358 }
3359 if (SplatLeafs.contains(FrontU)) {
3360 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3361 return Builder.CreateShuffleVector(FrontU->get(), Mask);
3362 }
3363 if (ConcatLeafs.contains(FrontU)) {
3364 unsigned NumElts =
3365 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
3366 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3367 for (unsigned S = 0; S < Values.size(); ++S)
3368 Values[S] = Item[S * NumElts].first->get();
3369
3370 while (Values.size() > 1) {
3371 NumElts *= 2;
3372 SmallVector<int, 16> Mask(NumElts, 0);
3373 std::iota(Mask.begin(), Mask.end(), 0);
3374 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3375 for (unsigned S = 0; S < NewValues.size(); ++S)
3376 NewValues[S] =
3377 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3378 Values = NewValues;
3379 }
3380 return Values[0];
3381 }
3382
3383 auto *I = cast<Instruction>(FrontU->get());
3384 auto *II = dyn_cast<IntrinsicInst>(I);
3385 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3387 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3388 if (II &&
3389 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3390 Ops[Idx] = II->getOperand(Idx);
3391 continue;
3392 }
3394 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
3395 Builder, TTI);
3396 }
3397
3398 SmallVector<Value *, 8> ValueList;
3399 for (const auto &Lane : Item)
3400 if (Lane.first)
3401 ValueList.push_back(Lane.first->get());
3402
3403 Type *DstTy =
3404 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3405 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3406 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3407 Ops[0], Ops[1]);
3408 propagateIRFlags(Value, ValueList);
3409 return Value;
3410 }
3411 if (auto *CI = dyn_cast<CmpInst>(I)) {
3412 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3413 propagateIRFlags(Value, ValueList);
3414 return Value;
3415 }
3416 if (auto *SI = dyn_cast<SelectInst>(I)) {
3417 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3418 propagateIRFlags(Value, ValueList);
3419 return Value;
3420 }
3421 if (auto *CI = dyn_cast<CastInst>(I)) {
3422 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3423 propagateIRFlags(Value, ValueList);
3424 return Value;
3425 }
3426 if (II) {
3427 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3428 propagateIRFlags(Value, ValueList);
3429 return Value;
3430 }
3431 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3432 auto *Value =
3433 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3434 propagateIRFlags(Value, ValueList);
3435 return Value;
3436}
3437
3438// Starting from a shuffle, look up through operands tracking the shuffled index
3439// of each lane. If we can simplify away the shuffles to identities then
3440// do so.
3441bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3442 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3443 if (!Ty || I.use_empty())
3444 return false;
3445
3446 SmallVector<InstLane> Start(Ty->getNumElements());
3447 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3448 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3449
3451 Worklist.push_back(Start);
3452 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3453 unsigned NumVisited = 0;
3454
3455 while (!Worklist.empty()) {
3456 if (++NumVisited > MaxInstrsToScan)
3457 return false;
3458
3459 SmallVector<InstLane> Item = Worklist.pop_back_val();
3460 auto [FrontU, FrontLane] = Item.front();
3461
3462 // If we found an undef first lane then bail out to keep things simple.
3463 if (!FrontU)
3464 return false;
3465
3466 // Helper to peek through bitcasts to the same value.
3467 auto IsEquiv = [&](Value *X, Value *Y) {
3468 return X->getType() == Y->getType() &&
3470 };
3471
3472 // Look for an identity value.
3473 if (FrontLane == 0 &&
3474 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3475 Ty->getNumElements() &&
3476 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3477 Value *FrontV = Item.front().first->get();
3478 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3479 E.value().second == (int)E.index());
3480 })) {
3481 IdentityLeafs.insert(FrontU);
3482 continue;
3483 }
3484 // Look for constants, for the moment only supporting constant splats.
3485 if (auto *C = dyn_cast<Constant>(FrontU);
3486 C && C->getSplatValue() &&
3487 all_of(drop_begin(Item), [Item](InstLane &IL) {
3488 Value *FrontV = Item.front().first->get();
3489 Use *U = IL.first;
3490 return !U || (isa<Constant>(U->get()) &&
3491 cast<Constant>(U->get())->getSplatValue() ==
3492 cast<Constant>(FrontV)->getSplatValue());
3493 })) {
3494 SplatLeafs.insert(FrontU);
3495 continue;
3496 }
3497 // Look for a splat value.
3498 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3499 auto [FrontU, FrontLane] = Item.front();
3500 auto [U, Lane] = IL;
3501 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3502 })) {
3503 SplatLeafs.insert(FrontU);
3504 continue;
3505 }
3506
3507 // We need each element to be the same type of value, and check that each
3508 // element has a single use.
3509 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3510 Value *FrontV = Item.front().first->get();
3511 if (!IL.first)
3512 return true;
3513 Value *V = IL.first->get();
3514 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3515 return false;
3516 if (V->getValueID() != FrontV->getValueID())
3517 return false;
3518 if (auto *CI = dyn_cast<CmpInst>(V))
3519 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3520 return false;
3521 if (auto *CI = dyn_cast<CastInst>(V))
3522 if (CI->getSrcTy()->getScalarType() !=
3523 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3524 return false;
3525 if (auto *SI = dyn_cast<SelectInst>(V))
3526 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3527 SI->getOperand(0)->getType() !=
3528 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3529 return false;
3530 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3531 return false;
3532 auto *II = dyn_cast<IntrinsicInst>(V);
3533 return !II || (isa<IntrinsicInst>(FrontV) &&
3534 II->getIntrinsicID() ==
3535 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3536 !II->hasOperandBundles());
3537 };
3538 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3539 // Check the operator is one that we support.
3540 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3541 // We exclude div/rem in case they hit UB from poison lanes.
3542 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3543 BO && BO->isIntDivRem())
3544 return false;
3547 continue;
3548 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3549 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3551 continue;
3552 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3553 // TODO: Handle vector widening/narrowing bitcasts.
3554 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3555 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3556 if (DstTy && SrcTy &&
3557 SrcTy->getNumElements() == DstTy->getNumElements()) {
3559 continue;
3560 }
3561 } else if (isa<SelectInst>(FrontU)) {
3565 continue;
3566 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3567 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3568 !II->hasOperandBundles()) {
3569 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3570 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3571 &TTI)) {
3572 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3573 Value *FrontV = Item.front().first->get();
3574 Use *U = IL.first;
3575 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3576 cast<Instruction>(FrontV)->getOperand(Op));
3577 }))
3578 return false;
3579 continue;
3580 }
3582 }
3583 continue;
3584 }
3585 }
3586
3587 if (isFreeConcat(Item, CostKind, TTI)) {
3588 ConcatLeafs.insert(FrontU);
3589 continue;
3590 }
3591
3592 return false;
3593 }
3594
3595 if (NumVisited <= 1)
3596 return false;
3597
3598 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3599
3600 // If we got this far, we know the shuffles are superfluous and can be
3601 // removed. Scan through again and generate the new tree of instructions.
3602 Builder.SetInsertPoint(&I);
3603 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3604 ConcatLeafs, Builder, &TTI);
3605 replaceValue(I, *V);
3606 return true;
3607}
3608
3609/// Given a commutative reduction, the order of the input lanes does not alter
3610/// the results. We can use this to remove certain shuffles feeding the
3611/// reduction, removing the need to shuffle at all.
3612bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3613 auto *II = dyn_cast<IntrinsicInst>(&I);
3614 if (!II)
3615 return false;
3616 switch (II->getIntrinsicID()) {
3617 case Intrinsic::vector_reduce_add:
3618 case Intrinsic::vector_reduce_mul:
3619 case Intrinsic::vector_reduce_and:
3620 case Intrinsic::vector_reduce_or:
3621 case Intrinsic::vector_reduce_xor:
3622 case Intrinsic::vector_reduce_smin:
3623 case Intrinsic::vector_reduce_smax:
3624 case Intrinsic::vector_reduce_umin:
3625 case Intrinsic::vector_reduce_umax:
3626 break;
3627 default:
3628 return false;
3629 }
3630
3631 // Find all the inputs when looking through operations that do not alter the
3632 // lane order (binops, for example). Currently we look for a single shuffle,
3633 // and can ignore splat values.
3634 std::queue<Value *> Worklist;
3635 SmallPtrSet<Value *, 4> Visited;
3636 ShuffleVectorInst *Shuffle = nullptr;
3637 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3638 Worklist.push(Op);
3639
3640 while (!Worklist.empty()) {
3641 Value *CV = Worklist.front();
3642 Worklist.pop();
3643 if (Visited.contains(CV))
3644 continue;
3645
3646 // Splats don't change the order, so can be safely ignored.
3647 if (isSplatValue(CV))
3648 continue;
3649
3650 Visited.insert(CV);
3651
3652 if (auto *CI = dyn_cast<Instruction>(CV)) {
3653 if (CI->isBinaryOp()) {
3654 for (auto *Op : CI->operand_values())
3655 Worklist.push(Op);
3656 continue;
3657 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3658 if (Shuffle && Shuffle != SV)
3659 return false;
3660 Shuffle = SV;
3661 continue;
3662 }
3663 }
3664
3665 // Anything else is currently an unknown node.
3666 return false;
3667 }
3668
3669 if (!Shuffle)
3670 return false;
3671
3672 // Check all uses of the binary ops and shuffles are also included in the
3673 // lane-invariant operations (Visited should be the list of lanewise
3674 // instructions, including the shuffle that we found).
3675 for (auto *V : Visited)
3676 for (auto *U : V->users())
3677 if (!Visited.contains(U) && U != &I)
3678 return false;
3679
3680 FixedVectorType *VecType =
3681 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3682 if (!VecType)
3683 return false;
3684 FixedVectorType *ShuffleInputType =
3686 if (!ShuffleInputType)
3687 return false;
3688 unsigned NumInputElts = ShuffleInputType->getNumElements();
3689
3690 // Find the mask from sorting the lanes into order. This is most likely to
3691 // become a identity or concat mask. Undef elements are pushed to the end.
3692 SmallVector<int> ConcatMask;
3693 Shuffle->getShuffleMask(ConcatMask);
3694 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3695 bool UsesSecondVec =
3696 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3697
3699 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3700 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3702 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3703 ShuffleInputType, ConcatMask, CostKind);
3704
3705 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3706 << "\n");
3707 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3708 << "\n");
3709 bool MadeChanges = false;
3710 if (NewCost < OldCost) {
3711 Builder.SetInsertPoint(Shuffle);
3712 Value *NewShuffle = Builder.CreateShuffleVector(
3713 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3714 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3715 replaceValue(*Shuffle, *NewShuffle);
3716 return true;
3717 }
3718
3719 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3720 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3721 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3722 return MadeChanges;
3723}
3724
3725/// For a given chain of patterns of the following form:
3726///
3727/// ```
3728/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3729///
3730/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3731/// ty1> %1)
3732/// OR
3733/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3734///
3735/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3736/// ...
3737/// ...
3738/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3739/// 3), <n x ty1> %(i - 2)
3740/// OR
3741/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3742///
3743/// %(i) = extractelement <n x ty1> %(i - 1), 0
3744/// ```
3745///
3746/// Where:
3747/// `mask` follows a partition pattern:
3748///
3749/// Ex:
3750/// [n = 8, p = poison]
3751///
3752/// 4 5 6 7 | p p p p
3753/// 2 3 | p p p p p p
3754/// 1 | p p p p p p p
3755///
3756/// For powers of 2, there's a consistent pattern, but for other cases
3757/// the parity of the current half value at each step decides the
3758/// next partition half (see `ExpectedParityMask` for more logical details
3759/// in generalising this).
3760///
3761/// Ex:
3762/// [n = 6]
3763///
3764/// 3 4 5 | p p p
3765/// 1 2 | p p p p
3766/// 1 | p p p p p
3767bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3768 // Going bottom-up for the pattern.
3769 std::queue<Value *> InstWorklist;
3770 InstructionCost OrigCost = 0;
3771
3772 // Common instruction operation after each shuffle op.
3773 std::optional<unsigned int> CommonCallOp = std::nullopt;
3774 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3775
3776 bool IsFirstCallOrBinInst = true;
3777 bool ShouldBeCallOrBinInst = true;
3778
3779 // This stores the last used instructions for shuffle/common op.
3780 //
3781 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3782 // instructions from either shuffle/common op.
3783 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3784
3785 Value *VecOpEE;
3786 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3787 return false;
3788
3789 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3790 if (!FVT)
3791 return false;
3792
3793 int64_t VecSize = FVT->getNumElements();
3794 if (VecSize < 2)
3795 return false;
3796
3797 // Number of levels would be ~log2(n), considering we always partition
3798 // by half for this fold pattern.
3799 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3800 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3801
3802 // This is how we generalise for all element sizes.
3803 // At each step, if vector size is odd, we need non-poison
3804 // values to cover the dominant half so we don't miss out on any element.
3805 //
3806 // This mask will help us retrieve this as we go from bottom to top:
3807 //
3808 // Mask Set -> N = N * 2 - 1
3809 // Mask Unset -> N = N * 2
3810 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3811 Cur = (Cur + 1) / 2, --Mask) {
3812 if (Cur & 1)
3813 ExpectedParityMask |= (1ll << Mask);
3814 }
3815
3816 InstWorklist.push(VecOpEE);
3817
3818 while (!InstWorklist.empty()) {
3819 Value *CI = InstWorklist.front();
3820 InstWorklist.pop();
3821
3822 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3823 if (!ShouldBeCallOrBinInst)
3824 return false;
3825
3826 if (!IsFirstCallOrBinInst &&
3827 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3828 return false;
3829
3830 // For the first found call/bin op, the vector has to come from the
3831 // extract element op.
3832 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3833 return false;
3834 IsFirstCallOrBinInst = false;
3835
3836 if (!CommonCallOp)
3837 CommonCallOp = II->getIntrinsicID();
3838 if (II->getIntrinsicID() != *CommonCallOp)
3839 return false;
3840
3841 switch (II->getIntrinsicID()) {
3842 case Intrinsic::umin:
3843 case Intrinsic::umax:
3844 case Intrinsic::smin:
3845 case Intrinsic::smax: {
3846 auto *Op0 = II->getOperand(0);
3847 auto *Op1 = II->getOperand(1);
3848 PrevVecV[0] = Op0;
3849 PrevVecV[1] = Op1;
3850 break;
3851 }
3852 default:
3853 return false;
3854 }
3855 ShouldBeCallOrBinInst ^= 1;
3856
3857 IntrinsicCostAttributes ICA(
3858 *CommonCallOp, II->getType(),
3859 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3860 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3861
3862 // We may need a swap here since it can be (a, b) or (b, a)
3863 // and accordingly change as we go up.
3864 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3865 std::swap(PrevVecV[0], PrevVecV[1]);
3866 InstWorklist.push(PrevVecV[1]);
3867 InstWorklist.push(PrevVecV[0]);
3868 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3869 // Similar logic for bin ops.
3870
3871 if (!ShouldBeCallOrBinInst)
3872 return false;
3873
3874 if (!IsFirstCallOrBinInst &&
3875 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3876 return false;
3877
3878 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3879 return false;
3880 IsFirstCallOrBinInst = false;
3881
3882 if (!CommonBinOp)
3883 CommonBinOp = BinOp->getOpcode();
3884
3885 if (BinOp->getOpcode() != *CommonBinOp)
3886 return false;
3887
3888 switch (*CommonBinOp) {
3889 case BinaryOperator::Add:
3890 case BinaryOperator::Mul:
3891 case BinaryOperator::Or:
3892 case BinaryOperator::And:
3893 case BinaryOperator::Xor: {
3894 auto *Op0 = BinOp->getOperand(0);
3895 auto *Op1 = BinOp->getOperand(1);
3896 PrevVecV[0] = Op0;
3897 PrevVecV[1] = Op1;
3898 break;
3899 }
3900 default:
3901 return false;
3902 }
3903 ShouldBeCallOrBinInst ^= 1;
3904
3905 OrigCost +=
3906 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3907
3908 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3909 std::swap(PrevVecV[0], PrevVecV[1]);
3910 InstWorklist.push(PrevVecV[1]);
3911 InstWorklist.push(PrevVecV[0]);
3912 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3913 // We shouldn't have any null values in the previous vectors,
3914 // is so, there was a mismatch in pattern.
3915 if (ShouldBeCallOrBinInst ||
3916 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3917 return false;
3918
3919 if (SVInst != PrevVecV[1])
3920 return false;
3921
3922 ArrayRef<int> CurMask;
3923 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3924 m_Mask(CurMask))))
3925 return false;
3926
3927 // Subtract the parity mask when checking the condition.
3928 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3929 if (Mask < ShuffleMaskHalf &&
3930 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3931 return false;
3932 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3933 return false;
3934 }
3935
3936 // Update mask values.
3937 ShuffleMaskHalf *= 2;
3938 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3939 ExpectedParityMask >>= 1;
3940
3942 SVInst->getType(), SVInst->getType(),
3943 CurMask, CostKind);
3944
3945 VisitedCnt += 1;
3946 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3947 break;
3948
3949 ShouldBeCallOrBinInst ^= 1;
3950 } else {
3951 return false;
3952 }
3953 }
3954
3955 // Pattern should end with a shuffle op.
3956 if (ShouldBeCallOrBinInst)
3957 return false;
3958
3959 assert(VecSize != -1 && "Expected Match for Vector Size");
3960
3961 Value *FinalVecV = PrevVecV[0];
3962 if (!FinalVecV)
3963 return false;
3964
3965 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3966
3967 Intrinsic::ID ReducedOp =
3968 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3969 : getReductionForBinop(*CommonBinOp));
3970 if (!ReducedOp)
3971 return false;
3972
3973 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3975
3976 if (NewCost >= OrigCost)
3977 return false;
3978
3979 auto *ReducedResult =
3980 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3981 replaceValue(I, *ReducedResult);
3982
3983 return true;
3984}
3985
3986/// Determine if its more efficient to fold:
3987/// reduce(trunc(x)) -> trunc(reduce(x)).
3988/// reduce(sext(x)) -> sext(reduce(x)).
3989/// reduce(zext(x)) -> zext(reduce(x)).
3990bool VectorCombine::foldCastFromReductions(Instruction &I) {
3991 auto *II = dyn_cast<IntrinsicInst>(&I);
3992 if (!II)
3993 return false;
3994
3995 bool TruncOnly = false;
3996 Intrinsic::ID IID = II->getIntrinsicID();
3997 switch (IID) {
3998 case Intrinsic::vector_reduce_add:
3999 case Intrinsic::vector_reduce_mul:
4000 TruncOnly = true;
4001 break;
4002 case Intrinsic::vector_reduce_and:
4003 case Intrinsic::vector_reduce_or:
4004 case Intrinsic::vector_reduce_xor:
4005 break;
4006 default:
4007 return false;
4008 }
4009
4010 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4011 Value *ReductionSrc = I.getOperand(0);
4012
4013 Value *Src;
4014 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4015 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4016 return false;
4017
4018 auto CastOpc =
4019 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4020
4021 auto *SrcTy = cast<VectorType>(Src->getType());
4022 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4023 Type *ResultTy = I.getType();
4024
4026 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4027 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4029 cast<CastInst>(ReductionSrc));
4030 InstructionCost NewCost =
4031 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4032 CostKind) +
4033 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4035
4036 if (OldCost <= NewCost || !NewCost.isValid())
4037 return false;
4038
4039 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4040 II->getIntrinsicID(), {Src});
4041 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4042 replaceValue(I, *NewCast);
4043 return true;
4044}
4045
4046/// Returns true if this ShuffleVectorInst eventually feeds into a
4047/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
4048/// chains of shuffles and binary operators (in any combination/order).
4049/// The search does not go deeper than the given Depth.
4051 constexpr unsigned MaxVisited = 32;
4054 bool FoundReduction = false;
4055
4056 WorkList.push_back(SVI);
4057 while (!WorkList.empty()) {
4058 Instruction *I = WorkList.pop_back_val();
4059 for (User *U : I->users()) {
4060 auto *UI = cast<Instruction>(U);
4061 if (!UI || !Visited.insert(UI).second)
4062 continue;
4063 if (Visited.size() > MaxVisited)
4064 return false;
4065 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
4066 // More than one reduction reached
4067 if (FoundReduction)
4068 return false;
4069 switch (II->getIntrinsicID()) {
4070 case Intrinsic::vector_reduce_add:
4071 case Intrinsic::vector_reduce_mul:
4072 case Intrinsic::vector_reduce_and:
4073 case Intrinsic::vector_reduce_or:
4074 case Intrinsic::vector_reduce_xor:
4075 case Intrinsic::vector_reduce_smin:
4076 case Intrinsic::vector_reduce_smax:
4077 case Intrinsic::vector_reduce_umin:
4078 case Intrinsic::vector_reduce_umax:
4079 FoundReduction = true;
4080 continue;
4081 default:
4082 return false;
4083 }
4084 }
4085
4087 return false;
4088
4089 WorkList.emplace_back(UI);
4090 }
4091 }
4092 return FoundReduction;
4093}
4094
4095/// This method looks for groups of shuffles acting on binops, of the form:
4096/// %x = shuffle ...
4097/// %y = shuffle ...
4098/// %a = binop %x, %y
4099/// %b = binop %x, %y
4100/// shuffle %a, %b, selectmask
4101/// We may, especially if the shuffle is wider than legal, be able to convert
4102/// the shuffle to a form where only parts of a and b need to be computed. On
4103/// architectures with no obvious "select" shuffle, this can reduce the total
4104/// number of operations if the target reports them as cheaper.
4105bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
4106 auto *SVI = cast<ShuffleVectorInst>(&I);
4107 auto *VT = cast<FixedVectorType>(I.getType());
4108 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
4109 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
4110 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
4111 VT != Op0->getType())
4112 return false;
4113
4114 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
4115 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
4116 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
4117 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
4118 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
4119 auto checkSVNonOpUses = [&](Instruction *I) {
4120 if (!I || I->getOperand(0)->getType() != VT)
4121 return true;
4122 return any_of(I->users(), [&](User *U) {
4123 return U != Op0 && U != Op1 &&
4124 !(isa<ShuffleVectorInst>(U) &&
4125 (InputShuffles.contains(cast<Instruction>(U)) ||
4126 isInstructionTriviallyDead(cast<Instruction>(U))));
4127 });
4128 };
4129 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
4130 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
4131 return false;
4132
4133 // Collect all the uses that are shuffles that we can transform together. We
4134 // may not have a single shuffle, but a group that can all be transformed
4135 // together profitably.
4137 auto collectShuffles = [&](Instruction *I) {
4138 for (auto *U : I->users()) {
4139 auto *SV = dyn_cast<ShuffleVectorInst>(U);
4140 if (!SV || SV->getType() != VT)
4141 return false;
4142 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
4143 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
4144 return false;
4145 if (!llvm::is_contained(Shuffles, SV))
4146 Shuffles.push_back(SV);
4147 }
4148 return true;
4149 };
4150 if (!collectShuffles(Op0) || !collectShuffles(Op1))
4151 return false;
4152 // From a reduction, we need to be processing a single shuffle, otherwise the
4153 // other uses will not be lane-invariant.
4154 if (FromReduction && Shuffles.size() > 1)
4155 return false;
4156
4157 // Add any shuffle uses for the shuffles we have found, to include them in our
4158 // cost calculations.
4159 if (!FromReduction) {
4160 for (ShuffleVectorInst *SV : Shuffles) {
4161 for (auto *U : SV->users()) {
4162 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
4163 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
4164 Shuffles.push_back(SSV);
4165 }
4166 }
4167 }
4168
4169 // For each of the output shuffles, we try to sort all the first vector
4170 // elements to the beginning, followed by the second array elements at the
4171 // end. If the binops are legalized to smaller vectors, this may reduce total
4172 // number of binops. We compute the ReconstructMask mask needed to convert
4173 // back to the original lane order.
4175 SmallVector<SmallVector<int>> OrigReconstructMasks;
4176 int MaxV1Elt = 0, MaxV2Elt = 0;
4177 unsigned NumElts = VT->getNumElements();
4178 for (ShuffleVectorInst *SVN : Shuffles) {
4179 SmallVector<int> Mask;
4180 SVN->getShuffleMask(Mask);
4181
4182 // Check the operands are the same as the original, or reversed (in which
4183 // case we need to commute the mask).
4184 Value *SVOp0 = SVN->getOperand(0);
4185 Value *SVOp1 = SVN->getOperand(1);
4186 if (isa<UndefValue>(SVOp1)) {
4187 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
4188 SVOp0 = SSV->getOperand(0);
4189 SVOp1 = SSV->getOperand(1);
4190 for (int &Elem : Mask) {
4191 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
4192 return false;
4193 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
4194 }
4195 }
4196 if (SVOp0 == Op1 && SVOp1 == Op0) {
4197 std::swap(SVOp0, SVOp1);
4199 }
4200 if (SVOp0 != Op0 || SVOp1 != Op1)
4201 return false;
4202
4203 // Calculate the reconstruction mask for this shuffle, as the mask needed to
4204 // take the packed values from Op0/Op1 and reconstructing to the original
4205 // order.
4206 SmallVector<int> ReconstructMask;
4207 for (unsigned I = 0; I < Mask.size(); I++) {
4208 if (Mask[I] < 0) {
4209 ReconstructMask.push_back(-1);
4210 } else if (Mask[I] < static_cast<int>(NumElts)) {
4211 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
4212 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
4213 return Mask[I] == A.first;
4214 });
4215 if (It != V1.end())
4216 ReconstructMask.push_back(It - V1.begin());
4217 else {
4218 ReconstructMask.push_back(V1.size());
4219 V1.emplace_back(Mask[I], V1.size());
4220 }
4221 } else {
4222 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
4223 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
4224 return Mask[I] - static_cast<int>(NumElts) == A.first;
4225 });
4226 if (It != V2.end())
4227 ReconstructMask.push_back(NumElts + It - V2.begin());
4228 else {
4229 ReconstructMask.push_back(NumElts + V2.size());
4230 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
4231 }
4232 }
4233 }
4234
4235 // For reductions, we know that the lane ordering out doesn't alter the
4236 // result. In-order can help simplify the shuffle away.
4237 if (FromReduction)
4238 sort(ReconstructMask);
4239 OrigReconstructMasks.push_back(std::move(ReconstructMask));
4240 }
4241
4242 // If the Maximum element used from V1 and V2 are not larger than the new
4243 // vectors, the vectors are already packes and performing the optimization
4244 // again will likely not help any further. This also prevents us from getting
4245 // stuck in a cycle in case the costs do not also rule it out.
4246 if (V1.empty() || V2.empty() ||
4247 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
4248 MaxV2Elt == static_cast<int>(V2.size()) - 1))
4249 return false;
4250
4251 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
4252 // shuffle of another shuffle, or not a shuffle (that is treated like a
4253 // identity shuffle).
4254 auto GetBaseMaskValue = [&](Instruction *I, int M) {
4255 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4256 if (!SV)
4257 return M;
4258 if (isa<UndefValue>(SV->getOperand(1)))
4259 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4260 if (InputShuffles.contains(SSV))
4261 return SSV->getMaskValue(SV->getMaskValue(M));
4262 return SV->getMaskValue(M);
4263 };
4264
4265 // Attempt to sort the inputs my ascending mask values to make simpler input
4266 // shuffles and push complex shuffles down to the uses. We sort on the first
4267 // of the two input shuffle orders, to try and get at least one input into a
4268 // nice order.
4269 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
4270 std::pair<int, int> Y) {
4271 int MXA = GetBaseMaskValue(A, X.first);
4272 int MYA = GetBaseMaskValue(A, Y.first);
4273 return MXA < MYA;
4274 };
4275 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
4276 return SortBase(SVI0A, A, B);
4277 });
4278 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
4279 return SortBase(SVI1A, A, B);
4280 });
4281 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
4282 // modified order of the input shuffles.
4283 SmallVector<SmallVector<int>> ReconstructMasks;
4284 for (const auto &Mask : OrigReconstructMasks) {
4285 SmallVector<int> ReconstructMask;
4286 for (int M : Mask) {
4287 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
4288 auto It = find_if(V, [M](auto A) { return A.second == M; });
4289 assert(It != V.end() && "Expected all entries in Mask");
4290 return std::distance(V.begin(), It);
4291 };
4292 if (M < 0)
4293 ReconstructMask.push_back(-1);
4294 else if (M < static_cast<int>(NumElts)) {
4295 ReconstructMask.push_back(FindIndex(V1, M));
4296 } else {
4297 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
4298 }
4299 }
4300 ReconstructMasks.push_back(std::move(ReconstructMask));
4301 }
4302
4303 // Calculate the masks needed for the new input shuffles, which get padded
4304 // with undef
4305 SmallVector<int> V1A, V1B, V2A, V2B;
4306 for (unsigned I = 0; I < V1.size(); I++) {
4307 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
4308 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
4309 }
4310 for (unsigned I = 0; I < V2.size(); I++) {
4311 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
4312 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
4313 }
4314 while (V1A.size() < NumElts) {
4317 }
4318 while (V2A.size() < NumElts) {
4321 }
4322
4323 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
4324 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4325 if (!SV)
4326 return C;
4327 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
4330 VT, VT, SV->getShuffleMask(), CostKind);
4331 };
4332 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4333 return C +
4335 };
4336
4337 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
4338 unsigned MaxVectorSize =
4340 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
4341 if (MaxElementsInVector == 0)
4342 return false;
4343 // When there are multiple shufflevector operations on the same input,
4344 // especially when the vector length is larger than the register size,
4345 // identical shuffle patterns may occur across different groups of elements.
4346 // To avoid overestimating the cost by counting these repeated shuffles more
4347 // than once, we only account for unique shuffle patterns. This adjustment
4348 // prevents inflated costs in the cost model for wide vectors split into
4349 // several register-sized groups.
4350 std::set<SmallVector<int, 4>> UniqueShuffles;
4351 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
4352 // Compute the cost for performing the shuffle over the full vector.
4353 auto ShuffleCost =
4355 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
4356 if (NumFullVectors < 2)
4357 return C + ShuffleCost;
4358 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
4359 unsigned NumUniqueGroups = 0;
4360 unsigned NumGroups = Mask.size() / MaxElementsInVector;
4361 // For each group of MaxElementsInVector contiguous elements,
4362 // collect their shuffle pattern and insert into the set of unique patterns.
4363 for (unsigned I = 0; I < NumFullVectors; ++I) {
4364 for (unsigned J = 0; J < MaxElementsInVector; ++J)
4365 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
4366 if (UniqueShuffles.insert(SubShuffle).second)
4367 NumUniqueGroups += 1;
4368 }
4369 return C + ShuffleCost * NumUniqueGroups / NumGroups;
4370 };
4371 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
4372 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4373 if (!SV)
4374 return C;
4375 SmallVector<int, 16> Mask;
4376 SV->getShuffleMask(Mask);
4377 return AddShuffleMaskAdjustedCost(C, Mask);
4378 };
4379 // Check that input consists of ShuffleVectors applied to the same input
4380 auto AllShufflesHaveSameOperands =
4381 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
4382 if (InputShuffles.size() < 2)
4383 return false;
4384 ShuffleVectorInst *FirstSV =
4385 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
4386 if (!FirstSV)
4387 return false;
4388
4389 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
4390 return std::all_of(
4391 std::next(InputShuffles.begin()), InputShuffles.end(),
4392 [&](Instruction *I) {
4393 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
4394 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
4395 });
4396 };
4397
4398 // Get the costs of the shuffles + binops before and after with the new
4399 // shuffle masks.
4400 InstructionCost CostBefore =
4401 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
4402 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
4403 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
4404 InstructionCost(0), AddShuffleCost);
4405 if (AllShufflesHaveSameOperands(InputShuffles)) {
4406 UniqueShuffles.clear();
4407 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4408 InstructionCost(0), AddShuffleAdjustedCost);
4409 } else {
4410 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4411 InstructionCost(0), AddShuffleCost);
4412 }
4413
4414 // The new binops will be unused for lanes past the used shuffle lengths.
4415 // These types attempt to get the correct cost for that from the target.
4416 FixedVectorType *Op0SmallVT =
4417 FixedVectorType::get(VT->getScalarType(), V1.size());
4418 FixedVectorType *Op1SmallVT =
4419 FixedVectorType::get(VT->getScalarType(), V2.size());
4420 InstructionCost CostAfter =
4421 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4422 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4423 UniqueShuffles.clear();
4424 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4425 InstructionCost(0), AddShuffleMaskAdjustedCost);
4426 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4427 CostAfter +=
4428 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4429 InstructionCost(0), AddShuffleMaskCost);
4430
4431 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4432 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4433 << " vs CostAfter: " << CostAfter << "\n");
4434 if (CostBefore < CostAfter ||
4435 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4436 return false;
4437
4438 // The cost model has passed, create the new instructions.
4439 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4440 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4441 if (!SV)
4442 return I;
4443 if (isa<UndefValue>(SV->getOperand(1)))
4444 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4445 if (InputShuffles.contains(SSV))
4446 return SSV->getOperand(Op);
4447 return SV->getOperand(Op);
4448 };
4449 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4450 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4451 GetShuffleOperand(SVI0A, 1), V1A);
4452 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4453 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4454 GetShuffleOperand(SVI0B, 1), V1B);
4455 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4456 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4457 GetShuffleOperand(SVI1A, 1), V2A);
4458 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4459 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4460 GetShuffleOperand(SVI1B, 1), V2B);
4461 Builder.SetInsertPoint(Op0);
4462 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4463 NSV0A, NSV0B);
4464 if (auto *I = dyn_cast<Instruction>(NOp0))
4465 I->copyIRFlags(Op0, true);
4466 Builder.SetInsertPoint(Op1);
4467 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4468 NSV1A, NSV1B);
4469 if (auto *I = dyn_cast<Instruction>(NOp1))
4470 I->copyIRFlags(Op1, true);
4471
4472 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4473 Builder.SetInsertPoint(Shuffles[S]);
4474 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4475 replaceValue(*Shuffles[S], *NSV, false);
4476 }
4477
4478 Worklist.pushValue(NSV0A);
4479 Worklist.pushValue(NSV0B);
4480 Worklist.pushValue(NSV1A);
4481 Worklist.pushValue(NSV1B);
4482 return true;
4483}
4484
4485/// Check if instruction depends on ZExt and this ZExt can be moved after the
4486/// instruction. Move ZExt if it is profitable. For example:
4487/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4488/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4489/// Cost model calculations takes into account if zext(x) has other users and
4490/// whether it can be propagated through them too.
4491bool VectorCombine::shrinkType(Instruction &I) {
4492 Value *ZExted, *OtherOperand;
4493 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4494 m_Value(OtherOperand))) &&
4495 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4496 return false;
4497
4498 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4499
4500 auto *BigTy = cast<FixedVectorType>(I.getType());
4501 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4502 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4503
4504 if (I.getOpcode() == Instruction::LShr) {
4505 // Check that the shift amount is less than the number of bits in the
4506 // smaller type. Otherwise, the smaller lshr will return a poison value.
4507 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4508 if (ShAmtKB.getMaxValue().uge(BW))
4509 return false;
4510 } else {
4511 // Check that the expression overall uses at most the same number of bits as
4512 // ZExted
4513 KnownBits KB = computeKnownBits(&I, *DL);
4514 if (KB.countMaxActiveBits() > BW)
4515 return false;
4516 }
4517
4518 // Calculate costs of leaving current IR as it is and moving ZExt operation
4519 // later, along with adding truncates if needed
4521 Instruction::ZExt, BigTy, SmallTy,
4522 TargetTransformInfo::CastContextHint::None, CostKind);
4523 InstructionCost CurrentCost = ZExtCost;
4524 InstructionCost ShrinkCost = 0;
4525
4526 // Calculate total cost and check that we can propagate through all ZExt users
4527 for (User *U : ZExtOperand->users()) {
4528 auto *UI = cast<Instruction>(U);
4529 if (UI == &I) {
4530 CurrentCost +=
4531 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4532 ShrinkCost +=
4533 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4534 ShrinkCost += ZExtCost;
4535 continue;
4536 }
4537
4538 if (!Instruction::isBinaryOp(UI->getOpcode()))
4539 return false;
4540
4541 // Check if we can propagate ZExt through its other users
4542 KnownBits KB = computeKnownBits(UI, *DL);
4543 if (KB.countMaxActiveBits() > BW)
4544 return false;
4545
4546 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4547 ShrinkCost +=
4548 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4549 ShrinkCost += ZExtCost;
4550 }
4551
4552 // If the other instruction operand is not a constant, we'll need to
4553 // generate a truncate instruction. So we have to adjust cost
4554 if (!isa<Constant>(OtherOperand))
4555 ShrinkCost += TTI.getCastInstrCost(
4556 Instruction::Trunc, SmallTy, BigTy,
4557 TargetTransformInfo::CastContextHint::None, CostKind);
4558
4559 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4560 // towards modifying the IR because shrinking opens opportunities for other
4561 // shrinking optimisations.
4562 if (ShrinkCost > CurrentCost)
4563 return false;
4564
4565 Builder.SetInsertPoint(&I);
4566 Value *Op0 = ZExted;
4567 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4568 // Keep the order of operands the same
4569 if (I.getOperand(0) == OtherOperand)
4570 std::swap(Op0, Op1);
4571 Value *NewBinOp =
4572 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4573 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4574 cast<Instruction>(NewBinOp)->copyMetadata(I);
4575 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4576 replaceValue(I, *NewZExtr);
4577 return true;
4578}
4579
4580/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4581/// shuffle (DstVec, SrcVec, Mask)
4582bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4583 Value *DstVec, *SrcVec;
4584 uint64_t ExtIdx, InsIdx;
4585 if (!match(&I,
4586 m_InsertElt(m_Value(DstVec),
4587 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4588 m_ConstantInt(InsIdx))))
4589 return false;
4590
4591 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4592 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4593 // We can try combining vectors with different element sizes.
4594 if (!DstVecTy || !SrcVecTy ||
4595 SrcVecTy->getElementType() != DstVecTy->getElementType())
4596 return false;
4597
4598 unsigned NumDstElts = DstVecTy->getNumElements();
4599 unsigned NumSrcElts = SrcVecTy->getNumElements();
4600 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4601 return false;
4602
4603 // Insertion into poison is a cheaper single operand shuffle.
4605 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4606
4607 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4608 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4609 if (NeedDstSrcSwap) {
4611 Mask[InsIdx] = ExtIdx % NumDstElts;
4612 std::swap(DstVec, SrcVec);
4613 } else {
4615 std::iota(Mask.begin(), Mask.end(), 0);
4616 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
4617 }
4618
4619 // Cost
4620 auto *Ins = cast<InsertElementInst>(&I);
4621 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4622 InstructionCost InsCost =
4623 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4624 InstructionCost ExtCost =
4625 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4626 InstructionCost OldCost = ExtCost + InsCost;
4627
4628 InstructionCost NewCost = 0;
4629 SmallVector<int> ExtToVecMask;
4630 if (!NeedExpOrNarrow) {
4631 // Ignore 'free' identity insertion shuffle.
4632 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4633 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4634 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4635 nullptr, {DstVec, SrcVec});
4636 } else {
4637 // When creating a length-changing-vector, always try to keep the relevant
4638 // element in an equivalent position, so that bulk shuffles are more likely
4639 // to be useful.
4640 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4641 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
4642 // Add cost for expanding or narrowing
4644 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4645 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4646 }
4647
4648 if (!Ext->hasOneUse())
4649 NewCost += ExtCost;
4650
4651 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4652 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4653 << "\n");
4654
4655 if (OldCost < NewCost)
4656 return false;
4657
4658 if (NeedExpOrNarrow) {
4659 if (!NeedDstSrcSwap)
4660 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4661 else
4662 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4663 }
4664
4665 // Canonicalize undef param to RHS to help further folds.
4666 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4667 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4668 std::swap(DstVec, SrcVec);
4669 }
4670
4671 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4672 replaceValue(I, *Shuf);
4673
4674 return true;
4675}
4676
4677/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4678/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4679/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4680/// before casting it back into `<vscale x 16 x i32>`.
4681bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4682 const APInt *SplatVal0, *SplatVal1;
4684 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4685 return false;
4686
4687 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4688 << "\n");
4689
4690 auto *VTy =
4691 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4692 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4693 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4694
4695 // Just in case the cost of interleave2 intrinsic and bitcast are both
4696 // invalid, in which case we want to bail out, we use <= rather
4697 // than < here. Even they both have valid and equal costs, it's probably
4698 // not a good idea to emit a high-cost constant splat.
4700 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4702 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4703 << *I.getType() << " is too high.\n");
4704 return false;
4705 }
4706
4707 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4708 NewSplatVal <<= Width;
4709 NewSplatVal |= SplatVal0->zext(Width * 2);
4710 auto *NewSplat = ConstantVector::getSplat(
4711 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4712
4713 IRBuilder<> Builder(&I);
4714 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4715 return true;
4716}
4717
4718// Attempt to shrink loads that are only used by shufflevector instructions.
4719bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4720 auto *OldLoad = dyn_cast<LoadInst>(&I);
4721 if (!OldLoad || !OldLoad->isSimple())
4722 return false;
4723
4724 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4725 if (!OldLoadTy)
4726 return false;
4727
4728 unsigned const OldNumElements = OldLoadTy->getNumElements();
4729
4730 // Search all uses of load. If all uses are shufflevector instructions, and
4731 // the second operands are all poison values, find the minimum and maximum
4732 // indices of the vector elements referenced by all shuffle masks.
4733 // Otherwise return `std::nullopt`.
4734 using IndexRange = std::pair<int, int>;
4735 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4736 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4737 for (llvm::Use &Use : I.uses()) {
4738 // Ensure all uses match the required pattern.
4739 User *Shuffle = Use.getUser();
4740 ArrayRef<int> Mask;
4741
4742 if (!match(Shuffle,
4743 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4744 return std::nullopt;
4745
4746 // Ignore shufflevector instructions that have no uses.
4747 if (Shuffle->use_empty())
4748 continue;
4749
4750 // Find the min and max indices used by the shufflevector instruction.
4751 for (int Index : Mask) {
4752 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4753 OutputRange.first = std::min(Index, OutputRange.first);
4754 OutputRange.second = std::max(Index, OutputRange.second);
4755 }
4756 }
4757 }
4758
4759 if (OutputRange.second < OutputRange.first)
4760 return std::nullopt;
4761
4762 return OutputRange;
4763 };
4764
4765 // Get the range of vector elements used by shufflevector instructions.
4766 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4767 unsigned const NewNumElements = Indices->second + 1u;
4768
4769 // If the range of vector elements is smaller than the full load, attempt
4770 // to create a smaller load.
4771 if (NewNumElements < OldNumElements) {
4772 IRBuilder Builder(&I);
4773 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4774
4775 // Calculate costs of old and new ops.
4776 Type *ElemTy = OldLoadTy->getElementType();
4777 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4778 Value *PtrOp = OldLoad->getPointerOperand();
4779
4781 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4782 OldLoad->getPointerAddressSpace(), CostKind);
4783 InstructionCost NewCost =
4784 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4785 OldLoad->getPointerAddressSpace(), CostKind);
4786
4787 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4789 unsigned const MaxIndex = NewNumElements * 2u;
4790
4791 for (llvm::Use &Use : I.uses()) {
4792 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4793 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4794
4795 // Create entry for new use.
4796 NewUses.push_back({Shuffle, OldMask});
4797
4798 // Validate mask indices.
4799 for (int Index : OldMask) {
4800 if (Index >= static_cast<int>(MaxIndex))
4801 return false;
4802 }
4803
4804 // Update costs.
4805 OldCost +=
4807 OldLoadTy, OldMask, CostKind);
4808 NewCost +=
4810 NewLoadTy, OldMask, CostKind);
4811 }
4812
4813 LLVM_DEBUG(
4814 dbgs() << "Found a load used only by shufflevector instructions: "
4815 << I << "\n OldCost: " << OldCost
4816 << " vs NewCost: " << NewCost << "\n");
4817
4818 if (OldCost < NewCost || !NewCost.isValid())
4819 return false;
4820
4821 // Create new load of smaller vector.
4822 auto *NewLoad = cast<LoadInst>(
4823 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4824 NewLoad->copyMetadata(I);
4825
4826 // Replace all uses.
4827 for (UseEntry &Use : NewUses) {
4828 ShuffleVectorInst *Shuffle = Use.first;
4829 std::vector<int> &NewMask = Use.second;
4830
4831 Builder.SetInsertPoint(Shuffle);
4832 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4833 Value *NewShuffle = Builder.CreateShuffleVector(
4834 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4835
4836 replaceValue(*Shuffle, *NewShuffle, false);
4837 }
4838
4839 return true;
4840 }
4841 }
4842 return false;
4843}
4844
4845// Attempt to narrow a phi of shufflevector instructions where the two incoming
4846// values have the same operands but different masks. If the two shuffle masks
4847// are offsets of one another we can use one branch to rotate the incoming
4848// vector and perform one larger shuffle after the phi.
4849bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4850 auto *Phi = dyn_cast<PHINode>(&I);
4851 if (!Phi || Phi->getNumIncomingValues() != 2u)
4852 return false;
4853
4854 Value *Op = nullptr;
4855 ArrayRef<int> Mask0;
4856 ArrayRef<int> Mask1;
4857
4858 if (!match(Phi->getOperand(0u),
4859 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4860 !match(Phi->getOperand(1u),
4861 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4862 return false;
4863
4864 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4865
4866 // Ensure result vectors are wider than the argument vector.
4867 auto *InputVT = cast<FixedVectorType>(Op->getType());
4868 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4869 auto const InputNumElements = InputVT->getNumElements();
4870
4871 if (InputNumElements >= ResultVT->getNumElements())
4872 return false;
4873
4874 // Take the difference of the two shuffle masks at each index. Ignore poison
4875 // values at the same index in both masks.
4876 SmallVector<int, 16> NewMask;
4877 NewMask.reserve(Mask0.size());
4878
4879 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4880 if (M0 >= 0 && M1 >= 0)
4881 NewMask.push_back(M0 - M1);
4882 else if (M0 == -1 && M1 == -1)
4883 continue;
4884 else
4885 return false;
4886 }
4887
4888 // Ensure all elements of the new mask are equal. If the difference between
4889 // the incoming mask elements is the same, the two must be constant offsets
4890 // of one another.
4891 if (NewMask.empty() || !all_equal(NewMask))
4892 return false;
4893
4894 // Create new mask using difference of the two incoming masks.
4895 int MaskOffset = NewMask[0u];
4896 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4897 NewMask.clear();
4898
4899 for (unsigned I = 0u; I < InputNumElements; ++I) {
4900 NewMask.push_back(Index);
4901 Index = (Index + 1u) % InputNumElements;
4902 }
4903
4904 // Calculate costs for worst cases and compare.
4905 auto const Kind = TTI::SK_PermuteSingleSrc;
4906 auto OldCost =
4907 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4908 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4909 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4910 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4911
4912 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4913 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4914 << "\n");
4915
4916 if (NewCost > OldCost)
4917 return false;
4918
4919 // Create new shuffles and narrowed phi.
4920 auto Builder = IRBuilder(Shuf);
4921 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4922 auto *PoisonVal = PoisonValue::get(InputVT);
4923 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4924 Worklist.push(cast<Instruction>(NewShuf0));
4925
4926 Builder.SetInsertPoint(Phi);
4927 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4928 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4929 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4930 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4931
4932 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4933 PoisonVal = PoisonValue::get(NewPhi->getType());
4934 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4935
4936 replaceValue(*Phi, *NewShuf1);
4937 return true;
4938}
4939
4940/// This is the entry point for all transforms. Pass manager differences are
4941/// handled in the callers of this function.
4942bool VectorCombine::run() {
4944 return false;
4945
4946 // Don't attempt vectorization if the target does not support vectors.
4947 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4948 return false;
4949
4950 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4951
4952 auto FoldInst = [this](Instruction &I) {
4953 Builder.SetInsertPoint(&I);
4954 bool IsVectorType = isa<VectorType>(I.getType());
4955 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4956 auto Opcode = I.getOpcode();
4957
4958 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4959
4960 // These folds should be beneficial regardless of when this pass is run
4961 // in the optimization pipeline.
4962 // The type checking is for run-time efficiency. We can avoid wasting time
4963 // dispatching to folding functions if there's no chance of matching.
4964 if (IsFixedVectorType) {
4965 switch (Opcode) {
4966 case Instruction::InsertElement:
4967 if (vectorizeLoadInsert(I))
4968 return true;
4969 break;
4970 case Instruction::ShuffleVector:
4971 if (widenSubvectorLoad(I))
4972 return true;
4973 break;
4974 default:
4975 break;
4976 }
4977 }
4978
4979 // This transform works with scalable and fixed vectors
4980 // TODO: Identify and allow other scalable transforms
4981 if (IsVectorType) {
4982 if (scalarizeOpOrCmp(I))
4983 return true;
4984 if (scalarizeLoad(I))
4985 return true;
4986 if (scalarizeExtExtract(I))
4987 return true;
4988 if (scalarizeVPIntrinsic(I))
4989 return true;
4990 if (foldInterleaveIntrinsics(I))
4991 return true;
4992 }
4993
4994 if (Opcode == Instruction::Store)
4995 if (foldSingleElementStore(I))
4996 return true;
4997
4998 // If this is an early pipeline invocation of this pass, we are done.
4999 if (TryEarlyFoldsOnly)
5000 return false;
5001
5002 // Otherwise, try folds that improve codegen but may interfere with
5003 // early IR canonicalizations.
5004 // The type checking is for run-time efficiency. We can avoid wasting time
5005 // dispatching to folding functions if there's no chance of matching.
5006 if (IsFixedVectorType) {
5007 switch (Opcode) {
5008 case Instruction::InsertElement:
5009 if (foldInsExtFNeg(I))
5010 return true;
5011 if (foldInsExtBinop(I))
5012 return true;
5013 if (foldInsExtVectorToShuffle(I))
5014 return true;
5015 break;
5016 case Instruction::ShuffleVector:
5017 if (foldPermuteOfBinops(I))
5018 return true;
5019 if (foldShuffleOfBinops(I))
5020 return true;
5021 if (foldShuffleOfSelects(I))
5022 return true;
5023 if (foldShuffleOfCastops(I))
5024 return true;
5025 if (foldShuffleOfShuffles(I))
5026 return true;
5027 if (foldPermuteOfIntrinsic(I))
5028 return true;
5029 if (foldShufflesOfLengthChangingShuffles(I))
5030 return true;
5031 if (foldShuffleOfIntrinsics(I))
5032 return true;
5033 if (foldSelectShuffle(I))
5034 return true;
5035 if (foldShuffleToIdentity(I))
5036 return true;
5037 break;
5038 case Instruction::Load:
5039 if (shrinkLoadForShuffles(I))
5040 return true;
5041 break;
5042 case Instruction::BitCast:
5043 if (foldBitcastShuffle(I))
5044 return true;
5045 break;
5046 case Instruction::And:
5047 case Instruction::Or:
5048 case Instruction::Xor:
5049 if (foldBitOpOfCastops(I))
5050 return true;
5051 if (foldBitOpOfCastConstant(I))
5052 return true;
5053 break;
5054 case Instruction::PHI:
5055 if (shrinkPhiOfShuffles(I))
5056 return true;
5057 break;
5058 default:
5059 if (shrinkType(I))
5060 return true;
5061 break;
5062 }
5063 } else {
5064 switch (Opcode) {
5065 case Instruction::Call:
5066 if (foldShuffleFromReductions(I))
5067 return true;
5068 if (foldCastFromReductions(I))
5069 return true;
5070 break;
5071 case Instruction::ExtractElement:
5072 if (foldShuffleChainsToReduce(I))
5073 return true;
5074 break;
5075 case Instruction::ICmp:
5076 case Instruction::FCmp:
5077 if (foldExtractExtract(I))
5078 return true;
5079 break;
5080 case Instruction::Or:
5081 if (foldConcatOfBoolMasks(I))
5082 return true;
5083 [[fallthrough]];
5084 default:
5085 if (Instruction::isBinaryOp(Opcode)) {
5086 if (foldExtractExtract(I))
5087 return true;
5088 if (foldExtractedCmps(I))
5089 return true;
5090 if (foldBinopOfReductions(I))
5091 return true;
5092 }
5093 break;
5094 }
5095 }
5096 return false;
5097 };
5098
5099 bool MadeChange = false;
5100 for (BasicBlock &BB : F) {
5101 // Ignore unreachable basic blocks.
5102 if (!DT.isReachableFromEntry(&BB))
5103 continue;
5104 // Use early increment range so that we can erase instructions in loop.
5105 // make_early_inc_range is not applicable here, as the next iterator may
5106 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
5107 // We manually maintain the next instruction and update it when it is about
5108 // to be deleted.
5109 Instruction *I = &BB.front();
5110 while (I) {
5111 NextInst = I->getNextNode();
5112 if (!I->isDebugOrPseudoInst())
5113 MadeChange |= FoldInst(*I);
5114 I = NextInst;
5115 }
5116 }
5117
5118 NextInst = nullptr;
5119
5120 while (!Worklist.isEmpty()) {
5121 Instruction *I = Worklist.removeOne();
5122 if (!I)
5123 continue;
5124
5127 continue;
5128 }
5129
5130 MadeChange |= FoldInst(*I);
5131 }
5132
5133 return MadeChange;
5134}
5135
5138 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
5140 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
5141 AAResults &AA = FAM.getResult<AAManager>(F);
5142 const DataLayout *DL = &F.getDataLayout();
5143 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
5144 TryEarlyFoldsOnly);
5145 if (!Combiner.run())
5146 return PreservedAnalyses::all();
5149 return PA;
5150}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1450
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:162
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:178
iterator end()
Definition DenseMap.h:81
Analysis pass which computes a DominatorTree.
Definition Dominators.h:283
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:164
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:802
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:318
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:230
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:293
Value * getOperand(unsigned i) const
Definition User.h:233
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:963
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
bool user_empty() const
Definition Value.h:389
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:532
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2106
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1730
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1737
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2530
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2184
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1744
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1634
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
auto make_scope_exit(Callable &&F)
Definition ScopeExit.h:53
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1770
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1945
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2156
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145