LLVM 23.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
40#include <numeric>
41#include <optional>
42#include <queue>
43#include <set>
44
45#define DEBUG_TYPE "vector-combine"
47
48using namespace llvm;
49using namespace llvm::PatternMatch;
50
51STATISTIC(NumVecLoad, "Number of vector loads formed");
52STATISTIC(NumVecCmp, "Number of vector compares formed");
53STATISTIC(NumVecBO, "Number of vector binops formed");
54STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
55STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
56STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
57STATISTIC(NumScalarCmp, "Number of scalar compares formed");
58STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
59
61 "disable-vector-combine", cl::init(false), cl::Hidden,
62 cl::desc("Disable all vector combine transforms"));
63
65 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
66 cl::desc("Disable binop extract to shuffle transforms"));
67
69 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
70 cl::desc("Max number of instructions to scan for vector combining."));
71
72static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
73
74namespace {
75class VectorCombine {
76public:
77 VectorCombine(Function &F, const TargetTransformInfo &TTI,
80 bool TryEarlyFoldsOnly)
81 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
82 DT(DT), AA(AA), DL(DL), CostKind(CostKind),
83 SQ(*DL, /*TLI=*/nullptr, &DT, &AC),
84 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
85
86 bool run();
87
88private:
89 Function &F;
91 const TargetTransformInfo &TTI;
92 const DominatorTree &DT;
93 AAResults &AA;
94 const DataLayout *DL;
95 TTI::TargetCostKind CostKind;
96 const SimplifyQuery SQ;
97
98 /// If true, only perform beneficial early IR transforms. Do not introduce new
99 /// vector operations.
100 bool TryEarlyFoldsOnly;
101
102 InstructionWorklist Worklist;
103
104 /// Next instruction to iterate. It will be updated when it is erased by
105 /// RecursivelyDeleteTriviallyDeadInstructions.
106 Instruction *NextInst;
107
108 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
109 // parameter. That should be updated to specific sub-classes because the
110 // run loop was changed to dispatch on opcode.
111 bool vectorizeLoadInsert(Instruction &I);
112 bool widenSubvectorLoad(Instruction &I);
113 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
114 ExtractElementInst *Ext1,
115 unsigned PreferredExtractIndex) const;
116 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
117 const Instruction &I,
118 ExtractElementInst *&ConvertToShuffle,
119 unsigned PreferredExtractIndex);
120 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
121 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
122 bool foldExtractExtract(Instruction &I);
123 bool foldInsExtFNeg(Instruction &I);
124 bool foldInsExtBinop(Instruction &I);
125 bool foldInsExtVectorToShuffle(Instruction &I);
126 bool foldBitOpOfCastops(Instruction &I);
127 bool foldBitOpOfCastConstant(Instruction &I);
128 bool foldBitcastShuffle(Instruction &I);
129 bool scalarizeOpOrCmp(Instruction &I);
130 bool scalarizeVPIntrinsic(Instruction &I);
131 bool foldExtractedCmps(Instruction &I);
132 bool foldSelectsFromBitcast(Instruction &I);
133 bool foldBinopOfReductions(Instruction &I);
134 bool foldSingleElementStore(Instruction &I);
135 bool scalarizeLoad(Instruction &I);
136 bool scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy, Value *Ptr);
137 bool scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy, Value *Ptr);
138 bool scalarizeExtExtract(Instruction &I);
139 bool foldConcatOfBoolMasks(Instruction &I);
140 bool foldPermuteOfBinops(Instruction &I);
141 bool foldShuffleOfBinops(Instruction &I);
142 bool foldShuffleOfSelects(Instruction &I);
143 bool foldShuffleOfCastops(Instruction &I);
144 bool foldShuffleOfShuffles(Instruction &I);
145 bool foldPermuteOfIntrinsic(Instruction &I);
146 bool foldShufflesOfLengthChangingShuffles(Instruction &I);
147 bool foldShuffleOfIntrinsics(Instruction &I);
148 bool foldShuffleToIdentity(Instruction &I);
149 bool foldShuffleFromReductions(Instruction &I);
150 bool foldShuffleChainsToReduce(Instruction &I);
151 bool foldCastFromReductions(Instruction &I);
152 bool foldSignBitReductionCmp(Instruction &I);
153 bool foldReductionZeroTest(Instruction &I);
154 bool foldICmpEqZeroVectorReduce(Instruction &I);
155 bool foldEquivalentReductionCmp(Instruction &I);
156 bool foldReduceAddCmpZero(Instruction &I);
157 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
158 bool foldInterleaveIntrinsics(Instruction &I);
159 bool foldDeinterleaveIntrinsics(Instruction &I);
160 bool foldBitcastOfVPLoad(Instruction &I);
161 bool foldBitOrderReverseAndSwap(Instruction &I);
162 bool shrinkType(Instruction &I);
163 bool shrinkLoadForShuffles(Instruction &I);
164 bool shrinkPhiOfShuffles(Instruction &I);
165
166 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
167 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
168 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
169 Old.replaceAllUsesWith(&New);
170 if (auto *NewI = dyn_cast<Instruction>(&New)) {
171 New.takeName(&Old);
172 Worklist.pushUsersToWorkList(*NewI);
173 Worklist.pushValue(NewI);
174 }
175 if (Erase && isInstructionTriviallyDead(&Old)) {
176 eraseInstruction(Old);
177 } else {
178 Worklist.push(&Old);
179 }
180 }
181
182 void eraseInstruction(Instruction &I) {
183 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
184 SmallVector<Value *> Ops(I.operands());
185 Worklist.remove(&I);
186 I.eraseFromParent();
187
188 // Push remaining users of the operands and then the operand itself - allows
189 // further folds that were hindered by OneUse limits.
190 SmallPtrSet<Value *, 4> Visited;
191 for (Value *Op : Ops) {
192 if (!Visited.contains(Op)) {
193 if (auto *OpI = dyn_cast<Instruction>(Op)) {
195 OpI, nullptr, nullptr, [&](Value *V) {
196 if (auto *I = dyn_cast<Instruction>(V)) {
197 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
198 Worklist.remove(I);
199 if (I == NextInst)
200 NextInst = NextInst->getNextNode();
201 Visited.insert(I);
202 }
203 }))
204 continue;
205 Worklist.pushUsersToWorkList(*OpI);
206 Worklist.pushValue(OpI);
207 }
208 }
209 }
210 }
211};
212} // namespace
213
214/// Return the source operand of a potentially bitcasted value. If there is no
215/// bitcast, return the input value itself.
217 while (auto *BitCast = dyn_cast<BitCastInst>(V))
218 V = BitCast->getOperand(0);
219 return V;
220}
221
222static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
223 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
224 // The widened load may load data from dirty regions or create data races
225 // non-existent in the source.
226 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
227 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
229 return false;
230
231 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
232 // sure we have all of our type-based constraints in place for this target.
233 Type *ScalarTy = Load->getType()->getScalarType();
234 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
235 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
236 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
237 ScalarSize % 8 != 0)
238 return false;
239
240 return true;
241}
242
243bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
244 // Match insert into fixed vector of scalar value.
245 // TODO: Handle non-zero insert index.
246 Value *Scalar;
247 if (!match(&I,
249 return false;
250
251 // Optionally match an extract from another vector.
252 Value *X;
253 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
254 if (!HasExtract)
255 X = Scalar;
256
257 auto *Load = dyn_cast<LoadInst>(X);
258 if (!canWidenLoad(Load, TTI))
259 return false;
260
261 Type *ScalarTy = Scalar->getType();
262 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
263 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
264
265 // Check safety of replacing the scalar load with a larger vector load.
266 // We use minimal alignment (maximum flexibility) because we only care about
267 // the dereferenceable region. When calculating cost and creating a new op,
268 // we may use a larger value based on alignment attributes.
269 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
270 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
271
272 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
273 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
274 unsigned OffsetEltIndex = 0;
275 Align Alignment = Load->getAlign();
276 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, SQ.AC,
277 SQ.DT)) {
278 // It is not safe to load directly from the pointer, but we can still peek
279 // through gep offsets and check if it safe to load from a base address with
280 // updated alignment. If it is, we can shuffle the element(s) into place
281 // after loading.
282 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
283 APInt Offset(OffsetBitWidth, 0);
285
286 // We want to shuffle the result down from a high element of a vector, so
287 // the offset must be positive.
288 if (Offset.isNegative())
289 return false;
290
291 // The offset must be a multiple of the scalar element to shuffle cleanly
292 // in the element's size.
293 uint64_t ScalarSizeInBytes = ScalarSize / 8;
294 if (Offset.urem(ScalarSizeInBytes) != 0)
295 return false;
296
297 // If we load MinVecNumElts, will our target element still be loaded?
298 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
299 if (OffsetEltIndex >= MinVecNumElts)
300 return false;
301
302 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load,
303 SQ.AC, SQ.DT))
304 return false;
305
306 // Update alignment with offset value. Note that the offset could be negated
307 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
308 // negation does not change the result of the alignment calculation.
309 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
310 }
311
312 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
313 // Use the greater of the alignment on the load or its source pointer.
314 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
315 Type *LoadTy = Load->getType();
316 unsigned AS = Load->getPointerAddressSpace();
317 InstructionCost OldCost =
318 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
319 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
320 OldCost +=
321 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
322 /* Insert */ true, HasExtract, CostKind);
323
324 // New pattern: load VecPtr
325 InstructionCost NewCost =
326 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
327 // Optionally, we are shuffling the loaded vector element(s) into place.
328 // For the mask set everything but element 0 to undef to prevent poison from
329 // propagating from the extra loaded memory. This will also optionally
330 // shrink/grow the vector from the loaded size to the output size.
331 // We assume this operation has no cost in codegen if there was no offset.
332 // Note that we could use freeze to avoid poison problems, but then we might
333 // still need a shuffle to change the vector size.
334 auto *Ty = cast<FixedVectorType>(I.getType());
335 unsigned OutputNumElts = Ty->getNumElements();
336 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
337 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
338 Mask[0] = OffsetEltIndex;
339 if (OffsetEltIndex)
340 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
341 CostKind);
342
343 // We can aggressively convert to the vector form because the backend can
344 // invert this transform if it does not result in a performance win.
345 if (OldCost < NewCost || !NewCost.isValid())
346 return false;
347
348 // It is safe and potentially profitable to load a vector directly:
349 // inselt undef, load Scalar, 0 --> load VecPtr
350 IRBuilder<> Builder(Load);
351 Value *CastedPtr =
352 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
353 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
354 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
355
356 replaceValue(I, *VecLd);
357 ++NumVecLoad;
358 return true;
359}
360
361/// If we are loading a vector and then inserting it into a larger vector with
362/// undefined elements, try to load the larger vector and eliminate the insert.
363/// This removes a shuffle in IR and may allow combining of other loaded values.
364bool VectorCombine::widenSubvectorLoad(Instruction &I) {
365 // Match subvector insert of fixed vector.
366 auto *Shuf = cast<ShuffleVectorInst>(&I);
367 if (!Shuf->isIdentityWithPadding())
368 return false;
369
370 // Allow a non-canonical shuffle mask that is choosing elements from op1.
371 unsigned NumOpElts =
372 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
373 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
374 return M >= (int)(NumOpElts);
375 });
376
377 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
378 if (!canWidenLoad(Load, TTI))
379 return false;
380
381 // We use minimal alignment (maximum flexibility) because we only care about
382 // the dereferenceable region. When calculating cost and creating a new op,
383 // we may use a larger value based on alignment attributes.
384 auto *Ty = cast<FixedVectorType>(I.getType());
385 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
386 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
387 Align Alignment = Load->getAlign();
388 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, SQ.AC,
389 SQ.DT))
390 return false;
391
392 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
393 Type *LoadTy = Load->getType();
394 unsigned AS = Load->getPointerAddressSpace();
395
396 // Original pattern: insert_subvector (load PtrOp)
397 // This conservatively assumes that the cost of a subvector insert into an
398 // undef value is 0. We could add that cost if the cost model accurately
399 // reflects the real cost of that operation.
400 InstructionCost OldCost =
401 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
402
403 // New pattern: load PtrOp
404 InstructionCost NewCost =
405 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
406
407 // We can aggressively convert to the vector form because the backend can
408 // invert this transform if it does not result in a performance win.
409 if (OldCost < NewCost || !NewCost.isValid())
410 return false;
411
412 IRBuilder<> Builder(Load);
413 Value *CastedPtr =
414 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
415 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
416 replaceValue(I, *VecLd);
417 ++NumVecLoad;
418 return true;
419}
420
421/// Determine which, if any, of the inputs should be replaced by a shuffle
422/// followed by extract from a different index.
423ExtractElementInst *VectorCombine::getShuffleExtract(
424 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
425 unsigned PreferredExtractIndex = InvalidIndex) const {
426 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
427 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
428 assert(Index0C && Index1C && "Expected constant extract indexes");
429
430 unsigned Index0 = Index0C->getZExtValue();
431 unsigned Index1 = Index1C->getZExtValue();
432
433 // If the extract indexes are identical, no shuffle is needed.
434 if (Index0 == Index1)
435 return nullptr;
436
437 Type *VecTy = Ext0->getVectorOperand()->getType();
438 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
439 InstructionCost Cost0 =
440 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
441 InstructionCost Cost1 =
442 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
443
444 // If both costs are invalid no shuffle is needed
445 if (!Cost0.isValid() && !Cost1.isValid())
446 return nullptr;
447
448 // We are extracting from 2 different indexes, so one operand must be shuffled
449 // before performing a vector operation and/or extract. The more expensive
450 // extract will be replaced by a shuffle.
451 if (Cost0 > Cost1)
452 return Ext0;
453 if (Cost1 > Cost0)
454 return Ext1;
455
456 // If the costs are equal and there is a preferred extract index, shuffle the
457 // opposite operand.
458 if (PreferredExtractIndex == Index0)
459 return Ext1;
460 if (PreferredExtractIndex == Index1)
461 return Ext0;
462
463 // Otherwise, replace the extract with the higher index.
464 return Index0 > Index1 ? Ext0 : Ext1;
465}
466
467/// Compare the relative costs of 2 extracts followed by scalar operation vs.
468/// vector operation(s) followed by extract. Return true if the existing
469/// instructions are cheaper than a vector alternative. Otherwise, return false
470/// and if one of the extracts should be transformed to a shufflevector, set
471/// \p ConvertToShuffle to that extract instruction.
472bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
473 ExtractElementInst *Ext1,
474 const Instruction &I,
475 ExtractElementInst *&ConvertToShuffle,
476 unsigned PreferredExtractIndex) {
477 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
478 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
479 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
480
481 unsigned Opcode = I.getOpcode();
482 Value *Ext0Src = Ext0->getVectorOperand();
483 Value *Ext1Src = Ext1->getVectorOperand();
484 Type *ScalarTy = Ext0->getType();
485 auto *VecTy = cast<VectorType>(Ext0Src->getType());
486 InstructionCost ScalarOpCost, VectorOpCost;
487
488 // Get cost estimates for scalar and vector versions of the operation.
489 bool IsBinOp = Instruction::isBinaryOp(Opcode);
490 if (IsBinOp) {
491 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
492 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
493 } else {
494 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
495 "Expected a compare");
496 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
497 ScalarOpCost = TTI.getCmpSelInstrCost(
498 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
499 VectorOpCost = TTI.getCmpSelInstrCost(
500 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
501 }
502
503 // Get cost estimates for the extract elements. These costs will factor into
504 // both sequences.
505 unsigned Ext0Index = Ext0IndexC->getZExtValue();
506 unsigned Ext1Index = Ext1IndexC->getZExtValue();
507
508 InstructionCost Extract0Cost =
509 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
510 InstructionCost Extract1Cost =
511 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
512
513 // A more expensive extract will always be replaced by a splat shuffle.
514 // For example, if Ext0 is more expensive:
515 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
516 // extelt (opcode (splat V0, Ext0), V1), Ext1
517 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
518 // check the cost of creating a broadcast shuffle and shuffling both
519 // operands to element 0.
520 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
521 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
522 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
523
524 // Extra uses of the extracts mean that we include those costs in the
525 // vector total because those instructions will not be eliminated.
526 InstructionCost OldCost, NewCost;
527 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
528 // Handle a special case. If the 2 extracts are identical, adjust the
529 // formulas to account for that. The extra use charge allows for either the
530 // CSE'd pattern or an unoptimized form with identical values:
531 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
532 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
533 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
534 OldCost = CheapExtractCost + ScalarOpCost;
535 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
536 } else {
537 // Handle the general case. Each extract is actually a different value:
538 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
539 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
540 NewCost = VectorOpCost + CheapExtractCost +
541 !Ext0->hasOneUse() * Extract0Cost +
542 !Ext1->hasOneUse() * Extract1Cost;
543 }
544
545 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
546 if (ConvertToShuffle) {
547 if (IsBinOp && DisableBinopExtractShuffle)
548 return true;
549
550 // If we are extracting from 2 different indexes, then one operand must be
551 // shuffled before performing the vector operation. The shuffle mask is
552 // poison except for 1 lane that is being translated to the remaining
553 // extraction lane. Therefore, it is a splat shuffle. Ex:
554 // ShufMask = { poison, poison, 0, poison }
555 // TODO: The cost model has an option for a "broadcast" shuffle
556 // (splat-from-element-0), but no option for a more general splat.
557 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
558 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
560 ShuffleMask[BestInsIndex] = BestExtIndex;
562 VecTy, VecTy, ShuffleMask, CostKind, 0,
563 nullptr, {ConvertToShuffle});
564 } else {
566 VecTy, VecTy, {}, CostKind, 0, nullptr,
567 {ConvertToShuffle});
568 }
569 }
570
571 // Aggressively form a vector op if the cost is equal because the transform
572 // may enable further optimization.
573 // Codegen can reverse this transform (scalarize) if it was not profitable.
574 return OldCost < NewCost;
575}
576
577/// Create a shuffle that translates (shifts) 1 element from the input vector
578/// to a new element location.
579static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
580 unsigned NewIndex, IRBuilderBase &Builder) {
581 // The shuffle mask is poison except for 1 lane that is being translated
582 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
583 // ShufMask = { 2, poison, poison, poison }
584 auto *VecTy = cast<FixedVectorType>(Vec->getType());
585 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
586 ShufMask[NewIndex] = OldIndex;
587 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
588}
589
590/// Given an extract element instruction with constant index operand, shuffle
591/// the source vector (shift the scalar element) to a NewIndex for extraction.
592/// Return null if the input can be constant folded, so that we are not creating
593/// unnecessary instructions.
594static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
595 IRBuilderBase &Builder) {
596 // Shufflevectors can only be created for fixed-width vectors.
597 Value *X = ExtElt->getVectorOperand();
598 if (!isa<FixedVectorType>(X->getType()))
599 return nullptr;
600
601 // If the extract can be constant-folded, this code is unsimplified. Defer
602 // to other passes to handle that.
603 Value *C = ExtElt->getIndexOperand();
604 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
605 if (isa<Constant>(X))
606 return nullptr;
607
608 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
609 NewIndex, Builder);
610 return Shuf;
611}
612
613/// Try to reduce extract element costs by converting scalar compares to vector
614/// compares followed by extract.
615/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
616Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
617 Instruction &I) {
618 assert(isa<CmpInst>(&I) && "Expected a compare");
619
620 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
621 // --> extelt (cmp Pred V0, V1), ExtIndex
622 ++NumVecCmp;
623 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
624 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
625 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
626}
627
628/// Try to reduce extract element costs by converting scalar binops to vector
629/// binops followed by extract.
630/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
631Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
632 Instruction &I) {
633 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
634
635 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
636 // --> extelt (bo V0, V1), ExtIndex
637 ++NumVecBO;
638 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
639 V1, "foldExtExtBinop");
640
641 // All IR flags are safe to back-propagate because any potential poison
642 // created in unused vector elements is discarded by the extract.
643 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
644 VecBOInst->copyIRFlags(&I);
645
646 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
647}
648
649/// Match an instruction with extracted vector operands.
650bool VectorCombine::foldExtractExtract(Instruction &I) {
651 // It is not safe to transform things like div, urem, etc. because we may
652 // create undefined behavior when executing those on unknown vector elements.
654 return false;
655
656 Instruction *I0, *I1;
657 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
658 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
660 return false;
661
662 Value *V0, *V1;
663 uint64_t C0, C1;
664 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
666 V0->getType() != V1->getType())
667 return false;
668
669 // For fixed-width vectors, reject out-of-bounds extract indexes
670 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(V0->getType())) {
671 unsigned NumElts = FixedVecTy->getNumElements();
672 if (C0 >= NumElts || C1 >= NumElts)
673 return false;
674 }
675
676 // If the scalar value 'I' is going to be re-inserted into a vector, then try
677 // to create an extract to that same element. The extract/insert can be
678 // reduced to a "select shuffle".
679 // TODO: If we add a larger pattern match that starts from an insert, this
680 // probably becomes unnecessary.
681 auto *Ext0 = cast<ExtractElementInst>(I0);
682 auto *Ext1 = cast<ExtractElementInst>(I1);
683 uint64_t InsertIndex = InvalidIndex;
684 if (I.hasOneUse())
685 match(I.user_back(),
686 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
687
688 ExtractElementInst *ExtractToChange;
689 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
690 return false;
691
692 Value *ExtOp0 = Ext0->getVectorOperand();
693 Value *ExtOp1 = Ext1->getVectorOperand();
694
695 if (ExtractToChange) {
696 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
697 Value *NewExtOp =
698 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
699 if (!NewExtOp)
700 return false;
701 if (ExtractToChange == Ext0)
702 ExtOp0 = NewExtOp;
703 else
704 ExtOp1 = NewExtOp;
705 }
706
707 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
708 : Ext0->getIndexOperand();
709 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
710 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
711 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
712 Worklist.push(Ext0);
713 Worklist.push(Ext1);
714 replaceValue(I, *NewExt);
715 return true;
716}
717
718/// Try to replace an extract + scalar fneg + insert with a vector fneg +
719/// shuffle.
720bool VectorCombine::foldInsExtFNeg(Instruction &I) {
721 // Match an insert (op (extract)) pattern.
722 Value *DstVec;
723 uint64_t ExtIdx, InsIdx;
724 Instruction *FNeg;
725 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
726 m_ConstantInt(InsIdx))))
727 return false;
728
729 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
730 Value *SrcVec;
731 Instruction *Extract;
732 if (!match(FNeg, m_FNeg(m_CombineAnd(
733 m_Instruction(Extract),
734 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
735 return false;
736
737 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
738 auto *DstVecScalarTy = DstVecTy->getScalarType();
739 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
740 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
741 return false;
742
743 // Ignore if insert/extract index is out of bounds or destination vector has
744 // one element
745 unsigned NumDstElts = DstVecTy->getNumElements();
746 unsigned NumSrcElts = SrcVecTy->getNumElements();
747 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
748 return false;
749
750 // We are inserting the negated element into the same lane that we extracted
751 // from. This is equivalent to a select-shuffle that chooses all but the
752 // negated element from the destination vector.
753 SmallVector<int> Mask(NumDstElts);
754 std::iota(Mask.begin(), Mask.end(), 0);
755 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
756 InstructionCost OldCost =
757 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
758 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
759
760 // If the extract has one use, it will be eliminated, so count it in the
761 // original cost. If it has more than one use, ignore the cost because it will
762 // be the same before/after.
763 if (Extract->hasOneUse())
764 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
765
766 InstructionCost NewCost =
767 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
769 DstVecTy, Mask, CostKind);
770
771 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
772 // If the lengths of the two vectors are not equal,
773 // we need to add a length-change vector. Add this cost.
774 SmallVector<int> SrcMask;
775 if (NeedLenChg) {
776 SrcMask.assign(NumDstElts, PoisonMaskElem);
777 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
779 DstVecTy, SrcVecTy, SrcMask, CostKind);
780 }
781
782 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
783 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
784 << "\n");
785 if (NewCost > OldCost)
786 return false;
787
788 Value *NewShuf, *LenChgShuf = nullptr;
789 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
790 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
791 if (NeedLenChg) {
792 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
793 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
794 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
795 Worklist.pushValue(LenChgShuf);
796 } else {
797 // shuffle DstVec, (fneg SrcVec), Mask
798 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
799 }
800
801 Worklist.pushValue(VecFNeg);
802 replaceValue(I, *NewShuf);
803 return true;
804}
805
806/// Try to fold insert(binop(x,y),binop(a,b),idx)
807/// --> binop(insert(x,a,idx),insert(y,b,idx))
808bool VectorCombine::foldInsExtBinop(Instruction &I) {
809 BinaryOperator *VecBinOp, *SclBinOp;
810 uint64_t Index;
811 if (!match(&I,
812 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
813 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
814 return false;
815
816 // TODO: Add support for addlike etc.
817 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
818 if (BinOpcode != SclBinOp->getOpcode())
819 return false;
820
821 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
822 if (!ResultTy)
823 return false;
824
825 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
826 // shuffle?
827
829 TTI.getInstructionCost(VecBinOp, CostKind) +
831 InstructionCost NewCost =
832 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
833 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
834 Index, VecBinOp->getOperand(0),
835 SclBinOp->getOperand(0)) +
836 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
837 Index, VecBinOp->getOperand(1),
838 SclBinOp->getOperand(1));
839
840 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
841 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
842 << "\n");
843 if (NewCost > OldCost)
844 return false;
845
846 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
847 SclBinOp->getOperand(0), Index);
848 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
849 SclBinOp->getOperand(1), Index);
850 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
851
852 // Intersect flags from the old binops.
853 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
854 NewInst->copyIRFlags(VecBinOp);
855 NewInst->andIRFlags(SclBinOp);
856 }
857
858 Worklist.pushValue(NewIns0);
859 Worklist.pushValue(NewIns1);
860 replaceValue(I, *NewBO);
861 return true;
862}
863
864/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
865/// Supports: bitcast, trunc, sext, zext
866bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
867 // Check if this is a bitwise logic operation
868 auto *BinOp = dyn_cast<BinaryOperator>(&I);
869 if (!BinOp || !BinOp->isBitwiseLogicOp())
870 return false;
871
872 // Get the cast instructions
873 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
874 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
875 if (!LHSCast || !RHSCast) {
876 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
877 return false;
878 }
879
880 // Both casts must be the same type
881 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
882 if (CastOpcode != RHSCast->getOpcode())
883 return false;
884
885 // Only handle supported cast operations
886 switch (CastOpcode) {
887 case Instruction::BitCast:
888 case Instruction::Trunc:
889 case Instruction::SExt:
890 case Instruction::ZExt:
891 break;
892 default:
893 return false;
894 }
895
896 Value *LHSSrc = LHSCast->getOperand(0);
897 Value *RHSSrc = RHSCast->getOperand(0);
898
899 // Source types must match
900 if (LHSSrc->getType() != RHSSrc->getType())
901 return false;
902
903 auto *SrcTy = LHSSrc->getType();
904 auto *DstTy = I.getType();
905 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
906 // Other casts only handle vector types with integer elements.
907 if (CastOpcode != Instruction::BitCast &&
908 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
909 return false;
910
911 // Only integer scalar/vector values are legal for bitwise logic operations.
912 if (!SrcTy->getScalarType()->isIntegerTy() ||
913 !DstTy->getScalarType()->isIntegerTy())
914 return false;
915
916 // Cost Check :
917 // OldCost = bitlogic + 2*casts
918 // NewCost = bitlogic + cast
919
920 // Calculate specific costs for each cast with instruction context
922 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
924 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
925
926 InstructionCost OldCost =
927 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
928 LHSCastCost + RHSCastCost;
929
930 // For new cost, we can't provide an instruction (it doesn't exist yet)
931 InstructionCost GenericCastCost = TTI.getCastInstrCost(
932 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
933
934 InstructionCost NewCost =
935 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
936 GenericCastCost;
937
938 // Account for multi-use casts using specific costs
939 if (!LHSCast->hasOneUse())
940 NewCost += LHSCastCost;
941 if (!RHSCast->hasOneUse())
942 NewCost += RHSCastCost;
943
944 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
945 << " NewCost=" << NewCost << "\n");
946
947 if (NewCost > OldCost)
948 return false;
949
950 // Create the operation on the source type
951 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
952 BinOp->getName() + ".inner");
953 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
954 NewBinOp->copyIRFlags(BinOp);
955
956 Worklist.pushValue(NewOp);
957
958 // Create the cast operation directly to ensure we get a new instruction
959 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
960
961 // Preserve cast instruction flags
962 NewCast->copyIRFlags(LHSCast);
963 NewCast->andIRFlags(RHSCast);
964
965 // Insert the new instruction
966 Value *Result = Builder.Insert(NewCast);
967
968 replaceValue(I, *Result);
969 return true;
970}
971
972/// Match:
973// bitop(castop(x), C) ->
974// bitop(castop(x), castop(InvC)) ->
975// castop(bitop(x, InvC))
976// Supports: bitcast
977bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
979 Constant *C;
980
981 // Check if this is a bitwise logic operation
983 return false;
984
985 // Get the cast instructions
986 auto *LHSCast = dyn_cast<CastInst>(LHS);
987 if (!LHSCast)
988 return false;
989
990 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
991
992 // Only handle supported cast operations
993 switch (CastOpcode) {
994 case Instruction::BitCast:
995 case Instruction::ZExt:
996 case Instruction::SExt:
997 case Instruction::Trunc:
998 break;
999 default:
1000 return false;
1001 }
1002
1003 Value *LHSSrc = LHSCast->getOperand(0);
1004
1005 auto *SrcTy = LHSSrc->getType();
1006 auto *DstTy = I.getType();
1007 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
1008 // Other casts only handle vector types with integer elements.
1009 if (CastOpcode != Instruction::BitCast &&
1010 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
1011 return false;
1012
1013 // Only integer scalar/vector values are legal for bitwise logic operations.
1014 if (!SrcTy->getScalarType()->isIntegerTy() ||
1015 !DstTy->getScalarType()->isIntegerTy())
1016 return false;
1017
1018 // Find the constant InvC, such that castop(InvC) equals to C.
1019 PreservedCastFlags RHSFlags;
1020 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
1021 if (!InvC)
1022 return false;
1023
1024 // Cost Check :
1025 // OldCost = bitlogic + cast
1026 // NewCost = bitlogic + cast
1027
1028 // Calculate specific costs for each cast with instruction context
1029 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1030 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1031
1032 InstructionCost OldCost =
1033 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1034
1035 // For new cost, we can't provide an instruction (it doesn't exist yet)
1036 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1037 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1038
1039 InstructionCost NewCost =
1040 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1041 GenericCastCost;
1042
1043 // Account for multi-use casts using specific costs
1044 if (!LHSCast->hasOneUse())
1045 NewCost += LHSCastCost;
1046
1047 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1048 << " NewCost=" << NewCost << "\n");
1049
1050 if (NewCost > OldCost)
1051 return false;
1052
1053 // Create the operation on the source type
1054 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1055 LHSSrc, InvC, I.getName() + ".inner");
1056 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1057 NewBinOp->copyIRFlags(&I);
1058
1059 Worklist.pushValue(NewOp);
1060
1061 // Create the cast operation directly to ensure we get a new instruction
1062 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1063
1064 // Preserve cast instruction flags
1065 if (RHSFlags.NNeg)
1066 NewCast->setNonNeg();
1067 if (RHSFlags.NUW)
1068 NewCast->setHasNoUnsignedWrap();
1069 if (RHSFlags.NSW)
1070 NewCast->setHasNoSignedWrap();
1071
1072 NewCast->andIRFlags(LHSCast);
1073
1074 // Insert the new instruction
1075 Value *Result = Builder.Insert(NewCast);
1076
1077 replaceValue(I, *Result);
1078 return true;
1079}
1080
1081/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1082/// destination type followed by shuffle. This can enable further transforms by
1083/// moving bitcasts or shuffles together.
1084bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1085 Value *V0, *V1;
1086 ArrayRef<int> Mask;
1087 if (!match(&I, m_BitCast(m_OneUse(
1088 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1089 return false;
1090
1091 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1092 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1093 // mask for scalable type is a splat or not.
1094 // 2) Disallow non-vector casts.
1095 // TODO: We could allow any shuffle.
1096 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1097 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1098 if (!DestTy || !SrcTy)
1099 return false;
1100
1101 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1102 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1103 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1104 return false;
1105
1106 bool IsUnary = isa<UndefValue>(V1);
1107
1108 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1109 // if it won't increase the number of bitcasts.
1110 if (!IsUnary) {
1113 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1114 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1115 return false;
1116 }
1117
1118 SmallVector<int, 16> NewMask;
1119 if (DestEltSize <= SrcEltSize) {
1120 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1121 // always be expanded to the equivalent form choosing narrower elements.
1122 if (SrcEltSize % DestEltSize != 0)
1123 return false;
1124 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1125 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1126 } else {
1127 // The bitcast is from narrow elements to wide elements. The shuffle mask
1128 // must choose consecutive elements to allow casting first.
1129 if (DestEltSize % SrcEltSize != 0)
1130 return false;
1131 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1132 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1133 return false;
1134 }
1135
1136 // Bitcast the shuffle src - keep its original width but using the destination
1137 // scalar type.
1138 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1139 auto *NewShuffleTy =
1140 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1141 auto *OldShuffleTy =
1142 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1143 unsigned NumOps = IsUnary ? 1 : 2;
1144
1145 // The new shuffle must not cost more than the old shuffle.
1149
1150 InstructionCost NewCost =
1151 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1152 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1153 TargetTransformInfo::CastContextHint::None,
1154 CostKind));
1155 InstructionCost OldCost =
1156 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1157 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1158 TargetTransformInfo::CastContextHint::None,
1159 CostKind);
1160
1161 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1162 << OldCost << " vs NewCost: " << NewCost << "\n");
1163
1164 if (NewCost > OldCost || !NewCost.isValid())
1165 return false;
1166
1167 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1168 ++NumShufOfBitcast;
1169 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1170 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1171 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1172 replaceValue(I, *Shuf);
1173 return true;
1174}
1175
1176/// VP Intrinsics whose vector operands are both splat values may be simplified
1177/// into the scalar version of the operation and the result splatted. This
1178/// can lead to scalarization down the line.
1179bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1180 if (!isa<VPIntrinsic>(I))
1181 return false;
1182 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1183 Value *Op0 = VPI.getArgOperand(0);
1184 Value *Op1 = VPI.getArgOperand(1);
1185
1186 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1187 return false;
1188
1189 // Check getSplatValue early in this function, to avoid doing unnecessary
1190 // work.
1191 Value *ScalarOp0 = getSplatValue(Op0);
1192 Value *ScalarOp1 = getSplatValue(Op1);
1193 if (!ScalarOp0 || !ScalarOp1)
1194 return false;
1195
1196 // For the binary VP intrinsics supported here, the result on disabled lanes
1197 // is a poison value. For now, only do this simplification if all lanes
1198 // are active.
1199 // TODO: Relax the condition that all lanes are active by using insertelement
1200 // on inactive lanes.
1201 auto IsAllTrueMask = [](Value *MaskVal) {
1202 if (Value *SplattedVal = getSplatValue(MaskVal))
1203 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1204 return ConstValue->isAllOnesValue();
1205 return false;
1206 };
1207 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1208 return false;
1209
1210 // Check to make sure we support scalarization of the intrinsic
1211 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1212 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1213 return false;
1214
1215 // Calculate cost of splatting both operands into vectors and the vector
1216 // intrinsic
1217 VectorType *VecTy = cast<VectorType>(VPI.getType());
1218 SmallVector<int> Mask;
1219 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1220 Mask.resize(FVTy->getNumElements(), 0);
1221 InstructionCost SplatCost =
1222 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1224 CostKind);
1225
1226 // Calculate the cost of the VP Intrinsic
1228 for (Value *V : VPI.args())
1229 Args.push_back(V->getType());
1230 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1231 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1232 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1233
1234 // Determine scalar opcode
1235 std::optional<unsigned> FunctionalOpcode =
1236 VPI.getFunctionalOpcode();
1237 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1238 if (!FunctionalOpcode) {
1239 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1240 if (!ScalarIntrID)
1241 return false;
1242 }
1243
1244 // Calculate cost of scalarizing
1245 InstructionCost ScalarOpCost = 0;
1246 if (ScalarIntrID) {
1247 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1248 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1249 } else {
1250 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1251 VecTy->getScalarType(), CostKind);
1252 }
1253
1254 // The existing splats may be kept around if other instructions use them.
1255 InstructionCost CostToKeepSplats =
1256 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1257 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1258
1259 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1260 << "\n");
1261 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1262 << ", Cost of scalarizing:" << NewCost << "\n");
1263
1264 // We want to scalarize unless the vector variant actually has lower cost.
1265 if (OldCost < NewCost || !NewCost.isValid())
1266 return false;
1267
1268 // Scalarize the intrinsic
1269 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1270 Value *EVL = VPI.getArgOperand(3);
1271
1272 // If the VP op might introduce UB or poison, we can scalarize it provided
1273 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1274 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1275 // scalarizing it.
1276 bool SafeToSpeculate;
1277 if (ScalarIntrID)
1278 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1279 .hasAttribute(Attribute::AttrKind::Speculatable);
1280 else
1282 *FunctionalOpcode, &VPI, nullptr, SQ.AC, SQ.DT);
1283 if (!SafeToSpeculate &&
1284 !isKnownNonZero(EVL, SimplifyQuery(*DL, SQ.DT, SQ.AC, &VPI)))
1285 return false;
1286
1287 Value *ScalarVal =
1288 ScalarIntrID
1289 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1290 {ScalarOp0, ScalarOp1})
1291 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1292 ScalarOp0, ScalarOp1);
1293
1294 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1295 return true;
1296}
1297
1298/// Match a vector op/compare/intrinsic with at least one
1299/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1300/// by insertelement.
1301bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1302 auto *UO = dyn_cast<UnaryOperator>(&I);
1303 auto *BO = dyn_cast<BinaryOperator>(&I);
1304 auto *CI = dyn_cast<CmpInst>(&I);
1305 auto *II = dyn_cast<IntrinsicInst>(&I);
1306 if (!UO && !BO && !CI && !II)
1307 return false;
1308
1309 // TODO: Allow intrinsics with different argument types
1310 if (II) {
1311 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1312 return false;
1313 for (auto [Idx, Arg] : enumerate(II->args()))
1314 if (Arg->getType() != II->getType() &&
1315 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1316 return false;
1317 }
1318
1319 // Do not convert the vector condition of a vector select into a scalar
1320 // condition. That may cause problems for codegen because of differences in
1321 // boolean formats and register-file transfers.
1322 // TODO: Can we account for that in the cost model?
1323 if (CI)
1324 for (User *U : I.users())
1325 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1326 return false;
1327
1328 // Match constant vectors or scalars being inserted into constant vectors:
1329 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1330 SmallVector<Value *> VecCs, ScalarOps;
1331 std::optional<uint64_t> Index;
1332
1333 auto Ops = II ? II->args() : I.operands();
1334 for (auto [OpNum, Op] : enumerate(Ops)) {
1335 Constant *VecC;
1336 Value *V;
1337 uint64_t InsIdx = 0;
1338 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1339 m_ConstantInt(InsIdx)))) {
1340 // Bail if any inserts are out of bounds.
1341 VectorType *OpTy = cast<VectorType>(Op->getType());
1342 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1343 return false;
1344 // All inserts must have the same index.
1345 // TODO: Deal with mismatched index constants and variable indexes?
1346 if (!Index)
1347 Index = InsIdx;
1348 else if (InsIdx != *Index)
1349 return false;
1350 VecCs.push_back(VecC);
1351 ScalarOps.push_back(V);
1352 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1353 OpNum, &TTI)) {
1354 VecCs.push_back(Op.get());
1355 ScalarOps.push_back(Op.get());
1356 } else if (match(Op.get(), m_Constant(VecC))) {
1357 VecCs.push_back(VecC);
1358 ScalarOps.push_back(nullptr);
1359 } else {
1360 return false;
1361 }
1362 }
1363
1364 // Bail if all operands are constant.
1365 if (!Index.has_value())
1366 return false;
1367
1368 VectorType *VecTy = cast<VectorType>(I.getType());
1369 Type *ScalarTy = VecTy->getScalarType();
1370 assert(VecTy->isVectorTy() &&
1371 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1372 ScalarTy->isPointerTy()) &&
1373 "Unexpected types for insert element into binop or cmp");
1374
1375 unsigned Opcode = I.getOpcode();
1376 InstructionCost ScalarOpCost, VectorOpCost;
1377 if (CI) {
1378 CmpInst::Predicate Pred = CI->getPredicate();
1379 ScalarOpCost = TTI.getCmpSelInstrCost(
1380 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1381 VectorOpCost = TTI.getCmpSelInstrCost(
1382 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1383 } else if (UO || BO) {
1384 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1385 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1386 } else {
1387 IntrinsicCostAttributes ScalarICA(
1388 II->getIntrinsicID(), ScalarTy,
1389 SmallVector<Type *>(II->arg_size(), ScalarTy));
1390 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1391 IntrinsicCostAttributes VectorICA(
1392 II->getIntrinsicID(), VecTy,
1393 SmallVector<Type *>(II->arg_size(), VecTy));
1394 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1395 }
1396
1397 // Fold the vector constants in the original vectors into a new base vector to
1398 // get more accurate cost modelling.
1399 Value *NewVecC = nullptr;
1400 if (CI)
1401 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1402 else if (UO)
1403 NewVecC =
1404 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1405 else if (BO)
1406 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1407 else if (II)
1408 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1409
1410 if (!NewVecC)
1411 return false;
1412
1413 // Get cost estimate for the insert element. This cost will factor into
1414 // both sequences.
1415 InstructionCost OldCost = VectorOpCost;
1416 InstructionCost NewCost =
1417 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1418 CostKind, *Index, NewVecC);
1419
1420 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1421 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1422 II->getIntrinsicID(), Idx, &TTI)))
1423 continue;
1425 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1426 OldCost += InsertCost;
1427 NewCost += !Op->hasOneUse() * InsertCost;
1428 }
1429
1430 // We want to scalarize unless the vector variant actually has lower cost.
1431 if (OldCost < NewCost || !NewCost.isValid())
1432 return false;
1433
1434 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1435 // inselt NewVecC, (scalar_op V0, V1), Index
1436 if (CI)
1437 ++NumScalarCmp;
1438 else if (UO || BO)
1439 ++NumScalarOps;
1440 else
1441 ++NumScalarIntrinsic;
1442
1443 // For constant cases, extract the scalar element, this should constant fold.
1444 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1445 if (!Scalar)
1447 cast<Constant>(VecC), Builder.getInt64(*Index));
1448
1449 Value *Scalar;
1450 if (CI)
1451 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1452 else if (UO || BO)
1453 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1454 else
1455 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1456
1457 Scalar->setName(I.getName() + ".scalar");
1458
1459 // All IR flags are safe to back-propagate. There is no potential for extra
1460 // poison to be created by the scalar instruction.
1461 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1462 ScalarInst->copyIRFlags(&I);
1463
1464 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1465 replaceValue(I, *Insert);
1466 return true;
1467}
1468
1469/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1470/// a vector into vector operations followed by extract. Note: The SLP pass
1471/// may miss this pattern because of implementation problems.
1472bool VectorCombine::foldExtractedCmps(Instruction &I) {
1473 auto *BI = dyn_cast<BinaryOperator>(&I);
1474
1475 // We are looking for a scalar binop of booleans.
1476 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1477 if (!BI || !I.getType()->isIntegerTy(1))
1478 return false;
1479
1480 // The compare predicates should match, and each compare should have a
1481 // constant operand.
1482 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1483 Instruction *I0, *I1;
1484 Constant *C0, *C1;
1485 CmpPredicate P0, P1;
1486 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1487 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1488 return false;
1489
1490 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1491 if (!MatchingPred)
1492 return false;
1493
1494 // The compare operands must be extracts of the same vector with constant
1495 // extract indexes.
1496 Value *X;
1497 uint64_t Index0, Index1;
1498 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1499 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1500 return false;
1501
1502 auto *Ext0 = cast<ExtractElementInst>(I0);
1503 auto *Ext1 = cast<ExtractElementInst>(I1);
1504 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1505 if (!ConvertToShuf)
1506 return false;
1507 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1508 "Unknown ExtractElementInst");
1509
1510 // The original scalar pattern is:
1511 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1512 CmpInst::Predicate Pred = *MatchingPred;
1513 unsigned CmpOpcode =
1514 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1515 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1516 if (!VecTy)
1517 return false;
1518
1519 if (Index0 >= VecTy->getNumElements() || Index1 >= VecTy->getNumElements())
1520 return false;
1521
1522 InstructionCost Ext0Cost =
1523 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1524 InstructionCost Ext1Cost =
1525 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1527 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1528 CostKind);
1529
1530 InstructionCost OldCost =
1531 Ext0Cost + Ext1Cost + CmpCost * 2 +
1532 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1533
1534 // The proposed vector pattern is:
1535 // vcmp = cmp Pred X, VecC
1536 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1537 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1538 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1541 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1542 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1543 ShufMask[CheapIndex] = ExpensiveIndex;
1545 CmpTy, ShufMask, CostKind);
1546 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1547 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1548 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1549 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1550
1551 // Aggressively form vector ops if the cost is equal because the transform
1552 // may enable further optimization.
1553 // Codegen can reverse this transform (scalarize) if it was not profitable.
1554 if (OldCost < NewCost || !NewCost.isValid())
1555 return false;
1556
1557 // Create a vector constant from the 2 scalar constants.
1558 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1559 PoisonValue::get(VecTy->getElementType()));
1560 CmpC[Index0] = C0;
1561 CmpC[Index1] = C1;
1562 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1563 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1564 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1565 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1566 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1567 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1568 replaceValue(I, *NewExt);
1569 ++NumVecCmpBO;
1570 return true;
1571}
1572
1573/// Try to fold scalar selects that select between extracted elements and zero
1574/// into extracting from a vector select. This is rooted at the bitcast.
1575///
1576/// This pattern arises when a vector is bitcast to a smaller element type,
1577/// elements are extracted, and then conditionally selected with zero:
1578///
1579/// %bc = bitcast <4 x i32> %src to <16 x i8>
1580/// %e0 = extractelement <16 x i8> %bc, i32 0
1581/// %s0 = select i1 %cond, i8 %e0, i8 0
1582/// %e1 = extractelement <16 x i8> %bc, i32 1
1583/// %s1 = select i1 %cond, i8 %e1, i8 0
1584/// ...
1585///
1586/// Transforms to:
1587/// %sel = select i1 %cond, <4 x i32> %src, <4 x i32> zeroinitializer
1588/// %bc = bitcast <4 x i32> %sel to <16 x i8>
1589/// %e0 = extractelement <16 x i8> %bc, i32 0
1590/// %e1 = extractelement <16 x i8> %bc, i32 1
1591/// ...
1592///
1593/// This is profitable because vector select on wider types produces fewer
1594/// select/cndmask instructions than scalar selects on each element.
1595bool VectorCombine::foldSelectsFromBitcast(Instruction &I) {
1596 auto *BC = dyn_cast<BitCastInst>(&I);
1597 if (!BC)
1598 return false;
1599
1600 FixedVectorType *SrcVecTy = dyn_cast<FixedVectorType>(BC->getSrcTy());
1601 FixedVectorType *DstVecTy = dyn_cast<FixedVectorType>(BC->getDestTy());
1602 if (!SrcVecTy || !DstVecTy)
1603 return false;
1604
1605 // Source must be 32-bit or 64-bit elements, destination must be smaller
1606 // integer elements. Zero in all these types is all-bits-zero.
1607 Type *SrcEltTy = SrcVecTy->getElementType();
1608 Type *DstEltTy = DstVecTy->getElementType();
1609 unsigned SrcEltBits = SrcEltTy->getPrimitiveSizeInBits();
1610 unsigned DstEltBits = DstEltTy->getPrimitiveSizeInBits();
1611
1612 if (SrcEltBits != 32 && SrcEltBits != 64)
1613 return false;
1614
1615 if (!DstEltTy->isIntegerTy() || DstEltBits >= SrcEltBits)
1616 return false;
1617
1618 // Check profitability using TTI before collecting users.
1619 Type *CondTy = CmpInst::makeCmpResultType(DstEltTy);
1620 Type *VecCondTy = CmpInst::makeCmpResultType(SrcVecTy);
1621
1622 InstructionCost ScalarSelCost =
1623 TTI.getCmpSelInstrCost(Instruction::Select, DstEltTy, CondTy,
1625 InstructionCost VecSelCost =
1626 TTI.getCmpSelInstrCost(Instruction::Select, SrcVecTy, VecCondTy,
1628
1629 // We need at least this many selects for vectorization to be profitable.
1630 // VecSelCost < ScalarSelCost * NumSelects => NumSelects > VecSelCost /
1631 // ScalarSelCost
1632 if (!ScalarSelCost.isValid() || ScalarSelCost == 0)
1633 return false;
1634
1635 unsigned MinSelects = (VecSelCost.getValue() / ScalarSelCost.getValue()) + 1;
1636
1637 // Quick check: if bitcast doesn't have enough users, bail early.
1638 if (!BC->hasNUsesOrMore(MinSelects))
1639 return false;
1640
1641 // Collect all select users that match the pattern, grouped by condition.
1642 // Pattern: select i1 %cond, (extractelement %bc, idx), 0
1643 DenseMap<Value *, SmallVector<SelectInst *, 8>> CondToSelects;
1644
1645 for (User *U : BC->users()) {
1646 auto *Ext = dyn_cast<ExtractElementInst>(U);
1647 if (!Ext)
1648 continue;
1649
1650 for (User *ExtUser : Ext->users()) {
1651 Value *Cond;
1652 // Match: select i1 %cond, %ext, 0
1653 if (match(ExtUser, m_Select(m_Value(Cond), m_Specific(Ext), m_Zero())) &&
1654 Cond->getType()->isIntegerTy(1))
1655 CondToSelects[Cond].push_back(cast<SelectInst>(ExtUser));
1656 }
1657 }
1658
1659 if (CondToSelects.empty())
1660 return false;
1661
1662 bool MadeChange = false;
1663 Value *SrcVec = BC->getOperand(0);
1664
1665 // Process each group of selects with the same condition.
1666 for (auto [Cond, Selects] : CondToSelects) {
1667 // Only profitable if vector select cost < total scalar select cost.
1668 if (Selects.size() < MinSelects) {
1669 LLVM_DEBUG(dbgs() << "VectorCombine: foldSelectsFromBitcast not "
1670 << "profitable (VecCost=" << VecSelCost
1671 << ", ScalarCost=" << ScalarSelCost
1672 << ", NumSelects=" << Selects.size() << ")\n");
1673 continue;
1674 }
1675
1676 // Create the vector select and bitcast once for this condition.
1677 auto InsertPt = std::next(BC->getIterator());
1678
1679 if (auto *CondInst = dyn_cast<Instruction>(Cond))
1680 if (DT.dominates(BC, CondInst))
1681 InsertPt = std::next(CondInst->getIterator());
1682
1683 Builder.SetInsertPoint(InsertPt);
1684 Value *VecSel =
1685 Builder.CreateSelect(Cond, SrcVec, Constant::getNullValue(SrcVecTy));
1686 Value *NewBC = Builder.CreateBitCast(VecSel, DstVecTy);
1687
1688 // Replace each scalar select with an extract from the new bitcast.
1689 for (SelectInst *Sel : Selects) {
1690 auto *Ext = cast<ExtractElementInst>(Sel->getTrueValue());
1691 Value *Idx = Ext->getIndexOperand();
1692
1693 Builder.SetInsertPoint(Sel);
1694 Value *NewExt = Builder.CreateExtractElement(NewBC, Idx);
1695 replaceValue(*Sel, *NewExt);
1696 MadeChange = true;
1697 }
1698
1699 LLVM_DEBUG(dbgs() << "VectorCombine: folded " << Selects.size()
1700 << " selects into vector select\n");
1701 }
1702
1703 return MadeChange;
1704}
1705
1708 const TargetTransformInfo &TTI,
1709 InstructionCost &CostBeforeReduction,
1710 InstructionCost &CostAfterReduction) {
1711 Instruction *Op0, *Op1;
1712 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1713 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1714 unsigned ReductionOpc =
1715 getArithmeticReductionInstruction(II.getIntrinsicID());
1716 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1717 bool IsUnsigned = isa<ZExtInst>(RedOp);
1718 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1719
1720 CostBeforeReduction =
1721 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1723 CostAfterReduction =
1724 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1725 ExtType, FastMathFlags(), CostKind);
1726 return;
1727 }
1728 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1729 match(RedOp,
1731 match(Op0, m_ZExtOrSExt(m_Value())) &&
1732 Op0->getOpcode() == Op1->getOpcode() &&
1733 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1734 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1735 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1736 bool IsUnsigned = isa<ZExtInst>(Op0);
1737 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1738 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1739
1740 InstructionCost ExtCost =
1741 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1743 InstructionCost MulCost =
1744 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1745 InstructionCost Ext2Cost =
1746 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1748
1749 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1750 CostAfterReduction = TTI.getMulAccReductionCost(
1751 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1752 return;
1753 }
1754 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1755 std::nullopt, CostKind);
1756}
1757
1758bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1759 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1760 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1761 if (BinOpOpc == Instruction::Sub)
1762 ReductionIID = Intrinsic::vector_reduce_add;
1763 if (ReductionIID == Intrinsic::not_intrinsic)
1764 return false;
1765 // FP reductions have a start-value operand that this fold doesn't handle.
1766 if (ReductionIID == Intrinsic::vector_reduce_fadd ||
1767 ReductionIID == Intrinsic::vector_reduce_fmul)
1768 return false;
1769
1770 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1771 Intrinsic::ID IID) -> Value * {
1772 auto *II = dyn_cast<IntrinsicInst>(V);
1773 if (!II)
1774 return nullptr;
1775 if (II->getIntrinsicID() == IID && II->hasOneUse())
1776 return II->getArgOperand(0);
1777 return nullptr;
1778 };
1779
1780 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1781 if (!V0)
1782 return false;
1783 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1784 if (!V1)
1785 return false;
1786
1787 auto *VTy = cast<VectorType>(V0->getType());
1788 if (V1->getType() != VTy)
1789 return false;
1790 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1791 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1792 unsigned ReductionOpc =
1793 getArithmeticReductionInstruction(II0.getIntrinsicID());
1794
1795 InstructionCost OldCost = 0;
1796 InstructionCost NewCost = 0;
1797 InstructionCost CostOfRedOperand0 = 0;
1798 InstructionCost CostOfRed0 = 0;
1799 InstructionCost CostOfRedOperand1 = 0;
1800 InstructionCost CostOfRed1 = 0;
1801 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1802 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1803 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1804 NewCost =
1805 CostOfRedOperand0 + CostOfRedOperand1 +
1806 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1807 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1808 if (NewCost >= OldCost || !NewCost.isValid())
1809 return false;
1810
1811 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1812 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1813 << "\n");
1814 Value *VectorBO;
1815 if (BinOpOpc == Instruction::Or)
1816 VectorBO = Builder.CreateOr(V0, V1, "",
1817 cast<PossiblyDisjointInst>(I).isDisjoint());
1818 else
1819 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1820
1821 Value *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1822 replaceValue(I, *Rdx);
1823 return true;
1824}
1825
1826// Check if memory loc modified between two instrs in the same BB
1829 const MemoryLocation &Loc, AAResults &AA) {
1830 unsigned NumScanned = 0;
1831 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1832 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1833 ++NumScanned > MaxInstrsToScan;
1834 });
1835}
1836
1837namespace {
1838/// Helper class to indicate whether a vector index can be safely scalarized and
1839/// if a freeze needs to be inserted.
1840class ScalarizationResult {
1841 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1842
1843 StatusTy Status;
1844 Value *ToFreeze;
1845
1846 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1847 : Status(Status), ToFreeze(ToFreeze) {}
1848
1849public:
1850 ScalarizationResult(const ScalarizationResult &Other) = default;
1851 ~ScalarizationResult() {
1852 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1853 }
1854
1855 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1856 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1857 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1858 return {StatusTy::SafeWithFreeze, ToFreeze};
1859 }
1860
1861 /// Returns true if the index can be scalarize without requiring a freeze.
1862 bool isSafe() const { return Status == StatusTy::Safe; }
1863 /// Returns true if the index cannot be scalarized.
1864 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1865 /// Returns true if the index can be scalarize, but requires inserting a
1866 /// freeze.
1867 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1868
1869 /// Reset the state of Unsafe and clear ToFreze if set.
1870 void discard() {
1871 ToFreeze = nullptr;
1872 Status = StatusTy::Unsafe;
1873 }
1874
1875 /// Freeze the ToFreeze and update the use in \p User to use it.
1876 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1877 assert(isSafeWithFreeze() &&
1878 "should only be used when freezing is required");
1879 assert(is_contained(ToFreeze->users(), &UserI) &&
1880 "UserI must be a user of ToFreeze");
1881 IRBuilder<>::InsertPointGuard Guard(Builder);
1882 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1883 Value *Frozen =
1884 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1885 for (Use &U : make_early_inc_range((UserI.operands())))
1886 if (U.get() == ToFreeze)
1887 U.set(Frozen);
1888
1889 ToFreeze = nullptr;
1890 }
1891};
1892} // namespace
1893
1894/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1895/// Idx. \p Idx must access a valid vector element.
1896static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1897 const SimplifyQuery &SQ) {
1898 // We do checks for both fixed vector types and scalable vector types.
1899 // This is the number of elements of fixed vector types,
1900 // or the minimum number of elements of scalable vector types.
1901 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1902 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1903
1904 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1905 if (C->getValue().ult(NumElements))
1906 return ScalarizationResult::safe();
1907 return ScalarizationResult::unsafe();
1908 }
1909
1910 // Always unsafe if the index type can't handle all inbound values.
1911 if (!llvm::isUIntN(IntWidth, NumElements))
1912 return ScalarizationResult::unsafe();
1913
1914 APInt Zero(IntWidth, 0);
1915 APInt MaxElts(IntWidth, NumElements);
1916 ConstantRange ValidIndices(Zero, MaxElts);
1917 ConstantRange IdxRange(IntWidth, true);
1918
1919 if (isGuaranteedNotToBePoison(Idx, SQ.AC, SQ.CxtI, SQ.DT)) {
1920 if (ValidIndices.contains(
1921 computeConstantRange(Idx, /*ForSigned=*/false, SQ)))
1922 return ScalarizationResult::safe();
1923 return ScalarizationResult::unsafe();
1924 }
1925
1926 // If the index may be poison, check if we can insert a freeze before the
1927 // range of the index is restricted.
1928 Value *IdxBase;
1929 ConstantInt *CI;
1930 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1931 IdxRange = IdxRange.binaryAnd(CI->getValue());
1932 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1933 IdxRange = IdxRange.urem(CI->getValue());
1934 }
1935
1936 if (ValidIndices.contains(IdxRange))
1937 return ScalarizationResult::safeWithFreeze(IdxBase);
1938 return ScalarizationResult::unsafe();
1939}
1940
1941/// The memory operation on a vector of \p ScalarType had alignment of
1942/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1943/// alignment that will be valid for the memory operation on a single scalar
1944/// element of the same type with index \p Idx.
1946 Type *ScalarType, Value *Idx,
1947 const DataLayout &DL) {
1948 if (auto *C = dyn_cast<ConstantInt>(Idx))
1949 return commonAlignment(VectorAlignment,
1950 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1951 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1952}
1953
1954// Combine patterns like:
1955// %0 = load <4 x i32>, <4 x i32>* %a
1956// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1957// store <4 x i32> %1, <4 x i32>* %a
1958// to:
1959// %0 = bitcast <4 x i32>* %a to i32*
1960// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1961// store i32 %b, i32* %1
1962bool VectorCombine::foldSingleElementStore(Instruction &I) {
1964 return false;
1965 auto *SI = cast<StoreInst>(&I);
1966 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1967 return false;
1968
1969 // TODO: Combine more complicated patterns (multiple insert) by referencing
1970 // TargetTransformInfo.
1972 Value *NewElement;
1973 Value *Idx;
1974 if (!match(SI->getValueOperand(),
1975 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1976 m_Value(Idx))))
1977 return false;
1978
1979 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1980 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1981 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1982 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1983 // modified between, vector type matches store size, and index is inbounds.
1984 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1985 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1986 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1987 return false;
1988
1989 auto ScalarizableIdx =
1990 canScalarizeAccess(VecTy, Idx, SQ.getWithInstruction(Load));
1991 if (ScalarizableIdx.isUnsafe() ||
1992 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1993 MemoryLocation::get(SI), AA))
1994 return false;
1995
1996 // Ensure we add the load back to the worklist BEFORE its users so they can
1997 // erased in the correct order.
1998 Worklist.push(Load);
1999
2000 if (ScalarizableIdx.isSafeWithFreeze())
2001 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
2002 Value *GEP = Builder.CreateInBoundsGEP(
2003 SI->getValueOperand()->getType(), SI->getPointerOperand(),
2004 {ConstantInt::get(Idx->getType(), 0), Idx});
2005 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
2006 NSI->copyMetadata(*SI);
2007 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
2008 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
2009 *DL);
2010 NSI->setAlignment(ScalarOpAlignment);
2011 replaceValue(I, *NSI);
2013 return true;
2014 }
2015
2016 return false;
2017}
2018
2019/// Try to scalarize vector loads feeding extractelement or bitcast
2020/// instructions.
2021bool VectorCombine::scalarizeLoad(Instruction &I) {
2022 Value *Ptr;
2023 if (!match(&I, m_Load(m_Value(Ptr))))
2024 return false;
2025
2026 auto *LI = cast<LoadInst>(&I);
2027 auto *VecTy = cast<VectorType>(LI->getType());
2028
2029 // The isSimple() check could be isUnordered(), but for now we cowardly
2030 // refuse to handle even unordered atomics.
2031 if (!LI->isSimple() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
2032 return false;
2033
2034 bool AllExtracts = true;
2035 bool AllBitcasts = true;
2036 Instruction *LastCheckedInst = LI;
2037 unsigned NumInstChecked = 0;
2038
2039 // Check what type of users we have (must either all be extracts or
2040 // bitcasts) and ensure no memory modifications between the load and
2041 // its users.
2042 for (User *U : LI->users()) {
2043 auto *UI = dyn_cast<Instruction>(U);
2044 if (!UI || UI->getParent() != LI->getParent())
2045 return false;
2046
2047 // If any user is waiting to be erased, then bail out as this will
2048 // distort the cost calculation and possibly lead to infinite loops.
2049 if (UI->use_empty())
2050 return false;
2051
2052 if (!isa<ExtractElementInst>(UI))
2053 AllExtracts = false;
2054 if (!isa<BitCastInst>(UI))
2055 AllBitcasts = false;
2056
2057 // Check if any instruction between the load and the user may modify memory.
2058 if (LastCheckedInst->comesBefore(UI)) {
2059 for (Instruction &I :
2060 make_range(std::next(LI->getIterator()), UI->getIterator())) {
2061 // Bail out if we reached the check limit or the instruction may write
2062 // to memory.
2063 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
2064 return false;
2065 NumInstChecked++;
2066 }
2067 LastCheckedInst = UI;
2068 }
2069 }
2070
2071 if (AllExtracts)
2072 return scalarizeLoadExtract(LI, VecTy, Ptr);
2073 if (AllBitcasts)
2074 return scalarizeLoadBitcast(LI, VecTy, Ptr);
2075 return false;
2076}
2077
2078/// Try to scalarize vector loads feeding extractelement instructions.
2079bool VectorCombine::scalarizeLoadExtract(LoadInst *LI, VectorType *VecTy,
2080 Value *Ptr) {
2082 return false;
2083
2084 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
2085 llvm::scope_exit FailureGuard([&]() {
2086 // If the transform is aborted, discard the ScalarizationResults.
2087 for (auto &Pair : NeedFreeze)
2088 Pair.second.discard();
2089 });
2090
2091 InstructionCost OriginalCost =
2092 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2094 InstructionCost ScalarizedCost = 0;
2095
2096 for (User *U : LI->users()) {
2097 auto *UI = cast<ExtractElementInst>(U);
2098
2099 auto ScalarIdx = canScalarizeAccess(VecTy, UI->getIndexOperand(),
2100 SQ.getWithInstruction(LI));
2101 if (ScalarIdx.isUnsafe())
2102 return false;
2103 if (ScalarIdx.isSafeWithFreeze()) {
2104 NeedFreeze.try_emplace(UI, ScalarIdx);
2105 ScalarIdx.discard();
2106 }
2107
2108 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
2109 OriginalCost +=
2110 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
2111 Index ? Index->getZExtValue() : -1);
2112 ScalarizedCost +=
2113 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
2115 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
2116 nullptr, nullptr, CostKind);
2117 }
2118
2119 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << *LI
2120 << "\n LoadExtractCost: " << OriginalCost
2121 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2122
2123 if (ScalarizedCost >= OriginalCost)
2124 return false;
2125
2126 // Ensure we add the load back to the worklist BEFORE its users so they can
2127 // erased in the correct order.
2128 Worklist.push(LI);
2129
2130 Type *ElemType = VecTy->getElementType();
2131
2132 // Replace extracts with narrow scalar loads.
2133 for (User *U : LI->users()) {
2134 auto *EI = cast<ExtractElementInst>(U);
2135 Value *Idx = EI->getIndexOperand();
2136
2137 // Insert 'freeze' for poison indexes.
2138 auto It = NeedFreeze.find(EI);
2139 if (It != NeedFreeze.end())
2140 It->second.freeze(Builder, *cast<Instruction>(Idx));
2141
2142 Builder.SetInsertPoint(EI);
2143 Value *GEP =
2144 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
2145 auto *NewLoad = cast<LoadInst>(
2146 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
2147
2148 Align ScalarOpAlignment =
2149 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
2150 NewLoad->setAlignment(ScalarOpAlignment);
2151
2152 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
2153 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
2154 AAMDNodes OldAAMD = LI->getAAMetadata();
2155 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
2156 }
2157
2158 replaceValue(*EI, *NewLoad, false);
2159 }
2160
2161 FailureGuard.release();
2162 return true;
2163}
2164
2165/// Try to scalarize vector loads feeding bitcast instructions.
2166bool VectorCombine::scalarizeLoadBitcast(LoadInst *LI, VectorType *VecTy,
2167 Value *Ptr) {
2168 InstructionCost OriginalCost =
2169 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
2171
2172 Type *TargetScalarType = nullptr;
2173 unsigned VecBitWidth = DL->getTypeSizeInBits(VecTy);
2174
2175 for (User *U : LI->users()) {
2176 auto *BC = cast<BitCastInst>(U);
2177
2178 Type *DestTy = BC->getDestTy();
2179 if (!DestTy->isIntegerTy() && !DestTy->isFloatingPointTy())
2180 return false;
2181
2182 unsigned DestBitWidth = DL->getTypeSizeInBits(DestTy);
2183 if (DestBitWidth != VecBitWidth)
2184 return false;
2185
2186 // All bitcasts must target the same scalar type.
2187 if (!TargetScalarType)
2188 TargetScalarType = DestTy;
2189 else if (TargetScalarType != DestTy)
2190 return false;
2191
2192 OriginalCost +=
2193 TTI.getCastInstrCost(Instruction::BitCast, TargetScalarType, VecTy,
2195 }
2196
2197 if (!TargetScalarType)
2198 return false;
2199
2200 assert(!LI->user_empty() && "Unexpected load without bitcast users");
2201 InstructionCost ScalarizedCost =
2202 TTI.getMemoryOpCost(Instruction::Load, TargetScalarType, LI->getAlign(),
2204
2205 LLVM_DEBUG(dbgs() << "Found vector load feeding only bitcasts: " << *LI
2206 << "\n OriginalCost: " << OriginalCost
2207 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
2208
2209 if (ScalarizedCost >= OriginalCost)
2210 return false;
2211
2212 // Ensure we add the load back to the worklist BEFORE its users so they can
2213 // erased in the correct order.
2214 Worklist.push(LI);
2215
2216 Builder.SetInsertPoint(LI);
2217 auto *ScalarLoad =
2218 Builder.CreateLoad(TargetScalarType, Ptr, LI->getName() + ".scalar");
2219 ScalarLoad->setAlignment(LI->getAlign());
2220 ScalarLoad->copyMetadata(*LI);
2221
2222 // Replace all bitcast users with the scalar load.
2223 for (User *U : LI->users()) {
2224 auto *BC = cast<BitCastInst>(U);
2225 replaceValue(*BC, *ScalarLoad, false);
2226 }
2227
2228 return true;
2229}
2230
2231bool VectorCombine::scalarizeExtExtract(Instruction &I) {
2233 return false;
2234 auto *Ext = dyn_cast<ZExtInst>(&I);
2235 if (!Ext)
2236 return false;
2237
2238 // Try to convert a vector zext feeding only extracts to a set of scalar
2239 // (Src << ExtIdx *Size) & (Size -1)
2240 // if profitable .
2241 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
2242 if (!SrcTy)
2243 return false;
2244 auto *DstTy = cast<FixedVectorType>(Ext->getType());
2245
2246 Type *ScalarDstTy = DstTy->getElementType();
2247 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
2248 return false;
2249
2250 InstructionCost VectorCost =
2251 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
2253 unsigned ExtCnt = 0;
2254 bool ExtLane0 = false;
2255 for (User *U : Ext->users()) {
2256 uint64_t Idx;
2257 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2258 return false;
2259 if (cast<Instruction>(U)->use_empty())
2260 continue;
2261 ExtCnt += 1;
2262 ExtLane0 |= !Idx;
2263 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2264 CostKind, Idx, U);
2265 }
2266
2267 InstructionCost ScalarCost =
2268 ExtCnt * TTI.getArithmeticInstrCost(
2269 Instruction::And, ScalarDstTy, CostKind,
2272 (ExtCnt - ExtLane0) *
2274 Instruction::LShr, ScalarDstTy, CostKind,
2277 if (ScalarCost > VectorCost)
2278 return false;
2279
2280 Value *ScalarV = Ext->getOperand(0);
2281 if (!isGuaranteedNotToBePoison(ScalarV, SQ.AC, dyn_cast<Instruction>(ScalarV),
2282 SQ.DT)) {
2283 // Check wether all lanes are extracted, all extracts trigger UB
2284 // on poison, and the last extract (and hence all previous ones)
2285 // are guaranteed to execute if Ext executes. If so, we do not
2286 // need to insert a freeze.
2287 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2288 bool AllExtractsTriggerUB = true;
2289 ExtractElementInst *LastExtract = nullptr;
2290 BasicBlock *ExtBB = Ext->getParent();
2291 for (User *U : Ext->users()) {
2292 auto *Extract = cast<ExtractElementInst>(U);
2293 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2294 AllExtractsTriggerUB = false;
2295 break;
2296 }
2297 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2298 if (!LastExtract || LastExtract->comesBefore(Extract))
2299 LastExtract = Extract;
2300 }
2301 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2302 !AllExtractsTriggerUB ||
2304 LastExtract->getIterator()))
2305 ScalarV = Builder.CreateFreeze(ScalarV);
2306 }
2307 ScalarV = Builder.CreateBitCast(
2308 ScalarV,
2309 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2310 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2311 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2312 APInt EltBitMask = APInt::getLowBitsSet(TotalBits, SrcEltSizeInBits);
2313 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2314 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2315 for (User *U : Ext->users()) {
2316 auto *Extract = cast<ExtractElementInst>(U);
2317 uint64_t Idx =
2318 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2319 uint64_t ShiftAmt =
2320 DL->isBigEndian()
2321 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2322 : (Idx * SrcEltSizeInBits);
2323 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2324 Value *And = Builder.CreateAnd(LShr, Mask);
2325 U->replaceAllUsesWith(And);
2326 }
2327 return true;
2328}
2329
2330/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2331/// to "(bitcast (concat X, Y))"
2332/// where X/Y are bitcasted from i1 mask vectors.
2333bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2334 Type *Ty = I.getType();
2335 if (!Ty->isIntegerTy())
2336 return false;
2337
2338 // TODO: Add big endian test coverage
2339 if (DL->isBigEndian())
2340 return false;
2341
2342 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2343 Instruction *X, *Y;
2345 return false;
2346
2347 // Allow both sources to contain shl, to handle more generic pattern:
2348 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2349 Value *SrcX;
2350 uint64_t ShAmtX = 0;
2351 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2352 !match(X, m_OneUse(
2354 m_ConstantInt(ShAmtX)))))
2355 return false;
2356
2357 Value *SrcY;
2358 uint64_t ShAmtY = 0;
2359 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2360 !match(Y, m_OneUse(
2362 m_ConstantInt(ShAmtY)))))
2363 return false;
2364
2365 // Canonicalize larger shift to the RHS.
2366 if (ShAmtX > ShAmtY) {
2367 std::swap(X, Y);
2368 std::swap(SrcX, SrcY);
2369 std::swap(ShAmtX, ShAmtY);
2370 }
2371
2372 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2373 // difference is the mask width so they can be easily concatenated together.
2374 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2375 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2376 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2377 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2378 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2379 !MaskTy->getElementType()->isIntegerTy(1) ||
2380 MaskTy->getNumElements() != ShAmtDiff ||
2381 MaskTy->getNumElements() > (BitWidth / 2))
2382 return false;
2383
2384 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2385 auto *ConcatIntTy =
2386 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2387 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2388
2389 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2390 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2391
2392 // TODO: Is it worth supporting multi use cases?
2393 InstructionCost OldCost = 0;
2394 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2395 OldCost +=
2396 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2397 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2399 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2401
2402 InstructionCost NewCost = 0;
2404 MaskTy, ConcatMask, CostKind);
2405 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2407 if (Ty != ConcatIntTy)
2408 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2410 if (ShAmtX > 0)
2411 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2412
2413 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2414 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2415 << "\n");
2416
2417 if (NewCost > OldCost)
2418 return false;
2419
2420 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2421 // any residual zero-extension or shifting.
2422 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2423 Worklist.pushValue(Concat);
2424
2425 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2426
2427 if (Ty != ConcatIntTy) {
2428 Worklist.pushValue(Result);
2429 Result = Builder.CreateZExt(Result, Ty);
2430 }
2431
2432 if (ShAmtX > 0) {
2433 Worklist.pushValue(Result);
2434 Result = Builder.CreateShl(Result, ShAmtX);
2435 }
2436
2437 replaceValue(I, *Result);
2438 return true;
2439}
2440
2441/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2442/// --> "binop (shuffle), (shuffle)".
2443bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2444 BinaryOperator *BinOp;
2445 ArrayRef<int> OuterMask;
2446 if (!match(&I, m_Shuffle(m_BinOp(BinOp), m_Undef(), m_Mask(OuterMask))))
2447 return false;
2448
2449 // Don't introduce poison into div/rem.
2450 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2451 return false;
2452
2453 Value *Op00, *Op01, *Op10, *Op11;
2454 ArrayRef<int> Mask0, Mask1;
2455 bool Match0 = match(BinOp->getOperand(0),
2456 m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0)));
2457 bool Match1 = match(BinOp->getOperand(1),
2458 m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1)));
2459 if (!Match0 && !Match1)
2460 return false;
2461
2462 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2463 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2464 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2465 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2466
2467 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2468 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2469 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2470 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2471 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2472 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2473 return false;
2474
2475 unsigned NumSrcElts = BinOpTy->getNumElements();
2476
2477 // Don't accept shuffles that reference the second operand in
2478 // div/rem or if its an undef arg.
2479 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2480 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2481 return false;
2482
2483 // Merge outer / inner (or identity if no match) shuffles.
2484 SmallVector<int> NewMask0, NewMask1;
2485 for (int M : OuterMask) {
2486 if (M < 0 || M >= (int)NumSrcElts) {
2487 NewMask0.push_back(PoisonMaskElem);
2488 NewMask1.push_back(PoisonMaskElem);
2489 } else {
2490 NewMask0.push_back(Match0 ? Mask0[M] : M);
2491 NewMask1.push_back(Match1 ? Mask1[M] : M);
2492 }
2493 }
2494
2495 unsigned NumOpElts = Op0Ty->getNumElements();
2496 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2497 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2498 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2499 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2500 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2501 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2502
2503 InstructionCost NewCost = 0;
2504 // Try to merge shuffles across the binop if the new shuffles are not costly.
2505 InstructionCost BinOpCost =
2506 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind);
2507 InstructionCost OldCost =
2509 ShuffleDstTy, BinOpTy, OuterMask, CostKind,
2510 0, nullptr, {BinOp}, &I);
2511 if (!BinOp->hasOneUse())
2512 NewCost += BinOpCost;
2513
2514 if (Match0) {
2516 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2517 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2518 OldCost += Shuf0Cost;
2519 if (!BinOp->hasOneUse() || !BinOp->getOperand(0)->hasOneUse())
2520 NewCost += Shuf0Cost;
2521 }
2522 if (Match1) {
2524 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2525 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2526 OldCost += Shuf1Cost;
2527 if (!BinOp->hasOneUse() || !BinOp->getOperand(1)->hasOneUse())
2528 NewCost += Shuf1Cost;
2529 }
2530
2531 NewCost += TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2532
2533 if (!IsIdentity0)
2534 NewCost +=
2536 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2537 if (!IsIdentity1)
2538 NewCost +=
2540 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2541
2542 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2543 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2544 << "\n");
2545
2546 // If costs are equal, still fold as we reduce instruction count.
2547 if (NewCost > OldCost)
2548 return false;
2549
2550 Value *LHS =
2551 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2552 Value *RHS =
2553 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2554 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2555
2556 // Intersect flags from the old binops.
2557 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2558 NewInst->copyIRFlags(BinOp);
2559
2560 Worklist.pushValue(LHS);
2561 Worklist.pushValue(RHS);
2562 replaceValue(I, *NewBO);
2563 return true;
2564}
2565
2566/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2567/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2568bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2569 ArrayRef<int> OldMask;
2570 Instruction *LHS, *RHS;
2572 m_Mask(OldMask))))
2573 return false;
2574
2575 // TODO: Add support for addlike etc.
2576 if (LHS->getOpcode() != RHS->getOpcode())
2577 return false;
2578
2579 Value *X, *Y, *Z, *W;
2580 bool IsCommutative = false;
2581 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2582 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2583 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2584 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2585 auto *BO = cast<BinaryOperator>(LHS);
2586 // Don't introduce poison into div/rem.
2587 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2588 return false;
2589 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2590 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2591 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2592 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2593 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2594 } else
2595 return false;
2596
2597 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2598 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2599 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2600 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2601 return false;
2602
2603 bool SameBinOp = LHS == RHS;
2604 unsigned NumSrcElts = BinOpTy->getNumElements();
2605
2606 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2607 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2608 std::swap(X, Y);
2609
2610 auto ConvertToUnary = [NumSrcElts](int &M) {
2611 if (M >= (int)NumSrcElts)
2612 M -= NumSrcElts;
2613 };
2614
2615 SmallVector<int> NewMask0(OldMask);
2617 TTI::OperandValueInfo Op0Info = TTI.commonOperandInfo(X, Z);
2618 if (X == Z) {
2619 llvm::for_each(NewMask0, ConvertToUnary);
2621 Z = PoisonValue::get(BinOpTy);
2622 }
2623
2624 SmallVector<int> NewMask1(OldMask);
2626 TTI::OperandValueInfo Op1Info = TTI.commonOperandInfo(Y, W);
2627 if (Y == W) {
2628 llvm::for_each(NewMask1, ConvertToUnary);
2630 W = PoisonValue::get(BinOpTy);
2631 }
2632
2633 // Try to replace a binop with a shuffle if the shuffle is not costly.
2634 // When SameBinOp, only count the binop cost once.
2637
2638 InstructionCost OldCost = LHSCost;
2639 if (!SameBinOp) {
2640 OldCost += RHSCost;
2641 }
2643 ShuffleDstTy, BinResTy, OldMask, CostKind, 0,
2644 nullptr, {LHS, RHS}, &I);
2645
2646 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2647 // where one use shuffles have gotten split across the binop/cmp. These
2648 // often allow a major reduction in total cost that wouldn't happen as
2649 // individual folds.
2650 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2651 TTI::TargetCostKind CostKind) -> bool {
2652 Value *InnerOp;
2653 ArrayRef<int> InnerMask;
2654 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2655 m_Mask(InnerMask)))) &&
2656 InnerOp->getType() == Op->getType() &&
2657 all_of(InnerMask,
2658 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2659 for (int &M : Mask)
2660 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2661 M = InnerMask[M - Offset];
2662 M = 0 <= M ? M + Offset : M;
2663 }
2665 Op = InnerOp;
2666 return true;
2667 }
2668 return false;
2669 };
2670 bool ReducedInstCount = false;
2671 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2672 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2673 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2674 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2675 bool SingleSrcBinOp = (X == Y) && (Z == W) && (NewMask0 == NewMask1);
2676 // SingleSrcBinOp only reduces instruction count if we also eliminate the
2677 // original binop(s). If binops have multiple uses, they won't be eliminated.
2678 ReducedInstCount |= SingleSrcBinOp && LHS->hasOneUser() && RHS->hasOneUser();
2679
2680 auto *ShuffleCmpTy =
2681 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2683 SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0, nullptr, {X, Z});
2684 if (!SingleSrcBinOp)
2685 NewCost += TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1,
2686 CostKind, 0, nullptr, {Y, W});
2687
2688 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2689 NewCost += TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy,
2690 CostKind, Op0Info, Op1Info);
2691 } else {
2692 NewCost +=
2693 TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy, ShuffleDstTy,
2694 PredLHS, CostKind, Op0Info, Op1Info);
2695 }
2696 // If LHS/RHS have other uses, we need to account for the cost of keeping
2697 // the original instructions. When SameBinOp, only add the cost once.
2698 if (!LHS->hasOneUser())
2699 NewCost += LHSCost;
2700 if (!SameBinOp && !RHS->hasOneUser())
2701 NewCost += RHSCost;
2702
2703 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2704 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2705 << "\n");
2706
2707 // If either shuffle will constant fold away, then fold for the same cost as
2708 // we will reduce the instruction count.
2709 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2710 (isa<Constant>(Y) && isa<Constant>(W));
2711 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2712 return false;
2713
2714 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2715 Value *Shuf1 =
2716 SingleSrcBinOp ? Shuf0 : Builder.CreateShuffleVector(Y, W, NewMask1);
2717 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2718 ? Builder.CreateBinOp(
2719 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2720 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2721
2722 // Intersect flags from the old binops.
2723 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2724 NewInst->copyIRFlags(LHS);
2725 NewInst->andIRFlags(RHS);
2726 }
2727
2728 Worklist.pushValue(Shuf0);
2729 Worklist.pushValue(Shuf1);
2730 replaceValue(I, *NewBO);
2731 return true;
2732}
2733
2734/// Try to convert,
2735/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2736/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2737bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2738 ArrayRef<int> Mask;
2739 Value *C1, *T1, *F1, *C2, *T2, *F2;
2740 if (!match(&I, m_Shuffle(m_Select(m_Value(C1), m_Value(T1), m_Value(F1)),
2741 m_Select(m_Value(C2), m_Value(T2), m_Value(F2)),
2742 m_Mask(Mask))))
2743 return false;
2744
2745 auto *Sel1 = cast<Instruction>(I.getOperand(0));
2746 auto *Sel2 = cast<Instruction>(I.getOperand(1));
2747
2748 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2749 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2750 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2751 return false;
2752
2753 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2754 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2755 // SelectInsts must have the same FMF.
2756 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2757 ((SI0FOp != nullptr) &&
2758 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2759 return false;
2760
2761 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2762 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2764 auto SelOp = Instruction::Select;
2765
2767 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2769 SelOp, SrcVecTy, C2VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2770
2771 InstructionCost OldCost =
2772 CostSel1 + CostSel2 +
2773 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2774 {I.getOperand(0), I.getOperand(1)}, &I);
2775
2777 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2778 Mask, CostKind, 0, nullptr, {C1, C2});
2779 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2780 nullptr, {T1, T2});
2781 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2782 nullptr, {F1, F2});
2783 auto *C1C2ShuffledVecTy = FixedVectorType::get(
2784 Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements());
2785 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2787
2788 if (!Sel1->hasOneUse())
2789 NewCost += CostSel1;
2790 if (!Sel2->hasOneUse())
2791 NewCost += CostSel2;
2792
2793 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2794 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2795 << "\n");
2796 if (NewCost > OldCost)
2797 return false;
2798
2799 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2800 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2801 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2802 Value *NewSel;
2803 // We presuppose that the SelectInsts have the same FMF.
2804 if (SI0FOp)
2805 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2806 SI0FOp->getFastMathFlags());
2807 else
2808 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2809
2810 Worklist.pushValue(ShuffleCmp);
2811 Worklist.pushValue(ShuffleTrue);
2812 Worklist.pushValue(ShuffleFalse);
2813 replaceValue(I, *NewSel);
2814 return true;
2815}
2816
2817/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2818/// into "castop (shuffle)".
2819bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2820 Value *V0, *V1;
2821 ArrayRef<int> OldMask;
2822 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2823 return false;
2824
2825 // Check whether this is a binary shuffle.
2826 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2827
2828 auto *C0 = dyn_cast<CastInst>(V0);
2829 auto *C1 = dyn_cast<CastInst>(V1);
2830 if (!C0 || (IsBinaryShuffle && !C1))
2831 return false;
2832
2833 Instruction::CastOps Opcode = C0->getOpcode();
2834
2835 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2836 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2837 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2838 return false;
2839
2840 if (IsBinaryShuffle) {
2841 if (C0->getSrcTy() != C1->getSrcTy())
2842 return false;
2843 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2844 if (Opcode != C1->getOpcode()) {
2845 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2846 Opcode = Instruction::SExt;
2847 else
2848 return false;
2849 }
2850 }
2851
2852 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2853 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2854 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2855 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2856 return false;
2857
2858 unsigned NumSrcElts = CastSrcTy->getNumElements();
2859 unsigned NumDstElts = CastDstTy->getNumElements();
2860 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2861 "Only bitcasts expected to alter src/dst element counts");
2862
2863 // Check for bitcasting of unscalable vector types.
2864 // e.g. <32 x i40> -> <40 x i32>
2865 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2866 (NumDstElts % NumSrcElts) != 0)
2867 return false;
2868
2869 SmallVector<int, 16> NewMask;
2870 if (NumSrcElts >= NumDstElts) {
2871 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2872 // always be expanded to the equivalent form choosing narrower elements.
2873 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2874 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2875 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2876 } else {
2877 // The bitcast is from narrow elements to wide elements. The shuffle mask
2878 // must choose consecutive elements to allow casting first.
2879 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2880 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2881 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2882 return false;
2883 }
2884
2885 auto *NewShuffleDstTy =
2886 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2887
2888 // Try to replace a castop with a shuffle if the shuffle is not costly.
2889 InstructionCost CostC0 =
2890 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2892
2894 if (IsBinaryShuffle)
2896 else
2898
2899 InstructionCost OldCost = CostC0;
2900 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2901 CostKind, 0, nullptr, {}, &I);
2902
2903 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2904 CastSrcTy, NewMask, CostKind);
2905 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2907 if (!C0->hasOneUse())
2908 NewCost += CostC0;
2909 if (IsBinaryShuffle) {
2910 InstructionCost CostC1 =
2911 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2913 OldCost += CostC1;
2914 if (!C1->hasOneUse())
2915 NewCost += CostC1;
2916 }
2917
2918 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2919 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2920 << "\n");
2921 if (NewCost > OldCost)
2922 return false;
2923
2924 Value *Shuf;
2925 if (IsBinaryShuffle)
2926 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2927 NewMask);
2928 else
2929 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2930
2931 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2932
2933 // Intersect flags from the old casts.
2934 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2935 NewInst->copyIRFlags(C0);
2936 if (IsBinaryShuffle)
2937 NewInst->andIRFlags(C1);
2938 }
2939
2940 Worklist.pushValue(Shuf);
2941 replaceValue(I, *Cast);
2942 return true;
2943}
2944
2945/// Try to convert any of:
2946/// "shuffle (shuffle x, y), (shuffle y, x)"
2947/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2948/// "shuffle (shuffle x, undef), y"
2949/// "shuffle x, (shuffle y, undef)"
2950/// into "shuffle x, y".
2951bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2952 ArrayRef<int> OuterMask;
2953 Value *OuterV0, *OuterV1;
2954 if (!match(&I,
2955 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2956 return false;
2957
2958 ArrayRef<int> InnerMask0, InnerMask1;
2959 Value *X0, *X1, *Y0, *Y1;
2960 bool Match0 =
2961 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2962 bool Match1 =
2963 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2964 if (!Match0 && !Match1)
2965 return false;
2966
2967 // If the outer shuffle is a permute, then create a fake inner all-poison
2968 // shuffle. This is easier than accounting for length-changing shuffles below.
2969 SmallVector<int, 16> PoisonMask1;
2970 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2971 X1 = X0;
2972 Y1 = Y0;
2973 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2974 InnerMask1 = PoisonMask1;
2975 Match1 = true; // fake match
2976 }
2977
2978 X0 = Match0 ? X0 : OuterV0;
2979 Y0 = Match0 ? Y0 : OuterV0;
2980 X1 = Match1 ? X1 : OuterV1;
2981 Y1 = Match1 ? Y1 : OuterV1;
2982 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2983 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2984 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2985 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2986 X0->getType() != X1->getType())
2987 return false;
2988
2989 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2990 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2991
2992 // Attempt to merge shuffles, matching upto 2 source operands.
2993 // Replace index to a poison arg with PoisonMaskElem.
2994 // Bail if either inner masks reference an undef arg.
2995 SmallVector<int, 16> NewMask(OuterMask);
2996 Value *NewX = nullptr, *NewY = nullptr;
2997 for (int &M : NewMask) {
2998 Value *Src = nullptr;
2999 if (0 <= M && M < (int)NumImmElts) {
3000 Src = OuterV0;
3001 if (Match0) {
3002 M = InnerMask0[M];
3003 Src = M >= (int)NumSrcElts ? Y0 : X0;
3004 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
3005 }
3006 } else if (M >= (int)NumImmElts) {
3007 Src = OuterV1;
3008 M -= NumImmElts;
3009 if (Match1) {
3010 M = InnerMask1[M];
3011 Src = M >= (int)NumSrcElts ? Y1 : X1;
3012 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
3013 }
3014 }
3015 if (Src && M != PoisonMaskElem) {
3016 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
3017 if (isa<UndefValue>(Src)) {
3018 // We've referenced an undef element - if its poison, update the shuffle
3019 // mask, else bail.
3020 if (!isa<PoisonValue>(Src))
3021 return false;
3022 M = PoisonMaskElem;
3023 continue;
3024 }
3025 if (!NewX || NewX == Src) {
3026 NewX = Src;
3027 continue;
3028 }
3029 if (!NewY || NewY == Src) {
3030 M += NumSrcElts;
3031 NewY = Src;
3032 continue;
3033 }
3034 return false;
3035 }
3036 }
3037
3038 if (!NewX) {
3039 replaceValue(I, *PoisonValue::get(ShuffleDstTy));
3040 return true;
3041 }
3042
3043 if (!NewY)
3044 NewY = PoisonValue::get(ShuffleSrcTy);
3045
3046 // Have we folded to an Identity shuffle?
3047 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
3048 replaceValue(I, *NewX);
3049 return true;
3050 }
3051
3052 // Try to merge the shuffles if the new shuffle is not costly.
3053 InstructionCost InnerCost0 = 0;
3054 if (Match0)
3055 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
3056
3057 InstructionCost InnerCost1 = 0;
3058 if (Match1)
3059 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
3060
3062
3063 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
3064
3065 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
3069 InstructionCost NewCost =
3070 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
3071 nullptr, {NewX, NewY});
3072 if (!OuterV0->hasOneUse())
3073 NewCost += InnerCost0;
3074 if (!OuterV1->hasOneUse())
3075 NewCost += InnerCost1;
3076
3077 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
3078 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3079 << "\n");
3080 if (NewCost > OldCost)
3081 return false;
3082
3083 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
3084 replaceValue(I, *Shuf);
3085 return true;
3086}
3087
3088/// Try to convert a chain of length-preserving shuffles that are fed by
3089/// length-changing shuffles from the same source, e.g. a chain of length 3:
3090///
3091/// "shuffle (shuffle (shuffle x, (shuffle y, undef)),
3092/// (shuffle y, undef)),
3093// (shuffle y, undef)"
3094///
3095/// into a single shuffle fed by a length-changing shuffle:
3096///
3097/// "shuffle x, (shuffle y, undef)"
3098///
3099/// Such chains arise e.g. from folding extract/insert sequences.
3100bool VectorCombine::foldShufflesOfLengthChangingShuffles(Instruction &I) {
3101 FixedVectorType *TrunkType = dyn_cast<FixedVectorType>(I.getType());
3102 if (!TrunkType)
3103 return false;
3104
3105 unsigned ChainLength = 0;
3106 SmallVector<int> Mask;
3107 SmallVector<int> YMask;
3108 InstructionCost OldCost = 0;
3109 InstructionCost NewCost = 0;
3110 Value *Trunk = &I;
3111 unsigned NumTrunkElts = TrunkType->getNumElements();
3112 Value *Y = nullptr;
3113
3114 for (;;) {
3115 // Match the current trunk against (commutations of) the pattern
3116 // "shuffle trunk', (shuffle y, undef)"
3117 ArrayRef<int> OuterMask;
3118 Value *OuterV0, *OuterV1;
3119 if (ChainLength != 0 && !Trunk->hasOneUse())
3120 break;
3121 if (!match(Trunk, m_Shuffle(m_Value(OuterV0), m_Value(OuterV1),
3122 m_Mask(OuterMask))))
3123 break;
3124 if (OuterV0->getType() != TrunkType) {
3125 // This shuffle is not length-preserving, so it cannot be part of the
3126 // chain.
3127 break;
3128 }
3129
3130 ArrayRef<int> InnerMask0, InnerMask1;
3131 Value *A0, *A1, *B0, *B1;
3132 bool Match0 =
3133 match(OuterV0, m_Shuffle(m_Value(A0), m_Value(B0), m_Mask(InnerMask0)));
3134 bool Match1 =
3135 match(OuterV1, m_Shuffle(m_Value(A1), m_Value(B1), m_Mask(InnerMask1)));
3136 bool Match0Leaf = Match0 && A0->getType() != I.getType();
3137 bool Match1Leaf = Match1 && A1->getType() != I.getType();
3138 if (Match0Leaf == Match1Leaf) {
3139 // Only handle the case of exactly one leaf in each step. The "two leaves"
3140 // case is handled by foldShuffleOfShuffles.
3141 break;
3142 }
3143
3144 SmallVector<int> CommutedOuterMask;
3145 if (Match0Leaf) {
3146 std::swap(OuterV0, OuterV1);
3147 std::swap(InnerMask0, InnerMask1);
3148 std::swap(A0, A1);
3149 std::swap(B0, B1);
3150 llvm::append_range(CommutedOuterMask, OuterMask);
3151 for (int &M : CommutedOuterMask) {
3152 if (M == PoisonMaskElem)
3153 continue;
3154 if (M < (int)NumTrunkElts)
3155 M += NumTrunkElts;
3156 else
3157 M -= NumTrunkElts;
3158 }
3159 OuterMask = CommutedOuterMask;
3160 }
3161 if (!OuterV1->hasOneUse())
3162 break;
3163
3164 if (!isa<UndefValue>(A1)) {
3165 if (!Y)
3166 Y = A1;
3167 else if (Y != A1)
3168 break;
3169 }
3170 if (!isa<UndefValue>(B1)) {
3171 if (!Y)
3172 Y = B1;
3173 else if (Y != B1)
3174 break;
3175 }
3176
3177 auto *YType = cast<FixedVectorType>(A1->getType());
3178 int NumLeafElts = YType->getNumElements();
3179 SmallVector<int> LocalYMask(InnerMask1);
3180 for (int &M : LocalYMask) {
3181 if (M >= NumLeafElts)
3182 M -= NumLeafElts;
3183 }
3184
3185 InstructionCost LocalOldCost =
3188
3189 // Handle the initial (start of chain) case.
3190 if (!ChainLength) {
3191 Mask.assign(OuterMask);
3192 YMask.assign(LocalYMask);
3193 OldCost = NewCost = LocalOldCost;
3194 Trunk = OuterV0;
3195 ChainLength++;
3196 continue;
3197 }
3198
3199 // For the non-root case, first attempt to combine masks.
3200 SmallVector<int> NewYMask(YMask);
3201 bool Valid = true;
3202 for (auto [CombinedM, LeafM] : llvm::zip(NewYMask, LocalYMask)) {
3203 if (LeafM == -1 || CombinedM == LeafM)
3204 continue;
3205 if (CombinedM == -1) {
3206 CombinedM = LeafM;
3207 } else {
3208 Valid = false;
3209 break;
3210 }
3211 }
3212 if (!Valid)
3213 break;
3214
3215 SmallVector<int> NewMask;
3216 NewMask.reserve(NumTrunkElts);
3217 for (int M : Mask) {
3218 if (M < 0 || M >= static_cast<int>(NumTrunkElts))
3219 NewMask.push_back(M);
3220 else
3221 NewMask.push_back(OuterMask[M]);
3222 }
3223
3224 // Break the chain if adding this new step complicates the shuffles such
3225 // that it would increase the new cost by more than the old cost of this
3226 // step.
3227 InstructionCost LocalNewCost =
3229 YType, NewYMask, CostKind) +
3231 TrunkType, NewMask, CostKind);
3232
3233 if (LocalNewCost >= NewCost && LocalOldCost < LocalNewCost - NewCost)
3234 break;
3235
3236 LLVM_DEBUG({
3237 if (ChainLength == 1) {
3238 dbgs() << "Found chain of shuffles fed by length-changing shuffles: "
3239 << I << '\n';
3240 }
3241 dbgs() << " next chain link: " << *Trunk << '\n'
3242 << " old cost: " << (OldCost + LocalOldCost)
3243 << " new cost: " << LocalNewCost << '\n';
3244 });
3245
3246 Mask = NewMask;
3247 YMask = NewYMask;
3248 OldCost += LocalOldCost;
3249 NewCost = LocalNewCost;
3250 Trunk = OuterV0;
3251 ChainLength++;
3252 }
3253 if (ChainLength <= 1)
3254 return false;
3255
3256 // Bail out if all leaves were poison.
3257 if (!Y)
3258 return false;
3259
3260 if (llvm::all_of(Mask, [&](int M) {
3261 return M < 0 || M >= static_cast<int>(NumTrunkElts);
3262 })) {
3263 // Produce a canonical simplified form if all elements are sourced from Y.
3264 for (int &M : Mask) {
3265 if (M >= static_cast<int>(NumTrunkElts))
3266 M = YMask[M - NumTrunkElts];
3267 }
3268 Value *Root =
3269 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), Mask);
3270 replaceValue(I, *Root);
3271 return true;
3272 }
3273
3274 Value *Leaf =
3275 Builder.CreateShuffleVector(Y, PoisonValue::get(Y->getType()), YMask);
3276 Value *Root = Builder.CreateShuffleVector(Trunk, Leaf, Mask);
3277 replaceValue(I, *Root);
3278 return true;
3279}
3280
3281/// Try to convert
3282/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
3283bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
3284 Value *V0, *V1;
3285 ArrayRef<int> OldMask;
3286 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
3287 return false;
3288
3289 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3290 auto *II1 = dyn_cast<IntrinsicInst>(V1);
3291 if (!II0 || !II1)
3292 return false;
3293
3294 Intrinsic::ID IID = II0->getIntrinsicID();
3295 if (IID != II1->getIntrinsicID())
3296 return false;
3297 InstructionCost CostII0 =
3298 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3299 InstructionCost CostII1 =
3300 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind);
3301
3302 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3303 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
3304 if (!ShuffleDstTy || !II0Ty)
3305 return false;
3306
3307 if (!isTriviallyVectorizable(IID))
3308 return false;
3309
3310 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3311 Value *Arg0 = II0->getArgOperand(I);
3312 Value *Arg1 = II1->getArgOperand(I);
3314 // Scalar operands must be identical.
3315 if (Arg0 != Arg1)
3316 return false;
3317 } else if (Arg0->getType() != Arg1->getType()) {
3318 // The corresponding vector operands are shuffled together, so they must
3319 // share the same type. For intrinsics overloaded on their operand type
3320 // (e.g. llvm.fptosi.sat), two calls can produce the same result type
3321 // from different operand types; shuffling those would be invalid.
3322 return false;
3323 }
3324 }
3325
3326 InstructionCost OldCost =
3327 CostII0 + CostII1 +
3329 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
3330
3331 SmallVector<Type *> NewArgsTy;
3332 InstructionCost NewCost = 0;
3333 SmallDenseSet<std::pair<Value *, Value *>> SeenOperandPairs;
3334 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3336 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3337 } else {
3338 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3339 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3340 ShuffleDstTy->getNumElements());
3341 NewArgsTy.push_back(ArgTy);
3342 std::pair<Value *, Value *> OperandPair =
3343 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3344 if (!SeenOperandPairs.insert(OperandPair).second) {
3345 // We've already computed the cost for this operand pair.
3346 continue;
3347 }
3348 NewCost += TTI.getShuffleCost(
3349 TargetTransformInfo::SK_PermuteTwoSrc, ArgTy, VecTy, OldMask,
3350 CostKind, 0, nullptr, {II0->getArgOperand(I), II1->getArgOperand(I)});
3351 }
3352 }
3353 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3354
3355 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3356 if (!II0->hasOneUse())
3357 NewCost += CostII0;
3358 if (II1 != II0 && !II1->hasOneUse())
3359 NewCost += CostII1;
3360
3361 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
3362 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
3363 << "\n");
3364
3365 if (NewCost > OldCost)
3366 return false;
3367
3368 SmallVector<Value *> NewArgs;
3369 SmallDenseMap<std::pair<Value *, Value *>, Value *> ShuffleCache;
3370 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
3372 NewArgs.push_back(II0->getArgOperand(I));
3373 } else {
3374 std::pair<Value *, Value *> OperandPair =
3375 std::make_pair(II0->getArgOperand(I), II1->getArgOperand(I));
3376 auto It = ShuffleCache.find(OperandPair);
3377 if (It != ShuffleCache.end()) {
3378 // Reuse previously created shuffle for this operand pair.
3379 NewArgs.push_back(It->second);
3380 continue;
3381 }
3382 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
3383 II1->getArgOperand(I), OldMask);
3384 ShuffleCache[OperandPair] = Shuf;
3385 NewArgs.push_back(Shuf);
3386 Worklist.pushValue(Shuf);
3387 }
3388 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3389
3390 // Intersect flags from the old intrinsics.
3391 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
3392 NewInst->copyIRFlags(II0);
3393 NewInst->andIRFlags(II1);
3394 }
3395
3396 replaceValue(I, *NewIntrinsic);
3397 return true;
3398}
3399
3400/// Try to convert
3401/// "shuffle (intrinsic), (poison/undef)" into "intrinsic (shuffle)".
3402bool VectorCombine::foldPermuteOfIntrinsic(Instruction &I) {
3403 Value *V0;
3404 ArrayRef<int> Mask;
3405 if (!match(&I, m_Shuffle(m_Value(V0), m_Undef(), m_Mask(Mask))))
3406 return false;
3407
3408 auto *II0 = dyn_cast<IntrinsicInst>(V0);
3409 if (!II0)
3410 return false;
3411
3412 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
3413 auto *IntrinsicSrcTy = dyn_cast<FixedVectorType>(II0->getType());
3414 if (!ShuffleDstTy || !IntrinsicSrcTy)
3415 return false;
3416
3417 // Validate it's a pure permute, mask should only reference the first vector
3418 unsigned NumSrcElts = IntrinsicSrcTy->getNumElements();
3419 if (any_of(Mask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
3420 return false;
3421
3422 Intrinsic::ID IID = II0->getIntrinsicID();
3423 if (!isTriviallyVectorizable(IID))
3424 return false;
3425
3426 // Cost analysis
3428 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind);
3429 InstructionCost OldCost =
3432 IntrinsicSrcTy, Mask, CostKind, 0, nullptr, {V0}, &I);
3433
3434 SmallVector<Type *> NewArgsTy;
3435 InstructionCost NewCost = 0;
3436 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3438 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
3439 } else {
3440 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
3441 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
3442 ShuffleDstTy->getNumElements());
3443 NewArgsTy.push_back(ArgTy);
3445 ArgTy, VecTy, Mask, CostKind, 0, nullptr,
3446 {II0->getArgOperand(I)});
3447 }
3448 }
3449 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
3450 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
3451
3452 // If the intrinsic has multiple uses, we need to account for the cost of
3453 // keeping the original intrinsic around.
3454 if (!II0->hasOneUse())
3455 NewCost += IntrinsicCost;
3456
3457 LLVM_DEBUG(dbgs() << "Found a permute of intrinsic: " << I << "\n OldCost: "
3458 << OldCost << " vs NewCost: " << NewCost << "\n");
3459
3460 if (NewCost > OldCost)
3461 return false;
3462
3463 // Transform
3464 SmallVector<Value *> NewArgs;
3465 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
3467 NewArgs.push_back(II0->getArgOperand(I));
3468 } else {
3469 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I), Mask);
3470 NewArgs.push_back(Shuf);
3471 Worklist.pushValue(Shuf);
3472 }
3473 }
3474
3475 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
3476
3477 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic))
3478 NewInst->copyIRFlags(II0);
3479
3480 replaceValue(I, *NewIntrinsic);
3481 return true;
3482}
3483
3484using InstLane = std::pair<Value *, int>;
3485
3486static InstLane lookThroughShuffles(Value *V, int Lane) {
3487 while (auto *SV = dyn_cast<ShuffleVectorInst>(V)) {
3488 unsigned NumElts =
3489 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
3490 int M = SV->getMaskValue(Lane);
3491 if (M < 0)
3492 return {nullptr, PoisonMaskElem};
3493 if (static_cast<unsigned>(M) < NumElts) {
3494 V = SV->getOperand(0);
3495 Lane = M;
3496 } else {
3497 V = SV->getOperand(1);
3498 Lane = M - NumElts;
3499 }
3500 }
3501 return InstLane{V, Lane};
3502}
3503
3507 for (InstLane IL : Item) {
3508 auto [U, Lane] = IL;
3509 InstLane OpLane =
3510 U ? lookThroughShuffles(cast<Instruction>(U)->getOperand(Op), Lane)
3511 : InstLane{nullptr, PoisonMaskElem};
3512 NItem.emplace_back(OpLane);
3513 }
3514 return NItem;
3515}
3516
3517/// Detect concat of multiple values into a vector
3519 const TargetTransformInfo &TTI) {
3520 auto *Ty = cast<FixedVectorType>(Item.front().first->getType());
3521 unsigned NumElts = Ty->getNumElements();
3522 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
3523 return false;
3524
3525 // Check that the concat is free, usually meaning that the type will be split
3526 // during legalization.
3527 SmallVector<int, 16> ConcatMask(NumElts * 2);
3528 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
3529 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
3530 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
3531 Ty, ConcatMask, CostKind) != 0)
3532 return false;
3533
3534 unsigned NumSlices = Item.size() / NumElts;
3535 // Currently we generate a tree of shuffles for the concats, which limits us
3536 // to a power2.
3537 if (!isPowerOf2_32(NumSlices))
3538 return false;
3539 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
3540 Value *SliceV = Item[Slice * NumElts].first;
3541 if (!SliceV || SliceV->getType() != Ty)
3542 return false;
3543 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
3544 auto [V, Lane] = Item[Slice * NumElts + Elt];
3545 if (Lane != static_cast<int>(Elt) || SliceV != V)
3546 return false;
3547 }
3548 }
3549 return true;
3550}
3551
3552static Value *
3554 const DenseSet<std::pair<Value *, Use *>> &IdentityLeafs,
3555 const DenseSet<std::pair<Value *, Use *>> &SplatLeafs,
3556 const DenseSet<std::pair<Value *, Use *>> &ConcatLeafs,
3557 IRBuilderBase &Builder, const TargetTransformInfo *TTI) {
3558 auto [FrontV, FrontLane] = Item.front();
3559
3560 if (IdentityLeafs.contains(std::make_pair(FrontV, From))) {
3561 return FrontV;
3562 }
3563 if (SplatLeafs.contains(std::make_pair(FrontV, From))) {
3564 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
3565 return Builder.CreateShuffleVector(FrontV, Mask);
3566 }
3567 if (ConcatLeafs.contains(std::make_pair(FrontV, From))) {
3568 unsigned NumElts =
3569 cast<FixedVectorType>(FrontV->getType())->getNumElements();
3570 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
3571 for (unsigned S = 0; S < Values.size(); ++S)
3572 Values[S] = Item[S * NumElts].first;
3573
3574 while (Values.size() > 1) {
3575 NumElts *= 2;
3576 SmallVector<int, 16> Mask(NumElts, 0);
3577 std::iota(Mask.begin(), Mask.end(), 0);
3578 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
3579 for (unsigned S = 0; S < NewValues.size(); ++S)
3580 NewValues[S] =
3581 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
3582 Values = NewValues;
3583 }
3584 return Values[0];
3585 }
3586
3587 auto *I = cast<Instruction>(FrontV);
3588 auto *II = dyn_cast<IntrinsicInst>(I);
3589 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
3591 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
3592 if (II &&
3593 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
3594 Ops[Idx] = II->getOperand(Idx);
3595 continue;
3596 }
3598 &I->getOperandUse(Idx), Ty, IdentityLeafs,
3599 SplatLeafs, ConcatLeafs, Builder, TTI);
3600 }
3601
3602 SmallVector<Value *, 8> ValueList;
3603 for (const auto &Lane : Item)
3604 if (Lane.first)
3605 ValueList.push_back(Lane.first);
3606
3607 Type *DstTy =
3608 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
3609 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3610 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3611 Ops[0], Ops[1]);
3612 propagateIRFlags(Value, ValueList);
3613 return Value;
3614 }
3615 if (auto *CI = dyn_cast<CmpInst>(I)) {
3616 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3617 propagateIRFlags(Value, ValueList);
3618 return Value;
3619 }
3620 if (auto *SI = dyn_cast<SelectInst>(I)) {
3621 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3622 propagateIRFlags(Value, ValueList);
3623 return Value;
3624 }
3625 if (auto *CI = dyn_cast<CastInst>(I)) {
3626 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3627 propagateIRFlags(Value, ValueList);
3628 return Value;
3629 }
3630 if (II) {
3631 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3632 propagateIRFlags(Value, ValueList);
3633 return Value;
3634 }
3635 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3636 auto *Value =
3637 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3638 propagateIRFlags(Value, ValueList);
3639 return Value;
3640}
3641
3642// Starting from a shuffle, look up through operands tracking the shuffled index
3643// of each lane. If we can simplify away the shuffles to identities then
3644// do so.
3645bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3646 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3647 if (!Ty || I.use_empty())
3648 return false;
3649
3650 SmallVector<InstLane> Start(Ty->getNumElements());
3651 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3652 Start[M] = lookThroughShuffles(&I, M);
3653
3655 Worklist.push_back(std::make_pair(Start, &*I.use_begin()));
3656 DenseSet<std::pair<Value *, Use *>> IdentityLeafs, SplatLeafs, ConcatLeafs;
3657 unsigned NumVisited = 0;
3658
3659 while (!Worklist.empty()) {
3660 if (++NumVisited > MaxInstrsToScan)
3661 return false;
3662
3663 auto ItemFrom = Worklist.pop_back_val();
3664 auto Item = ItemFrom.first;
3665 auto From = ItemFrom.second;
3666 auto [FrontV, FrontLane] = Item.front();
3667
3668 // If we found an undef first lane then bail out to keep things simple.
3669 if (!FrontV)
3670 return false;
3671
3672 // Helper to peek through bitcasts to the same value.
3673 auto IsEquiv = [&](Value *X, Value *Y) {
3674 return X->getType() == Y->getType() &&
3676 };
3677
3678 // Look for an identity value.
3679 if (FrontLane == 0 &&
3680 cast<FixedVectorType>(FrontV->getType())->getNumElements() ==
3681 Ty->getNumElements() &&
3682 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3683 Value *FrontV = Item.front().first;
3684 return !E.value().first || (IsEquiv(E.value().first, FrontV) &&
3685 E.value().second == (int)E.index());
3686 })) {
3687 IdentityLeafs.insert(std::make_pair(FrontV, From));
3688 continue;
3689 }
3690 // Look for constants, for the moment only supporting constant splats.
3691 if (auto *C = dyn_cast<Constant>(FrontV);
3692 C && C->getSplatValue() &&
3693 all_of(drop_begin(Item), [Item](InstLane &IL) {
3694 Value *FrontV = Item.front().first;
3695 Value *V = IL.first;
3696 return !V || (isa<Constant>(V) &&
3697 cast<Constant>(V)->getSplatValue() ==
3698 cast<Constant>(FrontV)->getSplatValue());
3699 })) {
3700 SplatLeafs.insert(std::make_pair(FrontV, From));
3701 continue;
3702 }
3703 // Look for a splat value.
3704 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3705 auto [FrontV, FrontLane] = Item.front();
3706 auto [V, Lane] = IL;
3707 return !V || (V == FrontV && Lane == FrontLane);
3708 })) {
3709 SplatLeafs.insert(std::make_pair(FrontV, From));
3710 continue;
3711 }
3712
3713 // We need each element to be the same type of value, and check that each
3714 // element has a single use.
3715 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3716 Value *FrontV = Item.front().first;
3717 if (!IL.first)
3718 return true;
3719 Value *V = IL.first;
3720 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3721 return false;
3722 if (V->getValueID() != FrontV->getValueID())
3723 return false;
3724 if (auto *CI = dyn_cast<CmpInst>(V))
3725 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3726 return false;
3727 if (auto *CI = dyn_cast<CastInst>(V))
3728 if (CI->getSrcTy()->getScalarType() !=
3729 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3730 return false;
3731 if (auto *SI = dyn_cast<SelectInst>(V))
3732 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3733 SI->getOperand(0)->getType() !=
3734 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3735 return false;
3736 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3737 return false;
3738 auto *II = dyn_cast<IntrinsicInst>(V);
3739 return !II || (isa<IntrinsicInst>(FrontV) &&
3740 II->getIntrinsicID() ==
3741 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3742 !II->hasOperandBundles());
3743 };
3744 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3745 // Check the operator is one that we support.
3746 if (isa<BinaryOperator, CmpInst>(FrontV)) {
3747 // We exclude div/rem in case they hit UB from poison lanes.
3748 if (auto *BO = dyn_cast<BinaryOperator>(FrontV);
3749 BO && BO->isIntDivRem())
3750 return false;
3752 &cast<Instruction>(FrontV)->getOperandUse(0));
3754 &cast<Instruction>(FrontV)->getOperandUse(1));
3755 continue;
3756 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3757 FPToUIInst, SIToFPInst, UIToFPInst>(FrontV)) {
3759 &cast<Instruction>(FrontV)->getOperandUse(0));
3760 continue;
3761 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontV)) {
3762 // TODO: Handle vector widening/narrowing bitcasts.
3763 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3764 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3765 if (DstTy && SrcTy &&
3766 SrcTy->getNumElements() == DstTy->getNumElements()) {
3768 &BitCast->getOperandUse(0));
3769 continue;
3770 }
3771 } else if (auto *Sel = dyn_cast<SelectInst>(FrontV)) {
3773 &Sel->getOperandUse(0));
3775 &Sel->getOperandUse(1));
3777 &Sel->getOperandUse(2));
3778 continue;
3779 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontV);
3780 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3781 !II->hasOperandBundles()) {
3782 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3783 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3784 &TTI)) {
3785 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3786 Value *FrontV = Item.front().first;
3787 Value *V = IL.first;
3788 return !V || (cast<Instruction>(V)->getOperand(Op) ==
3789 cast<Instruction>(FrontV)->getOperand(Op));
3790 }))
3791 return false;
3792 continue;
3793 }
3795 &cast<Instruction>(FrontV)->getOperandUse(Op));
3796 }
3797 continue;
3798 }
3799 }
3800
3801 if (isFreeConcat(Item, CostKind, TTI)) {
3802 ConcatLeafs.insert(std::make_pair(FrontV, From));
3803 continue;
3804 }
3805
3806 return false;
3807 }
3808
3809 if (NumVisited <= 1)
3810 return false;
3811
3812 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3813
3814 // If we got this far, we know the shuffles are superfluous and can be
3815 // removed. Scan through again and generate the new tree of instructions.
3816 Builder.SetInsertPoint(&I);
3817 Value *V = generateNewInstTree(Start, &*I.use_begin(), Ty, IdentityLeafs,
3818 SplatLeafs, ConcatLeafs, Builder, &TTI);
3819 replaceValue(I, *V);
3820 return true;
3821}
3822
3823/// Given a commutative reduction, the order of the input lanes does not alter
3824/// the results. We can use this to remove certain shuffles feeding the
3825/// reduction, removing the need to shuffle at all.
3826bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3827 auto *II = dyn_cast<IntrinsicInst>(&I);
3828 if (!II)
3829 return false;
3830 switch (II->getIntrinsicID()) {
3831 case Intrinsic::vector_reduce_add:
3832 case Intrinsic::vector_reduce_mul:
3833 case Intrinsic::vector_reduce_and:
3834 case Intrinsic::vector_reduce_or:
3835 case Intrinsic::vector_reduce_xor:
3836 case Intrinsic::vector_reduce_smin:
3837 case Intrinsic::vector_reduce_smax:
3838 case Intrinsic::vector_reduce_umin:
3839 case Intrinsic::vector_reduce_umax:
3840 break;
3841 default:
3842 return false;
3843 }
3844
3845 // Find all the inputs when looking through operations that do not alter the
3846 // lane order (binops, for example). Currently we look for a single shuffle,
3847 // and can ignore splat values.
3848 std::queue<Value *> Worklist;
3849 SmallPtrSet<Value *, 4> Visited;
3850 ShuffleVectorInst *Shuffle = nullptr;
3851 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3852 Worklist.push(Op);
3853
3854 while (!Worklist.empty()) {
3855 Value *CV = Worklist.front();
3856 Worklist.pop();
3857 if (Visited.contains(CV))
3858 continue;
3859
3860 // Splats don't change the order, so can be safely ignored.
3861 if (isSplatValue(CV))
3862 continue;
3863
3864 Visited.insert(CV);
3865
3866 if (auto *CI = dyn_cast<Instruction>(CV)) {
3867 if (CI->isBinaryOp()) {
3868 for (auto *Op : CI->operand_values())
3869 Worklist.push(Op);
3870 continue;
3871 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3872 if (Shuffle && Shuffle != SV)
3873 return false;
3874 Shuffle = SV;
3875 continue;
3876 }
3877 }
3878
3879 // Anything else is currently an unknown node.
3880 return false;
3881 }
3882
3883 if (!Shuffle)
3884 return false;
3885
3886 // Check all uses of the binary ops and shuffles are also included in the
3887 // lane-invariant operations (Visited should be the list of lanewise
3888 // instructions, including the shuffle that we found).
3889 for (auto *V : Visited)
3890 for (auto *U : V->users())
3891 if (!Visited.contains(U) && U != &I)
3892 return false;
3893
3894 FixedVectorType *VecType =
3895 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3896 if (!VecType)
3897 return false;
3898 FixedVectorType *ShuffleInputType =
3900 if (!ShuffleInputType)
3901 return false;
3902 unsigned NumInputElts = ShuffleInputType->getNumElements();
3903
3904 // Find the mask from sorting the lanes into order. This is most likely to
3905 // become a identity or concat mask. Undef elements are pushed to the end.
3906 SmallVector<int> ConcatMask;
3907 Shuffle->getShuffleMask(ConcatMask);
3908 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3909 bool UsesSecondVec =
3910 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3911
3913 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3914 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3916 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3917 ShuffleInputType, ConcatMask, CostKind);
3918
3919 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3920 << "\n");
3921 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3922 << "\n");
3923 bool MadeChanges = false;
3924 if (NewCost < OldCost) {
3925 Builder.SetInsertPoint(Shuffle);
3926 Value *NewShuffle = Builder.CreateShuffleVector(
3927 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3928 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3929 replaceValue(*Shuffle, *NewShuffle);
3930 return true;
3931 }
3932
3933 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3934 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3935 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3936 return MadeChanges;
3937}
3938
3939/// Try to fold a chain of shuffles and ops feeding extractelement(..., 0)
3940/// into llvm.vector.reduce.*, by tracking which lanes contribute to the
3941/// extracted lane and reducing the widest vector whose lanes each contribute
3942/// once.
3943///
3944/// For example:
3945///
3946/// %lo = shufflevector <4 x i32> %a, poison, <2 x i32> <i32 0, i32 1>
3947/// %hi = shufflevector <4 x i32> %a, poison, <2 x i32> <i32 2, i32 3>
3948/// %s = add <2 x i32> %lo, %hi
3949/// %sh = shufflevector <2 x i32> %s, poison, <2 x i32> <i32 1, i32 poison>
3950/// %r = add <2 x i32> %s, %sh
3951/// %e = extractelement <2 x i32> %r, i64 0
3952///
3953/// transforms to:
3954///
3955/// %e = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
3956bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3957 Value *VecOpEE;
3958 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3959 return false;
3960
3961 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3962 if (!FVT)
3963 return false;
3964
3965 if (FVT->getNumElements() < 2)
3966 return false;
3967
3968 std::optional<Instruction::BinaryOps> CommonBinOp;
3969 std::optional<Intrinsic::ID> CommonCallOp;
3970
3971 if (auto *BO = dyn_cast<BinaryOperator>(VecOpEE)) {
3972 if (!getReductionForBinop(BO->getOpcode()))
3973 return false;
3974 CommonBinOp = BO->getOpcode();
3975 } else if (auto *MMI = dyn_cast<MinMaxIntrinsic>(VecOpEE)) {
3976 CommonCallOp = MMI->getIntrinsicID();
3977 } else {
3978 return false;
3979 }
3980
3981 // For floating-point reductions, track FMF intersection across all binops.
3982 FastMathFlags CommonFMF;
3983 bool IsFloatReduction = false;
3984
3985 // A chain node is one we walk through, either a matching-opcode binop/min-max
3986 // or a single-source shuffle. Anything else is a leaf source.
3987 auto IsChainNode = [&](Value *V) {
3988 if (auto *BO = dyn_cast<BinaryOperator>(V))
3989 return CommonBinOp && BO->getOpcode() == *CommonBinOp;
3990 if (auto *MMI = dyn_cast<MinMaxIntrinsic>(V))
3991 return CommonCallOp && MMI->getIntrinsicID() == *CommonCallOp;
3992 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V))
3993 return isa<PoisonValue>(SVI->getOperand(1));
3994 return false;
3995 };
3996
3997 // Collect the chain, building Nodes in postorder. Bail if the chain is empty
3998 // or exceeds MaxChainNodes.
3999 constexpr unsigned MaxChainNodes = 32;
4000 SmallSetVector<Value *, 16> Nodes;
4001 SmallSetVector<Value *, 4> Sources;
4002 unsigned NumVisited = 0;
4003 auto AddSource = [&](Value *V) {
4004 if (!isa<FixedVectorType>(V->getType()))
4005 return false;
4006 Sources.insert(V);
4007 return true;
4008 };
4009 auto Walk = [&](Value *V, auto &&Walk) -> bool {
4010 if (Nodes.contains(V) || Sources.contains(V))
4011 return true;
4012 if (++NumVisited > MaxChainNodes)
4013 return false;
4014 if (!IsChainNode(V))
4015 return AddSource(V);
4016 // Chain shuffles always have poison as op1, so only op0 matters.
4017 auto *U = cast<Instruction>(V);
4018 unsigned NumOps = isa<ShuffleVectorInst>(U) ? 1 : 2;
4019 for (unsigned I = 0; I != NumOps; ++I)
4020 if (!Walk(U->getOperand(I), Walk))
4021 return false;
4022 if (isa<ShuffleVectorInst>(U) || Nodes.contains(U->getOperand(0)) ||
4023 Nodes.contains(U->getOperand(1))) {
4024 Nodes.insert(V);
4025 return true;
4026 }
4027 // Both operands are leaves so treat this binop as a source rather than
4028 // walking into it.
4029 return AddSource(V);
4030 };
4031 if (!Walk(VecOpEE, Walk) || Nodes.empty())
4032 return false;
4033
4034 bool IsIdempotent =
4035 CommonCallOp || (CommonBinOp && Instruction::isIdempotent(*CommonBinOp));
4036
4037 // For FP reductions, require reassoc on every binop and collect FMF.
4038 for (Value *V : Nodes) {
4039 auto *BinOp = dyn_cast<BinaryOperator>(V);
4040 if (!BinOp || !BinOp->getType()->isFPOrFPVectorTy())
4041 continue;
4042 if (!BinOp->hasAllowReassoc())
4043 return false;
4044 if (!IsFloatReduction) {
4045 CommonFMF = BinOp->getFastMathFlags();
4046 IsFloatReduction = true;
4047 } else {
4048 CommonFMF &= BinOp->getFastMathFlags();
4049 }
4050 }
4051
4052 // Top-down demanded elements. For each chain value, track which lanes feed
4053 // the extracted lane 0 and which feed it more than once. Reverse postorder
4054 // visits every use before its value. A binop forwards its demand to both
4055 // operands and a shuffle follows its mask back to the source lane.
4056 struct Demand {
4057 APInt Lanes;
4058 APInt Duplicates;
4059 };
4060 DenseMap<Value *, Demand> Demands;
4061 auto DemandOf = [&](Value *V) -> Demand & {
4062 unsigned N = cast<FixedVectorType>(V->getType())->getNumElements();
4063 Demand &D = Demands[V];
4064 if (D.Lanes.getBitWidth() != N)
4065 D.Lanes = D.Duplicates = APInt::getZero(N);
4066 return D;
4067 };
4068 DemandOf(VecOpEE).Lanes.setBit(0);
4069 for (Value *V : reverse(Nodes)) {
4070 Demand DV = Demands.lookup(V);
4071 if (DV.Lanes.isZero())
4072 continue;
4073 if (auto *SVI = dyn_cast<ShuffleVectorInst>(V)) {
4074 ArrayRef<int> Mask = SVI->getShuffleMask();
4075 Demand &DS = DemandOf(SVI->getOperand(0));
4076 for (unsigned I = 0, E = Mask.size(); I != E; ++I) {
4077 // Skip lanes that are undemanded or map to poison.
4078 if (!DV.Lanes[I] || Mask[I] < 0 ||
4079 (unsigned)Mask[I] >= DS.Lanes.getBitWidth())
4080 continue;
4081 if (DS.Lanes[Mask[I]] || DV.Duplicates[I])
4082 DS.Duplicates.setBit(Mask[I]);
4083 DS.Lanes.setBit(Mask[I]);
4084 }
4085 } else {
4086 auto *U = cast<User>(V);
4087 for (Value *Op : {U->getOperand(0), U->getOperand(1)}) {
4088 Demand &DOp = DemandOf(Op);
4089 // Lanes demanded through more than one path accumulate in Duplicates.
4090 DOp.Duplicates |= DV.Duplicates | (DOp.Lanes & DV.Lanes);
4091 DOp.Lanes |= DV.Lanes;
4092 }
4093 }
4094 }
4095
4096 // Reducing V replaces the entire chain, so every contribution to the result
4097 // must flow through V. Reject if anything above V reads outside the chain.
4098 auto CoversChain = [&](Value *V) {
4099 SmallVector<Value *, 8> Worklist(1, VecOpEE);
4100 SmallPtrSet<Value *, 8> Seen;
4101 Seen.insert(VecOpEE);
4102 while (!Worklist.empty()) {
4103 auto *U = cast<Instruction>(Worklist.pop_back_val());
4104 unsigned NumOps = isa<ShuffleVectorInst>(U) ? 1 : 2;
4105 for (unsigned I = 0; I != NumOps; ++I) {
4106 Value *Op = U->getOperand(I);
4107 if (Op == V || !Seen.insert(Op).second)
4108 continue;
4109 if (!Nodes.contains(Op))
4110 return false;
4111 Worklist.push_back(Op);
4112 }
4113 }
4114 return true;
4115 };
4116
4117 // Reduce a single cleanly demanded source if there is one, otherwise the
4118 // deepest intermediate that covers the chain.
4119 struct ReductionCut {
4120 Value *Src;
4121 APInt Elts;
4122 };
4123 std::optional<ReductionCut> Cut;
4124 for (Value *S : Sources) {
4125 auto It = Demands.find(S);
4126 if (It == Demands.end() || It->second.Lanes.isZero())
4127 continue;
4128 if (Cut || (!IsIdempotent && !It->second.Duplicates.isZero())) {
4129 Cut.reset();
4130 break;
4131 }
4132 Cut = ReductionCut{S, It->second.Lanes};
4133 }
4134 if (!Cut) {
4135 for (Value *V : Nodes) {
4137 continue;
4138 auto It = Demands.find(V);
4139 if (It == Demands.end() || !It->second.Lanes.isAllOnes())
4140 continue;
4141 if (!IsIdempotent && !It->second.Duplicates.isZero())
4142 continue;
4143 if (!CoversChain(V))
4144 continue;
4145 Cut = ReductionCut{V, It->second.Lanes};
4146 break;
4147 }
4148 }
4149 // Reducing one lane is just an extract and can refold forever.
4150 if (!Cut || Cut->Elts.popcount() < 2)
4151 return false;
4152
4153 Intrinsic::ID ReducedOp =
4154 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
4155 : getReductionForBinop(*CommonBinOp));
4156 if (!ReducedOp)
4157 return false;
4158
4159 InstructionCost OrigCost = 0;
4160 for (Value *V : Nodes)
4162
4163 auto *SrcVT = cast<FixedVectorType>(Cut->Src->getType());
4164 bool IsPartialReduction = !Cut->Elts.isAllOnes();
4165 FixedVectorType *ReduceVecTy =
4166 IsPartialReduction
4167 ? FixedVectorType::get(FVT->getElementType(), Cut->Elts.popcount())
4168 : SrcVT;
4169
4170 SmallVector<int> ExtractMask;
4171 InstructionCost NewCost = 0;
4172 if (IsPartialReduction) {
4173 for (unsigned I = 0, E = Cut->Elts.getBitWidth(); I != E; ++I)
4174 if (Cut->Elts[I])
4175 ExtractMask.push_back(I);
4176 unsigned SubIdx = 0, SubLen;
4177 auto SK = Cut->Elts.isShiftedMask(SubIdx, SubLen)
4180 NewCost += TTI.getShuffleCost(SK, ReduceVecTy, SrcVT, ExtractMask, CostKind,
4181 SubIdx, ReduceVecTy);
4182 }
4183
4184 IntrinsicCostAttributes ICA(
4185 ReducedOp, ReduceVecTy->getElementType(),
4186 IsFloatReduction
4187 ? SmallVector<Type *, 2>{ReduceVecTy->getElementType(), ReduceVecTy}
4188 : SmallVector<Type *, 2>{ReduceVecTy},
4189 IsFloatReduction ? CommonFMF : FastMathFlags());
4190 NewCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
4191
4192 LLVM_DEBUG(dbgs() << "Found reduction shuffle chain: " << I << "\n OldCost : "
4193 << OrigCost << " vs NewCost: " << NewCost << "\n");
4194
4195 if (!OrigCost.isValid() || !NewCost.isValid())
4196 return false;
4197
4198 if (VecOpEE->hasOneUse() ? (NewCost > OrigCost) : (NewCost >= OrigCost))
4199 return false;
4200
4201 Value *ReduceInput = Cut->Src;
4202 if (IsPartialReduction)
4203 ReduceInput = Builder.CreateShuffleVector(Cut->Src, ExtractMask);
4204
4205 Value *ReducedResult;
4206 if (IsFloatReduction) {
4208 *CommonBinOp, ReduceVecTy->getElementType(), /*AllowRHSConstant=*/false,
4209 CommonFMF.noSignedZeros());
4210 ReducedResult = Builder.CreateIntrinsic(ReducedOp, {ReduceVecTy},
4211 {Identity, ReduceInput}, CommonFMF);
4212 } else {
4213 ReducedResult =
4214 Builder.CreateIntrinsic(ReducedOp, {ReduceVecTy}, {ReduceInput});
4215 }
4216 replaceValue(I, *ReducedResult);
4217
4218 return true;
4219}
4220
4221/// Determine if its more efficient to fold:
4222/// reduce(trunc(x)) -> trunc(reduce(x)).
4223/// reduce(sext(x)) -> sext(reduce(x)).
4224/// reduce(zext(x)) -> zext(reduce(x)).
4225bool VectorCombine::foldCastFromReductions(Instruction &I) {
4226 auto *II = dyn_cast<IntrinsicInst>(&I);
4227 if (!II)
4228 return false;
4229
4230 bool TruncOnly = false;
4231 Intrinsic::ID IID = II->getIntrinsicID();
4232 switch (IID) {
4233 case Intrinsic::vector_reduce_add:
4234 case Intrinsic::vector_reduce_mul:
4235 TruncOnly = true;
4236 break;
4237 case Intrinsic::vector_reduce_and:
4238 case Intrinsic::vector_reduce_or:
4239 case Intrinsic::vector_reduce_xor:
4240 break;
4241 default:
4242 return false;
4243 }
4244
4245 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4246 Value *ReductionSrc = I.getOperand(0);
4247
4248 Value *Src;
4249 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
4250 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
4251 return false;
4252
4253 auto CastOpc =
4254 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
4255
4256 auto *SrcTy = cast<VectorType>(Src->getType());
4257 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
4258 Type *ResultTy = I.getType();
4259
4261 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
4262 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
4264 cast<CastInst>(ReductionSrc));
4265 InstructionCost NewCost =
4266 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
4267 CostKind) +
4268 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
4270
4271 if (OldCost <= NewCost || !NewCost.isValid())
4272 return false;
4273
4274 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
4275 II->getIntrinsicID(), {Src});
4276 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
4277 replaceValue(I, *NewCast);
4278 return true;
4279}
4280
4281/// Fold:
4282/// icmp pred (reduce.{add,or,and,umax,umin}(signbit_extract(x))), C
4283/// into:
4284/// icmp sgt/slt (reduce.{or,umax,and,umin}(x)), -1/0
4285///
4286/// Sign-bit reductions produce values with known semantics:
4287/// - reduce.{or,umax}: 0 if no element is negative, 1 if any is
4288/// - reduce.{and,umin}: 1 if all elements are negative, 0 if any isn't
4289/// - reduce.add: count of negative elements (0 to NumElts)
4290///
4291/// Both lshr and ashr are supported:
4292/// - lshr produces 0 or 1, so reduce.add range is [0, N]
4293/// - ashr produces 0 or -1, so reduce.add range is [-N, 0]
4294///
4295/// The fold generalizes to multiple source vectors combined with the same
4296/// operation as the reduction. For example:
4297/// reduce.or(or(shr A, shr B)) conceptually extends the vector
4298/// For reduce.add, this changes the count to M*N where M is the number of
4299/// source vectors.
4300///
4301/// We transform to a direct sign check on the original vector using
4302/// reduce.{or,umax} or reduce.{and,umin}.
4303///
4304/// In spirit, it's similar to foldSignBitCheck in InstCombine.
4305bool VectorCombine::foldSignBitReductionCmp(Instruction &I) {
4306 CmpPredicate Pred;
4307 IntrinsicInst *ReduceOp;
4308 const APInt *CmpVal;
4309 if (!match(&I,
4310 m_ICmp(Pred, m_OneUse(m_AnyIntrinsic(ReduceOp)), m_APInt(CmpVal))))
4311 return false;
4312
4313 Intrinsic::ID OrigIID = ReduceOp->getIntrinsicID();
4314 switch (OrigIID) {
4315 case Intrinsic::vector_reduce_or:
4316 case Intrinsic::vector_reduce_umax:
4317 case Intrinsic::vector_reduce_and:
4318 case Intrinsic::vector_reduce_umin:
4319 case Intrinsic::vector_reduce_add:
4320 break;
4321 default:
4322 return false;
4323 }
4324
4325 Value *ReductionSrc = ReduceOp->getArgOperand(0);
4326 auto *VecTy = dyn_cast<FixedVectorType>(ReductionSrc->getType());
4327 if (!VecTy)
4328 return false;
4329
4330 unsigned BitWidth = VecTy->getScalarSizeInBits();
4331 if (BitWidth == 1)
4332 return false;
4333
4334 unsigned NumElts = VecTy->getNumElements();
4335
4336 // Determine the expected tree opcode for multi-vector patterns.
4337 // The tree opcode must match the reduction's underlying operation.
4338 //
4339 // TODO: for pairs of equivalent operators, we should match both,
4340 // not only the most common.
4341 Instruction::BinaryOps TreeOpcode;
4342 switch (OrigIID) {
4343 case Intrinsic::vector_reduce_or:
4344 case Intrinsic::vector_reduce_umax:
4345 TreeOpcode = Instruction::Or;
4346 break;
4347 case Intrinsic::vector_reduce_and:
4348 case Intrinsic::vector_reduce_umin:
4349 TreeOpcode = Instruction::And;
4350 break;
4351 case Intrinsic::vector_reduce_add:
4352 TreeOpcode = Instruction::Add;
4353 break;
4354 default:
4355 llvm_unreachable("Unexpected intrinsic");
4356 }
4357
4358 // Collect sign-bit extraction leaves from an associative tree of TreeOpcode.
4359 // The tree conceptually extends the vector being reduced.
4360 SmallVector<Value *, 8> Worklist;
4361 SmallVector<Value *, 8> Sources; // Original vectors (X in shr X, BW-1)
4362 Worklist.push_back(ReductionSrc);
4363 std::optional<bool> IsAShr;
4364 constexpr unsigned MaxSources = 8;
4365
4366 // Calculate old cost: all shifts + tree ops + reduction
4367 InstructionCost OldCost = TTI.getInstructionCost(ReduceOp, CostKind);
4368
4369 while (!Worklist.empty() && Worklist.size() <= MaxSources &&
4370 Sources.size() <= MaxSources) {
4371 Value *V = Worklist.pop_back_val();
4372
4373 // Try to match sign-bit extraction: shr X, (bitwidth-1)
4374 Value *X;
4375 if (match(V, m_OneUse(m_Shr(m_Value(X), m_SpecificInt(BitWidth - 1))))) {
4376 auto *Shr = cast<Instruction>(V);
4377
4378 // All shifts must be the same type (all lshr or all ashr)
4379 bool ThisIsAShr = Shr->getOpcode() == Instruction::AShr;
4380 if (!IsAShr)
4381 IsAShr = ThisIsAShr;
4382 else if (*IsAShr != ThisIsAShr)
4383 return false;
4384
4385 Sources.push_back(X);
4386
4387 // As part of the fold, we remove all of the shifts, so we need to keep
4388 // track of their costs.
4389 OldCost += TTI.getInstructionCost(Shr, CostKind);
4390
4391 continue;
4392 }
4393
4394 // Try to extend through a tree node of the expected opcode
4395 Value *A, *B;
4396 if (!match(V, m_OneUse(m_BinOp(TreeOpcode, m_Value(A), m_Value(B)))))
4397 return false;
4398
4399 // We are potentially replacing these operations as well, so we add them
4400 // to the costs.
4402
4403 Worklist.push_back(A);
4404 Worklist.push_back(B);
4405 }
4406
4407 // Must have at least one source and not exceed limit
4408 if (Sources.empty() || Sources.size() > MaxSources ||
4409 Worklist.size() > MaxSources || !IsAShr)
4410 return false;
4411
4412 unsigned NumSources = Sources.size();
4413
4414 // For reduce.add, the total count must fit as a signed integer.
4415 // Range is [0, M*N] for lshr or [-M*N, 0] for ashr.
4416 if (OrigIID == Intrinsic::vector_reduce_add &&
4417 !isIntN(BitWidth, NumSources * NumElts))
4418 return false;
4419
4420 // Compute the boundary value when all elements are negative:
4421 // - Per-element contribution: 1 for lshr, -1 for ashr
4422 // - For add: M*N (total elements across all sources); for others: just 1
4423 unsigned Count =
4424 (OrigIID == Intrinsic::vector_reduce_add) ? NumSources * NumElts : 1;
4425 APInt NegativeVal(CmpVal->getBitWidth(), Count);
4426 if (*IsAShr)
4427 NegativeVal.negate();
4428
4429 // Range is [min(0, AllNegVal), max(0, AllNegVal)]
4430 APInt Zero = APInt::getZero(CmpVal->getBitWidth());
4431 APInt RangeLow = APIntOps::smin(Zero, NegativeVal);
4432 APInt RangeHigh = APIntOps::smax(Zero, NegativeVal);
4433
4434 // Determine comparison semantics:
4435 // - IsEq: true for equality test, false for inequality
4436 // - TestsNegative: true if testing against AllNegVal, false for zero
4437 //
4438 // In addition to EQ/NE against 0 or AllNegVal, we support inequalities
4439 // that fold to boundary tests given the narrow value range:
4440 // < RangeHigh -> != RangeHigh
4441 // > RangeHigh-1 -> == RangeHigh
4442 // > RangeLow -> != RangeLow
4443 // < RangeLow+1 -> == RangeLow
4444 //
4445 // For inequalities, we work with signed predicates only. Unsigned predicates
4446 // are canonicalized to signed when the range is non-negative (where they are
4447 // equivalent). When the range includes negative values, unsigned predicates
4448 // would have different semantics due to wrap-around, so we reject them.
4449 if (!ICmpInst::isEquality(Pred) && !ICmpInst::isSigned(Pred)) {
4450 if (RangeLow.isNegative())
4451 return false;
4452 Pred = ICmpInst::getSignedPredicate(Pred);
4453 }
4454
4455 bool IsEq;
4456 bool TestsNegative;
4457 if (ICmpInst::isEquality(Pred)) {
4458 if (CmpVal->isZero()) {
4459 TestsNegative = false;
4460 } else if (*CmpVal == NegativeVal) {
4461 TestsNegative = true;
4462 } else {
4463 return false;
4464 }
4465 IsEq = Pred == ICmpInst::ICMP_EQ;
4466 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeHigh) {
4467 IsEq = false;
4468 TestsNegative = (RangeHigh == NegativeVal);
4469 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeHigh - 1) {
4470 IsEq = true;
4471 TestsNegative = (RangeHigh == NegativeVal);
4472 } else if (Pred == ICmpInst::ICMP_SGT && *CmpVal == RangeLow) {
4473 IsEq = false;
4474 TestsNegative = (RangeLow == NegativeVal);
4475 } else if (Pred == ICmpInst::ICMP_SLT && *CmpVal == RangeLow + 1) {
4476 IsEq = true;
4477 TestsNegative = (RangeLow == NegativeVal);
4478 } else {
4479 return false;
4480 }
4481
4482 // For this fold we support four types of checks:
4483 //
4484 // 1. All lanes are negative - AllNeg
4485 // 2. All lanes are non-negative - AllNonNeg
4486 // 3. At least one negative lane - AnyNeg
4487 // 4. At least one non-negative lane - AnyNonNeg
4488 //
4489 // For each case, we can generate the following code:
4490 //
4491 // 1. AllNeg - reduce.and/umin(X) < 0
4492 // 2. AllNonNeg - reduce.or/umax(X) > -1
4493 // 3. AnyNeg - reduce.or/umax(X) < 0
4494 // 4. AnyNonNeg - reduce.and/umin(X) > -1
4495 //
4496 // The table below shows the aggregation of all supported cases
4497 // using these four cases.
4498 //
4499 // Reduction | == 0 | != 0 | == MAX | != MAX
4500 // ------------+-----------+-----------+-----------+-----------
4501 // or/umax | AllNonNeg | AnyNeg | AnyNeg | AllNonNeg
4502 // and/umin | AnyNonNeg | AllNeg | AllNeg | AnyNonNeg
4503 // add | AllNonNeg | AnyNeg | AllNeg | AnyNonNeg
4504 //
4505 // NOTE: MAX = 1 for or/and/umax/umin, and the vector size N for add
4506 //
4507 // For easier codegen and check inversion, we use the following encoding:
4508 //
4509 // 1. Bit-3 === requires or/umax (1) or and/umin (0) check
4510 // 2. Bit-2 === checks < 0 (1) or > -1 (0)
4511 // 3. Bit-1 === universal (1) or existential (0) check
4512 //
4513 // AnyNeg = 0b110: uses or/umax, checks negative, any-check
4514 // AllNonNeg = 0b101: uses or/umax, checks non-neg, all-check
4515 // AnyNonNeg = 0b000: uses and/umin, checks non-neg, any-check
4516 // AllNeg = 0b011: uses and/umin, checks negative, all-check
4517 //
4518 // XOR with 0b011 inverts the check (swaps all/any and neg/non-neg).
4519 //
4520 enum CheckKind : unsigned {
4521 AnyNonNeg = 0b000,
4522 AllNeg = 0b011,
4523 AllNonNeg = 0b101,
4524 AnyNeg = 0b110,
4525 };
4526 // Return true if we fold this check into or/umax and false for and/umin
4527 auto RequiresOr = [](CheckKind C) -> bool { return C & 0b100; };
4528 // Return true if we should check if result is negative and false otherwise
4529 auto IsNegativeCheck = [](CheckKind C) -> bool { return C & 0b010; };
4530 // Logically invert the check
4531 auto Invert = [](CheckKind C) { return CheckKind(C ^ 0b011); };
4532
4533 CheckKind Base;
4534 switch (OrigIID) {
4535 case Intrinsic::vector_reduce_or:
4536 case Intrinsic::vector_reduce_umax:
4537 Base = TestsNegative ? AnyNeg : AllNonNeg;
4538 break;
4539 case Intrinsic::vector_reduce_and:
4540 case Intrinsic::vector_reduce_umin:
4541 Base = TestsNegative ? AllNeg : AnyNonNeg;
4542 break;
4543 case Intrinsic::vector_reduce_add:
4544 Base = TestsNegative ? AllNeg : AllNonNeg;
4545 break;
4546 default:
4547 llvm_unreachable("Unexpected intrinsic");
4548 }
4549
4550 CheckKind Check = IsEq ? Base : Invert(Base);
4551
4552 auto PickCheaper = [&](Intrinsic::ID Arith, Intrinsic::ID MinMax) {
4553 InstructionCost ArithCost =
4555 VecTy, std::nullopt, CostKind);
4556 InstructionCost MinMaxCost =
4558 FastMathFlags(), CostKind);
4559 return ArithCost <= MinMaxCost ? std::make_pair(Arith, ArithCost)
4560 : std::make_pair(MinMax, MinMaxCost);
4561 };
4562
4563 // Choose output reduction based on encoding's MSB
4564 auto [NewIID, NewCost] = RequiresOr(Check)
4565 ? PickCheaper(Intrinsic::vector_reduce_or,
4566 Intrinsic::vector_reduce_umax)
4567 : PickCheaper(Intrinsic::vector_reduce_and,
4568 Intrinsic::vector_reduce_umin);
4569
4570 // Add cost of combining multiple sources with or/and
4571 if (NumSources > 1) {
4572 unsigned CombineOpc =
4573 RequiresOr(Check) ? Instruction::Or : Instruction::And;
4574 NewCost += TTI.getArithmeticInstrCost(CombineOpc, VecTy, CostKind) *
4575 (NumSources - 1);
4576 }
4577
4578 LLVM_DEBUG(dbgs() << "Found sign-bit reduction cmp: " << I << "\n OldCost: "
4579 << OldCost << " vs NewCost: " << NewCost << "\n");
4580
4581 if (NewCost > OldCost)
4582 return false;
4583
4584 // Generate the combined input and reduction
4585 Builder.SetInsertPoint(&I);
4586 Type *ScalarTy = VecTy->getScalarType();
4587
4588 Value *Input;
4589 if (NumSources == 1) {
4590 Input = Sources[0];
4591 } else {
4592 // Combine sources with or/and based on check type
4593 Input = RequiresOr(Check) ? Builder.CreateOr(Sources)
4594 : Builder.CreateAnd(Sources);
4595 }
4596
4597 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, NewIID, {Input});
4598 Value *NewCmp = IsNegativeCheck(Check) ? Builder.CreateIsNeg(NewReduce)
4599 : Builder.CreateIsNotNeg(NewReduce);
4600 replaceValue(I, *NewCmp);
4601 return true;
4602}
4603
4604/// Fold a zero test of reduce.or or reduce.umax into a boolean reduction.
4605///
4606/// Vectorization may produce IR that compares the result of a scalar reduction
4607/// with zero. Depending on the target, lowering a reduction and a scalar
4608/// comparison separately can cost more than reducing lane-wise comparison
4609/// results. This fold creates the latter form only when it is not costlier.
4610///
4611/// Before:
4612/// %r = call iT @llvm.vector.reduce.or.vNiT(<N x iT> %x)
4613/// %cmp = icmp ne iT %r, 0
4614///
4615/// After:
4616/// %lane.cmp = icmp ne <N x iT> %x, zeroinitializer
4617/// %cmp = call i1 @llvm.vector.reduce.or.vNi1(<N x i1> %lane.cmp)
4618///
4619/// `reduce.or` and `reduce.umax` are non-zero when at least one lane is
4620/// non-zero. Therefore, `icmp ne` uses the existential `reduce.or` test.
4621/// Conversely, `icmp eq` must check that every lane is zero, so it uses the
4622/// universal `reduce.and` test.
4623///
4624/// Before:
4625/// %r = call iT @llvm.vector.reduce.umax.vNiT(<N x iT> %x)
4626/// %cmp = icmp eq iT %r, 0
4627///
4628/// After:
4629/// %lane.cmp = icmp eq <N x iT> %x, zeroinitializer
4630/// %cmp = call i1 @llvm.vector.reduce.and.vNi1(<N x i1> %lane.cmp)
4631bool VectorCombine::foldReductionZeroTest(Instruction &I) {
4632 CmpPredicate Pred;
4633 Value *Op;
4634
4635 if (!match(&I, m_c_ICmp(Pred, m_Value(Op), m_Zero())) ||
4636 !ICmpInst::isEquality(Pred))
4637 return false;
4638
4639 auto *II = dyn_cast<IntrinsicInst>(Op);
4640 if (!II || !II->hasOneUse())
4641 return false;
4642
4643 auto ReduceID = II->getIntrinsicID();
4644 if (ReduceID != Intrinsic::vector_reduce_or &&
4645 ReduceID != Intrinsic::vector_reduce_umax)
4646 return false;
4647
4648 Value *Vec = II->getArgOperand(0);
4649 auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
4650 if (!VecTy || !VecTy->getElementType()->isIntegerTy())
4651 return false;
4652
4653 // Map the scalar zero test to an any-lane or all-lane boolean reduction.
4654 Intrinsic::ID NewIID = (Pred == ICmpInst::ICMP_NE)
4655 ? Intrinsic::vector_reduce_or
4656 : Intrinsic::vector_reduce_and;
4657
4658 // This is not an unconditional canonicalization: compare the cost of the
4659 // original scalar reduction and compare with the vector compare and i1
4660 // reduction replacement for both reduce.or and reduce.umax.
4663
4664 auto *CmpTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
4665 InstructionCost NewCost =
4666 TTI.getCmpSelInstrCost(Instruction::ICmp, VecTy, CmpTy, Pred, CostKind);
4668 getArithmeticReductionInstruction(NewIID), CmpTy, std::nullopt, CostKind);
4669
4670 LLVM_DEBUG(dbgs() << "Found a reduction zero test: " << I << "\n OldCost: "
4671 << OldCost << " vs NewCost: " << NewCost << "\n");
4672
4673 if (!OldCost.isValid() || !NewCost.isValid() || NewCost > OldCost)
4674 return false;
4675
4676 Builder.SetInsertPoint(&I);
4677 Value *NewCmp = Builder.CreateICmp(Pred, Vec, Constant::getNullValue(VecTy));
4678 Value *NewReduce = Builder.CreateIntrinsic(NewIID, {CmpTy}, {NewCmp});
4679 replaceValue(I, *NewReduce);
4680 return true;
4681}
4682
4683/// vector.reduce.OP f(X_i) == 0 -> vector.reduce.OP X_i == 0
4684///
4685/// We can prove it for cases when:
4686///
4687/// 1. OP X_i == 0 <=> \forall i \in [1, N] X_i == 0
4688/// 1'. OP X_i == 0 <=> \exists j \in [1, N] X_j == 0
4689/// 2. f(x) == 0 <=> x == 0
4690///
4691/// From 1 and 2 (or 1' and 2), we can infer that
4692///
4693/// OP f(X_i) == 0 <=> OP X_i == 0.
4694///
4695/// (1)
4696/// OP f(X_i) == 0 <=> \forall i \in [1, N] f(X_i) == 0
4697/// (2)
4698/// <=> \forall i \in [1, N] X_i == 0
4699/// (1)
4700/// <=> OP(X_i) == 0
4701///
4702/// For some of the OP's and f's, we need to have domain constraints on X
4703/// to ensure properties 1 (or 1') and 2.
4704bool VectorCombine::foldICmpEqZeroVectorReduce(Instruction &I) {
4705 CmpPredicate Pred;
4706 Value *Op;
4707 if (!match(&I, m_ICmp(Pred, m_Value(Op), m_Zero())) ||
4708 !ICmpInst::isEquality(Pred))
4709 return false;
4710
4711 auto *II = dyn_cast<IntrinsicInst>(Op);
4712 if (!II)
4713 return false;
4714
4715 switch (II->getIntrinsicID()) {
4716 case Intrinsic::vector_reduce_add:
4717 case Intrinsic::vector_reduce_or:
4718 case Intrinsic::vector_reduce_umin:
4719 case Intrinsic::vector_reduce_umax:
4720 case Intrinsic::vector_reduce_smin:
4721 case Intrinsic::vector_reduce_smax:
4722 break;
4723 default:
4724 return false;
4725 }
4726
4727 Value *InnerOp = II->getArgOperand(0);
4728
4729 // TODO: fixed vector type might be too restrictive
4730 if (!II->hasOneUse() || !isa<FixedVectorType>(InnerOp->getType()))
4731 return false;
4732
4733 Value *X = nullptr;
4734
4735 // Check for zero-preserving operations where f(x) = 0 <=> x = 0
4736 //
4737 // 1. f(x) = shl nuw x, y for arbitrary y
4738 // 2. f(x) = mul nuw x, c for defined c != 0
4739 // 3. f(x) = zext x
4740 // 4. f(x) = sext x
4741 // 5. f(x) = neg x
4742 //
4743 if (!(match(InnerOp, m_NUWShl(m_Value(X), m_Value())) || // Case 1
4744 match(InnerOp, m_NUWMul(m_Value(X), m_NonZeroInt())) || // Case 2
4745 match(InnerOp, m_ZExt(m_Value(X))) || // Case 3
4746 match(InnerOp, m_SExt(m_Value(X))) || // Case 4
4747 match(InnerOp, m_Neg(m_Value(X))) // Case 5
4748 ))
4749 return false;
4750
4751 SimplifyQuery S = SQ.getWithInstruction(&I);
4752 auto *XTy = cast<FixedVectorType>(X->getType());
4753
4754 // Check for domain constraints for all supported reductions.
4755 //
4756 // a. OR X_i - has property 1 for every X
4757 // b. UMAX X_i - has property 1 for every X
4758 // c. UMIN X_i - has property 1' for every X
4759 // d. SMAX X_i - has property 1 for X >= 0
4760 // e. SMIN X_i - has property 1' for X >= 0
4761 // f. ADD X_i - has property 1 for X >= 0 && ADD X_i doesn't sign wrap
4762 //
4763 // In order for the proof to work, we need 1 (or 1') to be true for both
4764 // OP f(X_i) and OP X_i and that's why below we check constraints twice.
4765 //
4766 // NOTE: ADD X_i holds property 1 for a mirror case as well, i.e. when
4767 // X <= 0 && ADD X_i doesn't sign wrap. However, due to the nature
4768 // of known bits, we can't reasonably hold knowledge of "either 0
4769 // or negative".
4770 switch (II->getIntrinsicID()) {
4771 case Intrinsic::vector_reduce_add: {
4772 // We need to check that both X_i and f(X_i) have enough leading
4773 // zeros to not overflow.
4774 KnownBits KnownX = computeKnownBits(X, S);
4775 KnownBits KnownFX = computeKnownBits(InnerOp, S);
4776 unsigned NumElems = XTy->getNumElements();
4777 // Adding N elements loses at most ceil(log2(N)) leading bits.
4778 unsigned LostBits = Log2_32_Ceil(NumElems);
4779 unsigned LeadingZerosX = KnownX.countMinLeadingZeros();
4780 unsigned LeadingZerosFX = KnownFX.countMinLeadingZeros();
4781 // Need at least one leading zero left after summation to ensure no overflow
4782 if (LeadingZerosX <= LostBits || LeadingZerosFX <= LostBits)
4783 return false;
4784
4785 // We are not checking whether X or f(X) are positive explicitly because
4786 // we implicitly checked for it when we checked if both cases have enough
4787 // leading zeros to not wrap addition.
4788 break;
4789 }
4790 case Intrinsic::vector_reduce_smin:
4791 case Intrinsic::vector_reduce_smax:
4792 // Check whether X >= 0 and f(X) >= 0
4793 if (!isKnownNonNegative(InnerOp, S) || !isKnownNonNegative(X, S))
4794 return false;
4795
4796 break;
4797 default:
4798 break;
4799 };
4800
4801 LLVM_DEBUG(dbgs() << "Found a reduction to 0 comparison with removable op: "
4802 << *II << "\n");
4803
4804 // For zext/sext, check if the transform is profitable using cost model.
4805 // For other operations (shl, mul, neg), we're removing an instruction
4806 // while keeping the same reduction type, so it's always profitable.
4807 if (isa<ZExtInst>(InnerOp) || isa<SExtInst>(InnerOp)) {
4808 auto *FXTy = cast<FixedVectorType>(InnerOp->getType());
4809 Intrinsic::ID IID = II->getIntrinsicID();
4810
4812 cast<CastInst>(InnerOp)->getOpcode(), FXTy, XTy,
4814
4815 InstructionCost OldReduceCost, NewReduceCost;
4816 switch (IID) {
4817 case Intrinsic::vector_reduce_add:
4818 case Intrinsic::vector_reduce_or:
4819 OldReduceCost = TTI.getArithmeticReductionCost(
4820 getArithmeticReductionInstruction(IID), FXTy, std::nullopt, CostKind);
4821 NewReduceCost = TTI.getArithmeticReductionCost(
4822 getArithmeticReductionInstruction(IID), XTy, std::nullopt, CostKind);
4823 break;
4824 case Intrinsic::vector_reduce_umin:
4825 case Intrinsic::vector_reduce_umax:
4826 case Intrinsic::vector_reduce_smin:
4827 case Intrinsic::vector_reduce_smax:
4828 OldReduceCost = TTI.getMinMaxReductionCost(
4829 getMinMaxReductionIntrinsicOp(IID), FXTy, FastMathFlags(), CostKind);
4830 NewReduceCost = TTI.getMinMaxReductionCost(
4831 getMinMaxReductionIntrinsicOp(IID), XTy, FastMathFlags(), CostKind);
4832 break;
4833 default:
4834 llvm_unreachable("Unexpected reduction");
4835 }
4836
4837 InstructionCost OldCost = OldReduceCost + ExtCost;
4838 InstructionCost NewCost =
4839 NewReduceCost + (InnerOp->hasOneUse() ? 0 : ExtCost);
4840
4841 LLVM_DEBUG(dbgs() << "Found a removable extension before reduction: "
4842 << *InnerOp << "\n OldCost: " << OldCost
4843 << " vs NewCost: " << NewCost << "\n");
4844
4845 // We consider transformation to still be potentially beneficial even
4846 // when the costs are the same because we might remove a use from f(X)
4847 // and unlock other optimizations. Equal costs would just mean that we
4848 // didn't make it worse in the worst case.
4849 if (NewCost > OldCost)
4850 return false;
4851 }
4852
4853 // Since we support zext and sext as f, we might change the scalar type
4854 // of the intrinsic.
4855 Type *Ty = XTy->getScalarType();
4856 Value *NewReduce = Builder.CreateIntrinsic(Ty, II->getIntrinsicID(), {X});
4857 Value *NewCmp =
4858 Builder.CreateICmp(Pred, NewReduce, ConstantInt::getNullValue(Ty));
4859 replaceValue(I, *NewCmp);
4860 return true;
4861}
4862
4863/// Fold comparisons of reduce.or/reduce.and with reduce.umax/reduce.umin
4864/// based on cost, preserving the comparison semantics.
4865///
4866/// We use two fundamental properties for each pair:
4867///
4868/// 1. or(X) == 0 <=> umax(X) == 0
4869/// 2. or(X) == 1 <=> umax(X) == 1
4870/// 3. sign(or(X)) == sign(umax(X))
4871///
4872/// 1. and(X) == -1 <=> umin(X) == -1
4873/// 2. and(X) == -2 <=> umin(X) == -2
4874/// 3. sign(and(X)) == sign(umin(X))
4875///
4876/// From these we can infer the following transformations:
4877/// a. or(X) ==/!= 0 <-> umax(X) ==/!= 0
4878/// b. or(X) s< 0 <-> umax(X) s< 0
4879/// c. or(X) s> -1 <-> umax(X) s> -1
4880/// d. or(X) s< 1 <-> umax(X) s< 1
4881/// e. or(X) ==/!= 1 <-> umax(X) ==/!= 1
4882/// f. or(X) s< 2 <-> umax(X) s< 2
4883/// g. and(X) ==/!= -1 <-> umin(X) ==/!= -1
4884/// h. and(X) s< 0 <-> umin(X) s< 0
4885/// i. and(X) s> -1 <-> umin(X) s> -1
4886/// j. and(X) s> -2 <-> umin(X) s> -2
4887/// k. and(X) ==/!= -2 <-> umin(X) ==/!= -2
4888/// l. and(X) s> -3 <-> umin(X) s> -3
4889///
4890bool VectorCombine::foldEquivalentReductionCmp(Instruction &I) {
4891 CmpPredicate Pred;
4892 Value *ReduceOp;
4893 const APInt *CmpVal;
4894 if (!match(&I, m_ICmp(Pred, m_Value(ReduceOp), m_APInt(CmpVal))))
4895 return false;
4896
4897 auto *II = dyn_cast<IntrinsicInst>(ReduceOp);
4898 if (!II || !II->hasOneUse())
4899 return false;
4900
4901 const auto IsValidOrUmaxCmp = [&]() {
4902 // or === umax for i1
4903 if (CmpVal->getBitWidth() == 1)
4904 return true;
4905
4906 // Cases a and e
4907 bool IsEquality =
4908 (CmpVal->isZero() || CmpVal->isOne()) && ICmpInst::isEquality(Pred);
4909 // Case c
4910 bool IsPositive = CmpVal->isAllOnes() && Pred == ICmpInst::ICMP_SGT;
4911 // Cases b, d, and f
4912 bool IsNegative = (CmpVal->isZero() || CmpVal->isOne() || *CmpVal == 2) &&
4913 Pred == ICmpInst::ICMP_SLT;
4914 return IsEquality || IsPositive || IsNegative;
4915 };
4916
4917 const auto IsValidAndUminCmp = [&]() {
4918 // and === umin for i1
4919 if (CmpVal->getBitWidth() == 1)
4920 return true;
4921
4922 const auto LeadingOnes = CmpVal->countl_one();
4923
4924 // Cases g and k
4925 bool IsEquality =
4926 (CmpVal->isAllOnes() || LeadingOnes + 1 == CmpVal->getBitWidth()) &&
4928 // Case h
4929 bool IsNegative = CmpVal->isZero() && Pred == ICmpInst::ICMP_SLT;
4930 // Cases i, j, and l
4931 bool IsPositive =
4932 // if the number has at least N - 2 leading ones
4933 // and the two LSBs are:
4934 // - 1 x 1 -> -1
4935 // - 1 x 0 -> -2
4936 // - 0 x 1 -> -3
4937 LeadingOnes + 2 >= CmpVal->getBitWidth() &&
4938 ((*CmpVal)[0] || (*CmpVal)[1]) && Pred == ICmpInst::ICMP_SGT;
4939 return IsEquality || IsNegative || IsPositive;
4940 };
4941
4942 Intrinsic::ID OriginalIID = II->getIntrinsicID();
4943 Intrinsic::ID AlternativeIID;
4944
4945 // Check if this is a valid comparison pattern and determine the alternate
4946 // reduction intrinsic.
4947 switch (OriginalIID) {
4948 case Intrinsic::vector_reduce_or:
4949 if (!IsValidOrUmaxCmp())
4950 return false;
4951 AlternativeIID = Intrinsic::vector_reduce_umax;
4952 break;
4953 case Intrinsic::vector_reduce_umax:
4954 if (!IsValidOrUmaxCmp())
4955 return false;
4956 AlternativeIID = Intrinsic::vector_reduce_or;
4957 break;
4958 case Intrinsic::vector_reduce_and:
4959 if (!IsValidAndUminCmp())
4960 return false;
4961 AlternativeIID = Intrinsic::vector_reduce_umin;
4962 break;
4963 case Intrinsic::vector_reduce_umin:
4964 if (!IsValidAndUminCmp())
4965 return false;
4966 AlternativeIID = Intrinsic::vector_reduce_and;
4967 break;
4968 default:
4969 return false;
4970 }
4971
4972 Value *X = II->getArgOperand(0);
4973 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
4974 if (!VecTy)
4975 return false;
4976
4977 const auto GetReductionCost = [&](Intrinsic::ID IID) -> InstructionCost {
4978 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
4979 if (ReductionOpc != Instruction::ICmp)
4980 return TTI.getArithmeticReductionCost(ReductionOpc, VecTy, std::nullopt,
4981 CostKind);
4983 FastMathFlags(), CostKind);
4984 };
4985
4986 InstructionCost OrigCost = GetReductionCost(OriginalIID);
4987 InstructionCost AltCost = GetReductionCost(AlternativeIID);
4988
4989 LLVM_DEBUG(dbgs() << "Found equivalent reduction cmp: " << I
4990 << "\n OrigCost: " << OrigCost
4991 << " vs AltCost: " << AltCost << "\n");
4992
4993 if (AltCost >= OrigCost)
4994 return false;
4995
4996 Builder.SetInsertPoint(&I);
4997 Type *ScalarTy = VecTy->getScalarType();
4998 Value *NewReduce = Builder.CreateIntrinsic(ScalarTy, AlternativeIID, {X});
4999 Value *NewCmp =
5000 Builder.CreateICmp(Pred, NewReduce, ConstantInt::get(ScalarTy, *CmpVal));
5001
5002 replaceValue(I, *NewCmp);
5003 return true;
5004}
5005
5006/// Used by foldReduceAddCmpZero to check if we can prove that a value is
5007/// non-positive.
5008/// KnownBits cannot see sext <? x i1> as non-positive: each top bit equals a
5009/// single unknown input bit, which a per-bit lattice cannot track. The fold's
5010/// target shape is popcount-style sums of <N x i1> valid/invalid masks (e.g.
5011/// ray-intersection hits) tested for any-hit.
5012/// Previous attempts to approximate the known bits of such expressions were
5013/// using a fully recursive value tracking approach to infer a constant range
5014/// but ultimately turned to be too expensive in compile time.
5015static bool isKnownNonPositive(const Value *V, const SimplifyQuery &SQ,
5016 unsigned Depth = 0) {
5017 constexpr unsigned MaxLocalDepth = 2;
5018 if (Depth > MaxLocalDepth)
5019 return false;
5020
5021 auto NumSignBits = [&](const Value *X) {
5022 return ComputeNumSignBits(X, SQ.DL, SQ.AC, SQ.CxtI, SQ.DT);
5023 };
5024 if (NumSignBits(V) == V->getType()->getScalarSizeInBits())
5025 return true;
5026
5027 Value *A, *B;
5028 if (match(V, m_Add(m_Value(A), m_Value(B))))
5029 return NumSignBits(A) >= 2 && NumSignBits(B) >= 2 &&
5030 isKnownNonPositive(A, SQ, Depth + 1) &&
5031 isKnownNonPositive(B, SQ, Depth + 1);
5032
5033 return computeKnownBits(V, SQ).isNonPositive();
5034}
5035
5036/// Fold (icmp pred (reduce.add X), 0) to (icmp pred' (reduce.or X), 0) when X
5037/// has lanes known to all be non-negative or all non-positive, so that
5038/// sum == 0 iff every lane is 0. Falls back to reduce.umax if reduce.or is
5039/// more expensive on the target.
5040bool VectorCombine::foldReduceAddCmpZero(Instruction &I) {
5041 CmpPredicate Pred;
5042 Value *Vec;
5043 if (!match(&I, m_ICmp(Pred,
5045 m_Value(Vec))),
5046 m_Zero())))
5047 return false;
5048
5049 auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
5050 if (!VecTy || VecTy->getNumElements() < 2)
5051 return false;
5052
5053 SimplifyQuery Q = SQ.getWithInstruction(&I);
5054 bool IsNonNegative = isKnownNonNegative(Vec, Q);
5055 bool IsNonPositive = !IsNonNegative && isKnownNonPositive(Vec, Q);
5056 if (!IsNonNegative && !IsNonPositive)
5057 return false;
5058
5059 // Summing NumElts lanes can consume up to log2(NumElts) sign bits. Require
5060 // strictly more headroom than that so the sum cannot wrap to zero.
5061 unsigned NumElts = VecTy->getNumElements();
5062 unsigned NumSignBits = ComputeNumSignBits(Vec, *DL, SQ.AC, &I, &DT);
5063 if (Log2_32(NumElts) >= NumSignBits)
5064 return false;
5065
5066 ICmpInst::Predicate NewPred;
5067 switch (Pred) {
5068 case ICmpInst::ICMP_EQ:
5069 case ICmpInst::ICMP_ULE:
5070 case ICmpInst::ICMP_SLE:
5071 case ICmpInst::ICMP_SGE:
5072 NewPred = ICmpInst::ICMP_EQ;
5073 break;
5074 case ICmpInst::ICMP_NE:
5075 case ICmpInst::ICMP_UGT:
5076 case ICmpInst::ICMP_SGT:
5077 case ICmpInst::ICMP_SLT:
5078 NewPred = ICmpInst::ICMP_NE;
5079 break;
5080 default:
5081 return false;
5082 }
5083
5084 // SGT and SLE on a non-positive tree, and SLT and SGE on a non-negative
5085 // tree, are tautologies (always true or always false). Leave those to
5086 // InstCombine rather than mapping them here. Remaining signed inequalities
5087 // also need one extra sign bit so the sum cannot flip sign.
5088 if (!IsNonNegative &&
5089 (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE))
5090 return false;
5091 if (!IsNonPositive &&
5092 (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE))
5093 return false;
5094 if ((Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE ||
5095 Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) &&
5096 Log2_32(NumElts) >= NumSignBits - 1)
5097 return false;
5098
5100 Instruction::Add, VecTy, std::nullopt, CostKind);
5102 Instruction::Or, VecTy, std::nullopt, CostKind);
5104 Intrinsic::umax, VecTy, FastMathFlags(), CostKind);
5105 if (!OrCost.isValid() && !UmaxCost.isValid())
5106 return false;
5107 bool UseOr = OrCost.isValid() && (!UmaxCost.isValid() || OrCost <= UmaxCost);
5108 InstructionCost AltCost = UseOr ? OrCost : UmaxCost;
5109 if (AltCost > OrigCost)
5110 return false;
5111
5112 Builder.SetInsertPoint(&I);
5113 Value *NewReduce = UseOr ? Builder.CreateOrReduce(Vec)
5114 : Builder.CreateIntrinsic(
5115 Intrinsic::vector_reduce_umax, {VecTy}, {Vec});
5116 Worklist.pushValue(NewReduce);
5117 Value *NewCmp = Builder.CreateICmp(
5118 NewPred, NewReduce, ConstantInt::getNullValue(VecTy->getScalarType()));
5119 replaceValue(I, *NewCmp);
5120 return true;
5121}
5122
5123/// Returns true if this ShuffleVectorInst eventually feeds into a
5124/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
5125/// chains of shuffles and binary operators (in any combination/order).
5126/// The search does not go deeper than the given Depth.
5128 constexpr unsigned MaxVisited = 32;
5131 bool FoundReduction = false;
5132
5133 WorkList.push_back(SVI);
5134 while (!WorkList.empty()) {
5135 Instruction *I = WorkList.pop_back_val();
5136 for (User *U : I->users()) {
5137 auto *UI = cast<Instruction>(U);
5138 if (!UI || !Visited.insert(UI).second)
5139 continue;
5140 if (Visited.size() > MaxVisited)
5141 return false;
5142 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
5143 // More than one reduction reached
5144 if (FoundReduction)
5145 return false;
5146 switch (II->getIntrinsicID()) {
5147 case Intrinsic::vector_reduce_add:
5148 case Intrinsic::vector_reduce_mul:
5149 case Intrinsic::vector_reduce_and:
5150 case Intrinsic::vector_reduce_or:
5151 case Intrinsic::vector_reduce_xor:
5152 case Intrinsic::vector_reduce_smin:
5153 case Intrinsic::vector_reduce_smax:
5154 case Intrinsic::vector_reduce_umin:
5155 case Intrinsic::vector_reduce_umax:
5156 FoundReduction = true;
5157 continue;
5158 default:
5159 return false;
5160 }
5161 }
5162
5164 return false;
5165
5166 WorkList.emplace_back(UI);
5167 }
5168 }
5169 return FoundReduction;
5170}
5171
5172/// This method looks for groups of shuffles acting on binops, of the form:
5173/// %x = shuffle ...
5174/// %y = shuffle ...
5175/// %a = binop %x, %y
5176/// %b = binop %x, %y
5177/// shuffle %a, %b, selectmask
5178/// We may, especially if the shuffle is wider than legal, be able to convert
5179/// the shuffle to a form where only parts of a and b need to be computed. On
5180/// architectures with no obvious "select" shuffle, this can reduce the total
5181/// number of operations if the target reports them as cheaper.
5182bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
5183 auto *SVI = cast<ShuffleVectorInst>(&I);
5184 auto *VT = cast<FixedVectorType>(I.getType());
5185 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
5186 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
5187 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
5188 VT != Op0->getType())
5189 return false;
5190
5191 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
5192 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
5193 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
5194 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
5195 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
5196 auto checkSVNonOpUses = [&](Instruction *I) {
5197 if (!I || I->getOperand(0)->getType() != VT)
5198 return true;
5199 return any_of(I->users(), [&](User *U) {
5200 return U != Op0 && U != Op1 &&
5201 !(isa<ShuffleVectorInst>(U) &&
5202 (InputShuffles.contains(cast<Instruction>(U)) ||
5203 isInstructionTriviallyDead(cast<Instruction>(U))));
5204 });
5205 };
5206 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
5207 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
5208 return false;
5209
5210 // Collect all the uses that are shuffles that we can transform together. We
5211 // may not have a single shuffle, but a group that can all be transformed
5212 // together profitably.
5214 auto collectShuffles = [&](Instruction *I) {
5215 for (auto *U : I->users()) {
5216 auto *SV = dyn_cast<ShuffleVectorInst>(U);
5217 if (!SV || SV->getType() != VT)
5218 return false;
5219 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
5220 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
5221 return false;
5222 if (!llvm::is_contained(Shuffles, SV))
5223 Shuffles.push_back(SV);
5224 }
5225 return true;
5226 };
5227 if (!collectShuffles(Op0) || !collectShuffles(Op1))
5228 return false;
5229 // From a reduction, we need to be processing a single shuffle, otherwise the
5230 // other uses will not be lane-invariant.
5231 if (FromReduction && Shuffles.size() > 1)
5232 return false;
5233
5234 // Add any shuffle uses for the shuffles we have found, to include them in our
5235 // cost calculations.
5236 if (!FromReduction) {
5237 for (size_t Idx = 0, E = Shuffles.size(); Idx != E; ++Idx) {
5238 for (auto *U : Shuffles[Idx]->users()) {
5239 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
5240 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
5241 Shuffles.push_back(SSV);
5242 }
5243 }
5244 }
5245
5246 // For each of the output shuffles, we try to sort all the first vector
5247 // elements to the beginning, followed by the second array elements at the
5248 // end. If the binops are legalized to smaller vectors, this may reduce total
5249 // number of binops. We compute the ReconstructMask mask needed to convert
5250 // back to the original lane order.
5252 SmallVector<SmallVector<int>> OrigReconstructMasks;
5253 int MaxV1Elt = 0, MaxV2Elt = 0;
5254 unsigned NumElts = VT->getNumElements();
5255 for (ShuffleVectorInst *SVN : Shuffles) {
5256 SmallVector<int> Mask;
5257 SVN->getShuffleMask(Mask);
5258
5259 // Check the operands are the same as the original, or reversed (in which
5260 // case we need to commute the mask).
5261 Value *SVOp0 = SVN->getOperand(0);
5262 Value *SVOp1 = SVN->getOperand(1);
5263 if (isa<UndefValue>(SVOp1)) {
5264 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
5265 SVOp0 = SSV->getOperand(0);
5266 SVOp1 = SSV->getOperand(1);
5267 for (int &Elem : Mask) {
5268 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
5269 return false;
5270 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
5271 }
5272 }
5273 if (SVOp0 == Op1 && SVOp1 == Op0) {
5274 std::swap(SVOp0, SVOp1);
5276 }
5277 if (SVOp0 != Op0 || SVOp1 != Op1)
5278 return false;
5279
5280 // Calculate the reconstruction mask for this shuffle, as the mask needed to
5281 // take the packed values from Op0/Op1 and reconstructing to the original
5282 // order.
5283 SmallVector<int> ReconstructMask;
5284 for (unsigned I = 0; I < Mask.size(); I++) {
5285 if (Mask[I] < 0) {
5286 ReconstructMask.push_back(-1);
5287 } else if (Mask[I] < static_cast<int>(NumElts)) {
5288 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
5289 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
5290 return Mask[I] == A.first;
5291 });
5292 if (It != V1.end())
5293 ReconstructMask.push_back(It - V1.begin());
5294 else {
5295 ReconstructMask.push_back(V1.size());
5296 V1.emplace_back(Mask[I], V1.size());
5297 }
5298 } else {
5299 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
5300 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
5301 return Mask[I] - static_cast<int>(NumElts) == A.first;
5302 });
5303 if (It != V2.end())
5304 ReconstructMask.push_back(NumElts + It - V2.begin());
5305 else {
5306 ReconstructMask.push_back(NumElts + V2.size());
5307 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
5308 }
5309 }
5310 }
5311
5312 // For reductions, we know that the lane ordering out doesn't alter the
5313 // result. In-order can help simplify the shuffle away.
5314 if (FromReduction)
5315 sort(ReconstructMask);
5316 OrigReconstructMasks.push_back(std::move(ReconstructMask));
5317 }
5318
5319 // If the Maximum element used from V1 and V2 are not larger than the new
5320 // vectors, the vectors are already packes and performing the optimization
5321 // again will likely not help any further. This also prevents us from getting
5322 // stuck in a cycle in case the costs do not also rule it out.
5323 if (V1.empty() || V2.empty() ||
5324 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
5325 MaxV2Elt == static_cast<int>(V2.size()) - 1))
5326 return false;
5327
5328 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
5329 // shuffle of another shuffle, or not a shuffle (that is treated like a
5330 // identity shuffle).
5331 auto GetBaseMaskValue = [&](Instruction *I, int M) {
5332 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5333 if (!SV)
5334 return M;
5335 if (isa<UndefValue>(SV->getOperand(1)))
5336 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5337 if (InputShuffles.contains(SSV))
5338 return SSV->getMaskValue(SV->getMaskValue(M));
5339 return SV->getMaskValue(M);
5340 };
5341
5342 // Attempt to sort the inputs my ascending mask values to make simpler input
5343 // shuffles and push complex shuffles down to the uses. We sort on the first
5344 // of the two input shuffle orders, to try and get at least one input into a
5345 // nice order.
5346 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
5347 std::pair<int, int> Y) {
5348 int MXA = GetBaseMaskValue(A, X.first);
5349 int MYA = GetBaseMaskValue(A, Y.first);
5350 return MXA < MYA;
5351 };
5352 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
5353 return SortBase(SVI0A, A, B);
5354 });
5355 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
5356 return SortBase(SVI1A, A, B);
5357 });
5358 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
5359 // modified order of the input shuffles.
5360 SmallVector<SmallVector<int>> ReconstructMasks;
5361 for (const auto &Mask : OrigReconstructMasks) {
5362 SmallVector<int> ReconstructMask;
5363 for (int M : Mask) {
5364 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
5365 auto It = find_if(V, [M](auto A) { return A.second == M; });
5366 assert(It != V.end() && "Expected all entries in Mask");
5367 return std::distance(V.begin(), It);
5368 };
5369 if (M < 0)
5370 ReconstructMask.push_back(-1);
5371 else if (M < static_cast<int>(NumElts)) {
5372 ReconstructMask.push_back(FindIndex(V1, M));
5373 } else {
5374 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
5375 }
5376 }
5377 ReconstructMasks.push_back(std::move(ReconstructMask));
5378 }
5379
5380 // Calculate the masks needed for the new input shuffles, which get padded
5381 // with undef
5382 SmallVector<int> V1A, V1B, V2A, V2B;
5383 for (unsigned I = 0; I < V1.size(); I++) {
5384 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
5385 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
5386 }
5387 for (unsigned I = 0; I < V2.size(); I++) {
5388 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
5389 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
5390 }
5391 while (V1A.size() < NumElts) {
5394 }
5395 while (V2A.size() < NumElts) {
5398 }
5399
5400 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
5401 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5402 if (!SV)
5403 return C;
5404 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
5407 VT, VT, SV->getShuffleMask(), CostKind);
5408 };
5409 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5410 return C +
5412 };
5413
5414 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
5415 unsigned MaxVectorSize =
5417 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
5418 if (MaxElementsInVector == 0)
5419 return false;
5420 // When there are multiple shufflevector operations on the same input,
5421 // especially when the vector length is larger than the register size,
5422 // identical shuffle patterns may occur across different groups of elements.
5423 // To avoid overestimating the cost by counting these repeated shuffles more
5424 // than once, we only account for unique shuffle patterns. This adjustment
5425 // prevents inflated costs in the cost model for wide vectors split into
5426 // several register-sized groups.
5427 std::set<SmallVector<int, 4>> UniqueShuffles;
5428 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
5429 // Compute the cost for performing the shuffle over the full vector.
5430 auto ShuffleCost =
5432 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
5433 if (NumFullVectors < 2)
5434 return C + ShuffleCost;
5435 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
5436 unsigned NumUniqueGroups = 0;
5437 unsigned NumGroups = Mask.size() / MaxElementsInVector;
5438 // For each group of MaxElementsInVector contiguous elements,
5439 // collect their shuffle pattern and insert into the set of unique patterns.
5440 for (unsigned I = 0; I < NumFullVectors; ++I) {
5441 for (unsigned J = 0; J < MaxElementsInVector; ++J)
5442 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
5443 if (UniqueShuffles.insert(SubShuffle).second)
5444 NumUniqueGroups += 1;
5445 }
5446 return C + ShuffleCost * NumUniqueGroups / NumGroups;
5447 };
5448 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
5449 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5450 if (!SV)
5451 return C;
5452 SmallVector<int, 16> Mask;
5453 SV->getShuffleMask(Mask);
5454 return AddShuffleMaskAdjustedCost(C, Mask);
5455 };
5456 // Check that input consists of ShuffleVectors applied to the same input
5457 auto AllShufflesHaveSameOperands =
5458 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
5459 if (InputShuffles.size() < 2)
5460 return false;
5461 ShuffleVectorInst *FirstSV =
5462 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
5463 if (!FirstSV)
5464 return false;
5465
5466 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
5467 return std::all_of(
5468 std::next(InputShuffles.begin()), InputShuffles.end(),
5469 [&](Instruction *I) {
5470 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
5471 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
5472 });
5473 };
5474
5475 // Get the costs of the shuffles + binops before and after with the new
5476 // shuffle masks.
5477 InstructionCost CostBefore =
5478 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
5479 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
5480 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
5481 InstructionCost(0), AddShuffleCost);
5482 if (AllShufflesHaveSameOperands(InputShuffles)) {
5483 UniqueShuffles.clear();
5484 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5485 InstructionCost(0), AddShuffleAdjustedCost);
5486 } else {
5487 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
5488 InstructionCost(0), AddShuffleCost);
5489 }
5490
5491 // The new binops will be unused for lanes past the used shuffle lengths.
5492 // These types attempt to get the correct cost for that from the target.
5493 FixedVectorType *Op0SmallVT =
5494 FixedVectorType::get(VT->getScalarType(), V1.size());
5495 FixedVectorType *Op1SmallVT =
5496 FixedVectorType::get(VT->getScalarType(), V2.size());
5497 InstructionCost CostAfter =
5498 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
5499 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
5500 UniqueShuffles.clear();
5501 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
5502 InstructionCost(0), AddShuffleMaskAdjustedCost);
5503 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
5504 CostAfter +=
5505 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
5506 InstructionCost(0), AddShuffleMaskCost);
5507
5508 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
5509 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
5510 << " vs CostAfter: " << CostAfter << "\n");
5511 if (CostBefore < CostAfter ||
5512 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
5513 return false;
5514
5515 // The cost model has passed, create the new instructions.
5516 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
5517 auto *SV = dyn_cast<ShuffleVectorInst>(I);
5518 if (!SV)
5519 return I;
5520 if (isa<UndefValue>(SV->getOperand(1)))
5521 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
5522 if (InputShuffles.contains(SSV))
5523 return SSV->getOperand(Op);
5524 return SV->getOperand(Op);
5525 };
5526 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
5527 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
5528 GetShuffleOperand(SVI0A, 1), V1A);
5529 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
5530 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
5531 GetShuffleOperand(SVI0B, 1), V1B);
5532 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
5533 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
5534 GetShuffleOperand(SVI1A, 1), V2A);
5535 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
5536 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
5537 GetShuffleOperand(SVI1B, 1), V2B);
5538 Builder.SetInsertPoint(Op0);
5539 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
5540 NSV0A, NSV0B);
5541 if (auto *I = dyn_cast<Instruction>(NOp0))
5542 I->copyIRFlags(Op0, true);
5543 Builder.SetInsertPoint(Op1);
5544 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
5545 NSV1A, NSV1B);
5546 if (auto *I = dyn_cast<Instruction>(NOp1))
5547 I->copyIRFlags(Op1, true);
5548
5549 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
5550 Builder.SetInsertPoint(Shuffles[S]);
5551 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
5552 replaceValue(*Shuffles[S], *NSV, false);
5553 }
5554
5555 Worklist.pushValue(NSV0A);
5556 Worklist.pushValue(NSV0B);
5557 Worklist.pushValue(NSV1A);
5558 Worklist.pushValue(NSV1B);
5559 return true;
5560}
5561
5562/// Check if instruction depends on ZExt and this ZExt can be moved after the
5563/// instruction. Move ZExt if it is profitable. For example:
5564/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
5565/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
5566/// Cost model calculations takes into account if zext(x) has other users and
5567/// whether it can be propagated through them too.
5568bool VectorCombine::shrinkType(Instruction &I) {
5569 Value *ZExted, *OtherOperand;
5570 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
5571 m_Value(OtherOperand))) &&
5572 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
5573 return false;
5574
5575 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
5576
5577 auto *BigTy = cast<FixedVectorType>(I.getType());
5578 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
5579 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
5580
5581 if (I.getOpcode() == Instruction::LShr) {
5582 // Check that the shift amount is less than the number of bits in the
5583 // smaller type. Otherwise, the smaller lshr will return a poison value.
5584 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
5585 if (ShAmtKB.getMaxValue().uge(BW))
5586 return false;
5587 } else {
5588 // Check that the expression overall uses at most the same number of bits as
5589 // ZExted
5590 KnownBits KB = computeKnownBits(&I, *DL);
5591 if (KB.countMaxActiveBits() > BW)
5592 return false;
5593 }
5594
5595 // Calculate costs of leaving current IR as it is and moving ZExt operation
5596 // later, along with adding truncates if needed
5598 Instruction::ZExt, BigTy, SmallTy,
5599 TargetTransformInfo::CastContextHint::None, CostKind);
5600 InstructionCost CurrentCost = ZExtCost;
5601 InstructionCost ShrinkCost = 0;
5602
5603 // Calculate total cost and check that we can propagate through all ZExt users
5604 for (User *U : ZExtOperand->users()) {
5605 auto *UI = cast<Instruction>(U);
5606 if (UI == &I) {
5607 CurrentCost +=
5608 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5609 ShrinkCost +=
5610 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5611 ShrinkCost += ZExtCost;
5612 continue;
5613 }
5614
5615 if (!Instruction::isBinaryOp(UI->getOpcode()))
5616 return false;
5617
5618 // Check if we can propagate ZExt through its other users
5619 KnownBits KB = computeKnownBits(UI, *DL);
5620 if (KB.countMaxActiveBits() > BW)
5621 return false;
5622
5623 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
5624 ShrinkCost +=
5625 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
5626 ShrinkCost += ZExtCost;
5627 }
5628
5629 // If the other instruction operand is not a constant, we'll need to
5630 // generate a truncate instruction. So we have to adjust cost
5631 if (!isa<Constant>(OtherOperand))
5632 ShrinkCost += TTI.getCastInstrCost(
5633 Instruction::Trunc, SmallTy, BigTy,
5634 TargetTransformInfo::CastContextHint::None, CostKind);
5635
5636 // If the cost of shrinking types and leaving the IR is the same, we'll lean
5637 // towards modifying the IR because shrinking opens opportunities for other
5638 // shrinking optimisations.
5639 if (ShrinkCost > CurrentCost)
5640 return false;
5641
5642 Builder.SetInsertPoint(&I);
5643 Value *Op0 = ZExted;
5644 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
5645 // Keep the order of operands the same
5646 if (I.getOperand(0) == OtherOperand)
5647 std::swap(Op0, Op1);
5648 Value *NewBinOp =
5649 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
5650 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
5651 cast<Instruction>(NewBinOp)->copyMetadata(I);
5652 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
5653 replaceValue(I, *NewZExtr);
5654 return true;
5655}
5656
5657/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
5658/// shuffle (DstVec, SrcVec, Mask)
5659bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
5660 Value *DstVec, *SrcVec;
5661 uint64_t ExtIdx, InsIdx;
5662 if (!match(&I,
5663 m_InsertElt(m_Value(DstVec),
5664 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
5665 m_ConstantInt(InsIdx))))
5666 return false;
5667
5668 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
5669 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
5670 // We can try combining vectors with different element sizes.
5671 if (!DstVecTy || !SrcVecTy ||
5672 SrcVecTy->getElementType() != DstVecTy->getElementType())
5673 return false;
5674
5675 unsigned NumDstElts = DstVecTy->getNumElements();
5676 unsigned NumSrcElts = SrcVecTy->getNumElements();
5677 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
5678 return false;
5679
5680 // Insertion into poison is a cheaper single operand shuffle.
5682 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
5683
5684 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
5685 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
5686 if (NeedDstSrcSwap) {
5688 Mask[InsIdx] = ExtIdx % NumDstElts;
5689 std::swap(DstVec, SrcVec);
5690 } else {
5692 std::iota(Mask.begin(), Mask.end(), 0);
5693 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
5694 }
5695
5696 // Cost
5697 auto *Ins = cast<InsertElementInst>(&I);
5698 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
5699 InstructionCost InsCost =
5700 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
5701 InstructionCost ExtCost =
5702 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
5703 InstructionCost OldCost = ExtCost + InsCost;
5704
5705 InstructionCost NewCost = 0;
5706 SmallVector<int> ExtToVecMask;
5707 if (!NeedExpOrNarrow) {
5708 // Ignore 'free' identity insertion shuffle.
5709 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
5710 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
5711 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
5712 nullptr, {DstVec, SrcVec});
5713 } else {
5714 // When creating a length-changing-vector, always try to keep the relevant
5715 // element in an equivalent position, so that bulk shuffles are more likely
5716 // to be useful.
5717 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
5718 ExtToVecMask[ExtIdx % NumDstElts] = ExtIdx;
5719 // Add cost for expanding or narrowing
5721 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
5722 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
5723 }
5724
5725 if (!Ext->hasOneUse())
5726 NewCost += ExtCost;
5727
5728 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
5729 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
5730 << "\n");
5731
5732 if (OldCost < NewCost)
5733 return false;
5734
5735 if (NeedExpOrNarrow) {
5736 if (!NeedDstSrcSwap)
5737 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
5738 else
5739 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
5740 }
5741
5742 // Canonicalize undef param to RHS to help further folds.
5743 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
5744 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
5745 std::swap(DstVec, SrcVec);
5746 }
5747
5748 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
5749 replaceValue(I, *Shuf);
5750
5751 return true;
5752}
5753
5754/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
5755/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
5756/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
5757/// before casting it back into `<vscale x 16 x i32>`.
5758bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
5759 const APInt *SplatVal0, *SplatVal1;
5761 m_APInt(SplatVal0), m_APInt(SplatVal1))))
5762 return false;
5763
5764 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
5765 << "\n");
5766
5767 auto *VTy =
5768 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
5769 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
5770 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
5771
5772 // Just in case the cost of interleave2 intrinsic and bitcast are both
5773 // invalid, in which case we want to bail out, we use <= rather
5774 // than < here. Even they both have valid and equal costs, it's probably
5775 // not a good idea to emit a high-cost constant splat.
5777 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
5779 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
5780 << *I.getType() << " is too high.\n");
5781 return false;
5782 }
5783
5784 APInt NewSplatVal = SplatVal1->zext(Width * 2);
5785 NewSplatVal <<= Width;
5786 NewSplatVal |= SplatVal0->zext(Width * 2);
5787 auto *NewSplat = ConstantVector::getSplat(
5788 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
5789
5790 IRBuilder<> Builder(&I);
5791 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
5792 return true;
5793}
5794
5795/// Given this sequence:
5796/// ```
5797/// %d = llvm.vector.deinterleave2 <vscale x 16 x i32> %v
5798/// %f0 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d, 0
5799/// %f1 = extractvalue { <vscale x 8 x i32>, <vscale x 8 x i32> } %d, 1
5800///
5801/// %low0 = and <vscale x 8 x i32> %f0, splat (i32 65535)
5802/// %low1 = shl <vscale x 8 x i32> %f1, splat (i32 16)
5803/// %merge0 = or disjoint <vscale x 8 x i32> %low0, %low1
5804///
5805/// %high0 = and <vscale x 8 x i32> %f1, splat (i32 -65536)
5806/// %high1 = lshr <vscale x 8 x i32> %f0, splat (i32 16)
5807/// %merge1 = or disjoint <vscale x 8 x i32> %high0, %high1
5808/// ```
5809/// It is actually just de-interleaving a 16-bit vector with double the
5810/// vector length. More generally speaking, it's de-interleaving on a vector
5811/// with half the element width as the original vector.
5812///
5813/// Therefore, we can turn it into:
5814/// ```
5815/// %narrow.v = bitcast <vscale x 16 x i32> %v to <vscale x 32 x i16>
5816/// %d = llvm.vector.deinterleave2 <vscale x 32 x i16> %narrow.v
5817/// %f0 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %d, 0
5818/// %f1 = extractvalue { <vscale x 16 x i16>, <vscale x 16 x i16> } %d, 1
5819///
5820/// %merge0 = bitcast <vscale x 16 x i16> %f0 to <vscale x 8 x i32>
5821/// %merge1 = bitcast <vscale x 16 x i16> %f1 to <vscale x 8 x i32>
5822/// ```
5823bool VectorCombine::foldDeinterleaveIntrinsics(Instruction &I) {
5824 // This pattern involves bitcast that is not compatible with big endian.
5825 if (DL->isBigEndian())
5826 return false;
5827
5828 using namespace PatternMatch;
5829 Value *DeinterleavedVal;
5830 if (!match(&I, m_Deinterleave2(m_Value(DeinterleavedVal))))
5831 return false;
5832
5833 VectorType *VecTy = cast<VectorType>(DeinterleavedVal->getType());
5834 IntegerType *ElementTy = dyn_cast<IntegerType>(VecTy->getElementType());
5835 if (!ElementTy)
5836 return false;
5837 unsigned ElementWidth = ElementTy->getBitWidth();
5838 if (ElementWidth < 2 || !isPowerOf2_32(ElementWidth))
5839 return false;
5840 unsigned HalfElementWidth = ElementWidth / 2;
5841
5842 if (!I.hasNUses(2))
5843 return false;
5844 std::array<ExtractValueInst *, 2> OrigFields{};
5845 for (User *Usr : I.users()) {
5846 auto *E = dyn_cast<ExtractValueInst>(Usr);
5847 // The deinterleave result can only be used by extractions.
5848 if (!E || E->getNumIndices() != 1)
5849 return false;
5850 unsigned Idx = *E->idx_begin();
5851 // A single field cannot be extracted more than once.
5852 if (Idx >= 2 || OrigFields[Idx] || !E->hasNUses(2))
5853 return false;
5854 OrigFields[Idx] = E;
5855 }
5856
5857 // Find the merge instruction (i.e. OR) first.
5858 SmallVector<Instruction *, 2> MergeInsts;
5859 for (auto *FieldUsr : OrigFields[0]->users()) {
5860 if (!FieldUsr->hasOneUse() || !isa<Instruction>(FieldUsr->user_back()))
5861 return false;
5862 MergeInsts.push_back(cast<Instruction>(FieldUsr->user_back()));
5863 }
5864 assert(MergeInsts.size() == 2);
5865
5866 // Pattern match bottom-up from the merge instructions.
5867 auto MatchMerge = [&](void) -> bool {
5868 APInt LoMask = APInt::getLowBitsSet(ElementWidth, HalfElementWidth);
5869 APInt HiMask = APInt::getHighBitsSet(ElementWidth, HalfElementWidth);
5870 return match(MergeInsts[0],
5871 m_c_Or(m_And(m_Specific(OrigFields[0]), m_SpecificInt(LoMask)),
5872 m_Shl(m_Specific(OrigFields[1]),
5873 m_SpecificInt(HalfElementWidth)))) &&
5874 match(MergeInsts[1],
5875 m_c_Or(m_And(m_Specific(OrigFields[1]), m_SpecificInt(HiMask)),
5876 m_LShr(m_Specific(OrigFields[0]),
5877 m_SpecificInt(HalfElementWidth))));
5878 };
5879 if (!MatchMerge()) {
5880 std::swap(MergeInsts[0], MergeInsts[1]);
5881 if (!MatchMerge())
5882 return false;
5883 }
5884
5885 // Profitability check.
5886 InstructionCost OldCost =
5887 TTI.getInstructionCost(MergeInsts[0], CostKind) +
5888 TTI.getInstructionCost(cast<Instruction>(MergeInsts[0]->getOperand(0)),
5889 CostKind) +
5890 TTI.getInstructionCost(cast<Instruction>(MergeInsts[0]->getOperand(1)),
5891 CostKind);
5892 // There are two fields (assuming SHL has the same cost as LSHR).
5893 OldCost *= 2;
5894
5895 auto *NewFieldTy = VecTy->getWithNewBitWidth(HalfElementWidth);
5896 auto *NewVecTy =
5897 VectorType::getDoubleElementsVectorType(cast<VectorType>(NewFieldTy));
5898 InstructionCost NewCost =
5899 TTI.getCastInstrCost(Instruction::BitCast, VecTy, NewVecTy,
5901 TTI.getCastInstrCost(Instruction::BitCast, NewFieldTy,
5902 MergeInsts[0]->getType(), TTI::CastContextHint::None,
5903 CostKind) *
5904 2;
5905 if (OldCost <= NewCost || !NewCost.isValid()) {
5906 LLVM_DEBUG(
5907 dbgs() << "VC: New deinterleave2 sequence cost (" << NewCost << ")"
5908 << " is higher than that of the old one (" << OldCost << ")\n");
5909 return false;
5910 }
5911
5912 // Do the replacement.
5913 IRBuilder<> Builder(&I);
5914 Value *NewVecCast = Builder.CreateBitCast(DeinterleavedVal, NewVecTy);
5915 Value *NewDeinterleave = Builder.CreateIntrinsic(
5916 Intrinsic::vector_deinterleave2, {NewVecTy}, {NewVecCast});
5917 for (auto [Idx, MergeInst] : enumerate(MergeInsts)) {
5918 Value *NewField = Builder.CreateExtractValue(NewDeinterleave, Idx);
5919 NewField = Builder.CreateBitCast(NewField, MergeInst->getType());
5920 replaceValue(*MergeInst, *NewField);
5921 }
5922
5923 return true;
5924}
5925
5926bool VectorCombine::foldBitcastOfVPLoad(Instruction &I) {
5927 const DataLayout &DL = I.getDataLayout();
5928 auto *Cast = dyn_cast<CastInst>(&I);
5929 if (!Cast || !Cast->isNoopCast(DL) || !isa<VectorType>(Cast->getDestTy()))
5930 return false;
5931
5932 // Fold away bit casts of the loaded value by loading the desired type,
5933 // if the mask is all-ones.
5934 Value *EVL;
5935 auto *II = dyn_cast<VPIntrinsic>(I.getOperand(0));
5937 m_Value(), m_AllOnes(), m_Value(EVL)))))
5938 return false;
5939
5940 VectorType *OrigVecTy = cast<VectorType>(II->getType());
5941 Align OrigAlign =
5942 DL.getValueOrABITypeAlignment(II->getPointerAlignment(), OrigVecTy);
5943 ElementCount OrigVecCnt = OrigVecTy->getElementCount();
5944 VectorType *NewVecTy = cast<VectorType>(Cast->getDestTy());
5945 ElementCount NewVecCnt = NewVecTy->getElementCount();
5946
5947 // Right now we only support cases where the NewVec is longer, because for
5948 // cases where it's shorter, we have to be sure that EVL can be exactly
5949 // divided, otherwise it might yield incorrect results or even page faults
5950 // (if we round-up during the division).
5951 if (!(OrigVecCnt.isScalable() == NewVecCnt.isScalable() &&
5952 NewVecCnt.hasKnownScalarFactor(OrigVecCnt)))
5953 return false;
5954
5955 InstructionCost OldCost =
5956 TTI.getMemIntrinsicInstrCost({Intrinsic::vp_load, OrigVecTy,
5957 II->getMemoryPointerParam(), false,
5958 OrigAlign},
5959 CostKind) +
5960 TTI.getCastInstrCost(Instruction::BitCast, Cast->getType(), OrigVecTy,
5963 {Intrinsic::vp_load, NewVecTy, II->getMemoryPointerParam(), false,
5964 OrigAlign},
5965 CostKind);
5966 LLVM_DEBUG(dbgs() << "foldBitcastOfVPLoad: OldCost=" << OldCost
5967 << " NewCost=" << NewCost << "\n");
5968 if (NewCost > OldCost || !NewCost.isValid())
5969 return false;
5970
5971 unsigned Factor = NewVecCnt.getKnownScalarFactor(OrigVecCnt);
5972 Value *NewEVL = Builder.CreateNUWMul(EVL, Builder.getInt32(Factor));
5973 Value *NewMask = Builder.CreateVectorSplat(NewVecCnt, Builder.getTrue());
5974 CallInst *NewVP = Builder.CreateIntrinsicWithoutFolding(
5975 NewVecTy, Intrinsic::vp_load,
5976 {II->getMemoryPointerParam(), NewMask, NewEVL});
5977 // Preserve the original alignment.
5978 NewVP->addParamAttrs(
5979 0, AttrBuilder(II->getContext()).addAlignmentAttr(OrigAlign));
5980 replaceValue(*Cast, *NewVP);
5981 return true;
5982}
5983
5984/// Fold the following cases into a single byte-level bit-reverse operation
5985/// and accepts bswap and bitreverse intrinsics:
5986/// bswap(bitreverse(x)) --> bitcast(bitreverse(bitcast(x)))
5987/// bitreverse(bswap(x)) --> bitcast(bitreverse(bitcast(x)))
5988bool VectorCombine::foldBitOrderReverseAndSwap(Instruction &I) {
5989 Value *X;
5990 if (!match(&I, m_BitReverse(m_BSwap(m_Value(X)))) &&
5992 return false;
5993
5994 Type *Ty = I.getType();
5995 Type *I8Ty = Builder.getInt8Ty();
5996 TypeSize ElementSize = DL->getTypeStoreSize(Ty);
5997 ElementCount NewVecCnt = ElementCount::get(ElementSize.getKnownMinValue(),
5998 ElementSize.isScalable());
5999 Type *NewVecTy = VectorType::get(I8Ty, NewVecCnt);
6000
6001 auto *II = cast<IntrinsicInst>(&I);
6002 auto *InnerII = cast<IntrinsicInst>(II->getArgOperand(0));
6003 // OldCost = cost of bitreverse/bswap + cost of bswap/bitreverse
6006
6007 // NewCost = cost of bitcast to byte vector +
6008 // cost of bitreverse/bswap on byte vector +
6009 // cost of bitcast back to original type
6010 InstructionCost CastToVecCost = TTI.getCastInstrCost(
6011 Instruction::BitCast, NewVecTy, Ty, TTI::CastContextHint::None, CostKind);
6012 InstructionCost CastToOrigCost = TTI.getCastInstrCost(
6013 Instruction::BitCast, Ty, NewVecTy, TTI::CastContextHint::None, CostKind);
6014
6015 IntrinsicCostAttributes ICANew(Intrinsic::bitreverse, NewVecTy, {NewVecTy});
6016 InstructionCost NewIntrinsicCost =
6018 InstructionCost NewCost = CastToVecCost + NewIntrinsicCost + CastToOrigCost;
6019
6020 if (!InnerII->hasOneUse())
6021 NewCost += TTI.getInstructionCost(InnerII, CostKind);
6022
6023 LLVM_DEBUG(dbgs() << "Found bitorder reverse and swap: " << I
6024 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
6025 << "\n");
6026 if (!NewCost.isValid() || NewCost >= OldCost)
6027 return false;
6028
6029 // Perform transform: bitcast(arg, <N x i8>), bitreverse, bitcast back
6030 Builder.SetInsertPoint(II);
6031 Value *CastToVec = Builder.CreateBitCast(X, NewVecTy);
6032 Value *NewCall =
6033 Builder.CreateUnaryIntrinsic(Intrinsic::bitreverse, CastToVec);
6034 Value *CastToOrig = Builder.CreateBitCast(NewCall, Ty);
6035 replaceValue(I, *CastToOrig);
6036 return true;
6037}
6038
6039// Attempt to shrink loads that are only used by shufflevector instructions.
6040bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
6041 auto *OldLoad = dyn_cast<LoadInst>(&I);
6042 if (!OldLoad || !OldLoad->isSimple())
6043 return false;
6044
6045 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
6046 if (!OldLoadTy)
6047 return false;
6048
6049 unsigned const OldNumElements = OldLoadTy->getNumElements();
6050
6051 // Search all uses of load. If all uses are shufflevector instructions, and
6052 // the second operands are all poison values, find the minimum and maximum
6053 // indices of the vector elements referenced by all shuffle masks.
6054 // Otherwise return `std::nullopt`.
6055 using IndexRange = std::pair<int, int>;
6056 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
6057 IndexRange OutputRange = IndexRange(OldNumElements, -1);
6058 for (llvm::Use &Use : I.uses()) {
6059 // Ensure all uses match the required pattern.
6060 User *Shuffle = Use.getUser();
6061 ArrayRef<int> Mask;
6062
6063 if (!match(Shuffle,
6064 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
6065 return std::nullopt;
6066
6067 // Ignore shufflevector instructions that have no uses.
6068 if (Shuffle->use_empty())
6069 continue;
6070
6071 // Find the min and max indices used by the shufflevector instruction.
6072 for (int Index : Mask) {
6073 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
6074 OutputRange.first = std::min(Index, OutputRange.first);
6075 OutputRange.second = std::max(Index, OutputRange.second);
6076 }
6077 }
6078 }
6079
6080 if (OutputRange.second < OutputRange.first)
6081 return std::nullopt;
6082
6083 return OutputRange;
6084 };
6085
6086 // Get the range of vector elements used by shufflevector instructions.
6087 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
6088 unsigned const NewNumElements = Indices->second + 1u;
6089
6090 // If the range of vector elements is smaller than the full load, attempt
6091 // to create a smaller load.
6092 if (NewNumElements < OldNumElements) {
6093 IRBuilder Builder(&I);
6094 Builder.SetCurrentDebugLocation(I.getDebugLoc());
6095
6096 // Calculate costs of old and new ops.
6097 Type *ElemTy = OldLoadTy->getElementType();
6098 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
6099 Value *PtrOp = OldLoad->getPointerOperand();
6100
6102 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
6103 OldLoad->getPointerAddressSpace(), CostKind);
6104 InstructionCost NewCost =
6105 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
6106 OldLoad->getPointerAddressSpace(), CostKind);
6107
6108 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
6110 unsigned const MaxIndex = NewNumElements * 2u;
6111
6112 for (llvm::Use &Use : I.uses()) {
6113 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
6114
6115 // Ignore shufflevector instructions that have no uses.
6116 if (Shuffle->use_empty())
6117 continue;
6118
6119 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
6120
6121 // Create entry for new use.
6122 NewUses.push_back({Shuffle, OldMask});
6123
6124 // Validate mask indices.
6125 for (int Index : OldMask) {
6126 if (Index >= static_cast<int>(MaxIndex))
6127 return false;
6128 }
6129
6130 // Update costs.
6131 OldCost +=
6133 OldLoadTy, OldMask, CostKind);
6134 NewCost +=
6136 NewLoadTy, OldMask, CostKind);
6137 }
6138
6139 LLVM_DEBUG(
6140 dbgs() << "Found a load used only by shufflevector instructions: "
6141 << I << "\n OldCost: " << OldCost
6142 << " vs NewCost: " << NewCost << "\n");
6143
6144 if (OldCost < NewCost || !NewCost.isValid())
6145 return false;
6146
6147 // Create new load of smaller vector.
6148 auto *NewLoad = cast<LoadInst>(
6149 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
6150 NewLoad->copyMetadata(I);
6151
6152 // Replace all uses.
6153 for (UseEntry &Use : NewUses) {
6154 ShuffleVectorInst *Shuffle = Use.first;
6155 std::vector<int> &NewMask = Use.second;
6156
6157 Builder.SetInsertPoint(Shuffle);
6158 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
6159 Value *NewShuffle = Builder.CreateShuffleVector(
6160 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
6161
6162 replaceValue(*Shuffle, *NewShuffle, false);
6163 }
6164
6165 return true;
6166 }
6167 }
6168 return false;
6169}
6170
6171// Attempt to narrow a phi of shufflevector instructions where the two incoming
6172// values have the same operands but different masks. If the two shuffle masks
6173// are offsets of one another we can use one branch to rotate the incoming
6174// vector and perform one larger shuffle after the phi.
6175bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
6176 auto *Phi = dyn_cast<PHINode>(&I);
6177 if (!Phi || Phi->getNumIncomingValues() != 2u)
6178 return false;
6179
6180 Value *Op = nullptr;
6181 ArrayRef<int> Mask0;
6182 ArrayRef<int> Mask1;
6183
6184 if (!match(Phi->getOperand(0u),
6185 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
6186 !match(Phi->getOperand(1u),
6187 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
6188 return false;
6189
6190 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
6191
6192 // Ensure result vectors are wider than the argument vector.
6193 auto *InputVT = cast<FixedVectorType>(Op->getType());
6194 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
6195 auto const InputNumElements = InputVT->getNumElements();
6196
6197 if (InputNumElements >= ResultVT->getNumElements())
6198 return false;
6199
6200 // Take the difference of the two shuffle masks at each index. Ignore poison
6201 // values at the same index in both masks.
6202 SmallVector<int, 16> NewMask;
6203 NewMask.reserve(Mask0.size());
6204
6205 for (auto [M0, M1] : zip(Mask0, Mask1)) {
6206 if (M0 >= 0 && M1 >= 0)
6207 NewMask.push_back(M0 - M1);
6208 else if (M0 == -1 && M1 == -1)
6209 continue;
6210 else
6211 return false;
6212 }
6213
6214 // Ensure all elements of the new mask are equal. If the difference between
6215 // the incoming mask elements is the same, the two must be constant offsets
6216 // of one another.
6217 if (NewMask.empty() || !all_equal(NewMask))
6218 return false;
6219
6220 // Create new mask using difference of the two incoming masks.
6221 int MaskOffset = NewMask[0u];
6222 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
6223 NewMask.clear();
6224
6225 for (unsigned I = 0u; I < InputNumElements; ++I) {
6226 NewMask.push_back(Index);
6227 Index = (Index + 1u) % InputNumElements;
6228 }
6229
6230 // Calculate costs for worst cases and compare.
6231 auto const Kind = TTI::SK_PermuteSingleSrc;
6232 auto OldCost =
6233 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
6234 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
6235 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
6236 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
6237
6238 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
6239 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
6240 << "\n");
6241
6242 if (NewCost > OldCost)
6243 return false;
6244
6245 // Create new shuffles and narrowed phi.
6246 auto Builder = IRBuilder(Shuf);
6247 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
6248 auto *PoisonVal = PoisonValue::get(InputVT);
6249 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
6250 Worklist.push(cast<Instruction>(NewShuf0));
6251
6252 Builder.SetInsertPoint(Phi);
6253 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
6254 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
6255 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
6256 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
6257
6258 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
6259 PoisonVal = PoisonValue::get(NewPhi->getType());
6260 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
6261
6262 replaceValue(*Phi, *NewShuf1);
6263 return true;
6264}
6265
6266/// This is the entry point for all transforms. Pass manager differences are
6267/// handled in the callers of this function.
6268bool VectorCombine::run() {
6270 return false;
6271
6272 // Don't attempt vectorization if the target does not support vectors.
6273 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
6274 return false;
6275
6276 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
6277
6278 auto FoldInst = [this](Instruction &I) {
6279 Builder.SetInsertPoint(&I);
6280 bool IsVectorType = isa<VectorType>(I.getType());
6281 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
6282 auto Opcode = I.getOpcode();
6283
6284 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
6285
6286 // These folds should be beneficial regardless of when this pass is run
6287 // in the optimization pipeline.
6288 // The type checking is for run-time efficiency. We can avoid wasting time
6289 // dispatching to folding functions if there's no chance of matching.
6290 if (IsFixedVectorType) {
6291 switch (Opcode) {
6292 case Instruction::InsertElement:
6293 if (vectorizeLoadInsert(I))
6294 return true;
6295 break;
6296 case Instruction::ShuffleVector:
6297 if (widenSubvectorLoad(I))
6298 return true;
6299 break;
6300 default:
6301 break;
6302 }
6303 }
6304
6305 // This transform works with scalable and fixed vectors
6306 // TODO: Identify and allow other scalable transforms
6307 if (IsVectorType) {
6308 if (scalarizeOpOrCmp(I))
6309 return true;
6310 if (scalarizeLoad(I))
6311 return true;
6312 if (scalarizeExtExtract(I))
6313 return true;
6314 if (scalarizeVPIntrinsic(I))
6315 return true;
6316 if (foldInterleaveIntrinsics(I))
6317 return true;
6318 if (foldBitcastOfVPLoad(I))
6319 return true;
6320 }
6321
6322 if (foldDeinterleaveIntrinsics(I))
6323 return true;
6324
6325 if (Opcode == Instruction::Store)
6326 if (foldSingleElementStore(I))
6327 return true;
6328
6329 // If this is an early pipeline invocation of this pass, we are done.
6330 if (TryEarlyFoldsOnly)
6331 return false;
6332
6333 if (Opcode == Instruction::Call)
6334 if (foldBitOrderReverseAndSwap(I))
6335 return true;
6336
6337 // Otherwise, try folds that improve codegen but may interfere with
6338 // early IR canonicalizations.
6339 // The type checking is for run-time efficiency. We can avoid wasting time
6340 // dispatching to folding functions if there's no chance of matching.
6341 if (IsFixedVectorType) {
6342 switch (Opcode) {
6343 case Instruction::InsertElement:
6344 if (foldInsExtFNeg(I))
6345 return true;
6346 if (foldInsExtBinop(I))
6347 return true;
6348 if (foldInsExtVectorToShuffle(I))
6349 return true;
6350 break;
6351 case Instruction::ShuffleVector:
6352 if (foldPermuteOfBinops(I))
6353 return true;
6354 if (foldShuffleOfBinops(I))
6355 return true;
6356 if (foldShuffleOfSelects(I))
6357 return true;
6358 if (foldShuffleOfCastops(I))
6359 return true;
6360 if (foldShuffleOfShuffles(I))
6361 return true;
6362 if (foldPermuteOfIntrinsic(I))
6363 return true;
6364 if (foldShufflesOfLengthChangingShuffles(I))
6365 return true;
6366 if (foldShuffleOfIntrinsics(I))
6367 return true;
6368 if (foldSelectShuffle(I))
6369 return true;
6370 if (foldShuffleToIdentity(I))
6371 return true;
6372 break;
6373 case Instruction::Load:
6374 if (shrinkLoadForShuffles(I))
6375 return true;
6376 break;
6377 case Instruction::BitCast:
6378 if (foldBitcastShuffle(I))
6379 return true;
6380 if (foldSelectsFromBitcast(I))
6381 return true;
6382 break;
6383 case Instruction::And:
6384 case Instruction::Or:
6385 case Instruction::Xor:
6386 if (foldBitOpOfCastops(I))
6387 return true;
6388 if (foldBitOpOfCastConstant(I))
6389 return true;
6390 break;
6391 case Instruction::PHI:
6392 if (shrinkPhiOfShuffles(I))
6393 return true;
6394 break;
6395 default:
6396 if (shrinkType(I))
6397 return true;
6398 break;
6399 }
6400 } else {
6401 switch (Opcode) {
6402 case Instruction::Call:
6403 if (foldShuffleFromReductions(I))
6404 return true;
6405 if (foldCastFromReductions(I))
6406 return true;
6407 break;
6408 case Instruction::ExtractElement:
6409 if (foldShuffleChainsToReduce(I))
6410 return true;
6411 break;
6412 case Instruction::ICmp:
6413 if (foldSignBitReductionCmp(I))
6414 return true;
6415 if (foldICmpEqZeroVectorReduce(I))
6416 return true;
6417 if (foldReductionZeroTest(I))
6418 return true;
6419 if (foldEquivalentReductionCmp(I))
6420 return true;
6421 if (foldReduceAddCmpZero(I))
6422 return true;
6423 [[fallthrough]];
6424 case Instruction::FCmp:
6425 if (foldExtractExtract(I))
6426 return true;
6427 break;
6428 case Instruction::Or:
6429 if (foldConcatOfBoolMasks(I))
6430 return true;
6431 [[fallthrough]];
6432 default:
6433 if (Instruction::isBinaryOp(Opcode)) {
6434 if (foldExtractExtract(I))
6435 return true;
6436 if (foldExtractedCmps(I))
6437 return true;
6438 if (foldBinopOfReductions(I))
6439 return true;
6440 }
6441 break;
6442 }
6443 }
6444 return false;
6445 };
6446
6447 bool MadeChange = false;
6448 for (BasicBlock &BB : F) {
6449 // Ignore unreachable basic blocks.
6450 if (!DT.isReachableFromEntry(&BB))
6451 continue;
6452 // Use early increment range so that we can erase instructions in loop.
6453 // make_early_inc_range is not applicable here, as the next iterator may
6454 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
6455 // We manually maintain the next instruction and update it when it is about
6456 // to be deleted.
6457 Instruction *I = &BB.front();
6458 while (I) {
6459 NextInst = I->getNextNode();
6460 if (!I->isDebugOrPseudoInst())
6461 MadeChange |= FoldInst(*I);
6462 I = NextInst;
6463 }
6464 }
6465
6466 NextInst = nullptr;
6467
6468 while (!Worklist.isEmpty()) {
6469 Instruction *I = Worklist.removeOne();
6470 if (!I)
6471 continue;
6472
6475 continue;
6476 }
6477
6478 MadeChange |= FoldInst(*I);
6479 }
6480
6481 return MadeChange;
6482}
6483
6486 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
6488 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
6489 AAResults &AA = FAM.getResult<AAManager>(F);
6490 const DataLayout *DL = &F.getDataLayout();
6493 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, CostKind, TryEarlyFoldsOnly);
6494 if (!Combiner.run())
6495 return PreservedAnalyses::all();
6498 return PA;
6499}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< StatepointGC > D("statepoint-example", "an example strategy for statepoint")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
static cl::opt< IntrinsicCostStrategy > IntrinsicCost("intrinsic-cost-strategy", cl::desc("Costing strategy for intrinsic instructions"), cl::init(IntrinsicCostStrategy::InstructionCost), cl::values(clEnumValN(IntrinsicCostStrategy::InstructionCost, "instruction-cost", "Use TargetTransformInfo::getInstructionCost"), clEnumValN(IntrinsicCostStrategy::IntrinsicCost, "intrinsic-cost", "Use TargetTransformInfo::getIntrinsicInstrCost"), clEnumValN(IntrinsicCostStrategy::TypeBasedIntrinsicCost, "type-based-intrinsic-cost", "Calculate the intrinsic cost based only on argument types")))
This file defines the DenseMap class.
#define Check(C,...)
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
iv users
Definition IVUsers.cpp:48
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1457
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
if(PassOpts->AAPipeline)
const SmallVectorImpl< MachineOperand > & Cond
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Value * generateNewInstTree(ArrayRef< InstLane > Item, Use *From, FixedVectorType *Ty, const DenseSet< std::pair< Value *, Use * > > &IdentityLeafs, const DenseSet< std::pair< Value *, Use * > > &SplatLeafs, const DenseSet< std::pair< Value *, Use * > > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
std::pair< Value *, int > InstLane
static bool isKnownNonPositive(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Used by foldReduceAddCmpZero to check if we can prove that a value is non-positive.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, const SimplifyQuery &SQ)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static InstLane lookThroughShuffles(Value *V, int Lane)
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1055
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
unsigned countl_one() const
Count the number of leading one bits.
Definition APInt.h:1638
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
static APInt getZero(unsigned numBits)
Get the '0' value for the specified bit-width.
Definition APInt.h:201
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
Get the first element.
Definition ArrayRef.h:144
size_t size() const
Get the array size.
Definition ArrayRef.h:141
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:409
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
void addParamAttrs(unsigned ArgNo, const AttrBuilder &B)
Adds attributes to the indicated argument.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:740
bool isFPPredicate() const
Definition InstrTypes.h:845
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:33
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
static LLVM_ABI Constant * getBinOpIdentity(unsigned Opcode, Type *Ty, bool AllowRHSConstant=false, bool NSZ=false)
Return the identity constant for a binary opcode.
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:159
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
ValueT lookup(const_arg_type_t< KeyT > Val) const
Return the entry for the specified key, or a default constructed value if no such entry exists.
Definition DenseMap.h:252
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
bool empty() const
Definition DenseMap.h:173
iterator end()
Definition DenseMap.h:143
Implements a dense probed hash-table based set.
Definition DenseSet.h:289
Analysis pass which computes a DominatorTree.
Definition Dominators.h:270
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:151
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
static constexpr ElementCount get(ScalarTy MinVal, bool Scalable)
Definition TypeSize.h:315
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:23
bool noSignedZeros() const
Definition FMF.h:67
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
Predicate getSignedPredicate() const
For example, EQ->EQ, SLE->SLE, UGT->SGT, etc.
bool isEquality() const
Return true if this predicate is either EQ or NE.
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
LLVM_ABI CallInst * CreateIntrinsicWithoutFolding(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={})
Create a call to intrinsic ID with Args, mangled using OverloadTypes.
Value * CreateNUWMul(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1469
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2617
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2605
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1923
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
Value * CreateExtractValue(Value *Agg, ArrayRef< unsigned > Idxs, const Twine &Name="")
Definition IRBuilder.h:2664
ConstantInt * getTrue()
Get the constant value for i1 true.
Definition IRBuilder.h:457
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2683
void SetCurrentDebugLocation(const DebugLoc &L)
Set location information used by debugging information.
Definition IRBuilder.h:221
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1532
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2266
Value * CreateIsNotNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg > -1.
Definition IRBuilder.h:2707
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:2008
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2291
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:482
LLVM_ABI Value * CreateOrReduce(Value *Src)
Create a vector int OR reduction intrinsic of the source vector.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:477
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2498
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2529
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:146
Value * CreateIsNeg(Value *Arg, const Twine &Name="")
Return a boolean value testing if Arg < 0.
Definition IRBuilder.h:2702
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2232
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1906
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1511
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2110
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2639
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1570
LLVM_ABI Value * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > OverloadTypes, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="", ArrayRef< OperandBundleDef > OpBundles={}, function_ref< void(CallInst *)> SetFn=[](CallInst *) {})
Variant to create a possibly constant-folded intrinsic.
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1919
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2096
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:577
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1731
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:181
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1844
Value * CreateICmp(CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:2474
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1592
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:524
LLVM_ABI Value * CreateUnaryIntrinsic(Intrinsic::ID ID, Value *Op, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with 1 operand which is mangled on its type.
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
CostType getValue() const
This function is intended to be used as sparingly as possible, since the class provides the full rang...
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
LLVM_ABI FastMathFlags getFastMathFlags() const LLVM_READONLY
Convenience function for getting all the fast-math flags, which must be an operator which supports th...
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
bool isIdempotent() const
Return true if the instruction is idempotent:
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
LLVM_ABI bool hasAllowReassoc() const LLVM_READONLY
Determine whether the allow-reassociation flag is set.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:348
unsigned getBitWidth() const
Get the number of bits in this IntegerType.
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
void setAlignment(Align Align)
Type * getPointerOperandType() const
Align getAlign() const
Return the alignment of the access that is being performed.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
bool contains(const_arg_type key) const
Check if the SetVector contains the given key.
Definition SetVector.h:252
bool empty() const
Determine if the SetVector is empty or not.
Definition SetVector.h:100
bool insert(const value_type &X)
Insert a new element into the SetVector.
Definition SetVector.h:151
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
static LLVM_ABI CastContextHint getCastContextHint(const Instruction *I)
Calculates a CastContextHint from I.
@ None
The insert/extract is not used with a load/store.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
static LLVM_ABI OperandValueInfo commonOperandInfo(const Value *X, const Value *Y)
Collect common data between two OperandValueInfo inputs.
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
LLVM_ABI InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF=FastMathFlags(), TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_CodeSize
Instruction code size.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI InstructionCost getMemIntrinsicInstrCost(const MemIntrinsicCostAttributes &MICA, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}, TTI::VectorInstrContext VIC=TTI::VectorInstrContext::None) const
Estimate the overhead of scalarizing an instruction.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ SK_ExtractSubvector
ExtractSubvector Index indicates start offset.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:267
Value * getOperand(unsigned i) const
Definition User.h:207
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:727
LLVM_ABI bool hasOneUser() const
Return true if there is exactly one user of this value.
Definition Value.cpp:163
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:993
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:147
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:319
bool user_empty() const
Definition Value.h:389
LLVM_ABI PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
size_type size() const
Definition DenseSet.h:87
constexpr bool hasKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns true if there exists a value X where RHS.multiplyCoefficientBy(X) will result in a value whos...
Definition TypeSize.h:269
constexpr ScalarTy getKnownScalarFactor(const FixedOrScalableQuantity &RHS) const
Returns a value X where RHS.multiplyCoefficientBy(X) will result in a value whose quantity matches ou...
Definition TypeSize.h:277
constexpr bool isScalable() const
Returns whether the quantity is scaled by a runtime quantity (vscale).
Definition TypeSize.h:168
constexpr ScalarTy getKnownMinValue() const
Returns the minimum value this quantity can represent.
Definition TypeSize.h:165
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
const APInt & smin(const APInt &A, const APInt &B)
Determine the smaller of two APInts considered to be signed.
Definition APInt.h:2277
const APInt & smax(const APInt &A, const APInt &B)
Determine the larger of two APInts considered to be signed.
Definition APInt.h:2282
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
match_combine_and< Ty... > m_CombineAnd(const Ty &...Ps)
Combine pattern matchers matching all of Ps patterns.
cst_pred_ty< is_all_ones > m_AllOnes()
Match an integer or vector with all bits set.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
auto m_Cmp()
Matches any compare instruction and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
m_Intrinsic_Ty< Opnd0 >::Ty m_BitReverse(const Opnd0 &Op0)
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
auto m_Poison()
Match an arbitrary poison constant.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
match_bind< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_right_shift_op > m_Shr(const LHS &L, const RHS &R)
Matches logical shift operations.
CmpClass_match< LHS, RHS, ICmpInst, true > m_c_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
Matches an ICmp with a predicate over LHS and RHS in either order.
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
auto m_BinOp()
Match an arbitrary binary operation and ignore it.
auto m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
auto m_Constant()
Match an arbitrary Constant and ignore it.
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
cst_pred_ty< is_non_zero_int > m_NonZeroInt()
Match a non-zero integer or a vector with all non-zero elements.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
auto m_AnyIntrinsic()
Matches any intrinsic call and ignore it.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Mul, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWMul(const LHS &L, const RHS &R)
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
m_Intrinsic_Ty< Opnd0 >::Ty m_BSwap(const Opnd0 &Op0)
CastInst_match< OpTy, SExtInst > m_SExt(const OpTy &Op)
Matches SExt.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
m_Intrinsic_Ty< Opnd >::Ty m_Deinterleave2(const Opnd &Op)
auto m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
@ Valid
The data is already valid.
initializer< Ty > init(const Ty &Val)
DXILDebugInfoMap run(Module &M)
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
unsigned Log2_32_Ceil(uint32_t Value)
Return the ceil log base 2 of the specified value, 32 if the value is zero.
Definition MathExtras.h:344
@ Offset
Definition DWP.cpp:573
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:830
void stable_sort(R &&Range)
Definition STLExtras.h:2116
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicOp(Intrinsic::ID RdxID)
Returns the min/max intrinsic used when expanding a min/max reduction.
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:535
RelativeUniformCounterPtr Values
Definition InstrProf.h:91
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2554
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
scope_exit(Callable) -> scope_exit< Callable >
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:633
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:445
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:403
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:407
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:449
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
LLVM_ABI unsigned ComputeNumSignBits(const Value *Op, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Return the number of times the sign bit of the register is replicated into the other bits.
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1947
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2166
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI bool isKnownNonNegative(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Returns true if the give value is known to be non-negative.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, const SimplifyQuery &SQ, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:310
unsigned countMinLeadingZeros() const
Returns the minimum number of leading zero bits.
Definition KnownBits.h:262
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:146
const DataLayout & DL
const Instruction * CxtI
const DominatorTree * DT
SimplifyQuery getWithInstruction(const Instruction *I) const
AssumptionCache * AC