LLVM 22.0.0git
VectorCombine.cpp
Go to the documentation of this file.
1//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This pass optimizes scalar/vector interactions using target cost models. The
10// transforms implemented here may not fit in traditional loop-based or SLP
11// vectorization passes.
12//
13//===----------------------------------------------------------------------===//
14
16#include "llvm/ADT/DenseMap.h"
17#include "llvm/ADT/STLExtras.h"
18#include "llvm/ADT/ScopeExit.h"
20#include "llvm/ADT/Statistic.h"
25#include "llvm/Analysis/Loads.h"
30#include "llvm/IR/Dominators.h"
31#include "llvm/IR/Function.h"
32#include "llvm/IR/IRBuilder.h"
38#include <numeric>
39#include <optional>
40#include <queue>
41#include <set>
42
43#define DEBUG_TYPE "vector-combine"
45
46using namespace llvm;
47using namespace llvm::PatternMatch;
48
49STATISTIC(NumVecLoad, "Number of vector loads formed");
50STATISTIC(NumVecCmp, "Number of vector compares formed");
51STATISTIC(NumVecBO, "Number of vector binops formed");
52STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
53STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
54STATISTIC(NumScalarOps, "Number of scalar unary + binary ops formed");
55STATISTIC(NumScalarCmp, "Number of scalar compares formed");
56STATISTIC(NumScalarIntrinsic, "Number of scalar intrinsic calls formed");
57
59 "disable-vector-combine", cl::init(false), cl::Hidden,
60 cl::desc("Disable all vector combine transforms"));
61
63 "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
64 cl::desc("Disable binop extract to shuffle transforms"));
65
67 "vector-combine-max-scan-instrs", cl::init(30), cl::Hidden,
68 cl::desc("Max number of instructions to scan for vector combining."));
69
70static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
71
72namespace {
73class VectorCombine {
74public:
75 VectorCombine(Function &F, const TargetTransformInfo &TTI,
78 bool TryEarlyFoldsOnly)
79 : F(F), Builder(F.getContext(), InstSimplifyFolder(*DL)), TTI(TTI),
80 DT(DT), AA(AA), AC(AC), DL(DL), CostKind(CostKind), SQ(*DL),
81 TryEarlyFoldsOnly(TryEarlyFoldsOnly) {}
82
83 bool run();
84
85private:
86 Function &F;
88 const TargetTransformInfo &TTI;
89 const DominatorTree &DT;
90 AAResults &AA;
91 AssumptionCache &AC;
92 const DataLayout *DL;
93 TTI::TargetCostKind CostKind;
94 const SimplifyQuery SQ;
95
96 /// If true, only perform beneficial early IR transforms. Do not introduce new
97 /// vector operations.
98 bool TryEarlyFoldsOnly;
99
100 InstructionWorklist Worklist;
101
102 /// Next instruction to iterate. It will be updated when it is erased by
103 /// RecursivelyDeleteTriviallyDeadInstructions.
104 Instruction *NextInst;
105
106 // TODO: Direct calls from the top-level "run" loop use a plain "Instruction"
107 // parameter. That should be updated to specific sub-classes because the
108 // run loop was changed to dispatch on opcode.
109 bool vectorizeLoadInsert(Instruction &I);
110 bool widenSubvectorLoad(Instruction &I);
111 ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
112 ExtractElementInst *Ext1,
113 unsigned PreferredExtractIndex) const;
114 bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
115 const Instruction &I,
116 ExtractElementInst *&ConvertToShuffle,
117 unsigned PreferredExtractIndex);
118 Value *foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
119 Value *foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex, Instruction &I);
120 bool foldExtractExtract(Instruction &I);
121 bool foldInsExtFNeg(Instruction &I);
122 bool foldInsExtBinop(Instruction &I);
123 bool foldInsExtVectorToShuffle(Instruction &I);
124 bool foldBitOpOfCastops(Instruction &I);
125 bool foldBitOpOfCastConstant(Instruction &I);
126 bool foldBitcastShuffle(Instruction &I);
127 bool scalarizeOpOrCmp(Instruction &I);
128 bool scalarizeVPIntrinsic(Instruction &I);
129 bool foldExtractedCmps(Instruction &I);
130 bool foldBinopOfReductions(Instruction &I);
131 bool foldSingleElementStore(Instruction &I);
132 bool scalarizeLoadExtract(Instruction &I);
133 bool scalarizeExtExtract(Instruction &I);
134 bool foldConcatOfBoolMasks(Instruction &I);
135 bool foldPermuteOfBinops(Instruction &I);
136 bool foldShuffleOfBinops(Instruction &I);
137 bool foldShuffleOfSelects(Instruction &I);
138 bool foldShuffleOfCastops(Instruction &I);
139 bool foldShuffleOfShuffles(Instruction &I);
140 bool foldShuffleOfIntrinsics(Instruction &I);
141 bool foldShuffleToIdentity(Instruction &I);
142 bool foldShuffleFromReductions(Instruction &I);
143 bool foldShuffleChainsToReduce(Instruction &I);
144 bool foldCastFromReductions(Instruction &I);
145 bool foldSelectShuffle(Instruction &I, bool FromReduction = false);
146 bool foldInterleaveIntrinsics(Instruction &I);
147 bool shrinkType(Instruction &I);
148 bool shrinkLoadForShuffles(Instruction &I);
149 bool shrinkPhiOfShuffles(Instruction &I);
150
151 void replaceValue(Instruction &Old, Value &New, bool Erase = true) {
152 LLVM_DEBUG(dbgs() << "VC: Replacing: " << Old << '\n');
153 LLVM_DEBUG(dbgs() << " With: " << New << '\n');
154 Old.replaceAllUsesWith(&New);
155 if (auto *NewI = dyn_cast<Instruction>(&New)) {
156 New.takeName(&Old);
157 Worklist.pushUsersToWorkList(*NewI);
158 Worklist.pushValue(NewI);
159 }
160 if (Erase && isInstructionTriviallyDead(&Old)) {
161 eraseInstruction(Old);
162 } else {
163 Worklist.push(&Old);
164 }
165 }
166
167 void eraseInstruction(Instruction &I) {
168 LLVM_DEBUG(dbgs() << "VC: Erasing: " << I << '\n');
169 SmallVector<Value *> Ops(I.operands());
170 Worklist.remove(&I);
171 I.eraseFromParent();
172
173 // Push remaining users of the operands and then the operand itself - allows
174 // further folds that were hindered by OneUse limits.
175 SmallPtrSet<Value *, 4> Visited;
176 for (Value *Op : Ops) {
177 if (!Visited.contains(Op)) {
178 if (auto *OpI = dyn_cast<Instruction>(Op)) {
180 OpI, nullptr, nullptr, [&](Value *V) {
181 if (auto *I = dyn_cast<Instruction>(V)) {
182 LLVM_DEBUG(dbgs() << "VC: Erased: " << *I << '\n');
183 Worklist.remove(I);
184 if (I == NextInst)
185 NextInst = NextInst->getNextNode();
186 Visited.insert(I);
187 }
188 }))
189 continue;
190 Worklist.pushUsersToWorkList(*OpI);
191 Worklist.pushValue(OpI);
192 }
193 }
194 }
195 }
196};
197} // namespace
198
199/// Return the source operand of a potentially bitcasted value. If there is no
200/// bitcast, return the input value itself.
202 while (auto *BitCast = dyn_cast<BitCastInst>(V))
203 V = BitCast->getOperand(0);
204 return V;
205}
206
207static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI) {
208 // Do not widen load if atomic/volatile or under asan/hwasan/memtag/tsan.
209 // The widened load may load data from dirty regions or create data races
210 // non-existent in the source.
211 if (!Load || !Load->isSimple() || !Load->hasOneUse() ||
212 Load->getFunction()->hasFnAttribute(Attribute::SanitizeMemTag) ||
214 return false;
215
216 // We are potentially transforming byte-sized (8-bit) memory accesses, so make
217 // sure we have all of our type-based constraints in place for this target.
218 Type *ScalarTy = Load->getType()->getScalarType();
219 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
220 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
221 if (!ScalarSize || !MinVectorSize || MinVectorSize % ScalarSize != 0 ||
222 ScalarSize % 8 != 0)
223 return false;
224
225 return true;
226}
227
228bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
229 // Match insert into fixed vector of scalar value.
230 // TODO: Handle non-zero insert index.
231 Value *Scalar;
232 if (!match(&I,
234 return false;
235
236 // Optionally match an extract from another vector.
237 Value *X;
238 bool HasExtract = match(Scalar, m_ExtractElt(m_Value(X), m_ZeroInt()));
239 if (!HasExtract)
240 X = Scalar;
241
242 auto *Load = dyn_cast<LoadInst>(X);
243 if (!canWidenLoad(Load, TTI))
244 return false;
245
246 Type *ScalarTy = Scalar->getType();
247 uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
248 unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
249
250 // Check safety of replacing the scalar load with a larger vector load.
251 // We use minimal alignment (maximum flexibility) because we only care about
252 // the dereferenceable region. When calculating cost and creating a new op,
253 // we may use a larger value based on alignment attributes.
254 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
255 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
256
257 unsigned MinVecNumElts = MinVectorSize / ScalarSize;
258 auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
259 unsigned OffsetEltIndex = 0;
260 Align Alignment = Load->getAlign();
261 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
262 &DT)) {
263 // It is not safe to load directly from the pointer, but we can still peek
264 // through gep offsets and check if it safe to load from a base address with
265 // updated alignment. If it is, we can shuffle the element(s) into place
266 // after loading.
267 unsigned OffsetBitWidth = DL->getIndexTypeSizeInBits(SrcPtr->getType());
268 APInt Offset(OffsetBitWidth, 0);
270
271 // We want to shuffle the result down from a high element of a vector, so
272 // the offset must be positive.
273 if (Offset.isNegative())
274 return false;
275
276 // The offset must be a multiple of the scalar element to shuffle cleanly
277 // in the element's size.
278 uint64_t ScalarSizeInBytes = ScalarSize / 8;
279 if (Offset.urem(ScalarSizeInBytes) != 0)
280 return false;
281
282 // If we load MinVecNumElts, will our target element still be loaded?
283 OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
284 if (OffsetEltIndex >= MinVecNumElts)
285 return false;
286
287 if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
288 &DT))
289 return false;
290
291 // Update alignment with offset value. Note that the offset could be negated
292 // to more accurately represent "(new) SrcPtr - Offset = (old) SrcPtr", but
293 // negation does not change the result of the alignment calculation.
294 Alignment = commonAlignment(Alignment, Offset.getZExtValue());
295 }
296
297 // Original pattern: insertelt undef, load [free casts of] PtrOp, 0
298 // Use the greater of the alignment on the load or its source pointer.
299 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
300 Type *LoadTy = Load->getType();
301 unsigned AS = Load->getPointerAddressSpace();
302 InstructionCost OldCost =
303 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
304 APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
305 OldCost +=
306 TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
307 /* Insert */ true, HasExtract, CostKind);
308
309 // New pattern: load VecPtr
310 InstructionCost NewCost =
311 TTI.getMemoryOpCost(Instruction::Load, MinVecTy, Alignment, AS, CostKind);
312 // Optionally, we are shuffling the loaded vector element(s) into place.
313 // For the mask set everything but element 0 to undef to prevent poison from
314 // propagating from the extra loaded memory. This will also optionally
315 // shrink/grow the vector from the loaded size to the output size.
316 // We assume this operation has no cost in codegen if there was no offset.
317 // Note that we could use freeze to avoid poison problems, but then we might
318 // still need a shuffle to change the vector size.
319 auto *Ty = cast<FixedVectorType>(I.getType());
320 unsigned OutputNumElts = Ty->getNumElements();
321 SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
322 assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
323 Mask[0] = OffsetEltIndex;
324 if (OffsetEltIndex)
325 NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, MinVecTy, Mask,
326 CostKind);
327
328 // We can aggressively convert to the vector form because the backend can
329 // invert this transform if it does not result in a performance win.
330 if (OldCost < NewCost || !NewCost.isValid())
331 return false;
332
333 // It is safe and potentially profitable to load a vector directly:
334 // inselt undef, load Scalar, 0 --> load VecPtr
335 IRBuilder<> Builder(Load);
336 Value *CastedPtr =
337 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
338 Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
339 VecLd = Builder.CreateShuffleVector(VecLd, Mask);
340
341 replaceValue(I, *VecLd);
342 ++NumVecLoad;
343 return true;
344}
345
346/// If we are loading a vector and then inserting it into a larger vector with
347/// undefined elements, try to load the larger vector and eliminate the insert.
348/// This removes a shuffle in IR and may allow combining of other loaded values.
349bool VectorCombine::widenSubvectorLoad(Instruction &I) {
350 // Match subvector insert of fixed vector.
351 auto *Shuf = cast<ShuffleVectorInst>(&I);
352 if (!Shuf->isIdentityWithPadding())
353 return false;
354
355 // Allow a non-canonical shuffle mask that is choosing elements from op1.
356 unsigned NumOpElts =
357 cast<FixedVectorType>(Shuf->getOperand(0)->getType())->getNumElements();
358 unsigned OpIndex = any_of(Shuf->getShuffleMask(), [&NumOpElts](int M) {
359 return M >= (int)(NumOpElts);
360 });
361
362 auto *Load = dyn_cast<LoadInst>(Shuf->getOperand(OpIndex));
363 if (!canWidenLoad(Load, TTI))
364 return false;
365
366 // We use minimal alignment (maximum flexibility) because we only care about
367 // the dereferenceable region. When calculating cost and creating a new op,
368 // we may use a larger value based on alignment attributes.
369 auto *Ty = cast<FixedVectorType>(I.getType());
370 Value *SrcPtr = Load->getPointerOperand()->stripPointerCasts();
371 assert(isa<PointerType>(SrcPtr->getType()) && "Expected a pointer type");
372 Align Alignment = Load->getAlign();
373 if (!isSafeToLoadUnconditionally(SrcPtr, Ty, Align(1), *DL, Load, &AC, &DT))
374 return false;
375
376 Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
377 Type *LoadTy = Load->getType();
378 unsigned AS = Load->getPointerAddressSpace();
379
380 // Original pattern: insert_subvector (load PtrOp)
381 // This conservatively assumes that the cost of a subvector insert into an
382 // undef value is 0. We could add that cost if the cost model accurately
383 // reflects the real cost of that operation.
384 InstructionCost OldCost =
385 TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS, CostKind);
386
387 // New pattern: load PtrOp
388 InstructionCost NewCost =
389 TTI.getMemoryOpCost(Instruction::Load, Ty, Alignment, AS, CostKind);
390
391 // We can aggressively convert to the vector form because the backend can
392 // invert this transform if it does not result in a performance win.
393 if (OldCost < NewCost || !NewCost.isValid())
394 return false;
395
396 IRBuilder<> Builder(Load);
397 Value *CastedPtr =
398 Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
399 Value *VecLd = Builder.CreateAlignedLoad(Ty, CastedPtr, Alignment);
400 replaceValue(I, *VecLd);
401 ++NumVecLoad;
402 return true;
403}
404
405/// Determine which, if any, of the inputs should be replaced by a shuffle
406/// followed by extract from a different index.
407ExtractElementInst *VectorCombine::getShuffleExtract(
408 ExtractElementInst *Ext0, ExtractElementInst *Ext1,
409 unsigned PreferredExtractIndex = InvalidIndex) const {
410 auto *Index0C = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
411 auto *Index1C = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
412 assert(Index0C && Index1C && "Expected constant extract indexes");
413
414 unsigned Index0 = Index0C->getZExtValue();
415 unsigned Index1 = Index1C->getZExtValue();
416
417 // If the extract indexes are identical, no shuffle is needed.
418 if (Index0 == Index1)
419 return nullptr;
420
421 Type *VecTy = Ext0->getVectorOperand()->getType();
422 assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
423 InstructionCost Cost0 =
424 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
425 InstructionCost Cost1 =
426 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
427
428 // If both costs are invalid no shuffle is needed
429 if (!Cost0.isValid() && !Cost1.isValid())
430 return nullptr;
431
432 // We are extracting from 2 different indexes, so one operand must be shuffled
433 // before performing a vector operation and/or extract. The more expensive
434 // extract will be replaced by a shuffle.
435 if (Cost0 > Cost1)
436 return Ext0;
437 if (Cost1 > Cost0)
438 return Ext1;
439
440 // If the costs are equal and there is a preferred extract index, shuffle the
441 // opposite operand.
442 if (PreferredExtractIndex == Index0)
443 return Ext1;
444 if (PreferredExtractIndex == Index1)
445 return Ext0;
446
447 // Otherwise, replace the extract with the higher index.
448 return Index0 > Index1 ? Ext0 : Ext1;
449}
450
451/// Compare the relative costs of 2 extracts followed by scalar operation vs.
452/// vector operation(s) followed by extract. Return true if the existing
453/// instructions are cheaper than a vector alternative. Otherwise, return false
454/// and if one of the extracts should be transformed to a shufflevector, set
455/// \p ConvertToShuffle to that extract instruction.
456bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
457 ExtractElementInst *Ext1,
458 const Instruction &I,
459 ExtractElementInst *&ConvertToShuffle,
460 unsigned PreferredExtractIndex) {
461 auto *Ext0IndexC = dyn_cast<ConstantInt>(Ext0->getIndexOperand());
462 auto *Ext1IndexC = dyn_cast<ConstantInt>(Ext1->getIndexOperand());
463 assert(Ext0IndexC && Ext1IndexC && "Expected constant extract indexes");
464
465 unsigned Opcode = I.getOpcode();
466 Value *Ext0Src = Ext0->getVectorOperand();
467 Value *Ext1Src = Ext1->getVectorOperand();
468 Type *ScalarTy = Ext0->getType();
469 auto *VecTy = cast<VectorType>(Ext0Src->getType());
470 InstructionCost ScalarOpCost, VectorOpCost;
471
472 // Get cost estimates for scalar and vector versions of the operation.
473 bool IsBinOp = Instruction::isBinaryOp(Opcode);
474 if (IsBinOp) {
475 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
476 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
477 } else {
478 assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
479 "Expected a compare");
480 CmpInst::Predicate Pred = cast<CmpInst>(I).getPredicate();
481 ScalarOpCost = TTI.getCmpSelInstrCost(
482 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
483 VectorOpCost = TTI.getCmpSelInstrCost(
484 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
485 }
486
487 // Get cost estimates for the extract elements. These costs will factor into
488 // both sequences.
489 unsigned Ext0Index = Ext0IndexC->getZExtValue();
490 unsigned Ext1Index = Ext1IndexC->getZExtValue();
491
492 InstructionCost Extract0Cost =
493 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Ext0Index);
494 InstructionCost Extract1Cost =
495 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Ext1Index);
496
497 // A more expensive extract will always be replaced by a splat shuffle.
498 // For example, if Ext0 is more expensive:
499 // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
500 // extelt (opcode (splat V0, Ext0), V1), Ext1
501 // TODO: Evaluate whether that always results in lowest cost. Alternatively,
502 // check the cost of creating a broadcast shuffle and shuffling both
503 // operands to element 0.
504 unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index;
505 unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index;
506 InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
507
508 // Extra uses of the extracts mean that we include those costs in the
509 // vector total because those instructions will not be eliminated.
510 InstructionCost OldCost, NewCost;
511 if (Ext0Src == Ext1Src && Ext0Index == Ext1Index) {
512 // Handle a special case. If the 2 extracts are identical, adjust the
513 // formulas to account for that. The extra use charge allows for either the
514 // CSE'd pattern or an unoptimized form with identical values:
515 // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
516 bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
517 : !Ext0->hasOneUse() || !Ext1->hasOneUse();
518 OldCost = CheapExtractCost + ScalarOpCost;
519 NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
520 } else {
521 // Handle the general case. Each extract is actually a different value:
522 // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
523 OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
524 NewCost = VectorOpCost + CheapExtractCost +
525 !Ext0->hasOneUse() * Extract0Cost +
526 !Ext1->hasOneUse() * Extract1Cost;
527 }
528
529 ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
530 if (ConvertToShuffle) {
531 if (IsBinOp && DisableBinopExtractShuffle)
532 return true;
533
534 // If we are extracting from 2 different indexes, then one operand must be
535 // shuffled before performing the vector operation. The shuffle mask is
536 // poison except for 1 lane that is being translated to the remaining
537 // extraction lane. Therefore, it is a splat shuffle. Ex:
538 // ShufMask = { poison, poison, 0, poison }
539 // TODO: The cost model has an option for a "broadcast" shuffle
540 // (splat-from-element-0), but no option for a more general splat.
541 if (auto *FixedVecTy = dyn_cast<FixedVectorType>(VecTy)) {
542 SmallVector<int> ShuffleMask(FixedVecTy->getNumElements(),
544 ShuffleMask[BestInsIndex] = BestExtIndex;
546 VecTy, VecTy, ShuffleMask, CostKind, 0,
547 nullptr, {ConvertToShuffle});
548 } else {
550 VecTy, VecTy, {}, CostKind, 0, nullptr,
551 {ConvertToShuffle});
552 }
553 }
554
555 // Aggressively form a vector op if the cost is equal because the transform
556 // may enable further optimization.
557 // Codegen can reverse this transform (scalarize) if it was not profitable.
558 return OldCost < NewCost;
559}
560
561/// Create a shuffle that translates (shifts) 1 element from the input vector
562/// to a new element location.
563static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
564 unsigned NewIndex, IRBuilderBase &Builder) {
565 // The shuffle mask is poison except for 1 lane that is being translated
566 // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
567 // ShufMask = { 2, poison, poison, poison }
568 auto *VecTy = cast<FixedVectorType>(Vec->getType());
569 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
570 ShufMask[NewIndex] = OldIndex;
571 return Builder.CreateShuffleVector(Vec, ShufMask, "shift");
572}
573
574/// Given an extract element instruction with constant index operand, shuffle
575/// the source vector (shift the scalar element) to a NewIndex for extraction.
576/// Return null if the input can be constant folded, so that we are not creating
577/// unnecessary instructions.
578static Value *translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex,
579 IRBuilderBase &Builder) {
580 // Shufflevectors can only be created for fixed-width vectors.
581 Value *X = ExtElt->getVectorOperand();
582 if (!isa<FixedVectorType>(X->getType()))
583 return nullptr;
584
585 // If the extract can be constant-folded, this code is unsimplified. Defer
586 // to other passes to handle that.
587 Value *C = ExtElt->getIndexOperand();
588 assert(isa<ConstantInt>(C) && "Expected a constant index operand");
589 if (isa<Constant>(X))
590 return nullptr;
591
592 Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
593 NewIndex, Builder);
594 return Shuf;
595}
596
597/// Try to reduce extract element costs by converting scalar compares to vector
598/// compares followed by extract.
599/// cmp (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
600Value *VectorCombine::foldExtExtCmp(Value *V0, Value *V1, Value *ExtIndex,
601 Instruction &I) {
602 assert(isa<CmpInst>(&I) && "Expected a compare");
603
604 // cmp Pred (extelt V0, ExtIndex), (extelt V1, ExtIndex)
605 // --> extelt (cmp Pred V0, V1), ExtIndex
606 ++NumVecCmp;
607 CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
608 Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
609 return Builder.CreateExtractElement(VecCmp, ExtIndex, "foldExtExtCmp");
610}
611
612/// Try to reduce extract element costs by converting scalar binops to vector
613/// binops followed by extract.
614/// bo (ext0 V0, ExtIndex), (ext1 V1, ExtIndex)
615Value *VectorCombine::foldExtExtBinop(Value *V0, Value *V1, Value *ExtIndex,
616 Instruction &I) {
617 assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
618
619 // bo (extelt V0, ExtIndex), (extelt V1, ExtIndex)
620 // --> extelt (bo V0, V1), ExtIndex
621 ++NumVecBO;
622 Value *VecBO = Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0,
623 V1, "foldExtExtBinop");
624
625 // All IR flags are safe to back-propagate because any potential poison
626 // created in unused vector elements is discarded by the extract.
627 if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
628 VecBOInst->copyIRFlags(&I);
629
630 return Builder.CreateExtractElement(VecBO, ExtIndex, "foldExtExtBinop");
631}
632
633/// Match an instruction with extracted vector operands.
634bool VectorCombine::foldExtractExtract(Instruction &I) {
635 // It is not safe to transform things like div, urem, etc. because we may
636 // create undefined behavior when executing those on unknown vector elements.
638 return false;
639
640 Instruction *I0, *I1;
641 CmpPredicate Pred = CmpInst::BAD_ICMP_PREDICATE;
642 if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
644 return false;
645
646 Value *V0, *V1;
647 uint64_t C0, C1;
648 if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
649 !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
650 V0->getType() != V1->getType())
651 return false;
652
653 // If the scalar value 'I' is going to be re-inserted into a vector, then try
654 // to create an extract to that same element. The extract/insert can be
655 // reduced to a "select shuffle".
656 // TODO: If we add a larger pattern match that starts from an insert, this
657 // probably becomes unnecessary.
658 auto *Ext0 = cast<ExtractElementInst>(I0);
659 auto *Ext1 = cast<ExtractElementInst>(I1);
660 uint64_t InsertIndex = InvalidIndex;
661 if (I.hasOneUse())
662 match(I.user_back(),
663 m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
664
665 ExtractElementInst *ExtractToChange;
666 if (isExtractExtractCheap(Ext0, Ext1, I, ExtractToChange, InsertIndex))
667 return false;
668
669 Value *ExtOp0 = Ext0->getVectorOperand();
670 Value *ExtOp1 = Ext1->getVectorOperand();
671
672 if (ExtractToChange) {
673 unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
674 Value *NewExtOp =
675 translateExtract(ExtractToChange, CheapExtractIdx, Builder);
676 if (!NewExtOp)
677 return false;
678 if (ExtractToChange == Ext0)
679 ExtOp0 = NewExtOp;
680 else
681 ExtOp1 = NewExtOp;
682 }
683
684 Value *ExtIndex = ExtractToChange == Ext0 ? Ext1->getIndexOperand()
685 : Ext0->getIndexOperand();
686 Value *NewExt = Pred != CmpInst::BAD_ICMP_PREDICATE
687 ? foldExtExtCmp(ExtOp0, ExtOp1, ExtIndex, I)
688 : foldExtExtBinop(ExtOp0, ExtOp1, ExtIndex, I);
689 Worklist.push(Ext0);
690 Worklist.push(Ext1);
691 replaceValue(I, *NewExt);
692 return true;
693}
694
695/// Try to replace an extract + scalar fneg + insert with a vector fneg +
696/// shuffle.
697bool VectorCombine::foldInsExtFNeg(Instruction &I) {
698 // Match an insert (op (extract)) pattern.
699 Value *DstVec;
700 uint64_t ExtIdx, InsIdx;
701 Instruction *FNeg;
702 if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
703 m_ConstantInt(InsIdx))))
704 return false;
705
706 // Note: This handles the canonical fneg instruction and "fsub -0.0, X".
707 Value *SrcVec;
708 Instruction *Extract;
709 if (!match(FNeg, m_FNeg(m_CombineAnd(
710 m_Instruction(Extract),
711 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
712 return false;
713
714 auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
715 auto *DstVecScalarTy = DstVecTy->getScalarType();
716 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
717 if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
718 return false;
719
720 // Ignore if insert/extract index is out of bounds or destination vector has
721 // one element
722 unsigned NumDstElts = DstVecTy->getNumElements();
723 unsigned NumSrcElts = SrcVecTy->getNumElements();
724 if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
725 return false;
726
727 // We are inserting the negated element into the same lane that we extracted
728 // from. This is equivalent to a select-shuffle that chooses all but the
729 // negated element from the destination vector.
730 SmallVector<int> Mask(NumDstElts);
731 std::iota(Mask.begin(), Mask.end(), 0);
732 Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
733 InstructionCost OldCost =
734 TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
735 TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);
736
737 // If the extract has one use, it will be eliminated, so count it in the
738 // original cost. If it has more than one use, ignore the cost because it will
739 // be the same before/after.
740 if (Extract->hasOneUse())
741 OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);
742
743 InstructionCost NewCost =
744 TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
746 DstVecTy, Mask, CostKind);
747
748 bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
749 // If the lengths of the two vectors are not equal,
750 // we need to add a length-change vector. Add this cost.
751 SmallVector<int> SrcMask;
752 if (NeedLenChg) {
753 SrcMask.assign(NumDstElts, PoisonMaskElem);
754 SrcMask[ExtIdx % NumDstElts] = ExtIdx;
756 DstVecTy, SrcVecTy, SrcMask, CostKind);
757 }
758
759 LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
760 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
761 << "\n");
762 if (NewCost > OldCost)
763 return false;
764
765 Value *NewShuf, *LenChgShuf = nullptr;
766 // insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
767 Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
768 if (NeedLenChg) {
769 // shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
770 LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
771 NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
772 Worklist.pushValue(LenChgShuf);
773 } else {
774 // shuffle DstVec, (fneg SrcVec), Mask
775 NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
776 }
777
778 Worklist.pushValue(VecFNeg);
779 replaceValue(I, *NewShuf);
780 return true;
781}
782
783/// Try to fold insert(binop(x,y),binop(a,b),idx)
784/// --> binop(insert(x,a,idx),insert(y,b,idx))
785bool VectorCombine::foldInsExtBinop(Instruction &I) {
786 BinaryOperator *VecBinOp, *SclBinOp;
787 uint64_t Index;
788 if (!match(&I,
789 m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
790 m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
791 return false;
792
793 // TODO: Add support for addlike etc.
794 Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
795 if (BinOpcode != SclBinOp->getOpcode())
796 return false;
797
798 auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
799 if (!ResultTy)
800 return false;
801
802 // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
803 // shuffle?
804
806 TTI.getInstructionCost(VecBinOp, CostKind) +
808 InstructionCost NewCost =
809 TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
810 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
811 Index, VecBinOp->getOperand(0),
812 SclBinOp->getOperand(0)) +
813 TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
814 Index, VecBinOp->getOperand(1),
815 SclBinOp->getOperand(1));
816
817 LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
818 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
819 << "\n");
820 if (NewCost > OldCost)
821 return false;
822
823 Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
824 SclBinOp->getOperand(0), Index);
825 Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
826 SclBinOp->getOperand(1), Index);
827 Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
828
829 // Intersect flags from the old binops.
830 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
831 NewInst->copyIRFlags(VecBinOp);
832 NewInst->andIRFlags(SclBinOp);
833 }
834
835 Worklist.pushValue(NewIns0);
836 Worklist.pushValue(NewIns1);
837 replaceValue(I, *NewBO);
838 return true;
839}
840
841/// Match: bitop(castop(x), castop(y)) -> castop(bitop(x, y))
842/// Supports: bitcast, trunc, sext, zext
843bool VectorCombine::foldBitOpOfCastops(Instruction &I) {
844 // Check if this is a bitwise logic operation
845 auto *BinOp = dyn_cast<BinaryOperator>(&I);
846 if (!BinOp || !BinOp->isBitwiseLogicOp())
847 return false;
848
849 // Get the cast instructions
850 auto *LHSCast = dyn_cast<CastInst>(BinOp->getOperand(0));
851 auto *RHSCast = dyn_cast<CastInst>(BinOp->getOperand(1));
852 if (!LHSCast || !RHSCast) {
853 LLVM_DEBUG(dbgs() << " One or both operands are not cast instructions\n");
854 return false;
855 }
856
857 // Both casts must be the same type
858 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
859 if (CastOpcode != RHSCast->getOpcode())
860 return false;
861
862 // Only handle supported cast operations
863 switch (CastOpcode) {
864 case Instruction::BitCast:
865 case Instruction::Trunc:
866 case Instruction::SExt:
867 case Instruction::ZExt:
868 break;
869 default:
870 return false;
871 }
872
873 Value *LHSSrc = LHSCast->getOperand(0);
874 Value *RHSSrc = RHSCast->getOperand(0);
875
876 // Source types must match
877 if (LHSSrc->getType() != RHSSrc->getType())
878 return false;
879
880 auto *SrcTy = LHSSrc->getType();
881 auto *DstTy = I.getType();
882 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
883 // Other casts only handle vector types with integer elements.
884 if (CastOpcode != Instruction::BitCast &&
885 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
886 return false;
887
888 // Only integer scalar/vector values are legal for bitwise logic operations.
889 if (!SrcTy->getScalarType()->isIntegerTy() ||
890 !DstTy->getScalarType()->isIntegerTy())
891 return false;
892
893 // Cost Check :
894 // OldCost = bitlogic + 2*casts
895 // NewCost = bitlogic + cast
896
897 // Calculate specific costs for each cast with instruction context
899 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
901 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, RHSCast);
902
903 InstructionCost OldCost =
904 TTI.getArithmeticInstrCost(BinOp->getOpcode(), DstTy, CostKind) +
905 LHSCastCost + RHSCastCost;
906
907 // For new cost, we can't provide an instruction (it doesn't exist yet)
908 InstructionCost GenericCastCost = TTI.getCastInstrCost(
909 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
910
911 InstructionCost NewCost =
912 TTI.getArithmeticInstrCost(BinOp->getOpcode(), SrcTy, CostKind) +
913 GenericCastCost;
914
915 // Account for multi-use casts using specific costs
916 if (!LHSCast->hasOneUse())
917 NewCost += LHSCastCost;
918 if (!RHSCast->hasOneUse())
919 NewCost += RHSCastCost;
920
921 LLVM_DEBUG(dbgs() << "foldBitOpOfCastops: OldCost=" << OldCost
922 << " NewCost=" << NewCost << "\n");
923
924 if (NewCost > OldCost)
925 return false;
926
927 // Create the operation on the source type
928 Value *NewOp = Builder.CreateBinOp(BinOp->getOpcode(), LHSSrc, RHSSrc,
929 BinOp->getName() + ".inner");
930 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
931 NewBinOp->copyIRFlags(BinOp);
932
933 Worklist.pushValue(NewOp);
934
935 // Create the cast operation directly to ensure we get a new instruction
936 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
937
938 // Preserve cast instruction flags
939 NewCast->copyIRFlags(LHSCast);
940 NewCast->andIRFlags(RHSCast);
941
942 // Insert the new instruction
943 Value *Result = Builder.Insert(NewCast);
944
945 replaceValue(I, *Result);
946 return true;
947}
948
949/// Match:
950// bitop(castop(x), C) ->
951// bitop(castop(x), castop(InvC)) ->
952// castop(bitop(x, InvC))
953// Supports: bitcast
954bool VectorCombine::foldBitOpOfCastConstant(Instruction &I) {
956 Constant *C;
957
958 // Check if this is a bitwise logic operation
960 return false;
961
962 // Get the cast instructions
963 auto *LHSCast = dyn_cast<CastInst>(LHS);
964 if (!LHSCast)
965 return false;
966
967 Instruction::CastOps CastOpcode = LHSCast->getOpcode();
968
969 // Only handle supported cast operations
970 switch (CastOpcode) {
971 case Instruction::BitCast:
972 case Instruction::ZExt:
973 case Instruction::SExt:
974 case Instruction::Trunc:
975 break;
976 default:
977 return false;
978 }
979
980 Value *LHSSrc = LHSCast->getOperand(0);
981
982 auto *SrcTy = LHSSrc->getType();
983 auto *DstTy = I.getType();
984 // Bitcasts can handle scalar/vector mixes, such as i16 -> <16 x i1>.
985 // Other casts only handle vector types with integer elements.
986 if (CastOpcode != Instruction::BitCast &&
987 (!isa<FixedVectorType>(SrcTy) || !isa<FixedVectorType>(DstTy)))
988 return false;
989
990 // Only integer scalar/vector values are legal for bitwise logic operations.
991 if (!SrcTy->getScalarType()->isIntegerTy() ||
992 !DstTy->getScalarType()->isIntegerTy())
993 return false;
994
995 // Find the constant InvC, such that castop(InvC) equals to C.
996 PreservedCastFlags RHSFlags;
997 Constant *InvC = getLosslessInvCast(C, SrcTy, CastOpcode, *DL, &RHSFlags);
998 if (!InvC)
999 return false;
1000
1001 // Cost Check :
1002 // OldCost = bitlogic + cast
1003 // NewCost = bitlogic + cast
1004
1005 // Calculate specific costs for each cast with instruction context
1006 InstructionCost LHSCastCost = TTI.getCastInstrCost(
1007 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind, LHSCast);
1008
1009 InstructionCost OldCost =
1010 TTI.getArithmeticInstrCost(I.getOpcode(), DstTy, CostKind) + LHSCastCost;
1011
1012 // For new cost, we can't provide an instruction (it doesn't exist yet)
1013 InstructionCost GenericCastCost = TTI.getCastInstrCost(
1014 CastOpcode, DstTy, SrcTy, TTI::CastContextHint::None, CostKind);
1015
1016 InstructionCost NewCost =
1017 TTI.getArithmeticInstrCost(I.getOpcode(), SrcTy, CostKind) +
1018 GenericCastCost;
1019
1020 // Account for multi-use casts using specific costs
1021 if (!LHSCast->hasOneUse())
1022 NewCost += LHSCastCost;
1023
1024 LLVM_DEBUG(dbgs() << "foldBitOpOfCastConstant: OldCost=" << OldCost
1025 << " NewCost=" << NewCost << "\n");
1026
1027 if (NewCost > OldCost)
1028 return false;
1029
1030 // Create the operation on the source type
1031 Value *NewOp = Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(),
1032 LHSSrc, InvC, I.getName() + ".inner");
1033 if (auto *NewBinOp = dyn_cast<BinaryOperator>(NewOp))
1034 NewBinOp->copyIRFlags(&I);
1035
1036 Worklist.pushValue(NewOp);
1037
1038 // Create the cast operation directly to ensure we get a new instruction
1039 Instruction *NewCast = CastInst::Create(CastOpcode, NewOp, I.getType());
1040
1041 // Preserve cast instruction flags
1042 if (RHSFlags.NNeg)
1043 NewCast->setNonNeg();
1044 if (RHSFlags.NUW)
1045 NewCast->setHasNoUnsignedWrap();
1046 if (RHSFlags.NSW)
1047 NewCast->setHasNoSignedWrap();
1048
1049 NewCast->andIRFlags(LHSCast);
1050
1051 // Insert the new instruction
1052 Value *Result = Builder.Insert(NewCast);
1053
1054 replaceValue(I, *Result);
1055 return true;
1056}
1057
1058/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
1059/// destination type followed by shuffle. This can enable further transforms by
1060/// moving bitcasts or shuffles together.
1061bool VectorCombine::foldBitcastShuffle(Instruction &I) {
1062 Value *V0, *V1;
1063 ArrayRef<int> Mask;
1064 if (!match(&I, m_BitCast(m_OneUse(
1065 m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(Mask))))))
1066 return false;
1067
1068 // 1) Do not fold bitcast shuffle for scalable type. First, shuffle cost for
1069 // scalable type is unknown; Second, we cannot reason if the narrowed shuffle
1070 // mask for scalable type is a splat or not.
1071 // 2) Disallow non-vector casts.
1072 // TODO: We could allow any shuffle.
1073 auto *DestTy = dyn_cast<FixedVectorType>(I.getType());
1074 auto *SrcTy = dyn_cast<FixedVectorType>(V0->getType());
1075 if (!DestTy || !SrcTy)
1076 return false;
1077
1078 unsigned DestEltSize = DestTy->getScalarSizeInBits();
1079 unsigned SrcEltSize = SrcTy->getScalarSizeInBits();
1080 if (SrcTy->getPrimitiveSizeInBits() % DestEltSize != 0)
1081 return false;
1082
1083 bool IsUnary = isa<UndefValue>(V1);
1084
1085 // For binary shuffles, only fold bitcast(shuffle(X,Y))
1086 // if it won't increase the number of bitcasts.
1087 if (!IsUnary) {
1090 if (!(BCTy0 && BCTy0->getElementType() == DestTy->getElementType()) &&
1091 !(BCTy1 && BCTy1->getElementType() == DestTy->getElementType()))
1092 return false;
1093 }
1094
1095 SmallVector<int, 16> NewMask;
1096 if (DestEltSize <= SrcEltSize) {
1097 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
1098 // always be expanded to the equivalent form choosing narrower elements.
1099 assert(SrcEltSize % DestEltSize == 0 && "Unexpected shuffle mask");
1100 unsigned ScaleFactor = SrcEltSize / DestEltSize;
1101 narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
1102 } else {
1103 // The bitcast is from narrow elements to wide elements. The shuffle mask
1104 // must choose consecutive elements to allow casting first.
1105 assert(DestEltSize % SrcEltSize == 0 && "Unexpected shuffle mask");
1106 unsigned ScaleFactor = DestEltSize / SrcEltSize;
1107 if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
1108 return false;
1109 }
1110
1111 // Bitcast the shuffle src - keep its original width but using the destination
1112 // scalar type.
1113 unsigned NumSrcElts = SrcTy->getPrimitiveSizeInBits() / DestEltSize;
1114 auto *NewShuffleTy =
1115 FixedVectorType::get(DestTy->getScalarType(), NumSrcElts);
1116 auto *OldShuffleTy =
1117 FixedVectorType::get(SrcTy->getScalarType(), Mask.size());
1118 unsigned NumOps = IsUnary ? 1 : 2;
1119
1120 // The new shuffle must not cost more than the old shuffle.
1124
1125 InstructionCost NewCost =
1126 TTI.getShuffleCost(SK, DestTy, NewShuffleTy, NewMask, CostKind) +
1127 (NumOps * TTI.getCastInstrCost(Instruction::BitCast, NewShuffleTy, SrcTy,
1128 TargetTransformInfo::CastContextHint::None,
1129 CostKind));
1130 InstructionCost OldCost =
1131 TTI.getShuffleCost(SK, OldShuffleTy, SrcTy, Mask, CostKind) +
1132 TTI.getCastInstrCost(Instruction::BitCast, DestTy, OldShuffleTy,
1133 TargetTransformInfo::CastContextHint::None,
1134 CostKind);
1135
1136 LLVM_DEBUG(dbgs() << "Found a bitcasted shuffle: " << I << "\n OldCost: "
1137 << OldCost << " vs NewCost: " << NewCost << "\n");
1138
1139 if (NewCost > OldCost || !NewCost.isValid())
1140 return false;
1141
1142 // bitcast (shuf V0, V1, MaskC) --> shuf (bitcast V0), (bitcast V1), MaskC'
1143 ++NumShufOfBitcast;
1144 Value *CastV0 = Builder.CreateBitCast(peekThroughBitcasts(V0), NewShuffleTy);
1145 Value *CastV1 = Builder.CreateBitCast(peekThroughBitcasts(V1), NewShuffleTy);
1146 Value *Shuf = Builder.CreateShuffleVector(CastV0, CastV1, NewMask);
1147 replaceValue(I, *Shuf);
1148 return true;
1149}
1150
1151/// VP Intrinsics whose vector operands are both splat values may be simplified
1152/// into the scalar version of the operation and the result splatted. This
1153/// can lead to scalarization down the line.
1154bool VectorCombine::scalarizeVPIntrinsic(Instruction &I) {
1155 if (!isa<VPIntrinsic>(I))
1156 return false;
1157 VPIntrinsic &VPI = cast<VPIntrinsic>(I);
1158 Value *Op0 = VPI.getArgOperand(0);
1159 Value *Op1 = VPI.getArgOperand(1);
1160
1161 if (!isSplatValue(Op0) || !isSplatValue(Op1))
1162 return false;
1163
1164 // Check getSplatValue early in this function, to avoid doing unnecessary
1165 // work.
1166 Value *ScalarOp0 = getSplatValue(Op0);
1167 Value *ScalarOp1 = getSplatValue(Op1);
1168 if (!ScalarOp0 || !ScalarOp1)
1169 return false;
1170
1171 // For the binary VP intrinsics supported here, the result on disabled lanes
1172 // is a poison value. For now, only do this simplification if all lanes
1173 // are active.
1174 // TODO: Relax the condition that all lanes are active by using insertelement
1175 // on inactive lanes.
1176 auto IsAllTrueMask = [](Value *MaskVal) {
1177 if (Value *SplattedVal = getSplatValue(MaskVal))
1178 if (auto *ConstValue = dyn_cast<Constant>(SplattedVal))
1179 return ConstValue->isAllOnesValue();
1180 return false;
1181 };
1182 if (!IsAllTrueMask(VPI.getArgOperand(2)))
1183 return false;
1184
1185 // Check to make sure we support scalarization of the intrinsic
1186 Intrinsic::ID IntrID = VPI.getIntrinsicID();
1187 if (!VPBinOpIntrinsic::isVPBinOp(IntrID))
1188 return false;
1189
1190 // Calculate cost of splatting both operands into vectors and the vector
1191 // intrinsic
1192 VectorType *VecTy = cast<VectorType>(VPI.getType());
1193 SmallVector<int> Mask;
1194 if (auto *FVTy = dyn_cast<FixedVectorType>(VecTy))
1195 Mask.resize(FVTy->getNumElements(), 0);
1196 InstructionCost SplatCost =
1197 TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, CostKind, 0) +
1199 CostKind);
1200
1201 // Calculate the cost of the VP Intrinsic
1203 for (Value *V : VPI.args())
1204 Args.push_back(V->getType());
1205 IntrinsicCostAttributes Attrs(IntrID, VecTy, Args);
1206 InstructionCost VectorOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1207 InstructionCost OldCost = 2 * SplatCost + VectorOpCost;
1208
1209 // Determine scalar opcode
1210 std::optional<unsigned> FunctionalOpcode =
1211 VPI.getFunctionalOpcode();
1212 std::optional<Intrinsic::ID> ScalarIntrID = std::nullopt;
1213 if (!FunctionalOpcode) {
1214 ScalarIntrID = VPI.getFunctionalIntrinsicID();
1215 if (!ScalarIntrID)
1216 return false;
1217 }
1218
1219 // Calculate cost of scalarizing
1220 InstructionCost ScalarOpCost = 0;
1221 if (ScalarIntrID) {
1222 IntrinsicCostAttributes Attrs(*ScalarIntrID, VecTy->getScalarType(), Args);
1223 ScalarOpCost = TTI.getIntrinsicInstrCost(Attrs, CostKind);
1224 } else {
1225 ScalarOpCost = TTI.getArithmeticInstrCost(*FunctionalOpcode,
1226 VecTy->getScalarType(), CostKind);
1227 }
1228
1229 // The existing splats may be kept around if other instructions use them.
1230 InstructionCost CostToKeepSplats =
1231 (SplatCost * !Op0->hasOneUse()) + (SplatCost * !Op1->hasOneUse());
1232 InstructionCost NewCost = ScalarOpCost + SplatCost + CostToKeepSplats;
1233
1234 LLVM_DEBUG(dbgs() << "Found a VP Intrinsic to scalarize: " << VPI
1235 << "\n");
1236 LLVM_DEBUG(dbgs() << "Cost of Intrinsic: " << OldCost
1237 << ", Cost of scalarizing:" << NewCost << "\n");
1238
1239 // We want to scalarize unless the vector variant actually has lower cost.
1240 if (OldCost < NewCost || !NewCost.isValid())
1241 return false;
1242
1243 // Scalarize the intrinsic
1244 ElementCount EC = cast<VectorType>(Op0->getType())->getElementCount();
1245 Value *EVL = VPI.getArgOperand(3);
1246
1247 // If the VP op might introduce UB or poison, we can scalarize it provided
1248 // that we know the EVL > 0: If the EVL is zero, then the original VP op
1249 // becomes a no-op and thus won't be UB, so make sure we don't introduce UB by
1250 // scalarizing it.
1251 bool SafeToSpeculate;
1252 if (ScalarIntrID)
1253 SafeToSpeculate = Intrinsic::getFnAttributes(I.getContext(), *ScalarIntrID)
1254 .hasAttribute(Attribute::AttrKind::Speculatable);
1255 else
1257 *FunctionalOpcode, &VPI, nullptr, &AC, &DT);
1258 if (!SafeToSpeculate &&
1259 !isKnownNonZero(EVL, SimplifyQuery(*DL, &DT, &AC, &VPI)))
1260 return false;
1261
1262 Value *ScalarVal =
1263 ScalarIntrID
1264 ? Builder.CreateIntrinsic(VecTy->getScalarType(), *ScalarIntrID,
1265 {ScalarOp0, ScalarOp1})
1266 : Builder.CreateBinOp((Instruction::BinaryOps)(*FunctionalOpcode),
1267 ScalarOp0, ScalarOp1);
1268
1269 replaceValue(VPI, *Builder.CreateVectorSplat(EC, ScalarVal));
1270 return true;
1271}
1272
1273/// Match a vector op/compare/intrinsic with at least one
1274/// inserted scalar operand and convert to scalar op/cmp/intrinsic followed
1275/// by insertelement.
1276bool VectorCombine::scalarizeOpOrCmp(Instruction &I) {
1277 auto *UO = dyn_cast<UnaryOperator>(&I);
1278 auto *BO = dyn_cast<BinaryOperator>(&I);
1279 auto *CI = dyn_cast<CmpInst>(&I);
1280 auto *II = dyn_cast<IntrinsicInst>(&I);
1281 if (!UO && !BO && !CI && !II)
1282 return false;
1283
1284 // TODO: Allow intrinsics with different argument types
1285 if (II) {
1286 if (!isTriviallyVectorizable(II->getIntrinsicID()))
1287 return false;
1288 for (auto [Idx, Arg] : enumerate(II->args()))
1289 if (Arg->getType() != II->getType() &&
1290 !isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, &TTI))
1291 return false;
1292 }
1293
1294 // Do not convert the vector condition of a vector select into a scalar
1295 // condition. That may cause problems for codegen because of differences in
1296 // boolean formats and register-file transfers.
1297 // TODO: Can we account for that in the cost model?
1298 if (CI)
1299 for (User *U : I.users())
1300 if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
1301 return false;
1302
1303 // Match constant vectors or scalars being inserted into constant vectors:
1304 // vec_op [VecC0 | (inselt VecC0, V0, Index)], ...
1305 SmallVector<Value *> VecCs, ScalarOps;
1306 std::optional<uint64_t> Index;
1307
1308 auto Ops = II ? II->args() : I.operands();
1309 for (auto [OpNum, Op] : enumerate(Ops)) {
1310 Constant *VecC;
1311 Value *V;
1312 uint64_t InsIdx = 0;
1313 if (match(Op.get(), m_InsertElt(m_Constant(VecC), m_Value(V),
1314 m_ConstantInt(InsIdx)))) {
1315 // Bail if any inserts are out of bounds.
1316 VectorType *OpTy = cast<VectorType>(Op->getType());
1317 if (OpTy->getElementCount().getKnownMinValue() <= InsIdx)
1318 return false;
1319 // All inserts must have the same index.
1320 // TODO: Deal with mismatched index constants and variable indexes?
1321 if (!Index)
1322 Index = InsIdx;
1323 else if (InsIdx != *Index)
1324 return false;
1325 VecCs.push_back(VecC);
1326 ScalarOps.push_back(V);
1327 } else if (II && isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
1328 OpNum, &TTI)) {
1329 VecCs.push_back(Op.get());
1330 ScalarOps.push_back(Op.get());
1331 } else if (match(Op.get(), m_Constant(VecC))) {
1332 VecCs.push_back(VecC);
1333 ScalarOps.push_back(nullptr);
1334 } else {
1335 return false;
1336 }
1337 }
1338
1339 // Bail if all operands are constant.
1340 if (!Index.has_value())
1341 return false;
1342
1343 VectorType *VecTy = cast<VectorType>(I.getType());
1344 Type *ScalarTy = VecTy->getScalarType();
1345 assert(VecTy->isVectorTy() &&
1346 (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
1347 ScalarTy->isPointerTy()) &&
1348 "Unexpected types for insert element into binop or cmp");
1349
1350 unsigned Opcode = I.getOpcode();
1351 InstructionCost ScalarOpCost, VectorOpCost;
1352 if (CI) {
1353 CmpInst::Predicate Pred = CI->getPredicate();
1354 ScalarOpCost = TTI.getCmpSelInstrCost(
1355 Opcode, ScalarTy, CmpInst::makeCmpResultType(ScalarTy), Pred, CostKind);
1356 VectorOpCost = TTI.getCmpSelInstrCost(
1357 Opcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1358 } else if (UO || BO) {
1359 ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy, CostKind);
1360 VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy, CostKind);
1361 } else {
1362 IntrinsicCostAttributes ScalarICA(
1363 II->getIntrinsicID(), ScalarTy,
1364 SmallVector<Type *>(II->arg_size(), ScalarTy));
1365 ScalarOpCost = TTI.getIntrinsicInstrCost(ScalarICA, CostKind);
1366 IntrinsicCostAttributes VectorICA(
1367 II->getIntrinsicID(), VecTy,
1368 SmallVector<Type *>(II->arg_size(), VecTy));
1369 VectorOpCost = TTI.getIntrinsicInstrCost(VectorICA, CostKind);
1370 }
1371
1372 // Fold the vector constants in the original vectors into a new base vector to
1373 // get more accurate cost modelling.
1374 Value *NewVecC = nullptr;
1375 if (CI)
1376 NewVecC = simplifyCmpInst(CI->getPredicate(), VecCs[0], VecCs[1], SQ);
1377 else if (UO)
1378 NewVecC =
1379 simplifyUnOp(UO->getOpcode(), VecCs[0], UO->getFastMathFlags(), SQ);
1380 else if (BO)
1381 NewVecC = simplifyBinOp(BO->getOpcode(), VecCs[0], VecCs[1], SQ);
1382 else if (II)
1383 NewVecC = simplifyCall(II, II->getCalledOperand(), VecCs, SQ);
1384
1385 if (!NewVecC)
1386 return false;
1387
1388 // Get cost estimate for the insert element. This cost will factor into
1389 // both sequences.
1390 InstructionCost OldCost = VectorOpCost;
1391 InstructionCost NewCost =
1392 ScalarOpCost + TTI.getVectorInstrCost(Instruction::InsertElement, VecTy,
1393 CostKind, *Index, NewVecC);
1394
1395 for (auto [Idx, Op, VecC, Scalar] : enumerate(Ops, VecCs, ScalarOps)) {
1396 if (!Scalar || (II && isVectorIntrinsicWithScalarOpAtArg(
1397 II->getIntrinsicID(), Idx, &TTI)))
1398 continue;
1400 Instruction::InsertElement, VecTy, CostKind, *Index, VecC, Scalar);
1401 OldCost += InsertCost;
1402 NewCost += !Op->hasOneUse() * InsertCost;
1403 }
1404
1405 // We want to scalarize unless the vector variant actually has lower cost.
1406 if (OldCost < NewCost || !NewCost.isValid())
1407 return false;
1408
1409 // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
1410 // inselt NewVecC, (scalar_op V0, V1), Index
1411 if (CI)
1412 ++NumScalarCmp;
1413 else if (UO || BO)
1414 ++NumScalarOps;
1415 else
1416 ++NumScalarIntrinsic;
1417
1418 // For constant cases, extract the scalar element, this should constant fold.
1419 for (auto [OpIdx, Scalar, VecC] : enumerate(ScalarOps, VecCs))
1420 if (!Scalar)
1422 cast<Constant>(VecC), Builder.getInt64(*Index));
1423
1424 Value *Scalar;
1425 if (CI)
1426 Scalar = Builder.CreateCmp(CI->getPredicate(), ScalarOps[0], ScalarOps[1]);
1427 else if (UO || BO)
1428 Scalar = Builder.CreateNAryOp(Opcode, ScalarOps);
1429 else
1430 Scalar = Builder.CreateIntrinsic(ScalarTy, II->getIntrinsicID(), ScalarOps);
1431
1432 Scalar->setName(I.getName() + ".scalar");
1433
1434 // All IR flags are safe to back-propagate. There is no potential for extra
1435 // poison to be created by the scalar instruction.
1436 if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
1437 ScalarInst->copyIRFlags(&I);
1438
1439 Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, *Index);
1440 replaceValue(I, *Insert);
1441 return true;
1442}
1443
1444/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
1445/// a vector into vector operations followed by extract. Note: The SLP pass
1446/// may miss this pattern because of implementation problems.
1447bool VectorCombine::foldExtractedCmps(Instruction &I) {
1448 auto *BI = dyn_cast<BinaryOperator>(&I);
1449
1450 // We are looking for a scalar binop of booleans.
1451 // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
1452 if (!BI || !I.getType()->isIntegerTy(1))
1453 return false;
1454
1455 // The compare predicates should match, and each compare should have a
1456 // constant operand.
1457 Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
1458 Instruction *I0, *I1;
1459 Constant *C0, *C1;
1460 CmpPredicate P0, P1;
1461 if (!match(B0, m_Cmp(P0, m_Instruction(I0), m_Constant(C0))) ||
1462 !match(B1, m_Cmp(P1, m_Instruction(I1), m_Constant(C1))))
1463 return false;
1464
1465 auto MatchingPred = CmpPredicate::getMatching(P0, P1);
1466 if (!MatchingPred)
1467 return false;
1468
1469 // The compare operands must be extracts of the same vector with constant
1470 // extract indexes.
1471 Value *X;
1472 uint64_t Index0, Index1;
1473 if (!match(I0, m_ExtractElt(m_Value(X), m_ConstantInt(Index0))) ||
1474 !match(I1, m_ExtractElt(m_Specific(X), m_ConstantInt(Index1))))
1475 return false;
1476
1477 auto *Ext0 = cast<ExtractElementInst>(I0);
1478 auto *Ext1 = cast<ExtractElementInst>(I1);
1479 ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1, CostKind);
1480 if (!ConvertToShuf)
1481 return false;
1482 assert((ConvertToShuf == Ext0 || ConvertToShuf == Ext1) &&
1483 "Unknown ExtractElementInst");
1484
1485 // The original scalar pattern is:
1486 // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
1487 CmpInst::Predicate Pred = *MatchingPred;
1488 unsigned CmpOpcode =
1489 CmpInst::isFPPredicate(Pred) ? Instruction::FCmp : Instruction::ICmp;
1490 auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
1491 if (!VecTy)
1492 return false;
1493
1494 InstructionCost Ext0Cost =
1495 TTI.getVectorInstrCost(*Ext0, VecTy, CostKind, Index0);
1496 InstructionCost Ext1Cost =
1497 TTI.getVectorInstrCost(*Ext1, VecTy, CostKind, Index1);
1499 CmpOpcode, I0->getType(), CmpInst::makeCmpResultType(I0->getType()), Pred,
1500 CostKind);
1501
1502 InstructionCost OldCost =
1503 Ext0Cost + Ext1Cost + CmpCost * 2 +
1504 TTI.getArithmeticInstrCost(I.getOpcode(), I.getType(), CostKind);
1505
1506 // The proposed vector pattern is:
1507 // vcmp = cmp Pred X, VecC
1508 // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
1509 int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
1510 int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
1513 CmpOpcode, VecTy, CmpInst::makeCmpResultType(VecTy), Pred, CostKind);
1514 SmallVector<int, 32> ShufMask(VecTy->getNumElements(), PoisonMaskElem);
1515 ShufMask[CheapIndex] = ExpensiveIndex;
1517 CmpTy, ShufMask, CostKind);
1518 NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy, CostKind);
1519 NewCost += TTI.getVectorInstrCost(*Ext0, CmpTy, CostKind, CheapIndex);
1520 NewCost += Ext0->hasOneUse() ? 0 : Ext0Cost;
1521 NewCost += Ext1->hasOneUse() ? 0 : Ext1Cost;
1522
1523 // Aggressively form vector ops if the cost is equal because the transform
1524 // may enable further optimization.
1525 // Codegen can reverse this transform (scalarize) if it was not profitable.
1526 if (OldCost < NewCost || !NewCost.isValid())
1527 return false;
1528
1529 // Create a vector constant from the 2 scalar constants.
1530 SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
1531 PoisonValue::get(VecTy->getElementType()));
1532 CmpC[Index0] = C0;
1533 CmpC[Index1] = C1;
1534 Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
1535 Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
1536 Value *LHS = ConvertToShuf == Ext0 ? Shuf : VCmp;
1537 Value *RHS = ConvertToShuf == Ext0 ? VCmp : Shuf;
1538 Value *VecLogic = Builder.CreateBinOp(BI->getOpcode(), LHS, RHS);
1539 Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
1540 replaceValue(I, *NewExt);
1541 ++NumVecCmpBO;
1542 return true;
1543}
1544
1547 const TargetTransformInfo &TTI,
1548 InstructionCost &CostBeforeReduction,
1549 InstructionCost &CostAfterReduction) {
1550 Instruction *Op0, *Op1;
1551 auto *RedOp = dyn_cast<Instruction>(II.getOperand(0));
1552 auto *VecRedTy = cast<VectorType>(II.getOperand(0)->getType());
1553 unsigned ReductionOpc =
1554 getArithmeticReductionInstruction(II.getIntrinsicID());
1555 if (RedOp && match(RedOp, m_ZExtOrSExt(m_Value()))) {
1556 bool IsUnsigned = isa<ZExtInst>(RedOp);
1557 auto *ExtType = cast<VectorType>(RedOp->getOperand(0)->getType());
1558
1559 CostBeforeReduction =
1560 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, ExtType,
1562 CostAfterReduction =
1563 TTI.getExtendedReductionCost(ReductionOpc, IsUnsigned, II.getType(),
1564 ExtType, FastMathFlags(), CostKind);
1565 return;
1566 }
1567 if (RedOp && II.getIntrinsicID() == Intrinsic::vector_reduce_add &&
1568 match(RedOp,
1570 match(Op0, m_ZExtOrSExt(m_Value())) &&
1571 Op0->getOpcode() == Op1->getOpcode() &&
1572 Op0->getOperand(0)->getType() == Op1->getOperand(0)->getType() &&
1573 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
1574 // Matched reduce.add(ext(mul(ext(A), ext(B)))
1575 bool IsUnsigned = isa<ZExtInst>(Op0);
1576 auto *ExtType = cast<VectorType>(Op0->getOperand(0)->getType());
1577 VectorType *MulType = VectorType::get(Op0->getType(), VecRedTy);
1578
1579 InstructionCost ExtCost =
1580 TTI.getCastInstrCost(Op0->getOpcode(), MulType, ExtType,
1582 InstructionCost MulCost =
1583 TTI.getArithmeticInstrCost(Instruction::Mul, MulType, CostKind);
1584 InstructionCost Ext2Cost =
1585 TTI.getCastInstrCost(RedOp->getOpcode(), VecRedTy, MulType,
1587
1588 CostBeforeReduction = ExtCost * 2 + MulCost + Ext2Cost;
1589 CostAfterReduction = TTI.getMulAccReductionCost(
1590 IsUnsigned, ReductionOpc, II.getType(), ExtType, CostKind);
1591 return;
1592 }
1593 CostAfterReduction = TTI.getArithmeticReductionCost(ReductionOpc, VecRedTy,
1594 std::nullopt, CostKind);
1595}
1596
1597bool VectorCombine::foldBinopOfReductions(Instruction &I) {
1598 Instruction::BinaryOps BinOpOpc = cast<BinaryOperator>(&I)->getOpcode();
1599 Intrinsic::ID ReductionIID = getReductionForBinop(BinOpOpc);
1600 if (BinOpOpc == Instruction::Sub)
1601 ReductionIID = Intrinsic::vector_reduce_add;
1602 if (ReductionIID == Intrinsic::not_intrinsic)
1603 return false;
1604
1605 auto checkIntrinsicAndGetItsArgument = [](Value *V,
1606 Intrinsic::ID IID) -> Value * {
1607 auto *II = dyn_cast<IntrinsicInst>(V);
1608 if (!II)
1609 return nullptr;
1610 if (II->getIntrinsicID() == IID && II->hasOneUse())
1611 return II->getArgOperand(0);
1612 return nullptr;
1613 };
1614
1615 Value *V0 = checkIntrinsicAndGetItsArgument(I.getOperand(0), ReductionIID);
1616 if (!V0)
1617 return false;
1618 Value *V1 = checkIntrinsicAndGetItsArgument(I.getOperand(1), ReductionIID);
1619 if (!V1)
1620 return false;
1621
1622 auto *VTy = cast<VectorType>(V0->getType());
1623 if (V1->getType() != VTy)
1624 return false;
1625 const auto &II0 = *cast<IntrinsicInst>(I.getOperand(0));
1626 const auto &II1 = *cast<IntrinsicInst>(I.getOperand(1));
1627 unsigned ReductionOpc =
1628 getArithmeticReductionInstruction(II0.getIntrinsicID());
1629
1630 InstructionCost OldCost = 0;
1631 InstructionCost NewCost = 0;
1632 InstructionCost CostOfRedOperand0 = 0;
1633 InstructionCost CostOfRed0 = 0;
1634 InstructionCost CostOfRedOperand1 = 0;
1635 InstructionCost CostOfRed1 = 0;
1636 analyzeCostOfVecReduction(II0, CostKind, TTI, CostOfRedOperand0, CostOfRed0);
1637 analyzeCostOfVecReduction(II1, CostKind, TTI, CostOfRedOperand1, CostOfRed1);
1638 OldCost = CostOfRed0 + CostOfRed1 + TTI.getInstructionCost(&I, CostKind);
1639 NewCost =
1640 CostOfRedOperand0 + CostOfRedOperand1 +
1641 TTI.getArithmeticInstrCost(BinOpOpc, VTy, CostKind) +
1642 TTI.getArithmeticReductionCost(ReductionOpc, VTy, std::nullopt, CostKind);
1643 if (NewCost >= OldCost || !NewCost.isValid())
1644 return false;
1645
1646 LLVM_DEBUG(dbgs() << "Found two mergeable reductions: " << I
1647 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
1648 << "\n");
1649 Value *VectorBO;
1650 if (BinOpOpc == Instruction::Or)
1651 VectorBO = Builder.CreateOr(V0, V1, "",
1652 cast<PossiblyDisjointInst>(I).isDisjoint());
1653 else
1654 VectorBO = Builder.CreateBinOp(BinOpOpc, V0, V1);
1655
1656 Instruction *Rdx = Builder.CreateIntrinsic(ReductionIID, {VTy}, {VectorBO});
1657 replaceValue(I, *Rdx);
1658 return true;
1659}
1660
1661// Check if memory loc modified between two instrs in the same BB
1664 const MemoryLocation &Loc, AAResults &AA) {
1665 unsigned NumScanned = 0;
1666 return std::any_of(Begin, End, [&](const Instruction &Instr) {
1667 return isModSet(AA.getModRefInfo(&Instr, Loc)) ||
1668 ++NumScanned > MaxInstrsToScan;
1669 });
1670}
1671
1672namespace {
1673/// Helper class to indicate whether a vector index can be safely scalarized and
1674/// if a freeze needs to be inserted.
1675class ScalarizationResult {
1676 enum class StatusTy { Unsafe, Safe, SafeWithFreeze };
1677
1678 StatusTy Status;
1679 Value *ToFreeze;
1680
1681 ScalarizationResult(StatusTy Status, Value *ToFreeze = nullptr)
1682 : Status(Status), ToFreeze(ToFreeze) {}
1683
1684public:
1685 ScalarizationResult(const ScalarizationResult &Other) = default;
1686 ~ScalarizationResult() {
1687 assert(!ToFreeze && "freeze() not called with ToFreeze being set");
1688 }
1689
1690 static ScalarizationResult unsafe() { return {StatusTy::Unsafe}; }
1691 static ScalarizationResult safe() { return {StatusTy::Safe}; }
1692 static ScalarizationResult safeWithFreeze(Value *ToFreeze) {
1693 return {StatusTy::SafeWithFreeze, ToFreeze};
1694 }
1695
1696 /// Returns true if the index can be scalarize without requiring a freeze.
1697 bool isSafe() const { return Status == StatusTy::Safe; }
1698 /// Returns true if the index cannot be scalarized.
1699 bool isUnsafe() const { return Status == StatusTy::Unsafe; }
1700 /// Returns true if the index can be scalarize, but requires inserting a
1701 /// freeze.
1702 bool isSafeWithFreeze() const { return Status == StatusTy::SafeWithFreeze; }
1703
1704 /// Reset the state of Unsafe and clear ToFreze if set.
1705 void discard() {
1706 ToFreeze = nullptr;
1707 Status = StatusTy::Unsafe;
1708 }
1709
1710 /// Freeze the ToFreeze and update the use in \p User to use it.
1711 void freeze(IRBuilderBase &Builder, Instruction &UserI) {
1712 assert(isSafeWithFreeze() &&
1713 "should only be used when freezing is required");
1714 assert(is_contained(ToFreeze->users(), &UserI) &&
1715 "UserI must be a user of ToFreeze");
1716 IRBuilder<>::InsertPointGuard Guard(Builder);
1717 Builder.SetInsertPoint(cast<Instruction>(&UserI));
1718 Value *Frozen =
1719 Builder.CreateFreeze(ToFreeze, ToFreeze->getName() + ".frozen");
1720 for (Use &U : make_early_inc_range((UserI.operands())))
1721 if (U.get() == ToFreeze)
1722 U.set(Frozen);
1723
1724 ToFreeze = nullptr;
1725 }
1726};
1727} // namespace
1728
1729/// Check if it is legal to scalarize a memory access to \p VecTy at index \p
1730/// Idx. \p Idx must access a valid vector element.
1731static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx,
1732 Instruction *CtxI,
1733 AssumptionCache &AC,
1734 const DominatorTree &DT) {
1735 // We do checks for both fixed vector types and scalable vector types.
1736 // This is the number of elements of fixed vector types,
1737 // or the minimum number of elements of scalable vector types.
1738 uint64_t NumElements = VecTy->getElementCount().getKnownMinValue();
1739 unsigned IntWidth = Idx->getType()->getScalarSizeInBits();
1740
1741 if (auto *C = dyn_cast<ConstantInt>(Idx)) {
1742 if (C->getValue().ult(NumElements))
1743 return ScalarizationResult::safe();
1744 return ScalarizationResult::unsafe();
1745 }
1746
1747 // Always unsafe if the index type can't handle all inbound values.
1748 if (!llvm::isUIntN(IntWidth, NumElements))
1749 return ScalarizationResult::unsafe();
1750
1751 APInt Zero(IntWidth, 0);
1752 APInt MaxElts(IntWidth, NumElements);
1753 ConstantRange ValidIndices(Zero, MaxElts);
1754 ConstantRange IdxRange(IntWidth, true);
1755
1756 if (isGuaranteedNotToBePoison(Idx, &AC)) {
1757 if (ValidIndices.contains(computeConstantRange(Idx, /* ForSigned */ false,
1758 true, &AC, CtxI, &DT)))
1759 return ScalarizationResult::safe();
1760 return ScalarizationResult::unsafe();
1761 }
1762
1763 // If the index may be poison, check if we can insert a freeze before the
1764 // range of the index is restricted.
1765 Value *IdxBase;
1766 ConstantInt *CI;
1767 if (match(Idx, m_And(m_Value(IdxBase), m_ConstantInt(CI)))) {
1768 IdxRange = IdxRange.binaryAnd(CI->getValue());
1769 } else if (match(Idx, m_URem(m_Value(IdxBase), m_ConstantInt(CI)))) {
1770 IdxRange = IdxRange.urem(CI->getValue());
1771 }
1772
1773 if (ValidIndices.contains(IdxRange))
1774 return ScalarizationResult::safeWithFreeze(IdxBase);
1775 return ScalarizationResult::unsafe();
1776}
1777
1778/// The memory operation on a vector of \p ScalarType had alignment of
1779/// \p VectorAlignment. Compute the maximal, but conservatively correct,
1780/// alignment that will be valid for the memory operation on a single scalar
1781/// element of the same type with index \p Idx.
1783 Type *ScalarType, Value *Idx,
1784 const DataLayout &DL) {
1785 if (auto *C = dyn_cast<ConstantInt>(Idx))
1786 return commonAlignment(VectorAlignment,
1787 C->getZExtValue() * DL.getTypeStoreSize(ScalarType));
1788 return commonAlignment(VectorAlignment, DL.getTypeStoreSize(ScalarType));
1789}
1790
1791// Combine patterns like:
1792// %0 = load <4 x i32>, <4 x i32>* %a
1793// %1 = insertelement <4 x i32> %0, i32 %b, i32 1
1794// store <4 x i32> %1, <4 x i32>* %a
1795// to:
1796// %0 = bitcast <4 x i32>* %a to i32*
1797// %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
1798// store i32 %b, i32* %1
1799bool VectorCombine::foldSingleElementStore(Instruction &I) {
1801 return false;
1802 auto *SI = cast<StoreInst>(&I);
1803 if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
1804 return false;
1805
1806 // TODO: Combine more complicated patterns (multiple insert) by referencing
1807 // TargetTransformInfo.
1809 Value *NewElement;
1810 Value *Idx;
1811 if (!match(SI->getValueOperand(),
1812 m_InsertElt(m_Instruction(Source), m_Value(NewElement),
1813 m_Value(Idx))))
1814 return false;
1815
1816 if (auto *Load = dyn_cast<LoadInst>(Source)) {
1817 auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
1818 Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
1819 // Don't optimize for atomic/volatile load or store. Ensure memory is not
1820 // modified between, vector type matches store size, and index is inbounds.
1821 if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
1822 !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
1823 SrcAddr != SI->getPointerOperand()->stripPointerCasts())
1824 return false;
1825
1826 auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
1827 if (ScalarizableIdx.isUnsafe() ||
1828 isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
1829 MemoryLocation::get(SI), AA))
1830 return false;
1831
1832 // Ensure we add the load back to the worklist BEFORE its users so they can
1833 // erased in the correct order.
1834 Worklist.push(Load);
1835
1836 if (ScalarizableIdx.isSafeWithFreeze())
1837 ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
1838 Value *GEP = Builder.CreateInBoundsGEP(
1839 SI->getValueOperand()->getType(), SI->getPointerOperand(),
1840 {ConstantInt::get(Idx->getType(), 0), Idx});
1841 StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
1842 NSI->copyMetadata(*SI);
1843 Align ScalarOpAlignment = computeAlignmentAfterScalarization(
1844 std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
1845 *DL);
1846 NSI->setAlignment(ScalarOpAlignment);
1847 replaceValue(I, *NSI);
1849 return true;
1850 }
1851
1852 return false;
1853}
1854
1855/// Try to scalarize vector loads feeding extractelement instructions.
1856bool VectorCombine::scalarizeLoadExtract(Instruction &I) {
1858 return false;
1859
1860 Value *Ptr;
1861 if (!match(&I, m_Load(m_Value(Ptr))))
1862 return false;
1863
1864 auto *LI = cast<LoadInst>(&I);
1865 auto *VecTy = cast<VectorType>(LI->getType());
1866 if (LI->isVolatile() || !DL->typeSizeEqualsStoreSize(VecTy->getScalarType()))
1867 return false;
1868
1869 InstructionCost OriginalCost =
1870 TTI.getMemoryOpCost(Instruction::Load, VecTy, LI->getAlign(),
1871 LI->getPointerAddressSpace(), CostKind);
1872 InstructionCost ScalarizedCost = 0;
1873
1874 Instruction *LastCheckedInst = LI;
1875 unsigned NumInstChecked = 0;
1876 DenseMap<ExtractElementInst *, ScalarizationResult> NeedFreeze;
1877 auto FailureGuard = make_scope_exit([&]() {
1878 // If the transform is aborted, discard the ScalarizationResults.
1879 for (auto &Pair : NeedFreeze)
1880 Pair.second.discard();
1881 });
1882
1883 // Check if all users of the load are extracts with no memory modifications
1884 // between the load and the extract. Compute the cost of both the original
1885 // code and the scalarized version.
1886 for (User *U : LI->users()) {
1887 auto *UI = dyn_cast<ExtractElementInst>(U);
1888 if (!UI || UI->getParent() != LI->getParent())
1889 return false;
1890
1891 // If any extract is waiting to be erased, then bail out as this will
1892 // distort the cost calculation and possibly lead to infinite loops.
1893 if (UI->use_empty())
1894 return false;
1895
1896 // Check if any instruction between the load and the extract may modify
1897 // memory.
1898 if (LastCheckedInst->comesBefore(UI)) {
1899 for (Instruction &I :
1900 make_range(std::next(LI->getIterator()), UI->getIterator())) {
1901 // Bail out if we reached the check limit or the instruction may write
1902 // to memory.
1903 if (NumInstChecked == MaxInstrsToScan || I.mayWriteToMemory())
1904 return false;
1905 NumInstChecked++;
1906 }
1907 LastCheckedInst = UI;
1908 }
1909
1910 auto ScalarIdx =
1911 canScalarizeAccess(VecTy, UI->getIndexOperand(), LI, AC, DT);
1912 if (ScalarIdx.isUnsafe())
1913 return false;
1914 if (ScalarIdx.isSafeWithFreeze()) {
1915 NeedFreeze.try_emplace(UI, ScalarIdx);
1916 ScalarIdx.discard();
1917 }
1918
1919 auto *Index = dyn_cast<ConstantInt>(UI->getIndexOperand());
1920 OriginalCost +=
1921 TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, CostKind,
1922 Index ? Index->getZExtValue() : -1);
1923 ScalarizedCost +=
1924 TTI.getMemoryOpCost(Instruction::Load, VecTy->getElementType(),
1925 Align(1), LI->getPointerAddressSpace(), CostKind);
1926 ScalarizedCost += TTI.getAddressComputationCost(LI->getPointerOperandType(),
1927 nullptr, nullptr, CostKind);
1928 }
1929
1930 LLVM_DEBUG(dbgs() << "Found all extractions of a vector load: " << I
1931 << "\n LoadExtractCost: " << OriginalCost
1932 << " vs ScalarizedCost: " << ScalarizedCost << "\n");
1933
1934 if (ScalarizedCost >= OriginalCost)
1935 return false;
1936
1937 // Ensure we add the load back to the worklist BEFORE its users so they can
1938 // erased in the correct order.
1939 Worklist.push(LI);
1940
1941 Type *ElemType = VecTy->getElementType();
1942
1943 // Replace extracts with narrow scalar loads.
1944 for (User *U : LI->users()) {
1945 auto *EI = cast<ExtractElementInst>(U);
1946 Value *Idx = EI->getIndexOperand();
1947
1948 // Insert 'freeze' for poison indexes.
1949 auto It = NeedFreeze.find(EI);
1950 if (It != NeedFreeze.end())
1951 It->second.freeze(Builder, *cast<Instruction>(Idx));
1952
1953 Builder.SetInsertPoint(EI);
1954 Value *GEP =
1955 Builder.CreateInBoundsGEP(VecTy, Ptr, {Builder.getInt32(0), Idx});
1956 auto *NewLoad = cast<LoadInst>(
1957 Builder.CreateLoad(ElemType, GEP, EI->getName() + ".scalar"));
1958
1959 Align ScalarOpAlignment =
1960 computeAlignmentAfterScalarization(LI->getAlign(), ElemType, Idx, *DL);
1961 NewLoad->setAlignment(ScalarOpAlignment);
1962
1963 if (auto *ConstIdx = dyn_cast<ConstantInt>(Idx)) {
1964 size_t Offset = ConstIdx->getZExtValue() * DL->getTypeStoreSize(ElemType);
1965 AAMDNodes OldAAMD = LI->getAAMetadata();
1966 NewLoad->setAAMetadata(OldAAMD.adjustForAccess(Offset, ElemType, *DL));
1967 }
1968
1969 replaceValue(*EI, *NewLoad, false);
1970 }
1971
1972 FailureGuard.release();
1973 return true;
1974}
1975
1976bool VectorCombine::scalarizeExtExtract(Instruction &I) {
1978 return false;
1979 auto *Ext = dyn_cast<ZExtInst>(&I);
1980 if (!Ext)
1981 return false;
1982
1983 // Try to convert a vector zext feeding only extracts to a set of scalar
1984 // (Src << ExtIdx *Size) & (Size -1)
1985 // if profitable .
1986 auto *SrcTy = dyn_cast<FixedVectorType>(Ext->getOperand(0)->getType());
1987 if (!SrcTy)
1988 return false;
1989 auto *DstTy = cast<FixedVectorType>(Ext->getType());
1990
1991 Type *ScalarDstTy = DstTy->getElementType();
1992 if (DL->getTypeSizeInBits(SrcTy) != DL->getTypeSizeInBits(ScalarDstTy))
1993 return false;
1994
1995 InstructionCost VectorCost =
1996 TTI.getCastInstrCost(Instruction::ZExt, DstTy, SrcTy,
1998 unsigned ExtCnt = 0;
1999 bool ExtLane0 = false;
2000 for (User *U : Ext->users()) {
2001 uint64_t Idx;
2002 if (!match(U, m_ExtractElt(m_Value(), m_ConstantInt(Idx))))
2003 return false;
2004 if (cast<Instruction>(U)->use_empty())
2005 continue;
2006 ExtCnt += 1;
2007 ExtLane0 |= !Idx;
2008 VectorCost += TTI.getVectorInstrCost(Instruction::ExtractElement, DstTy,
2009 CostKind, Idx, U);
2010 }
2011
2012 InstructionCost ScalarCost =
2013 ExtCnt * TTI.getArithmeticInstrCost(
2014 Instruction::And, ScalarDstTy, CostKind,
2017 (ExtCnt - ExtLane0) *
2019 Instruction::LShr, ScalarDstTy, CostKind,
2022 if (ScalarCost > VectorCost)
2023 return false;
2024
2025 Value *ScalarV = Ext->getOperand(0);
2026 if (!isGuaranteedNotToBePoison(ScalarV, &AC, dyn_cast<Instruction>(ScalarV),
2027 &DT)) {
2028 // Check wether all lanes are extracted, all extracts trigger UB
2029 // on poison, and the last extract (and hence all previous ones)
2030 // are guaranteed to execute if Ext executes. If so, we do not
2031 // need to insert a freeze.
2032 SmallDenseSet<ConstantInt *, 8> ExtractedLanes;
2033 bool AllExtractsTriggerUB = true;
2034 ExtractElementInst *LastExtract = nullptr;
2035 BasicBlock *ExtBB = Ext->getParent();
2036 for (User *U : Ext->users()) {
2037 auto *Extract = cast<ExtractElementInst>(U);
2038 if (Extract->getParent() != ExtBB || !programUndefinedIfPoison(Extract)) {
2039 AllExtractsTriggerUB = false;
2040 break;
2041 }
2042 ExtractedLanes.insert(cast<ConstantInt>(Extract->getIndexOperand()));
2043 if (!LastExtract || LastExtract->comesBefore(Extract))
2044 LastExtract = Extract;
2045 }
2046 if (ExtractedLanes.size() != DstTy->getNumElements() ||
2047 !AllExtractsTriggerUB ||
2049 LastExtract->getIterator()))
2050 ScalarV = Builder.CreateFreeze(ScalarV);
2051 }
2052 ScalarV = Builder.CreateBitCast(
2053 ScalarV,
2054 IntegerType::get(SrcTy->getContext(), DL->getTypeSizeInBits(SrcTy)));
2055 uint64_t SrcEltSizeInBits = DL->getTypeSizeInBits(SrcTy->getElementType());
2056 uint64_t EltBitMask = (1ull << SrcEltSizeInBits) - 1;
2057 uint64_t TotalBits = DL->getTypeSizeInBits(SrcTy);
2058 Type *PackedTy = IntegerType::get(SrcTy->getContext(), TotalBits);
2059 Value *Mask = ConstantInt::get(PackedTy, EltBitMask);
2060 for (User *U : Ext->users()) {
2061 auto *Extract = cast<ExtractElementInst>(U);
2062 uint64_t Idx =
2063 cast<ConstantInt>(Extract->getIndexOperand())->getZExtValue();
2064 uint64_t ShiftAmt =
2065 DL->isBigEndian()
2066 ? (TotalBits - SrcEltSizeInBits - Idx * SrcEltSizeInBits)
2067 : (Idx * SrcEltSizeInBits);
2068 Value *LShr = Builder.CreateLShr(ScalarV, ShiftAmt);
2069 Value *And = Builder.CreateAnd(LShr, Mask);
2070 U->replaceAllUsesWith(And);
2071 }
2072 return true;
2073}
2074
2075/// Try to fold "(or (zext (bitcast X)), (shl (zext (bitcast Y)), C))"
2076/// to "(bitcast (concat X, Y))"
2077/// where X/Y are bitcasted from i1 mask vectors.
2078bool VectorCombine::foldConcatOfBoolMasks(Instruction &I) {
2079 Type *Ty = I.getType();
2080 if (!Ty->isIntegerTy())
2081 return false;
2082
2083 // TODO: Add big endian test coverage
2084 if (DL->isBigEndian())
2085 return false;
2086
2087 // Restrict to disjoint cases so the mask vectors aren't overlapping.
2088 Instruction *X, *Y;
2090 return false;
2091
2092 // Allow both sources to contain shl, to handle more generic pattern:
2093 // "(or (shl (zext (bitcast X)), C1), (shl (zext (bitcast Y)), C2))"
2094 Value *SrcX;
2095 uint64_t ShAmtX = 0;
2096 if (!match(X, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcX)))))) &&
2097 !match(X, m_OneUse(
2099 m_ConstantInt(ShAmtX)))))
2100 return false;
2101
2102 Value *SrcY;
2103 uint64_t ShAmtY = 0;
2104 if (!match(Y, m_OneUse(m_ZExt(m_OneUse(m_BitCast(m_Value(SrcY)))))) &&
2105 !match(Y, m_OneUse(
2107 m_ConstantInt(ShAmtY)))))
2108 return false;
2109
2110 // Canonicalize larger shift to the RHS.
2111 if (ShAmtX > ShAmtY) {
2112 std::swap(X, Y);
2113 std::swap(SrcX, SrcY);
2114 std::swap(ShAmtX, ShAmtY);
2115 }
2116
2117 // Ensure both sources are matching vXi1 bool mask types, and that the shift
2118 // difference is the mask width so they can be easily concatenated together.
2119 uint64_t ShAmtDiff = ShAmtY - ShAmtX;
2120 unsigned NumSHL = (ShAmtX > 0) + (ShAmtY > 0);
2121 unsigned BitWidth = Ty->getPrimitiveSizeInBits();
2122 auto *MaskTy = dyn_cast<FixedVectorType>(SrcX->getType());
2123 if (!MaskTy || SrcX->getType() != SrcY->getType() ||
2124 !MaskTy->getElementType()->isIntegerTy(1) ||
2125 MaskTy->getNumElements() != ShAmtDiff ||
2126 MaskTy->getNumElements() > (BitWidth / 2))
2127 return false;
2128
2129 auto *ConcatTy = FixedVectorType::getDoubleElementsVectorType(MaskTy);
2130 auto *ConcatIntTy =
2131 Type::getIntNTy(Ty->getContext(), ConcatTy->getNumElements());
2132 auto *MaskIntTy = Type::getIntNTy(Ty->getContext(), ShAmtDiff);
2133
2134 SmallVector<int, 32> ConcatMask(ConcatTy->getNumElements());
2135 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2136
2137 // TODO: Is it worth supporting multi use cases?
2138 InstructionCost OldCost = 0;
2139 OldCost += TTI.getArithmeticInstrCost(Instruction::Or, Ty, CostKind);
2140 OldCost +=
2141 NumSHL * TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2142 OldCost += 2 * TTI.getCastInstrCost(Instruction::ZExt, Ty, MaskIntTy,
2144 OldCost += 2 * TTI.getCastInstrCost(Instruction::BitCast, MaskIntTy, MaskTy,
2146
2147 InstructionCost NewCost = 0;
2149 MaskTy, ConcatMask, CostKind);
2150 NewCost += TTI.getCastInstrCost(Instruction::BitCast, ConcatIntTy, ConcatTy,
2152 if (Ty != ConcatIntTy)
2153 NewCost += TTI.getCastInstrCost(Instruction::ZExt, Ty, ConcatIntTy,
2155 if (ShAmtX > 0)
2156 NewCost += TTI.getArithmeticInstrCost(Instruction::Shl, Ty, CostKind);
2157
2158 LLVM_DEBUG(dbgs() << "Found a concatenation of bitcasted bool masks: " << I
2159 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2160 << "\n");
2161
2162 if (NewCost > OldCost)
2163 return false;
2164
2165 // Build bool mask concatenation, bitcast back to scalar integer, and perform
2166 // any residual zero-extension or shifting.
2167 Value *Concat = Builder.CreateShuffleVector(SrcX, SrcY, ConcatMask);
2168 Worklist.pushValue(Concat);
2169
2170 Value *Result = Builder.CreateBitCast(Concat, ConcatIntTy);
2171
2172 if (Ty != ConcatIntTy) {
2173 Worklist.pushValue(Result);
2174 Result = Builder.CreateZExt(Result, Ty);
2175 }
2176
2177 if (ShAmtX > 0) {
2178 Worklist.pushValue(Result);
2179 Result = Builder.CreateShl(Result, ShAmtX);
2180 }
2181
2182 replaceValue(I, *Result);
2183 return true;
2184}
2185
2186/// Try to convert "shuffle (binop (shuffle, shuffle)), undef"
2187/// --> "binop (shuffle), (shuffle)".
2188bool VectorCombine::foldPermuteOfBinops(Instruction &I) {
2189 BinaryOperator *BinOp;
2190 ArrayRef<int> OuterMask;
2191 if (!match(&I,
2192 m_Shuffle(m_OneUse(m_BinOp(BinOp)), m_Undef(), m_Mask(OuterMask))))
2193 return false;
2194
2195 // Don't introduce poison into div/rem.
2196 if (BinOp->isIntDivRem() && llvm::is_contained(OuterMask, PoisonMaskElem))
2197 return false;
2198
2199 Value *Op00, *Op01, *Op10, *Op11;
2200 ArrayRef<int> Mask0, Mask1;
2201 bool Match0 =
2202 match(BinOp->getOperand(0),
2203 m_OneUse(m_Shuffle(m_Value(Op00), m_Value(Op01), m_Mask(Mask0))));
2204 bool Match1 =
2205 match(BinOp->getOperand(1),
2206 m_OneUse(m_Shuffle(m_Value(Op10), m_Value(Op11), m_Mask(Mask1))));
2207 if (!Match0 && !Match1)
2208 return false;
2209
2210 Op00 = Match0 ? Op00 : BinOp->getOperand(0);
2211 Op01 = Match0 ? Op01 : BinOp->getOperand(0);
2212 Op10 = Match1 ? Op10 : BinOp->getOperand(1);
2213 Op11 = Match1 ? Op11 : BinOp->getOperand(1);
2214
2215 Instruction::BinaryOps Opcode = BinOp->getOpcode();
2216 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2217 auto *BinOpTy = dyn_cast<FixedVectorType>(BinOp->getType());
2218 auto *Op0Ty = dyn_cast<FixedVectorType>(Op00->getType());
2219 auto *Op1Ty = dyn_cast<FixedVectorType>(Op10->getType());
2220 if (!ShuffleDstTy || !BinOpTy || !Op0Ty || !Op1Ty)
2221 return false;
2222
2223 unsigned NumSrcElts = BinOpTy->getNumElements();
2224
2225 // Don't accept shuffles that reference the second operand in
2226 // div/rem or if its an undef arg.
2227 if ((BinOp->isIntDivRem() || !isa<PoisonValue>(I.getOperand(1))) &&
2228 any_of(OuterMask, [NumSrcElts](int M) { return M >= (int)NumSrcElts; }))
2229 return false;
2230
2231 // Merge outer / inner (or identity if no match) shuffles.
2232 SmallVector<int> NewMask0, NewMask1;
2233 for (int M : OuterMask) {
2234 if (M < 0 || M >= (int)NumSrcElts) {
2235 NewMask0.push_back(PoisonMaskElem);
2236 NewMask1.push_back(PoisonMaskElem);
2237 } else {
2238 NewMask0.push_back(Match0 ? Mask0[M] : M);
2239 NewMask1.push_back(Match1 ? Mask1[M] : M);
2240 }
2241 }
2242
2243 unsigned NumOpElts = Op0Ty->getNumElements();
2244 bool IsIdentity0 = ShuffleDstTy == Op0Ty &&
2245 all_of(NewMask0, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2246 ShuffleVectorInst::isIdentityMask(NewMask0, NumOpElts);
2247 bool IsIdentity1 = ShuffleDstTy == Op1Ty &&
2248 all_of(NewMask1, [NumOpElts](int M) { return M < (int)NumOpElts; }) &&
2249 ShuffleVectorInst::isIdentityMask(NewMask1, NumOpElts);
2250
2251 // Try to merge shuffles across the binop if the new shuffles are not costly.
2252 InstructionCost OldCost =
2253 TTI.getArithmeticInstrCost(Opcode, BinOpTy, CostKind) +
2255 BinOpTy, OuterMask, CostKind, 0, nullptr, {BinOp}, &I);
2256 if (Match0)
2257 OldCost += TTI.getShuffleCost(
2258 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op0Ty, Mask0, CostKind,
2259 0, nullptr, {Op00, Op01}, cast<Instruction>(BinOp->getOperand(0)));
2260 if (Match1)
2261 OldCost += TTI.getShuffleCost(
2262 TargetTransformInfo::SK_PermuteTwoSrc, BinOpTy, Op1Ty, Mask1, CostKind,
2263 0, nullptr, {Op10, Op11}, cast<Instruction>(BinOp->getOperand(1)));
2264
2265 InstructionCost NewCost =
2266 TTI.getArithmeticInstrCost(Opcode, ShuffleDstTy, CostKind);
2267
2268 if (!IsIdentity0)
2269 NewCost +=
2271 Op0Ty, NewMask0, CostKind, 0, nullptr, {Op00, Op01});
2272 if (!IsIdentity1)
2273 NewCost +=
2275 Op1Ty, NewMask1, CostKind, 0, nullptr, {Op10, Op11});
2276
2277 LLVM_DEBUG(dbgs() << "Found a shuffle feeding a shuffled binop: " << I
2278 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2279 << "\n");
2280
2281 // If costs are equal, still fold as we reduce instruction count.
2282 if (NewCost > OldCost)
2283 return false;
2284
2285 Value *LHS =
2286 IsIdentity0 ? Op00 : Builder.CreateShuffleVector(Op00, Op01, NewMask0);
2287 Value *RHS =
2288 IsIdentity1 ? Op10 : Builder.CreateShuffleVector(Op10, Op11, NewMask1);
2289 Value *NewBO = Builder.CreateBinOp(Opcode, LHS, RHS);
2290
2291 // Intersect flags from the old binops.
2292 if (auto *NewInst = dyn_cast<Instruction>(NewBO))
2293 NewInst->copyIRFlags(BinOp);
2294
2295 Worklist.pushValue(LHS);
2296 Worklist.pushValue(RHS);
2297 replaceValue(I, *NewBO);
2298 return true;
2299}
2300
2301/// Try to convert "shuffle (binop), (binop)" into "binop (shuffle), (shuffle)".
2302/// Try to convert "shuffle (cmpop), (cmpop)" into "cmpop (shuffle), (shuffle)".
2303bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
2304 ArrayRef<int> OldMask;
2305 Instruction *LHS, *RHS;
2307 m_OneUse(m_Instruction(RHS)), m_Mask(OldMask))))
2308 return false;
2309
2310 // TODO: Add support for addlike etc.
2311 if (LHS->getOpcode() != RHS->getOpcode())
2312 return false;
2313
2314 Value *X, *Y, *Z, *W;
2315 bool IsCommutative = false;
2316 CmpPredicate PredLHS = CmpInst::BAD_ICMP_PREDICATE;
2317 CmpPredicate PredRHS = CmpInst::BAD_ICMP_PREDICATE;
2318 if (match(LHS, m_BinOp(m_Value(X), m_Value(Y))) &&
2319 match(RHS, m_BinOp(m_Value(Z), m_Value(W)))) {
2320 auto *BO = cast<BinaryOperator>(LHS);
2321 // Don't introduce poison into div/rem.
2322 if (llvm::is_contained(OldMask, PoisonMaskElem) && BO->isIntDivRem())
2323 return false;
2324 IsCommutative = BinaryOperator::isCommutative(BO->getOpcode());
2325 } else if (match(LHS, m_Cmp(PredLHS, m_Value(X), m_Value(Y))) &&
2326 match(RHS, m_Cmp(PredRHS, m_Value(Z), m_Value(W))) &&
2327 (CmpInst::Predicate)PredLHS == (CmpInst::Predicate)PredRHS) {
2328 IsCommutative = cast<CmpInst>(LHS)->isCommutative();
2329 } else
2330 return false;
2331
2332 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2333 auto *BinResTy = dyn_cast<FixedVectorType>(LHS->getType());
2334 auto *BinOpTy = dyn_cast<FixedVectorType>(X->getType());
2335 if (!ShuffleDstTy || !BinResTy || !BinOpTy || X->getType() != Z->getType())
2336 return false;
2337
2338 unsigned NumSrcElts = BinOpTy->getNumElements();
2339
2340 // If we have something like "add X, Y" and "add Z, X", swap ops to match.
2341 if (IsCommutative && X != Z && Y != W && (X == W || Y == Z))
2342 std::swap(X, Y);
2343
2344 auto ConvertToUnary = [NumSrcElts](int &M) {
2345 if (M >= (int)NumSrcElts)
2346 M -= NumSrcElts;
2347 };
2348
2349 SmallVector<int> NewMask0(OldMask);
2351 if (X == Z) {
2352 llvm::for_each(NewMask0, ConvertToUnary);
2354 Z = PoisonValue::get(BinOpTy);
2355 }
2356
2357 SmallVector<int> NewMask1(OldMask);
2359 if (Y == W) {
2360 llvm::for_each(NewMask1, ConvertToUnary);
2362 W = PoisonValue::get(BinOpTy);
2363 }
2364
2365 // Try to replace a binop with a shuffle if the shuffle is not costly.
2366 InstructionCost OldCost =
2370 BinResTy, OldMask, CostKind, 0, nullptr, {LHS, RHS},
2371 &I);
2372
2373 // Handle shuffle(binop(shuffle(x),y),binop(z,shuffle(w))) style patterns
2374 // where one use shuffles have gotten split across the binop/cmp. These
2375 // often allow a major reduction in total cost that wouldn't happen as
2376 // individual folds.
2377 auto MergeInner = [&](Value *&Op, int Offset, MutableArrayRef<int> Mask,
2378 TTI::TargetCostKind CostKind) -> bool {
2379 Value *InnerOp;
2380 ArrayRef<int> InnerMask;
2381 if (match(Op, m_OneUse(m_Shuffle(m_Value(InnerOp), m_Undef(),
2382 m_Mask(InnerMask)))) &&
2383 InnerOp->getType() == Op->getType() &&
2384 all_of(InnerMask,
2385 [NumSrcElts](int M) { return M < (int)NumSrcElts; })) {
2386 for (int &M : Mask)
2387 if (Offset <= M && M < (int)(Offset + NumSrcElts)) {
2388 M = InnerMask[M - Offset];
2389 M = 0 <= M ? M + Offset : M;
2390 }
2392 Op = InnerOp;
2393 return true;
2394 }
2395 return false;
2396 };
2397 bool ReducedInstCount = false;
2398 ReducedInstCount |= MergeInner(X, 0, NewMask0, CostKind);
2399 ReducedInstCount |= MergeInner(Y, 0, NewMask1, CostKind);
2400 ReducedInstCount |= MergeInner(Z, NumSrcElts, NewMask0, CostKind);
2401 ReducedInstCount |= MergeInner(W, NumSrcElts, NewMask1, CostKind);
2402
2403 auto *ShuffleCmpTy =
2404 FixedVectorType::get(BinOpTy->getElementType(), ShuffleDstTy);
2405 InstructionCost NewCost =
2406 TTI.getShuffleCost(SK0, ShuffleCmpTy, BinOpTy, NewMask0, CostKind, 0,
2407 nullptr, {X, Z}) +
2408 TTI.getShuffleCost(SK1, ShuffleCmpTy, BinOpTy, NewMask1, CostKind, 0,
2409 nullptr, {Y, W});
2410
2411 if (PredLHS == CmpInst::BAD_ICMP_PREDICATE) {
2412 NewCost +=
2413 TTI.getArithmeticInstrCost(LHS->getOpcode(), ShuffleDstTy, CostKind);
2414 } else {
2415 NewCost += TTI.getCmpSelInstrCost(LHS->getOpcode(), ShuffleCmpTy,
2416 ShuffleDstTy, PredLHS, CostKind);
2417 }
2418
2419 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two binops: " << I
2420 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2421 << "\n");
2422
2423 // If either shuffle will constant fold away, then fold for the same cost as
2424 // we will reduce the instruction count.
2425 ReducedInstCount |= (isa<Constant>(X) && isa<Constant>(Z)) ||
2426 (isa<Constant>(Y) && isa<Constant>(W));
2427 if (ReducedInstCount ? (NewCost > OldCost) : (NewCost >= OldCost))
2428 return false;
2429
2430 Value *Shuf0 = Builder.CreateShuffleVector(X, Z, NewMask0);
2431 Value *Shuf1 = Builder.CreateShuffleVector(Y, W, NewMask1);
2432 Value *NewBO = PredLHS == CmpInst::BAD_ICMP_PREDICATE
2433 ? Builder.CreateBinOp(
2434 cast<BinaryOperator>(LHS)->getOpcode(), Shuf0, Shuf1)
2435 : Builder.CreateCmp(PredLHS, Shuf0, Shuf1);
2436
2437 // Intersect flags from the old binops.
2438 if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
2439 NewInst->copyIRFlags(LHS);
2440 NewInst->andIRFlags(RHS);
2441 }
2442
2443 Worklist.pushValue(Shuf0);
2444 Worklist.pushValue(Shuf1);
2445 replaceValue(I, *NewBO);
2446 return true;
2447}
2448
2449/// Try to convert,
2450/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
2451/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
2452bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
2453 ArrayRef<int> Mask;
2454 Value *C1, *T1, *F1, *C2, *T2, *F2;
2455 if (!match(&I, m_Shuffle(
2457 m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
2458 m_Mask(Mask))))
2459 return false;
2460
2461 auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
2462 auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
2463 if (!C1VecTy || !C2VecTy || C1VecTy != C2VecTy)
2464 return false;
2465
2466 auto *SI0FOp = dyn_cast<FPMathOperator>(I.getOperand(0));
2467 auto *SI1FOp = dyn_cast<FPMathOperator>(I.getOperand(1));
2468 // SelectInsts must have the same FMF.
2469 if (((SI0FOp == nullptr) != (SI1FOp == nullptr)) ||
2470 ((SI0FOp != nullptr) &&
2471 (SI0FOp->getFastMathFlags() != SI1FOp->getFastMathFlags())))
2472 return false;
2473
2474 auto *SrcVecTy = cast<FixedVectorType>(T1->getType());
2475 auto *DstVecTy = cast<FixedVectorType>(I.getType());
2477 auto SelOp = Instruction::Select;
2479 SelOp, SrcVecTy, C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
2480 OldCost += TTI.getCmpSelInstrCost(SelOp, SrcVecTy, C2VecTy,
2482 OldCost +=
2483 TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0, nullptr,
2484 {I.getOperand(0), I.getOperand(1)}, &I);
2485
2487 SK, FixedVectorType::get(C1VecTy->getScalarType(), Mask.size()), C1VecTy,
2488 Mask, CostKind, 0, nullptr, {C1, C2});
2489 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2490 nullptr, {T1, T2});
2491 NewCost += TTI.getShuffleCost(SK, DstVecTy, SrcVecTy, Mask, CostKind, 0,
2492 nullptr, {F1, F2});
2493 auto *C1C2ShuffledVecTy = cast<FixedVectorType>(
2494 toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
2495 NewCost += TTI.getCmpSelInstrCost(SelOp, DstVecTy, C1C2ShuffledVecTy,
2497
2498 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two selects: " << I
2499 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2500 << "\n");
2501 if (NewCost > OldCost)
2502 return false;
2503
2504 Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
2505 Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
2506 Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
2507 Value *NewSel;
2508 // We presuppose that the SelectInsts have the same FMF.
2509 if (SI0FOp)
2510 NewSel = Builder.CreateSelectFMF(ShuffleCmp, ShuffleTrue, ShuffleFalse,
2511 SI0FOp->getFastMathFlags());
2512 else
2513 NewSel = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
2514
2515 Worklist.pushValue(ShuffleCmp);
2516 Worklist.pushValue(ShuffleTrue);
2517 Worklist.pushValue(ShuffleFalse);
2518 replaceValue(I, *NewSel);
2519 return true;
2520}
2521
2522/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
2523/// into "castop (shuffle)".
2524bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
2525 Value *V0, *V1;
2526 ArrayRef<int> OldMask;
2527 if (!match(&I, m_Shuffle(m_Value(V0), m_Value(V1), m_Mask(OldMask))))
2528 return false;
2529
2530 // Check whether this is a binary shuffle.
2531 bool IsBinaryShuffle = !isa<UndefValue>(V1);
2532
2533 auto *C0 = dyn_cast<CastInst>(V0);
2534 auto *C1 = dyn_cast<CastInst>(V1);
2535 if (!C0 || (IsBinaryShuffle && !C1))
2536 return false;
2537
2538 Instruction::CastOps Opcode = C0->getOpcode();
2539
2540 // If this is allowed, foldShuffleOfCastops can get stuck in a loop
2541 // with foldBitcastOfShuffle. Reject in favor of foldBitcastOfShuffle.
2542 if (!IsBinaryShuffle && Opcode == Instruction::BitCast)
2543 return false;
2544
2545 if (IsBinaryShuffle) {
2546 if (C0->getSrcTy() != C1->getSrcTy())
2547 return false;
2548 // Handle shuffle(zext_nneg(x), sext(y)) -> sext(shuffle(x,y)) folds.
2549 if (Opcode != C1->getOpcode()) {
2550 if (match(C0, m_SExtLike(m_Value())) && match(C1, m_SExtLike(m_Value())))
2551 Opcode = Instruction::SExt;
2552 else
2553 return false;
2554 }
2555 }
2556
2557 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2558 auto *CastDstTy = dyn_cast<FixedVectorType>(C0->getDestTy());
2559 auto *CastSrcTy = dyn_cast<FixedVectorType>(C0->getSrcTy());
2560 if (!ShuffleDstTy || !CastDstTy || !CastSrcTy)
2561 return false;
2562
2563 unsigned NumSrcElts = CastSrcTy->getNumElements();
2564 unsigned NumDstElts = CastDstTy->getNumElements();
2565 assert((NumDstElts == NumSrcElts || Opcode == Instruction::BitCast) &&
2566 "Only bitcasts expected to alter src/dst element counts");
2567
2568 // Check for bitcasting of unscalable vector types.
2569 // e.g. <32 x i40> -> <40 x i32>
2570 if (NumDstElts != NumSrcElts && (NumSrcElts % NumDstElts) != 0 &&
2571 (NumDstElts % NumSrcElts) != 0)
2572 return false;
2573
2574 SmallVector<int, 16> NewMask;
2575 if (NumSrcElts >= NumDstElts) {
2576 // The bitcast is from wide to narrow/equal elements. The shuffle mask can
2577 // always be expanded to the equivalent form choosing narrower elements.
2578 assert(NumSrcElts % NumDstElts == 0 && "Unexpected shuffle mask");
2579 unsigned ScaleFactor = NumSrcElts / NumDstElts;
2580 narrowShuffleMaskElts(ScaleFactor, OldMask, NewMask);
2581 } else {
2582 // The bitcast is from narrow elements to wide elements. The shuffle mask
2583 // must choose consecutive elements to allow casting first.
2584 assert(NumDstElts % NumSrcElts == 0 && "Unexpected shuffle mask");
2585 unsigned ScaleFactor = NumDstElts / NumSrcElts;
2586 if (!widenShuffleMaskElts(ScaleFactor, OldMask, NewMask))
2587 return false;
2588 }
2589
2590 auto *NewShuffleDstTy =
2591 FixedVectorType::get(CastSrcTy->getScalarType(), NewMask.size());
2592
2593 // Try to replace a castop with a shuffle if the shuffle is not costly.
2594 InstructionCost CostC0 =
2595 TTI.getCastInstrCost(C0->getOpcode(), CastDstTy, CastSrcTy,
2597
2599 if (IsBinaryShuffle)
2601 else
2603
2604 InstructionCost OldCost = CostC0;
2605 OldCost += TTI.getShuffleCost(ShuffleKind, ShuffleDstTy, CastDstTy, OldMask,
2606 CostKind, 0, nullptr, {}, &I);
2607
2608 InstructionCost NewCost = TTI.getShuffleCost(ShuffleKind, NewShuffleDstTy,
2609 CastSrcTy, NewMask, CostKind);
2610 NewCost += TTI.getCastInstrCost(Opcode, ShuffleDstTy, NewShuffleDstTy,
2612 if (!C0->hasOneUse())
2613 NewCost += CostC0;
2614 if (IsBinaryShuffle) {
2615 InstructionCost CostC1 =
2616 TTI.getCastInstrCost(C1->getOpcode(), CastDstTy, CastSrcTy,
2618 OldCost += CostC1;
2619 if (!C1->hasOneUse())
2620 NewCost += CostC1;
2621 }
2622
2623 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two casts: " << I
2624 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2625 << "\n");
2626 if (NewCost > OldCost)
2627 return false;
2628
2629 Value *Shuf;
2630 if (IsBinaryShuffle)
2631 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), C1->getOperand(0),
2632 NewMask);
2633 else
2634 Shuf = Builder.CreateShuffleVector(C0->getOperand(0), NewMask);
2635
2636 Value *Cast = Builder.CreateCast(Opcode, Shuf, ShuffleDstTy);
2637
2638 // Intersect flags from the old casts.
2639 if (auto *NewInst = dyn_cast<Instruction>(Cast)) {
2640 NewInst->copyIRFlags(C0);
2641 if (IsBinaryShuffle)
2642 NewInst->andIRFlags(C1);
2643 }
2644
2645 Worklist.pushValue(Shuf);
2646 replaceValue(I, *Cast);
2647 return true;
2648}
2649
2650/// Try to convert any of:
2651/// "shuffle (shuffle x, y), (shuffle y, x)"
2652/// "shuffle (shuffle x, undef), (shuffle y, undef)"
2653/// "shuffle (shuffle x, undef), y"
2654/// "shuffle x, (shuffle y, undef)"
2655/// into "shuffle x, y".
2656bool VectorCombine::foldShuffleOfShuffles(Instruction &I) {
2657 ArrayRef<int> OuterMask;
2658 Value *OuterV0, *OuterV1;
2659 if (!match(&I,
2660 m_Shuffle(m_Value(OuterV0), m_Value(OuterV1), m_Mask(OuterMask))))
2661 return false;
2662
2663 ArrayRef<int> InnerMask0, InnerMask1;
2664 Value *X0, *X1, *Y0, *Y1;
2665 bool Match0 =
2666 match(OuterV0, m_Shuffle(m_Value(X0), m_Value(Y0), m_Mask(InnerMask0)));
2667 bool Match1 =
2668 match(OuterV1, m_Shuffle(m_Value(X1), m_Value(Y1), m_Mask(InnerMask1)));
2669 if (!Match0 && !Match1)
2670 return false;
2671
2672 // If the outer shuffle is a permute, then create a fake inner all-poison
2673 // shuffle. This is easier than accounting for length-changing shuffles below.
2674 SmallVector<int, 16> PoisonMask1;
2675 if (!Match1 && isa<PoisonValue>(OuterV1)) {
2676 X1 = X0;
2677 Y1 = Y0;
2678 PoisonMask1.append(InnerMask0.size(), PoisonMaskElem);
2679 InnerMask1 = PoisonMask1;
2680 Match1 = true; // fake match
2681 }
2682
2683 X0 = Match0 ? X0 : OuterV0;
2684 Y0 = Match0 ? Y0 : OuterV0;
2685 X1 = Match1 ? X1 : OuterV1;
2686 Y1 = Match1 ? Y1 : OuterV1;
2687 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2688 auto *ShuffleSrcTy = dyn_cast<FixedVectorType>(X0->getType());
2689 auto *ShuffleImmTy = dyn_cast<FixedVectorType>(OuterV0->getType());
2690 if (!ShuffleDstTy || !ShuffleSrcTy || !ShuffleImmTy ||
2691 X0->getType() != X1->getType())
2692 return false;
2693
2694 unsigned NumSrcElts = ShuffleSrcTy->getNumElements();
2695 unsigned NumImmElts = ShuffleImmTy->getNumElements();
2696
2697 // Attempt to merge shuffles, matching upto 2 source operands.
2698 // Replace index to a poison arg with PoisonMaskElem.
2699 // Bail if either inner masks reference an undef arg.
2700 SmallVector<int, 16> NewMask(OuterMask);
2701 Value *NewX = nullptr, *NewY = nullptr;
2702 for (int &M : NewMask) {
2703 Value *Src = nullptr;
2704 if (0 <= M && M < (int)NumImmElts) {
2705 Src = OuterV0;
2706 if (Match0) {
2707 M = InnerMask0[M];
2708 Src = M >= (int)NumSrcElts ? Y0 : X0;
2709 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2710 }
2711 } else if (M >= (int)NumImmElts) {
2712 Src = OuterV1;
2713 M -= NumImmElts;
2714 if (Match1) {
2715 M = InnerMask1[M];
2716 Src = M >= (int)NumSrcElts ? Y1 : X1;
2717 M = M >= (int)NumSrcElts ? (M - NumSrcElts) : M;
2718 }
2719 }
2720 if (Src && M != PoisonMaskElem) {
2721 assert(0 <= M && M < (int)NumSrcElts && "Unexpected shuffle mask index");
2722 if (isa<UndefValue>(Src)) {
2723 // We've referenced an undef element - if its poison, update the shuffle
2724 // mask, else bail.
2725 if (!isa<PoisonValue>(Src))
2726 return false;
2727 M = PoisonMaskElem;
2728 continue;
2729 }
2730 if (!NewX || NewX == Src) {
2731 NewX = Src;
2732 continue;
2733 }
2734 if (!NewY || NewY == Src) {
2735 M += NumSrcElts;
2736 NewY = Src;
2737 continue;
2738 }
2739 return false;
2740 }
2741 }
2742
2743 if (!NewX)
2744 return PoisonValue::get(ShuffleDstTy);
2745 if (!NewY)
2746 NewY = PoisonValue::get(ShuffleSrcTy);
2747
2748 // Have we folded to an Identity shuffle?
2749 if (ShuffleVectorInst::isIdentityMask(NewMask, NumSrcElts)) {
2750 replaceValue(I, *NewX);
2751 return true;
2752 }
2753
2754 // Try to merge the shuffles if the new shuffle is not costly.
2755 InstructionCost InnerCost0 = 0;
2756 if (Match0)
2757 InnerCost0 = TTI.getInstructionCost(cast<User>(OuterV0), CostKind);
2758
2759 InstructionCost InnerCost1 = 0;
2760 if (Match1)
2761 InnerCost1 = TTI.getInstructionCost(cast<User>(OuterV1), CostKind);
2762
2764
2765 InstructionCost OldCost = InnerCost0 + InnerCost1 + OuterCost;
2766
2767 bool IsUnary = all_of(NewMask, [&](int M) { return M < (int)NumSrcElts; });
2771 InstructionCost NewCost =
2772 TTI.getShuffleCost(SK, ShuffleDstTy, ShuffleSrcTy, NewMask, CostKind, 0,
2773 nullptr, {NewX, NewY});
2774 if (!OuterV0->hasOneUse())
2775 NewCost += InnerCost0;
2776 if (!OuterV1->hasOneUse())
2777 NewCost += InnerCost1;
2778
2779 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two shuffles: " << I
2780 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2781 << "\n");
2782 if (NewCost > OldCost)
2783 return false;
2784
2785 Value *Shuf = Builder.CreateShuffleVector(NewX, NewY, NewMask);
2786 replaceValue(I, *Shuf);
2787 return true;
2788}
2789
2790/// Try to convert
2791/// "shuffle (intrinsic), (intrinsic)" into "intrinsic (shuffle), (shuffle)".
2792bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
2793 Value *V0, *V1;
2794 ArrayRef<int> OldMask;
2795 if (!match(&I, m_Shuffle(m_OneUse(m_Value(V0)), m_OneUse(m_Value(V1)),
2796 m_Mask(OldMask))))
2797 return false;
2798
2799 auto *II0 = dyn_cast<IntrinsicInst>(V0);
2800 auto *II1 = dyn_cast<IntrinsicInst>(V1);
2801 if (!II0 || !II1)
2802 return false;
2803
2804 Intrinsic::ID IID = II0->getIntrinsicID();
2805 if (IID != II1->getIntrinsicID())
2806 return false;
2807
2808 auto *ShuffleDstTy = dyn_cast<FixedVectorType>(I.getType());
2809 auto *II0Ty = dyn_cast<FixedVectorType>(II0->getType());
2810 if (!ShuffleDstTy || !II0Ty)
2811 return false;
2812
2813 if (!isTriviallyVectorizable(IID))
2814 return false;
2815
2816 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2818 II0->getArgOperand(I) != II1->getArgOperand(I))
2819 return false;
2820
2821 InstructionCost OldCost =
2822 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II0), CostKind) +
2823 TTI.getIntrinsicInstrCost(IntrinsicCostAttributes(IID, *II1), CostKind) +
2825 II0Ty, OldMask, CostKind, 0, nullptr, {II0, II1}, &I);
2826
2827 SmallVector<Type *> NewArgsTy;
2828 InstructionCost NewCost = 0;
2829 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I) {
2831 NewArgsTy.push_back(II0->getArgOperand(I)->getType());
2832 } else {
2833 auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
2834 auto *ArgTy = FixedVectorType::get(VecTy->getElementType(),
2835 ShuffleDstTy->getNumElements());
2836 NewArgsTy.push_back(ArgTy);
2838 ArgTy, VecTy, OldMask, CostKind);
2839 }
2840 }
2841 IntrinsicCostAttributes NewAttr(IID, ShuffleDstTy, NewArgsTy);
2842 NewCost += TTI.getIntrinsicInstrCost(NewAttr, CostKind);
2843
2844 LLVM_DEBUG(dbgs() << "Found a shuffle feeding two intrinsics: " << I
2845 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
2846 << "\n");
2847
2848 if (NewCost > OldCost)
2849 return false;
2850
2851 SmallVector<Value *> NewArgs;
2852 for (unsigned I = 0, E = II0->arg_size(); I != E; ++I)
2854 NewArgs.push_back(II0->getArgOperand(I));
2855 } else {
2856 Value *Shuf = Builder.CreateShuffleVector(II0->getArgOperand(I),
2857 II1->getArgOperand(I), OldMask);
2858 NewArgs.push_back(Shuf);
2859 Worklist.pushValue(Shuf);
2860 }
2861 Value *NewIntrinsic = Builder.CreateIntrinsic(ShuffleDstTy, IID, NewArgs);
2862
2863 // Intersect flags from the old intrinsics.
2864 if (auto *NewInst = dyn_cast<Instruction>(NewIntrinsic)) {
2865 NewInst->copyIRFlags(II0);
2866 NewInst->andIRFlags(II1);
2867 }
2868
2869 replaceValue(I, *NewIntrinsic);
2870 return true;
2871}
2872
2873using InstLane = std::pair<Use *, int>;
2874
2875static InstLane lookThroughShuffles(Use *U, int Lane) {
2876 while (auto *SV = dyn_cast<ShuffleVectorInst>(U->get())) {
2877 unsigned NumElts =
2878 cast<FixedVectorType>(SV->getOperand(0)->getType())->getNumElements();
2879 int M = SV->getMaskValue(Lane);
2880 if (M < 0)
2881 return {nullptr, PoisonMaskElem};
2882 if (static_cast<unsigned>(M) < NumElts) {
2883 U = &SV->getOperandUse(0);
2884 Lane = M;
2885 } else {
2886 U = &SV->getOperandUse(1);
2887 Lane = M - NumElts;
2888 }
2889 }
2890 return InstLane{U, Lane};
2891}
2892
2896 for (InstLane IL : Item) {
2897 auto [U, Lane] = IL;
2898 InstLane OpLane =
2899 U ? lookThroughShuffles(&cast<Instruction>(U->get())->getOperandUse(Op),
2900 Lane)
2901 : InstLane{nullptr, PoisonMaskElem};
2902 NItem.emplace_back(OpLane);
2903 }
2904 return NItem;
2905}
2906
2907/// Detect concat of multiple values into a vector
2909 const TargetTransformInfo &TTI) {
2910 auto *Ty = cast<FixedVectorType>(Item.front().first->get()->getType());
2911 unsigned NumElts = Ty->getNumElements();
2912 if (Item.size() == NumElts || NumElts == 1 || Item.size() % NumElts != 0)
2913 return false;
2914
2915 // Check that the concat is free, usually meaning that the type will be split
2916 // during legalization.
2917 SmallVector<int, 16> ConcatMask(NumElts * 2);
2918 std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
2919 if (TTI.getShuffleCost(TTI::SK_PermuteTwoSrc,
2920 FixedVectorType::get(Ty->getScalarType(), NumElts * 2),
2921 Ty, ConcatMask, CostKind) != 0)
2922 return false;
2923
2924 unsigned NumSlices = Item.size() / NumElts;
2925 // Currently we generate a tree of shuffles for the concats, which limits us
2926 // to a power2.
2927 if (!isPowerOf2_32(NumSlices))
2928 return false;
2929 for (unsigned Slice = 0; Slice < NumSlices; ++Slice) {
2930 Use *SliceV = Item[Slice * NumElts].first;
2931 if (!SliceV || SliceV->get()->getType() != Ty)
2932 return false;
2933 for (unsigned Elt = 0; Elt < NumElts; ++Elt) {
2934 auto [V, Lane] = Item[Slice * NumElts + Elt];
2935 if (Lane != static_cast<int>(Elt) || SliceV->get() != V->get())
2936 return false;
2937 }
2938 }
2939 return true;
2940}
2941
2943 const SmallPtrSet<Use *, 4> &IdentityLeafs,
2944 const SmallPtrSet<Use *, 4> &SplatLeafs,
2945 const SmallPtrSet<Use *, 4> &ConcatLeafs,
2946 IRBuilderBase &Builder,
2947 const TargetTransformInfo *TTI) {
2948 auto [FrontU, FrontLane] = Item.front();
2949
2950 if (IdentityLeafs.contains(FrontU)) {
2951 return FrontU->get();
2952 }
2953 if (SplatLeafs.contains(FrontU)) {
2954 SmallVector<int, 16> Mask(Ty->getNumElements(), FrontLane);
2955 return Builder.CreateShuffleVector(FrontU->get(), Mask);
2956 }
2957 if (ConcatLeafs.contains(FrontU)) {
2958 unsigned NumElts =
2959 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements();
2960 SmallVector<Value *> Values(Item.size() / NumElts, nullptr);
2961 for (unsigned S = 0; S < Values.size(); ++S)
2962 Values[S] = Item[S * NumElts].first->get();
2963
2964 while (Values.size() > 1) {
2965 NumElts *= 2;
2966 SmallVector<int, 16> Mask(NumElts, 0);
2967 std::iota(Mask.begin(), Mask.end(), 0);
2968 SmallVector<Value *> NewValues(Values.size() / 2, nullptr);
2969 for (unsigned S = 0; S < NewValues.size(); ++S)
2970 NewValues[S] =
2971 Builder.CreateShuffleVector(Values[S * 2], Values[S * 2 + 1], Mask);
2972 Values = NewValues;
2973 }
2974 return Values[0];
2975 }
2976
2977 auto *I = cast<Instruction>(FrontU->get());
2978 auto *II = dyn_cast<IntrinsicInst>(I);
2979 unsigned NumOps = I->getNumOperands() - (II ? 1 : 0);
2981 for (unsigned Idx = 0; Idx < NumOps; Idx++) {
2982 if (II &&
2983 isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Idx, TTI)) {
2984 Ops[Idx] = II->getOperand(Idx);
2985 continue;
2986 }
2988 Ty, IdentityLeafs, SplatLeafs, ConcatLeafs,
2989 Builder, TTI);
2990 }
2991
2992 SmallVector<Value *, 8> ValueList;
2993 for (const auto &Lane : Item)
2994 if (Lane.first)
2995 ValueList.push_back(Lane.first->get());
2996
2997 Type *DstTy =
2998 FixedVectorType::get(I->getType()->getScalarType(), Ty->getNumElements());
2999 if (auto *BI = dyn_cast<BinaryOperator>(I)) {
3000 auto *Value = Builder.CreateBinOp((Instruction::BinaryOps)BI->getOpcode(),
3001 Ops[0], Ops[1]);
3002 propagateIRFlags(Value, ValueList);
3003 return Value;
3004 }
3005 if (auto *CI = dyn_cast<CmpInst>(I)) {
3006 auto *Value = Builder.CreateCmp(CI->getPredicate(), Ops[0], Ops[1]);
3007 propagateIRFlags(Value, ValueList);
3008 return Value;
3009 }
3010 if (auto *SI = dyn_cast<SelectInst>(I)) {
3011 auto *Value = Builder.CreateSelect(Ops[0], Ops[1], Ops[2], "", SI);
3012 propagateIRFlags(Value, ValueList);
3013 return Value;
3014 }
3015 if (auto *CI = dyn_cast<CastInst>(I)) {
3016 auto *Value = Builder.CreateCast(CI->getOpcode(), Ops[0], DstTy);
3017 propagateIRFlags(Value, ValueList);
3018 return Value;
3019 }
3020 if (II) {
3021 auto *Value = Builder.CreateIntrinsic(DstTy, II->getIntrinsicID(), Ops);
3022 propagateIRFlags(Value, ValueList);
3023 return Value;
3024 }
3025 assert(isa<UnaryInstruction>(I) && "Unexpected instruction type in Generate");
3026 auto *Value =
3027 Builder.CreateUnOp((Instruction::UnaryOps)I->getOpcode(), Ops[0]);
3028 propagateIRFlags(Value, ValueList);
3029 return Value;
3030}
3031
3032// Starting from a shuffle, look up through operands tracking the shuffled index
3033// of each lane. If we can simplify away the shuffles to identities then
3034// do so.
3035bool VectorCombine::foldShuffleToIdentity(Instruction &I) {
3036 auto *Ty = dyn_cast<FixedVectorType>(I.getType());
3037 if (!Ty || I.use_empty())
3038 return false;
3039
3040 SmallVector<InstLane> Start(Ty->getNumElements());
3041 for (unsigned M = 0, E = Ty->getNumElements(); M < E; ++M)
3042 Start[M] = lookThroughShuffles(&*I.use_begin(), M);
3043
3045 Worklist.push_back(Start);
3046 SmallPtrSet<Use *, 4> IdentityLeafs, SplatLeafs, ConcatLeafs;
3047 unsigned NumVisited = 0;
3048
3049 while (!Worklist.empty()) {
3050 if (++NumVisited > MaxInstrsToScan)
3051 return false;
3052
3053 SmallVector<InstLane> Item = Worklist.pop_back_val();
3054 auto [FrontU, FrontLane] = Item.front();
3055
3056 // If we found an undef first lane then bail out to keep things simple.
3057 if (!FrontU)
3058 return false;
3059
3060 // Helper to peek through bitcasts to the same value.
3061 auto IsEquiv = [&](Value *X, Value *Y) {
3062 return X->getType() == Y->getType() &&
3064 };
3065
3066 // Look for an identity value.
3067 if (FrontLane == 0 &&
3068 cast<FixedVectorType>(FrontU->get()->getType())->getNumElements() ==
3069 Ty->getNumElements() &&
3070 all_of(drop_begin(enumerate(Item)), [IsEquiv, Item](const auto &E) {
3071 Value *FrontV = Item.front().first->get();
3072 return !E.value().first || (IsEquiv(E.value().first->get(), FrontV) &&
3073 E.value().second == (int)E.index());
3074 })) {
3075 IdentityLeafs.insert(FrontU);
3076 continue;
3077 }
3078 // Look for constants, for the moment only supporting constant splats.
3079 if (auto *C = dyn_cast<Constant>(FrontU);
3080 C && C->getSplatValue() &&
3081 all_of(drop_begin(Item), [Item](InstLane &IL) {
3082 Value *FrontV = Item.front().first->get();
3083 Use *U = IL.first;
3084 return !U || (isa<Constant>(U->get()) &&
3085 cast<Constant>(U->get())->getSplatValue() ==
3086 cast<Constant>(FrontV)->getSplatValue());
3087 })) {
3088 SplatLeafs.insert(FrontU);
3089 continue;
3090 }
3091 // Look for a splat value.
3092 if (all_of(drop_begin(Item), [Item](InstLane &IL) {
3093 auto [FrontU, FrontLane] = Item.front();
3094 auto [U, Lane] = IL;
3095 return !U || (U->get() == FrontU->get() && Lane == FrontLane);
3096 })) {
3097 SplatLeafs.insert(FrontU);
3098 continue;
3099 }
3100
3101 // We need each element to be the same type of value, and check that each
3102 // element has a single use.
3103 auto CheckLaneIsEquivalentToFirst = [Item](InstLane IL) {
3104 Value *FrontV = Item.front().first->get();
3105 if (!IL.first)
3106 return true;
3107 Value *V = IL.first->get();
3108 if (auto *I = dyn_cast<Instruction>(V); I && !I->hasOneUser())
3109 return false;
3110 if (V->getValueID() != FrontV->getValueID())
3111 return false;
3112 if (auto *CI = dyn_cast<CmpInst>(V))
3113 if (CI->getPredicate() != cast<CmpInst>(FrontV)->getPredicate())
3114 return false;
3115 if (auto *CI = dyn_cast<CastInst>(V))
3116 if (CI->getSrcTy()->getScalarType() !=
3117 cast<CastInst>(FrontV)->getSrcTy()->getScalarType())
3118 return false;
3119 if (auto *SI = dyn_cast<SelectInst>(V))
3120 if (!isa<VectorType>(SI->getOperand(0)->getType()) ||
3121 SI->getOperand(0)->getType() !=
3122 cast<SelectInst>(FrontV)->getOperand(0)->getType())
3123 return false;
3124 if (isa<CallInst>(V) && !isa<IntrinsicInst>(V))
3125 return false;
3126 auto *II = dyn_cast<IntrinsicInst>(V);
3127 return !II || (isa<IntrinsicInst>(FrontV) &&
3128 II->getIntrinsicID() ==
3129 cast<IntrinsicInst>(FrontV)->getIntrinsicID() &&
3130 !II->hasOperandBundles());
3131 };
3132 if (all_of(drop_begin(Item), CheckLaneIsEquivalentToFirst)) {
3133 // Check the operator is one that we support.
3134 if (isa<BinaryOperator, CmpInst>(FrontU)) {
3135 // We exclude div/rem in case they hit UB from poison lanes.
3136 if (auto *BO = dyn_cast<BinaryOperator>(FrontU);
3137 BO && BO->isIntDivRem())
3138 return false;
3141 continue;
3142 } else if (isa<UnaryOperator, TruncInst, ZExtInst, SExtInst, FPToSIInst,
3143 FPToUIInst, SIToFPInst, UIToFPInst>(FrontU)) {
3145 continue;
3146 } else if (auto *BitCast = dyn_cast<BitCastInst>(FrontU)) {
3147 // TODO: Handle vector widening/narrowing bitcasts.
3148 auto *DstTy = dyn_cast<FixedVectorType>(BitCast->getDestTy());
3149 auto *SrcTy = dyn_cast<FixedVectorType>(BitCast->getSrcTy());
3150 if (DstTy && SrcTy &&
3151 SrcTy->getNumElements() == DstTy->getNumElements()) {
3153 continue;
3154 }
3155 } else if (isa<SelectInst>(FrontU)) {
3159 continue;
3160 } else if (auto *II = dyn_cast<IntrinsicInst>(FrontU);
3161 II && isTriviallyVectorizable(II->getIntrinsicID()) &&
3162 !II->hasOperandBundles()) {
3163 for (unsigned Op = 0, E = II->getNumOperands() - 1; Op < E; Op++) {
3164 if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(), Op,
3165 &TTI)) {
3166 if (!all_of(drop_begin(Item), [Item, Op](InstLane &IL) {
3167 Value *FrontV = Item.front().first->get();
3168 Use *U = IL.first;
3169 return !U || (cast<Instruction>(U->get())->getOperand(Op) ==
3170 cast<Instruction>(FrontV)->getOperand(Op));
3171 }))
3172 return false;
3173 continue;
3174 }
3176 }
3177 continue;
3178 }
3179 }
3180
3181 if (isFreeConcat(Item, CostKind, TTI)) {
3182 ConcatLeafs.insert(FrontU);
3183 continue;
3184 }
3185
3186 return false;
3187 }
3188
3189 if (NumVisited <= 1)
3190 return false;
3191
3192 LLVM_DEBUG(dbgs() << "Found a superfluous identity shuffle: " << I << "\n");
3193
3194 // If we got this far, we know the shuffles are superfluous and can be
3195 // removed. Scan through again and generate the new tree of instructions.
3196 Builder.SetInsertPoint(&I);
3197 Value *V = generateNewInstTree(Start, Ty, IdentityLeafs, SplatLeafs,
3198 ConcatLeafs, Builder, &TTI);
3199 replaceValue(I, *V);
3200 return true;
3201}
3202
3203/// Given a commutative reduction, the order of the input lanes does not alter
3204/// the results. We can use this to remove certain shuffles feeding the
3205/// reduction, removing the need to shuffle at all.
3206bool VectorCombine::foldShuffleFromReductions(Instruction &I) {
3207 auto *II = dyn_cast<IntrinsicInst>(&I);
3208 if (!II)
3209 return false;
3210 switch (II->getIntrinsicID()) {
3211 case Intrinsic::vector_reduce_add:
3212 case Intrinsic::vector_reduce_mul:
3213 case Intrinsic::vector_reduce_and:
3214 case Intrinsic::vector_reduce_or:
3215 case Intrinsic::vector_reduce_xor:
3216 case Intrinsic::vector_reduce_smin:
3217 case Intrinsic::vector_reduce_smax:
3218 case Intrinsic::vector_reduce_umin:
3219 case Intrinsic::vector_reduce_umax:
3220 break;
3221 default:
3222 return false;
3223 }
3224
3225 // Find all the inputs when looking through operations that do not alter the
3226 // lane order (binops, for example). Currently we look for a single shuffle,
3227 // and can ignore splat values.
3228 std::queue<Value *> Worklist;
3229 SmallPtrSet<Value *, 4> Visited;
3230 ShuffleVectorInst *Shuffle = nullptr;
3231 if (auto *Op = dyn_cast<Instruction>(I.getOperand(0)))
3232 Worklist.push(Op);
3233
3234 while (!Worklist.empty()) {
3235 Value *CV = Worklist.front();
3236 Worklist.pop();
3237 if (Visited.contains(CV))
3238 continue;
3239
3240 // Splats don't change the order, so can be safely ignored.
3241 if (isSplatValue(CV))
3242 continue;
3243
3244 Visited.insert(CV);
3245
3246 if (auto *CI = dyn_cast<Instruction>(CV)) {
3247 if (CI->isBinaryOp()) {
3248 for (auto *Op : CI->operand_values())
3249 Worklist.push(Op);
3250 continue;
3251 } else if (auto *SV = dyn_cast<ShuffleVectorInst>(CI)) {
3252 if (Shuffle && Shuffle != SV)
3253 return false;
3254 Shuffle = SV;
3255 continue;
3256 }
3257 }
3258
3259 // Anything else is currently an unknown node.
3260 return false;
3261 }
3262
3263 if (!Shuffle)
3264 return false;
3265
3266 // Check all uses of the binary ops and shuffles are also included in the
3267 // lane-invariant operations (Visited should be the list of lanewise
3268 // instructions, including the shuffle that we found).
3269 for (auto *V : Visited)
3270 for (auto *U : V->users())
3271 if (!Visited.contains(U) && U != &I)
3272 return false;
3273
3274 FixedVectorType *VecType =
3275 dyn_cast<FixedVectorType>(II->getOperand(0)->getType());
3276 if (!VecType)
3277 return false;
3278 FixedVectorType *ShuffleInputType =
3280 if (!ShuffleInputType)
3281 return false;
3282 unsigned NumInputElts = ShuffleInputType->getNumElements();
3283
3284 // Find the mask from sorting the lanes into order. This is most likely to
3285 // become a identity or concat mask. Undef elements are pushed to the end.
3286 SmallVector<int> ConcatMask;
3287 Shuffle->getShuffleMask(ConcatMask);
3288 sort(ConcatMask, [](int X, int Y) { return (unsigned)X < (unsigned)Y; });
3289 bool UsesSecondVec =
3290 any_of(ConcatMask, [&](int M) { return M >= (int)NumInputElts; });
3291
3293 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3294 ShuffleInputType, Shuffle->getShuffleMask(), CostKind);
3296 UsesSecondVec ? TTI::SK_PermuteTwoSrc : TTI::SK_PermuteSingleSrc, VecType,
3297 ShuffleInputType, ConcatMask, CostKind);
3298
3299 LLVM_DEBUG(dbgs() << "Found a reduction feeding from a shuffle: " << *Shuffle
3300 << "\n");
3301 LLVM_DEBUG(dbgs() << " OldCost: " << OldCost << " vs NewCost: " << NewCost
3302 << "\n");
3303 bool MadeChanges = false;
3304 if (NewCost < OldCost) {
3305 Builder.SetInsertPoint(Shuffle);
3306 Value *NewShuffle = Builder.CreateShuffleVector(
3307 Shuffle->getOperand(0), Shuffle->getOperand(1), ConcatMask);
3308 LLVM_DEBUG(dbgs() << "Created new shuffle: " << *NewShuffle << "\n");
3309 replaceValue(*Shuffle, *NewShuffle);
3310 return true;
3311 }
3312
3313 // See if we can re-use foldSelectShuffle, getting it to reduce the size of
3314 // the shuffle into a nicer order, as it can ignore the order of the shuffles.
3315 MadeChanges |= foldSelectShuffle(*Shuffle, true);
3316 return MadeChanges;
3317}
3318
3319/// For a given chain of patterns of the following form:
3320///
3321/// ```
3322/// %1 = shufflevector <n x ty1> %0, <n x ty1> poison <n x ty2> mask
3323///
3324/// %2 = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %0, <n x
3325/// ty1> %1)
3326/// OR
3327/// %2 = add/mul/or/and/xor <n x ty1> %0, %1
3328///
3329/// %3 = shufflevector <n x ty1> %2, <n x ty1> poison <n x ty2> mask
3330/// ...
3331/// ...
3332/// %(i - 1) = tail call <n x ty1> llvm.<umin/umax/smin/smax>(<n x ty1> %(i -
3333/// 3), <n x ty1> %(i - 2)
3334/// OR
3335/// %(i - 1) = add/mul/or/and/xor <n x ty1> %(i - 3), %(i - 2)
3336///
3337/// %(i) = extractelement <n x ty1> %(i - 1), 0
3338/// ```
3339///
3340/// Where:
3341/// `mask` follows a partition pattern:
3342///
3343/// Ex:
3344/// [n = 8, p = poison]
3345///
3346/// 4 5 6 7 | p p p p
3347/// 2 3 | p p p p p p
3348/// 1 | p p p p p p p
3349///
3350/// For powers of 2, there's a consistent pattern, but for other cases
3351/// the parity of the current half value at each step decides the
3352/// next partition half (see `ExpectedParityMask` for more logical details
3353/// in generalising this).
3354///
3355/// Ex:
3356/// [n = 6]
3357///
3358/// 3 4 5 | p p p
3359/// 1 2 | p p p p
3360/// 1 | p p p p p
3361bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) {
3362 // Going bottom-up for the pattern.
3363 std::queue<Value *> InstWorklist;
3364 InstructionCost OrigCost = 0;
3365
3366 // Common instruction operation after each shuffle op.
3367 std::optional<unsigned int> CommonCallOp = std::nullopt;
3368 std::optional<Instruction::BinaryOps> CommonBinOp = std::nullopt;
3369
3370 bool IsFirstCallOrBinInst = true;
3371 bool ShouldBeCallOrBinInst = true;
3372
3373 // This stores the last used instructions for shuffle/common op.
3374 //
3375 // PrevVecV[0] / PrevVecV[1] store the last two simultaneous
3376 // instructions from either shuffle/common op.
3377 SmallVector<Value *, 2> PrevVecV(2, nullptr);
3378
3379 Value *VecOpEE;
3380 if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero())))
3381 return false;
3382
3383 auto *FVT = dyn_cast<FixedVectorType>(VecOpEE->getType());
3384 if (!FVT)
3385 return false;
3386
3387 int64_t VecSize = FVT->getNumElements();
3388 if (VecSize < 2)
3389 return false;
3390
3391 // Number of levels would be ~log2(n), considering we always partition
3392 // by half for this fold pattern.
3393 unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0;
3394 int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0;
3395
3396 // This is how we generalise for all element sizes.
3397 // At each step, if vector size is odd, we need non-poison
3398 // values to cover the dominant half so we don't miss out on any element.
3399 //
3400 // This mask will help us retrieve this as we go from bottom to top:
3401 //
3402 // Mask Set -> N = N * 2 - 1
3403 // Mask Unset -> N = N * 2
3404 for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1;
3405 Cur = (Cur + 1) / 2, --Mask) {
3406 if (Cur & 1)
3407 ExpectedParityMask |= (1ll << Mask);
3408 }
3409
3410 InstWorklist.push(VecOpEE);
3411
3412 while (!InstWorklist.empty()) {
3413 Value *CI = InstWorklist.front();
3414 InstWorklist.pop();
3415
3416 if (auto *II = dyn_cast<IntrinsicInst>(CI)) {
3417 if (!ShouldBeCallOrBinInst)
3418 return false;
3419
3420 if (!IsFirstCallOrBinInst &&
3421 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3422 return false;
3423
3424 // For the first found call/bin op, the vector has to come from the
3425 // extract element op.
3426 if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3427 return false;
3428 IsFirstCallOrBinInst = false;
3429
3430 if (!CommonCallOp)
3431 CommonCallOp = II->getIntrinsicID();
3432 if (II->getIntrinsicID() != *CommonCallOp)
3433 return false;
3434
3435 switch (II->getIntrinsicID()) {
3436 case Intrinsic::umin:
3437 case Intrinsic::umax:
3438 case Intrinsic::smin:
3439 case Intrinsic::smax: {
3440 auto *Op0 = II->getOperand(0);
3441 auto *Op1 = II->getOperand(1);
3442 PrevVecV[0] = Op0;
3443 PrevVecV[1] = Op1;
3444 break;
3445 }
3446 default:
3447 return false;
3448 }
3449 ShouldBeCallOrBinInst ^= 1;
3450
3451 IntrinsicCostAttributes ICA(
3452 *CommonCallOp, II->getType(),
3453 {PrevVecV[0]->getType(), PrevVecV[1]->getType()});
3454 OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind);
3455
3456 // We may need a swap here since it can be (a, b) or (b, a)
3457 // and accordingly change as we go up.
3458 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3459 std::swap(PrevVecV[0], PrevVecV[1]);
3460 InstWorklist.push(PrevVecV[1]);
3461 InstWorklist.push(PrevVecV[0]);
3462 } else if (auto *BinOp = dyn_cast<BinaryOperator>(CI)) {
3463 // Similar logic for bin ops.
3464
3465 if (!ShouldBeCallOrBinInst)
3466 return false;
3467
3468 if (!IsFirstCallOrBinInst &&
3469 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3470 return false;
3471
3472 if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0]))
3473 return false;
3474 IsFirstCallOrBinInst = false;
3475
3476 if (!CommonBinOp)
3477 CommonBinOp = BinOp->getOpcode();
3478
3479 if (BinOp->getOpcode() != *CommonBinOp)
3480 return false;
3481
3482 switch (*CommonBinOp) {
3483 case BinaryOperator::Add:
3484 case BinaryOperator::Mul:
3485 case BinaryOperator::Or:
3486 case BinaryOperator::And:
3487 case BinaryOperator::Xor: {
3488 auto *Op0 = BinOp->getOperand(0);
3489 auto *Op1 = BinOp->getOperand(1);
3490 PrevVecV[0] = Op0;
3491 PrevVecV[1] = Op1;
3492 break;
3493 }
3494 default:
3495 return false;
3496 }
3497 ShouldBeCallOrBinInst ^= 1;
3498
3499 OrigCost +=
3500 TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind);
3501
3502 if (!isa<ShuffleVectorInst>(PrevVecV[1]))
3503 std::swap(PrevVecV[0], PrevVecV[1]);
3504 InstWorklist.push(PrevVecV[1]);
3505 InstWorklist.push(PrevVecV[0]);
3506 } else if (auto *SVInst = dyn_cast<ShuffleVectorInst>(CI)) {
3507 // We shouldn't have any null values in the previous vectors,
3508 // is so, there was a mismatch in pattern.
3509 if (ShouldBeCallOrBinInst ||
3510 any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; }))
3511 return false;
3512
3513 if (SVInst != PrevVecV[1])
3514 return false;
3515
3516 ArrayRef<int> CurMask;
3517 if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(),
3518 m_Mask(CurMask))))
3519 return false;
3520
3521 // Subtract the parity mask when checking the condition.
3522 for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) {
3523 if (Mask < ShuffleMaskHalf &&
3524 CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1))
3525 return false;
3526 if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1)
3527 return false;
3528 }
3529
3530 // Update mask values.
3531 ShuffleMaskHalf *= 2;
3532 ShuffleMaskHalf -= (ExpectedParityMask & 1);
3533 ExpectedParityMask >>= 1;
3534
3536 SVInst->getType(), SVInst->getType(),
3537 CurMask, CostKind);
3538
3539 VisitedCnt += 1;
3540 if (!ExpectedParityMask && VisitedCnt == NumLevels)
3541 break;
3542
3543 ShouldBeCallOrBinInst ^= 1;
3544 } else {
3545 return false;
3546 }
3547 }
3548
3549 // Pattern should end with a shuffle op.
3550 if (ShouldBeCallOrBinInst)
3551 return false;
3552
3553 assert(VecSize != -1 && "Expected Match for Vector Size");
3554
3555 Value *FinalVecV = PrevVecV[0];
3556 if (!FinalVecV)
3557 return false;
3558
3559 auto *FinalVecVTy = cast<FixedVectorType>(FinalVecV->getType());
3560
3561 Intrinsic::ID ReducedOp =
3562 (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp)
3563 : getReductionForBinop(*CommonBinOp));
3564 if (!ReducedOp)
3565 return false;
3566
3567 IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV});
3569
3570 if (NewCost >= OrigCost)
3571 return false;
3572
3573 auto *ReducedResult =
3574 Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV});
3575 replaceValue(I, *ReducedResult);
3576
3577 return true;
3578}
3579
3580/// Determine if its more efficient to fold:
3581/// reduce(trunc(x)) -> trunc(reduce(x)).
3582/// reduce(sext(x)) -> sext(reduce(x)).
3583/// reduce(zext(x)) -> zext(reduce(x)).
3584bool VectorCombine::foldCastFromReductions(Instruction &I) {
3585 auto *II = dyn_cast<IntrinsicInst>(&I);
3586 if (!II)
3587 return false;
3588
3589 bool TruncOnly = false;
3590 Intrinsic::ID IID = II->getIntrinsicID();
3591 switch (IID) {
3592 case Intrinsic::vector_reduce_add:
3593 case Intrinsic::vector_reduce_mul:
3594 TruncOnly = true;
3595 break;
3596 case Intrinsic::vector_reduce_and:
3597 case Intrinsic::vector_reduce_or:
3598 case Intrinsic::vector_reduce_xor:
3599 break;
3600 default:
3601 return false;
3602 }
3603
3604 unsigned ReductionOpc = getArithmeticReductionInstruction(IID);
3605 Value *ReductionSrc = I.getOperand(0);
3606
3607 Value *Src;
3608 if (!match(ReductionSrc, m_OneUse(m_Trunc(m_Value(Src)))) &&
3609 (TruncOnly || !match(ReductionSrc, m_OneUse(m_ZExtOrSExt(m_Value(Src))))))
3610 return false;
3611
3612 auto CastOpc =
3613 (Instruction::CastOps)cast<Instruction>(ReductionSrc)->getOpcode();
3614
3615 auto *SrcTy = cast<VectorType>(Src->getType());
3616 auto *ReductionSrcTy = cast<VectorType>(ReductionSrc->getType());
3617 Type *ResultTy = I.getType();
3618
3620 ReductionOpc, ReductionSrcTy, std::nullopt, CostKind);
3621 OldCost += TTI.getCastInstrCost(CastOpc, ReductionSrcTy, SrcTy,
3623 cast<CastInst>(ReductionSrc));
3624 InstructionCost NewCost =
3625 TTI.getArithmeticReductionCost(ReductionOpc, SrcTy, std::nullopt,
3626 CostKind) +
3627 TTI.getCastInstrCost(CastOpc, ResultTy, ReductionSrcTy->getScalarType(),
3629
3630 if (OldCost <= NewCost || !NewCost.isValid())
3631 return false;
3632
3633 Value *NewReduction = Builder.CreateIntrinsic(SrcTy->getScalarType(),
3634 II->getIntrinsicID(), {Src});
3635 Value *NewCast = Builder.CreateCast(CastOpc, NewReduction, ResultTy);
3636 replaceValue(I, *NewCast);
3637 return true;
3638}
3639
3640/// Returns true if this ShuffleVectorInst eventually feeds into a
3641/// vector reduction intrinsic (e.g., vector_reduce_add) by only following
3642/// chains of shuffles and binary operators (in any combination/order).
3643/// The search does not go deeper than the given Depth.
3645 constexpr unsigned MaxVisited = 32;
3648 bool FoundReduction = false;
3649
3650 WorkList.push_back(SVI);
3651 while (!WorkList.empty()) {
3652 Instruction *I = WorkList.pop_back_val();
3653 for (User *U : I->users()) {
3654 auto *UI = cast<Instruction>(U);
3655 if (!UI || !Visited.insert(UI).second)
3656 continue;
3657 if (Visited.size() > MaxVisited)
3658 return false;
3659 if (auto *II = dyn_cast<IntrinsicInst>(UI)) {
3660 // More than one reduction reached
3661 if (FoundReduction)
3662 return false;
3663 switch (II->getIntrinsicID()) {
3664 case Intrinsic::vector_reduce_add:
3665 case Intrinsic::vector_reduce_mul:
3666 case Intrinsic::vector_reduce_and:
3667 case Intrinsic::vector_reduce_or:
3668 case Intrinsic::vector_reduce_xor:
3669 case Intrinsic::vector_reduce_smin:
3670 case Intrinsic::vector_reduce_smax:
3671 case Intrinsic::vector_reduce_umin:
3672 case Intrinsic::vector_reduce_umax:
3673 FoundReduction = true;
3674 continue;
3675 default:
3676 return false;
3677 }
3678 }
3679
3681 return false;
3682
3683 WorkList.emplace_back(UI);
3684 }
3685 }
3686 return FoundReduction;
3687}
3688
3689/// This method looks for groups of shuffles acting on binops, of the form:
3690/// %x = shuffle ...
3691/// %y = shuffle ...
3692/// %a = binop %x, %y
3693/// %b = binop %x, %y
3694/// shuffle %a, %b, selectmask
3695/// We may, especially if the shuffle is wider than legal, be able to convert
3696/// the shuffle to a form where only parts of a and b need to be computed. On
3697/// architectures with no obvious "select" shuffle, this can reduce the total
3698/// number of operations if the target reports them as cheaper.
3699bool VectorCombine::foldSelectShuffle(Instruction &I, bool FromReduction) {
3700 auto *SVI = cast<ShuffleVectorInst>(&I);
3701 auto *VT = cast<FixedVectorType>(I.getType());
3702 auto *Op0 = dyn_cast<Instruction>(SVI->getOperand(0));
3703 auto *Op1 = dyn_cast<Instruction>(SVI->getOperand(1));
3704 if (!Op0 || !Op1 || Op0 == Op1 || !Op0->isBinaryOp() || !Op1->isBinaryOp() ||
3705 VT != Op0->getType())
3706 return false;
3707
3708 auto *SVI0A = dyn_cast<Instruction>(Op0->getOperand(0));
3709 auto *SVI0B = dyn_cast<Instruction>(Op0->getOperand(1));
3710 auto *SVI1A = dyn_cast<Instruction>(Op1->getOperand(0));
3711 auto *SVI1B = dyn_cast<Instruction>(Op1->getOperand(1));
3712 SmallPtrSet<Instruction *, 4> InputShuffles({SVI0A, SVI0B, SVI1A, SVI1B});
3713 auto checkSVNonOpUses = [&](Instruction *I) {
3714 if (!I || I->getOperand(0)->getType() != VT)
3715 return true;
3716 return any_of(I->users(), [&](User *U) {
3717 return U != Op0 && U != Op1 &&
3718 !(isa<ShuffleVectorInst>(U) &&
3719 (InputShuffles.contains(cast<Instruction>(U)) ||
3720 isInstructionTriviallyDead(cast<Instruction>(U))));
3721 });
3722 };
3723 if (checkSVNonOpUses(SVI0A) || checkSVNonOpUses(SVI0B) ||
3724 checkSVNonOpUses(SVI1A) || checkSVNonOpUses(SVI1B))
3725 return false;
3726
3727 // Collect all the uses that are shuffles that we can transform together. We
3728 // may not have a single shuffle, but a group that can all be transformed
3729 // together profitably.
3731 auto collectShuffles = [&](Instruction *I) {
3732 for (auto *U : I->users()) {
3733 auto *SV = dyn_cast<ShuffleVectorInst>(U);
3734 if (!SV || SV->getType() != VT)
3735 return false;
3736 if ((SV->getOperand(0) != Op0 && SV->getOperand(0) != Op1) ||
3737 (SV->getOperand(1) != Op0 && SV->getOperand(1) != Op1))
3738 return false;
3739 if (!llvm::is_contained(Shuffles, SV))
3740 Shuffles.push_back(SV);
3741 }
3742 return true;
3743 };
3744 if (!collectShuffles(Op0) || !collectShuffles(Op1))
3745 return false;
3746 // From a reduction, we need to be processing a single shuffle, otherwise the
3747 // other uses will not be lane-invariant.
3748 if (FromReduction && Shuffles.size() > 1)
3749 return false;
3750
3751 // Add any shuffle uses for the shuffles we have found, to include them in our
3752 // cost calculations.
3753 if (!FromReduction) {
3754 for (ShuffleVectorInst *SV : Shuffles) {
3755 for (auto *U : SV->users()) {
3756 ShuffleVectorInst *SSV = dyn_cast<ShuffleVectorInst>(U);
3757 if (SSV && isa<UndefValue>(SSV->getOperand(1)) && SSV->getType() == VT)
3758 Shuffles.push_back(SSV);
3759 }
3760 }
3761 }
3762
3763 // For each of the output shuffles, we try to sort all the first vector
3764 // elements to the beginning, followed by the second array elements at the
3765 // end. If the binops are legalized to smaller vectors, this may reduce total
3766 // number of binops. We compute the ReconstructMask mask needed to convert
3767 // back to the original lane order.
3769 SmallVector<SmallVector<int>> OrigReconstructMasks;
3770 int MaxV1Elt = 0, MaxV2Elt = 0;
3771 unsigned NumElts = VT->getNumElements();
3772 for (ShuffleVectorInst *SVN : Shuffles) {
3773 SmallVector<int> Mask;
3774 SVN->getShuffleMask(Mask);
3775
3776 // Check the operands are the same as the original, or reversed (in which
3777 // case we need to commute the mask).
3778 Value *SVOp0 = SVN->getOperand(0);
3779 Value *SVOp1 = SVN->getOperand(1);
3780 if (isa<UndefValue>(SVOp1)) {
3781 auto *SSV = cast<ShuffleVectorInst>(SVOp0);
3782 SVOp0 = SSV->getOperand(0);
3783 SVOp1 = SSV->getOperand(1);
3784 for (int &Elem : Mask) {
3785 if (Elem >= static_cast<int>(SSV->getShuffleMask().size()))
3786 return false;
3787 Elem = Elem < 0 ? Elem : SSV->getMaskValue(Elem);
3788 }
3789 }
3790 if (SVOp0 == Op1 && SVOp1 == Op0) {
3791 std::swap(SVOp0, SVOp1);
3793 }
3794 if (SVOp0 != Op0 || SVOp1 != Op1)
3795 return false;
3796
3797 // Calculate the reconstruction mask for this shuffle, as the mask needed to
3798 // take the packed values from Op0/Op1 and reconstructing to the original
3799 // order.
3800 SmallVector<int> ReconstructMask;
3801 for (unsigned I = 0; I < Mask.size(); I++) {
3802 if (Mask[I] < 0) {
3803 ReconstructMask.push_back(-1);
3804 } else if (Mask[I] < static_cast<int>(NumElts)) {
3805 MaxV1Elt = std::max(MaxV1Elt, Mask[I]);
3806 auto It = find_if(V1, [&](const std::pair<int, int> &A) {
3807 return Mask[I] == A.first;
3808 });
3809 if (It != V1.end())
3810 ReconstructMask.push_back(It - V1.begin());
3811 else {
3812 ReconstructMask.push_back(V1.size());
3813 V1.emplace_back(Mask[I], V1.size());
3814 }
3815 } else {
3816 MaxV2Elt = std::max<int>(MaxV2Elt, Mask[I] - NumElts);
3817 auto It = find_if(V2, [&](const std::pair<int, int> &A) {
3818 return Mask[I] - static_cast<int>(NumElts) == A.first;
3819 });
3820 if (It != V2.end())
3821 ReconstructMask.push_back(NumElts + It - V2.begin());
3822 else {
3823 ReconstructMask.push_back(NumElts + V2.size());
3824 V2.emplace_back(Mask[I] - NumElts, NumElts + V2.size());
3825 }
3826 }
3827 }
3828
3829 // For reductions, we know that the lane ordering out doesn't alter the
3830 // result. In-order can help simplify the shuffle away.
3831 if (FromReduction)
3832 sort(ReconstructMask);
3833 OrigReconstructMasks.push_back(std::move(ReconstructMask));
3834 }
3835
3836 // If the Maximum element used from V1 and V2 are not larger than the new
3837 // vectors, the vectors are already packes and performing the optimization
3838 // again will likely not help any further. This also prevents us from getting
3839 // stuck in a cycle in case the costs do not also rule it out.
3840 if (V1.empty() || V2.empty() ||
3841 (MaxV1Elt == static_cast<int>(V1.size()) - 1 &&
3842 MaxV2Elt == static_cast<int>(V2.size()) - 1))
3843 return false;
3844
3845 // GetBaseMaskValue takes one of the inputs, which may either be a shuffle, a
3846 // shuffle of another shuffle, or not a shuffle (that is treated like a
3847 // identity shuffle).
3848 auto GetBaseMaskValue = [&](Instruction *I, int M) {
3849 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3850 if (!SV)
3851 return M;
3852 if (isa<UndefValue>(SV->getOperand(1)))
3853 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
3854 if (InputShuffles.contains(SSV))
3855 return SSV->getMaskValue(SV->getMaskValue(M));
3856 return SV->getMaskValue(M);
3857 };
3858
3859 // Attempt to sort the inputs my ascending mask values to make simpler input
3860 // shuffles and push complex shuffles down to the uses. We sort on the first
3861 // of the two input shuffle orders, to try and get at least one input into a
3862 // nice order.
3863 auto SortBase = [&](Instruction *A, std::pair<int, int> X,
3864 std::pair<int, int> Y) {
3865 int MXA = GetBaseMaskValue(A, X.first);
3866 int MYA = GetBaseMaskValue(A, Y.first);
3867 return MXA < MYA;
3868 };
3869 stable_sort(V1, [&](std::pair<int, int> A, std::pair<int, int> B) {
3870 return SortBase(SVI0A, A, B);
3871 });
3872 stable_sort(V2, [&](std::pair<int, int> A, std::pair<int, int> B) {
3873 return SortBase(SVI1A, A, B);
3874 });
3875 // Calculate our ReconstructMasks from the OrigReconstructMasks and the
3876 // modified order of the input shuffles.
3877 SmallVector<SmallVector<int>> ReconstructMasks;
3878 for (const auto &Mask : OrigReconstructMasks) {
3879 SmallVector<int> ReconstructMask;
3880 for (int M : Mask) {
3881 auto FindIndex = [](const SmallVector<std::pair<int, int>> &V, int M) {
3882 auto It = find_if(V, [M](auto A) { return A.second == M; });
3883 assert(It != V.end() && "Expected all entries in Mask");
3884 return std::distance(V.begin(), It);
3885 };
3886 if (M < 0)
3887 ReconstructMask.push_back(-1);
3888 else if (M < static_cast<int>(NumElts)) {
3889 ReconstructMask.push_back(FindIndex(V1, M));
3890 } else {
3891 ReconstructMask.push_back(NumElts + FindIndex(V2, M));
3892 }
3893 }
3894 ReconstructMasks.push_back(std::move(ReconstructMask));
3895 }
3896
3897 // Calculate the masks needed for the new input shuffles, which get padded
3898 // with undef
3899 SmallVector<int> V1A, V1B, V2A, V2B;
3900 for (unsigned I = 0; I < V1.size(); I++) {
3901 V1A.push_back(GetBaseMaskValue(SVI0A, V1[I].first));
3902 V1B.push_back(GetBaseMaskValue(SVI0B, V1[I].first));
3903 }
3904 for (unsigned I = 0; I < V2.size(); I++) {
3905 V2A.push_back(GetBaseMaskValue(SVI1A, V2[I].first));
3906 V2B.push_back(GetBaseMaskValue(SVI1B, V2[I].first));
3907 }
3908 while (V1A.size() < NumElts) {
3911 }
3912 while (V2A.size() < NumElts) {
3915 }
3916
3917 auto AddShuffleCost = [&](InstructionCost C, Instruction *I) {
3918 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3919 if (!SV)
3920 return C;
3921 return C + TTI.getShuffleCost(isa<UndefValue>(SV->getOperand(1))
3924 VT, VT, SV->getShuffleMask(), CostKind);
3925 };
3926 auto AddShuffleMaskCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3927 return C +
3929 };
3930
3931 unsigned ElementSize = VT->getElementType()->getPrimitiveSizeInBits();
3932 unsigned MaxVectorSize =
3934 unsigned MaxElementsInVector = MaxVectorSize / ElementSize;
3935 if (MaxElementsInVector == 0)
3936 return false;
3937 // When there are multiple shufflevector operations on the same input,
3938 // especially when the vector length is larger than the register size,
3939 // identical shuffle patterns may occur across different groups of elements.
3940 // To avoid overestimating the cost by counting these repeated shuffles more
3941 // than once, we only account for unique shuffle patterns. This adjustment
3942 // prevents inflated costs in the cost model for wide vectors split into
3943 // several register-sized groups.
3944 std::set<SmallVector<int, 4>> UniqueShuffles;
3945 auto AddShuffleMaskAdjustedCost = [&](InstructionCost C, ArrayRef<int> Mask) {
3946 // Compute the cost for performing the shuffle over the full vector.
3947 auto ShuffleCost =
3949 unsigned NumFullVectors = Mask.size() / MaxElementsInVector;
3950 if (NumFullVectors < 2)
3951 return C + ShuffleCost;
3952 SmallVector<int, 4> SubShuffle(MaxElementsInVector);
3953 unsigned NumUniqueGroups = 0;
3954 unsigned NumGroups = Mask.size() / MaxElementsInVector;
3955 // For each group of MaxElementsInVector contiguous elements,
3956 // collect their shuffle pattern and insert into the set of unique patterns.
3957 for (unsigned I = 0; I < NumFullVectors; ++I) {
3958 for (unsigned J = 0; J < MaxElementsInVector; ++J)
3959 SubShuffle[J] = Mask[MaxElementsInVector * I + J];
3960 if (UniqueShuffles.insert(SubShuffle).second)
3961 NumUniqueGroups += 1;
3962 }
3963 return C + ShuffleCost * NumUniqueGroups / NumGroups;
3964 };
3965 auto AddShuffleAdjustedCost = [&](InstructionCost C, Instruction *I) {
3966 auto *SV = dyn_cast<ShuffleVectorInst>(I);
3967 if (!SV)
3968 return C;
3969 SmallVector<int, 16> Mask;
3970 SV->getShuffleMask(Mask);
3971 return AddShuffleMaskAdjustedCost(C, Mask);
3972 };
3973 // Check that input consists of ShuffleVectors applied to the same input
3974 auto AllShufflesHaveSameOperands =
3975 [](SmallPtrSetImpl<Instruction *> &InputShuffles) {
3976 if (InputShuffles.size() < 2)
3977 return false;
3978 ShuffleVectorInst *FirstSV =
3979 dyn_cast<ShuffleVectorInst>(*InputShuffles.begin());
3980 if (!FirstSV)
3981 return false;
3982
3983 Value *In0 = FirstSV->getOperand(0), *In1 = FirstSV->getOperand(1);
3984 return std::all_of(
3985 std::next(InputShuffles.begin()), InputShuffles.end(),
3986 [&](Instruction *I) {
3987 ShuffleVectorInst *SV = dyn_cast<ShuffleVectorInst>(I);
3988 return SV && SV->getOperand(0) == In0 && SV->getOperand(1) == In1;
3989 });
3990 };
3991
3992 // Get the costs of the shuffles + binops before and after with the new
3993 // shuffle masks.
3994 InstructionCost CostBefore =
3995 TTI.getArithmeticInstrCost(Op0->getOpcode(), VT, CostKind) +
3996 TTI.getArithmeticInstrCost(Op1->getOpcode(), VT, CostKind);
3997 CostBefore += std::accumulate(Shuffles.begin(), Shuffles.end(),
3998 InstructionCost(0), AddShuffleCost);
3999 if (AllShufflesHaveSameOperands(InputShuffles)) {
4000 UniqueShuffles.clear();
4001 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4002 InstructionCost(0), AddShuffleAdjustedCost);
4003 } else {
4004 CostBefore += std::accumulate(InputShuffles.begin(), InputShuffles.end(),
4005 InstructionCost(0), AddShuffleCost);
4006 }
4007
4008 // The new binops will be unused for lanes past the used shuffle lengths.
4009 // These types attempt to get the correct cost for that from the target.
4010 FixedVectorType *Op0SmallVT =
4011 FixedVectorType::get(VT->getScalarType(), V1.size());
4012 FixedVectorType *Op1SmallVT =
4013 FixedVectorType::get(VT->getScalarType(), V2.size());
4014 InstructionCost CostAfter =
4015 TTI.getArithmeticInstrCost(Op0->getOpcode(), Op0SmallVT, CostKind) +
4016 TTI.getArithmeticInstrCost(Op1->getOpcode(), Op1SmallVT, CostKind);
4017 UniqueShuffles.clear();
4018 CostAfter += std::accumulate(ReconstructMasks.begin(), ReconstructMasks.end(),
4019 InstructionCost(0), AddShuffleMaskAdjustedCost);
4020 std::set<SmallVector<int>> OutputShuffleMasks({V1A, V1B, V2A, V2B});
4021 CostAfter +=
4022 std::accumulate(OutputShuffleMasks.begin(), OutputShuffleMasks.end(),
4023 InstructionCost(0), AddShuffleMaskCost);
4024
4025 LLVM_DEBUG(dbgs() << "Found a binop select shuffle pattern: " << I << "\n");
4026 LLVM_DEBUG(dbgs() << " CostBefore: " << CostBefore
4027 << " vs CostAfter: " << CostAfter << "\n");
4028 if (CostBefore < CostAfter ||
4029 (CostBefore == CostAfter && !feedsIntoVectorReduction(SVI)))
4030 return false;
4031
4032 // The cost model has passed, create the new instructions.
4033 auto GetShuffleOperand = [&](Instruction *I, unsigned Op) -> Value * {
4034 auto *SV = dyn_cast<ShuffleVectorInst>(I);
4035 if (!SV)
4036 return I;
4037 if (isa<UndefValue>(SV->getOperand(1)))
4038 if (auto *SSV = dyn_cast<ShuffleVectorInst>(SV->getOperand(0)))
4039 if (InputShuffles.contains(SSV))
4040 return SSV->getOperand(Op);
4041 return SV->getOperand(Op);
4042 };
4043 Builder.SetInsertPoint(*SVI0A->getInsertionPointAfterDef());
4044 Value *NSV0A = Builder.CreateShuffleVector(GetShuffleOperand(SVI0A, 0),
4045 GetShuffleOperand(SVI0A, 1), V1A);
4046 Builder.SetInsertPoint(*SVI0B->getInsertionPointAfterDef());
4047 Value *NSV0B = Builder.CreateShuffleVector(GetShuffleOperand(SVI0B, 0),
4048 GetShuffleOperand(SVI0B, 1), V1B);
4049 Builder.SetInsertPoint(*SVI1A->getInsertionPointAfterDef());
4050 Value *NSV1A = Builder.CreateShuffleVector(GetShuffleOperand(SVI1A, 0),
4051 GetShuffleOperand(SVI1A, 1), V2A);
4052 Builder.SetInsertPoint(*SVI1B->getInsertionPointAfterDef());
4053 Value *NSV1B = Builder.CreateShuffleVector(GetShuffleOperand(SVI1B, 0),
4054 GetShuffleOperand(SVI1B, 1), V2B);
4055 Builder.SetInsertPoint(Op0);
4056 Value *NOp0 = Builder.CreateBinOp((Instruction::BinaryOps)Op0->getOpcode(),
4057 NSV0A, NSV0B);
4058 if (auto *I = dyn_cast<Instruction>(NOp0))
4059 I->copyIRFlags(Op0, true);
4060 Builder.SetInsertPoint(Op1);
4061 Value *NOp1 = Builder.CreateBinOp((Instruction::BinaryOps)Op1->getOpcode(),
4062 NSV1A, NSV1B);
4063 if (auto *I = dyn_cast<Instruction>(NOp1))
4064 I->copyIRFlags(Op1, true);
4065
4066 for (int S = 0, E = ReconstructMasks.size(); S != E; S++) {
4067 Builder.SetInsertPoint(Shuffles[S]);
4068 Value *NSV = Builder.CreateShuffleVector(NOp0, NOp1, ReconstructMasks[S]);
4069 replaceValue(*Shuffles[S], *NSV, false);
4070 }
4071
4072 Worklist.pushValue(NSV0A);
4073 Worklist.pushValue(NSV0B);
4074 Worklist.pushValue(NSV1A);
4075 Worklist.pushValue(NSV1B);
4076 return true;
4077}
4078
4079/// Check if instruction depends on ZExt and this ZExt can be moved after the
4080/// instruction. Move ZExt if it is profitable. For example:
4081/// logic(zext(x),y) -> zext(logic(x,trunc(y)))
4082/// lshr((zext(x),y) -> zext(lshr(x,trunc(y)))
4083/// Cost model calculations takes into account if zext(x) has other users and
4084/// whether it can be propagated through them too.
4085bool VectorCombine::shrinkType(Instruction &I) {
4086 Value *ZExted, *OtherOperand;
4087 if (!match(&I, m_c_BitwiseLogic(m_ZExt(m_Value(ZExted)),
4088 m_Value(OtherOperand))) &&
4089 !match(&I, m_LShr(m_ZExt(m_Value(ZExted)), m_Value(OtherOperand))))
4090 return false;
4091
4092 Value *ZExtOperand = I.getOperand(I.getOperand(0) == OtherOperand ? 1 : 0);
4093
4094 auto *BigTy = cast<FixedVectorType>(I.getType());
4095 auto *SmallTy = cast<FixedVectorType>(ZExted->getType());
4096 unsigned BW = SmallTy->getElementType()->getPrimitiveSizeInBits();
4097
4098 if (I.getOpcode() == Instruction::LShr) {
4099 // Check that the shift amount is less than the number of bits in the
4100 // smaller type. Otherwise, the smaller lshr will return a poison value.
4101 KnownBits ShAmtKB = computeKnownBits(I.getOperand(1), *DL);
4102 if (ShAmtKB.getMaxValue().uge(BW))
4103 return false;
4104 } else {
4105 // Check that the expression overall uses at most the same number of bits as
4106 // ZExted
4107 KnownBits KB = computeKnownBits(&I, *DL);
4108 if (KB.countMaxActiveBits() > BW)
4109 return false;
4110 }
4111
4112 // Calculate costs of leaving current IR as it is and moving ZExt operation
4113 // later, along with adding truncates if needed
4115 Instruction::ZExt, BigTy, SmallTy,
4116 TargetTransformInfo::CastContextHint::None, CostKind);
4117 InstructionCost CurrentCost = ZExtCost;
4118 InstructionCost ShrinkCost = 0;
4119
4120 // Calculate total cost and check that we can propagate through all ZExt users
4121 for (User *U : ZExtOperand->users()) {
4122 auto *UI = cast<Instruction>(U);
4123 if (UI == &I) {
4124 CurrentCost +=
4125 TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4126 ShrinkCost +=
4127 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4128 ShrinkCost += ZExtCost;
4129 continue;
4130 }
4131
4132 if (!Instruction::isBinaryOp(UI->getOpcode()))
4133 return false;
4134
4135 // Check if we can propagate ZExt through its other users
4136 KnownBits KB = computeKnownBits(UI, *DL);
4137 if (KB.countMaxActiveBits() > BW)
4138 return false;
4139
4140 CurrentCost += TTI.getArithmeticInstrCost(UI->getOpcode(), BigTy, CostKind);
4141 ShrinkCost +=
4142 TTI.getArithmeticInstrCost(UI->getOpcode(), SmallTy, CostKind);
4143 ShrinkCost += ZExtCost;
4144 }
4145
4146 // If the other instruction operand is not a constant, we'll need to
4147 // generate a truncate instruction. So we have to adjust cost
4148 if (!isa<Constant>(OtherOperand))
4149 ShrinkCost += TTI.getCastInstrCost(
4150 Instruction::Trunc, SmallTy, BigTy,
4151 TargetTransformInfo::CastContextHint::None, CostKind);
4152
4153 // If the cost of shrinking types and leaving the IR is the same, we'll lean
4154 // towards modifying the IR because shrinking opens opportunities for other
4155 // shrinking optimisations.
4156 if (ShrinkCost > CurrentCost)
4157 return false;
4158
4159 Builder.SetInsertPoint(&I);
4160 Value *Op0 = ZExted;
4161 Value *Op1 = Builder.CreateTrunc(OtherOperand, SmallTy);
4162 // Keep the order of operands the same
4163 if (I.getOperand(0) == OtherOperand)
4164 std::swap(Op0, Op1);
4165 Value *NewBinOp =
4166 Builder.CreateBinOp((Instruction::BinaryOps)I.getOpcode(), Op0, Op1);
4167 cast<Instruction>(NewBinOp)->copyIRFlags(&I);
4168 cast<Instruction>(NewBinOp)->copyMetadata(I);
4169 Value *NewZExtr = Builder.CreateZExt(NewBinOp, BigTy);
4170 replaceValue(I, *NewZExtr);
4171 return true;
4172}
4173
4174/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
4175/// shuffle (DstVec, SrcVec, Mask)
4176bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
4177 Value *DstVec, *SrcVec;
4178 uint64_t ExtIdx, InsIdx;
4179 if (!match(&I,
4180 m_InsertElt(m_Value(DstVec),
4181 m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
4182 m_ConstantInt(InsIdx))))
4183 return false;
4184
4185 auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
4186 auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
4187 // We can try combining vectors with different element sizes.
4188 if (!DstVecTy || !SrcVecTy ||
4189 SrcVecTy->getElementType() != DstVecTy->getElementType())
4190 return false;
4191
4192 unsigned NumDstElts = DstVecTy->getNumElements();
4193 unsigned NumSrcElts = SrcVecTy->getNumElements();
4194 if (InsIdx >= NumDstElts || ExtIdx >= NumSrcElts || NumDstElts == 1)
4195 return false;
4196
4197 // Insertion into poison is a cheaper single operand shuffle.
4199 SmallVector<int> Mask(NumDstElts, PoisonMaskElem);
4200
4201 bool NeedExpOrNarrow = NumSrcElts != NumDstElts;
4202 bool IsExtIdxInBounds = ExtIdx < NumDstElts;
4203 bool NeedDstSrcSwap = isa<PoisonValue>(DstVec) && !isa<UndefValue>(SrcVec);
4204 if (NeedDstSrcSwap) {
4206 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4207 Mask[InsIdx] = 0;
4208 else
4209 Mask[InsIdx] = ExtIdx;
4210 std::swap(DstVec, SrcVec);
4211 } else {
4213 std::iota(Mask.begin(), Mask.end(), 0);
4214 if (!IsExtIdxInBounds && NeedExpOrNarrow)
4215 Mask[InsIdx] = NumDstElts;
4216 else
4217 Mask[InsIdx] = ExtIdx + NumDstElts;
4218 }
4219
4220 // Cost
4221 auto *Ins = cast<InsertElementInst>(&I);
4222 auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
4223 InstructionCost InsCost =
4224 TTI.getVectorInstrCost(*Ins, DstVecTy, CostKind, InsIdx);
4225 InstructionCost ExtCost =
4226 TTI.getVectorInstrCost(*Ext, DstVecTy, CostKind, ExtIdx);
4227 InstructionCost OldCost = ExtCost + InsCost;
4228
4229 InstructionCost NewCost = 0;
4230 SmallVector<int> ExtToVecMask;
4231 if (!NeedExpOrNarrow) {
4232 // Ignore 'free' identity insertion shuffle.
4233 // TODO: getShuffleCost should return TCC_Free for Identity shuffles.
4234 if (!ShuffleVectorInst::isIdentityMask(Mask, NumSrcElts))
4235 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind, 0,
4236 nullptr, {DstVec, SrcVec});
4237 } else {
4238 // When creating length-changing-vector, always create with a Mask whose
4239 // first element has an ExtIdx, so that the first element of the vector
4240 // being created is always the target to be extracted.
4241 ExtToVecMask.assign(NumDstElts, PoisonMaskElem);
4242 if (IsExtIdxInBounds)
4243 ExtToVecMask[ExtIdx] = ExtIdx;
4244 else
4245 ExtToVecMask[0] = ExtIdx;
4246 // Add cost for expanding or narrowing
4248 DstVecTy, SrcVecTy, ExtToVecMask, CostKind);
4249 NewCost += TTI.getShuffleCost(SK, DstVecTy, DstVecTy, Mask, CostKind);
4250 }
4251
4252 if (!Ext->hasOneUse())
4253 NewCost += ExtCost;
4254
4255 LLVM_DEBUG(dbgs() << "Found a insert/extract shuffle-like pair: " << I
4256 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4257 << "\n");
4258
4259 if (OldCost < NewCost)
4260 return false;
4261
4262 if (NeedExpOrNarrow) {
4263 if (!NeedDstSrcSwap)
4264 SrcVec = Builder.CreateShuffleVector(SrcVec, ExtToVecMask);
4265 else
4266 DstVec = Builder.CreateShuffleVector(DstVec, ExtToVecMask);
4267 }
4268
4269 // Canonicalize undef param to RHS to help further folds.
4270 if (isa<UndefValue>(DstVec) && !isa<UndefValue>(SrcVec)) {
4271 ShuffleVectorInst::commuteShuffleMask(Mask, NumDstElts);
4272 std::swap(DstVec, SrcVec);
4273 }
4274
4275 Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
4276 replaceValue(I, *Shuf);
4277
4278 return true;
4279}
4280
4281/// If we're interleaving 2 constant splats, for instance `<vscale x 8 x i32>
4282/// <splat of 666>` and `<vscale x 8 x i32> <splat of 777>`, we can create a
4283/// larger splat `<vscale x 8 x i64> <splat of ((777 << 32) | 666)>` first
4284/// before casting it back into `<vscale x 16 x i32>`.
4285bool VectorCombine::foldInterleaveIntrinsics(Instruction &I) {
4286 const APInt *SplatVal0, *SplatVal1;
4288 m_APInt(SplatVal0), m_APInt(SplatVal1))))
4289 return false;
4290
4291 LLVM_DEBUG(dbgs() << "VC: Folding interleave2 with two splats: " << I
4292 << "\n");
4293
4294 auto *VTy =
4295 cast<VectorType>(cast<IntrinsicInst>(I).getArgOperand(0)->getType());
4296 auto *ExtVTy = VectorType::getExtendedElementVectorType(VTy);
4297 unsigned Width = VTy->getElementType()->getIntegerBitWidth();
4298
4299 // Just in case the cost of interleave2 intrinsic and bitcast are both
4300 // invalid, in which case we want to bail out, we use <= rather
4301 // than < here. Even they both have valid and equal costs, it's probably
4302 // not a good idea to emit a high-cost constant splat.
4304 TTI.getCastInstrCost(Instruction::BitCast, I.getType(), ExtVTy,
4306 LLVM_DEBUG(dbgs() << "VC: The cost to cast from " << *ExtVTy << " to "
4307 << *I.getType() << " is too high.\n");
4308 return false;
4309 }
4310
4311 APInt NewSplatVal = SplatVal1->zext(Width * 2);
4312 NewSplatVal <<= Width;
4313 NewSplatVal |= SplatVal0->zext(Width * 2);
4314 auto *NewSplat = ConstantVector::getSplat(
4315 ExtVTy->getElementCount(), ConstantInt::get(F.getContext(), NewSplatVal));
4316
4317 IRBuilder<> Builder(&I);
4318 replaceValue(I, *Builder.CreateBitCast(NewSplat, I.getType()));
4319 return true;
4320}
4321
4322// Attempt to shrink loads that are only used by shufflevector instructions.
4323bool VectorCombine::shrinkLoadForShuffles(Instruction &I) {
4324 auto *OldLoad = dyn_cast<LoadInst>(&I);
4325 if (!OldLoad || !OldLoad->isSimple())
4326 return false;
4327
4328 auto *OldLoadTy = dyn_cast<FixedVectorType>(OldLoad->getType());
4329 if (!OldLoadTy)
4330 return false;
4331
4332 unsigned const OldNumElements = OldLoadTy->getNumElements();
4333
4334 // Search all uses of load. If all uses are shufflevector instructions, and
4335 // the second operands are all poison values, find the minimum and maximum
4336 // indices of the vector elements referenced by all shuffle masks.
4337 // Otherwise return `std::nullopt`.
4338 using IndexRange = std::pair<int, int>;
4339 auto GetIndexRangeInShuffles = [&]() -> std::optional<IndexRange> {
4340 IndexRange OutputRange = IndexRange(OldNumElements, -1);
4341 for (llvm::Use &Use : I.uses()) {
4342 // Ensure all uses match the required pattern.
4343 User *Shuffle = Use.getUser();
4344 ArrayRef<int> Mask;
4345
4346 if (!match(Shuffle,
4347 m_Shuffle(m_Specific(OldLoad), m_Undef(), m_Mask(Mask))))
4348 return std::nullopt;
4349
4350 // Ignore shufflevector instructions that have no uses.
4351 if (Shuffle->use_empty())
4352 continue;
4353
4354 // Find the min and max indices used by the shufflevector instruction.
4355 for (int Index : Mask) {
4356 if (Index >= 0 && Index < static_cast<int>(OldNumElements)) {
4357 OutputRange.first = std::min(Index, OutputRange.first);
4358 OutputRange.second = std::max(Index, OutputRange.second);
4359 }
4360 }
4361 }
4362
4363 if (OutputRange.second < OutputRange.first)
4364 return std::nullopt;
4365
4366 return OutputRange;
4367 };
4368
4369 // Get the range of vector elements used by shufflevector instructions.
4370 if (std::optional<IndexRange> Indices = GetIndexRangeInShuffles()) {
4371 unsigned const NewNumElements = Indices->second + 1u;
4372
4373 // If the range of vector elements is smaller than the full load, attempt
4374 // to create a smaller load.
4375 if (NewNumElements < OldNumElements) {
4376 IRBuilder Builder(&I);
4377 Builder.SetCurrentDebugLocation(I.getDebugLoc());
4378
4379 // Calculate costs of old and new ops.
4380 Type *ElemTy = OldLoadTy->getElementType();
4381 FixedVectorType *NewLoadTy = FixedVectorType::get(ElemTy, NewNumElements);
4382 Value *PtrOp = OldLoad->getPointerOperand();
4383
4385 Instruction::Load, OldLoad->getType(), OldLoad->getAlign(),
4386 OldLoad->getPointerAddressSpace(), CostKind);
4387 InstructionCost NewCost =
4388 TTI.getMemoryOpCost(Instruction::Load, NewLoadTy, OldLoad->getAlign(),
4389 OldLoad->getPointerAddressSpace(), CostKind);
4390
4391 using UseEntry = std::pair<ShuffleVectorInst *, std::vector<int>>;
4393 unsigned const MaxIndex = NewNumElements * 2u;
4394
4395 for (llvm::Use &Use : I.uses()) {
4396 auto *Shuffle = cast<ShuffleVectorInst>(Use.getUser());
4397 ArrayRef<int> OldMask = Shuffle->getShuffleMask();
4398
4399 // Create entry for new use.
4400 NewUses.push_back({Shuffle, OldMask});
4401
4402 // Validate mask indices.
4403 for (int Index : OldMask) {
4404 if (Index >= static_cast<int>(MaxIndex))
4405 return false;
4406 }
4407
4408 // Update costs.
4409 OldCost +=
4411 OldLoadTy, OldMask, CostKind);
4412 NewCost +=
4414 NewLoadTy, OldMask, CostKind);
4415 }
4416
4417 LLVM_DEBUG(
4418 dbgs() << "Found a load used only by shufflevector instructions: "
4419 << I << "\n OldCost: " << OldCost
4420 << " vs NewCost: " << NewCost << "\n");
4421
4422 if (OldCost < NewCost || !NewCost.isValid())
4423 return false;
4424
4425 // Create new load of smaller vector.
4426 auto *NewLoad = cast<LoadInst>(
4427 Builder.CreateAlignedLoad(NewLoadTy, PtrOp, OldLoad->getAlign()));
4428 NewLoad->copyMetadata(I);
4429
4430 // Replace all uses.
4431 for (UseEntry &Use : NewUses) {
4432 ShuffleVectorInst *Shuffle = Use.first;
4433 std::vector<int> &NewMask = Use.second;
4434
4435 Builder.SetInsertPoint(Shuffle);
4436 Builder.SetCurrentDebugLocation(Shuffle->getDebugLoc());
4437 Value *NewShuffle = Builder.CreateShuffleVector(
4438 NewLoad, PoisonValue::get(NewLoadTy), NewMask);
4439
4440 replaceValue(*Shuffle, *NewShuffle, false);
4441 }
4442
4443 return true;
4444 }
4445 }
4446 return false;
4447}
4448
4449// Attempt to narrow a phi of shufflevector instructions where the two incoming
4450// values have the same operands but different masks. If the two shuffle masks
4451// are offsets of one another we can use one branch to rotate the incoming
4452// vector and perform one larger shuffle after the phi.
4453bool VectorCombine::shrinkPhiOfShuffles(Instruction &I) {
4454 auto *Phi = dyn_cast<PHINode>(&I);
4455 if (!Phi || Phi->getNumIncomingValues() != 2u)
4456 return false;
4457
4458 Value *Op = nullptr;
4459 ArrayRef<int> Mask0;
4460 ArrayRef<int> Mask1;
4461
4462 if (!match(Phi->getOperand(0u),
4463 m_OneUse(m_Shuffle(m_Value(Op), m_Poison(), m_Mask(Mask0)))) ||
4464 !match(Phi->getOperand(1u),
4465 m_OneUse(m_Shuffle(m_Specific(Op), m_Poison(), m_Mask(Mask1)))))
4466 return false;
4467
4468 auto *Shuf = cast<ShuffleVectorInst>(Phi->getOperand(0u));
4469
4470 // Ensure result vectors are wider than the argument vector.
4471 auto *InputVT = cast<FixedVectorType>(Op->getType());
4472 auto *ResultVT = cast<FixedVectorType>(Shuf->getType());
4473 auto const InputNumElements = InputVT->getNumElements();
4474
4475 if (InputNumElements >= ResultVT->getNumElements())
4476 return false;
4477
4478 // Take the difference of the two shuffle masks at each index. Ignore poison
4479 // values at the same index in both masks.
4480 SmallVector<int, 16> NewMask;
4481 NewMask.reserve(Mask0.size());
4482
4483 for (auto [M0, M1] : zip(Mask0, Mask1)) {
4484 if (M0 >= 0 && M1 >= 0)
4485 NewMask.push_back(M0 - M1);
4486 else if (M0 == -1 && M1 == -1)
4487 continue;
4488 else
4489 return false;
4490 }
4491
4492 // Ensure all elements of the new mask are equal. If the difference between
4493 // the incoming mask elements is the same, the two must be constant offsets
4494 // of one another.
4495 if (NewMask.empty() || !all_equal(NewMask))
4496 return false;
4497
4498 // Create new mask using difference of the two incoming masks.
4499 int MaskOffset = NewMask[0u];
4500 unsigned Index = (InputNumElements + MaskOffset) % InputNumElements;
4501 NewMask.clear();
4502
4503 for (unsigned I = 0u; I < InputNumElements; ++I) {
4504 NewMask.push_back(Index);
4505 Index = (Index + 1u) % InputNumElements;
4506 }
4507
4508 // Calculate costs for worst cases and compare.
4509 auto const Kind = TTI::SK_PermuteSingleSrc;
4510 auto OldCost =
4511 std::max(TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask0, CostKind),
4512 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind));
4513 auto NewCost = TTI.getShuffleCost(Kind, InputVT, InputVT, NewMask, CostKind) +
4514 TTI.getShuffleCost(Kind, ResultVT, InputVT, Mask1, CostKind);
4515
4516 LLVM_DEBUG(dbgs() << "Found a phi of mergeable shuffles: " << I
4517 << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
4518 << "\n");
4519
4520 if (NewCost > OldCost)
4521 return false;
4522
4523 // Create new shuffles and narrowed phi.
4524 auto Builder = IRBuilder(Shuf);
4525 Builder.SetCurrentDebugLocation(Shuf->getDebugLoc());
4526 auto *PoisonVal = PoisonValue::get(InputVT);
4527 auto *NewShuf0 = Builder.CreateShuffleVector(Op, PoisonVal, NewMask);
4528 Worklist.push(cast<Instruction>(NewShuf0));
4529
4530 Builder.SetInsertPoint(Phi);
4531 Builder.SetCurrentDebugLocation(Phi->getDebugLoc());
4532 auto *NewPhi = Builder.CreatePHI(NewShuf0->getType(), 2u);
4533 NewPhi->addIncoming(NewShuf0, Phi->getIncomingBlock(0u));
4534 NewPhi->addIncoming(Op, Phi->getIncomingBlock(1u));
4535
4536 Builder.SetInsertPoint(*NewPhi->getInsertionPointAfterDef());
4537 PoisonVal = PoisonValue::get(NewPhi->getType());
4538 auto *NewShuf1 = Builder.CreateShuffleVector(NewPhi, PoisonVal, Mask1);
4539
4540 replaceValue(*Phi, *NewShuf1);
4541 return true;
4542}
4543
4544/// This is the entry point for all transforms. Pass manager differences are
4545/// handled in the callers of this function.
4546bool VectorCombine::run() {
4548 return false;
4549
4550 // Don't attempt vectorization if the target does not support vectors.
4551 if (!TTI.getNumberOfRegisters(TTI.getRegisterClassForType(/*Vector*/ true)))
4552 return false;
4553
4554 LLVM_DEBUG(dbgs() << "\n\nVECTORCOMBINE on " << F.getName() << "\n");
4555
4556 auto FoldInst = [this](Instruction &I) {
4557 Builder.SetInsertPoint(&I);
4558 bool IsVectorType = isa<VectorType>(I.getType());
4559 bool IsFixedVectorType = isa<FixedVectorType>(I.getType());
4560 auto Opcode = I.getOpcode();
4561
4562 LLVM_DEBUG(dbgs() << "VC: Visiting: " << I << '\n');
4563
4564 // These folds should be beneficial regardless of when this pass is run
4565 // in the optimization pipeline.
4566 // The type checking is for run-time efficiency. We can avoid wasting time
4567 // dispatching to folding functions if there's no chance of matching.
4568 if (IsFixedVectorType) {
4569 switch (Opcode) {
4570 case Instruction::InsertElement:
4571 if (vectorizeLoadInsert(I))
4572 return true;
4573 break;
4574 case Instruction::ShuffleVector:
4575 if (widenSubvectorLoad(I))
4576 return true;
4577 break;
4578 default:
4579 break;
4580 }
4581 }
4582
4583 // This transform works with scalable and fixed vectors
4584 // TODO: Identify and allow other scalable transforms
4585 if (IsVectorType) {
4586 if (scalarizeOpOrCmp(I))
4587 return true;
4588 if (scalarizeLoadExtract(I))
4589 return true;
4590 if (scalarizeExtExtract(I))
4591 return true;
4592 if (scalarizeVPIntrinsic(I))
4593 return true;
4594 if (foldInterleaveIntrinsics(I))
4595 return true;
4596 }
4597
4598 if (Opcode == Instruction::Store)
4599 if (foldSingleElementStore(I))
4600 return true;
4601
4602 // If this is an early pipeline invocation of this pass, we are done.
4603 if (TryEarlyFoldsOnly)
4604 return false;
4605
4606 // Otherwise, try folds that improve codegen but may interfere with
4607 // early IR canonicalizations.
4608 // The type checking is for run-time efficiency. We can avoid wasting time
4609 // dispatching to folding functions if there's no chance of matching.
4610 if (IsFixedVectorType) {
4611 switch (Opcode) {
4612 case Instruction::InsertElement:
4613 if (foldInsExtFNeg(I))
4614 return true;
4615 if (foldInsExtBinop(I))
4616 return true;
4617 if (foldInsExtVectorToShuffle(I))
4618 return true;
4619 break;
4620 case Instruction::ShuffleVector:
4621 if (foldPermuteOfBinops(I))
4622 return true;
4623 if (foldShuffleOfBinops(I))
4624 return true;
4625 if (foldShuffleOfSelects(I))
4626 return true;
4627 if (foldShuffleOfCastops(I))
4628 return true;
4629 if (foldShuffleOfShuffles(I))
4630 return true;
4631 if (foldShuffleOfIntrinsics(I))
4632 return true;
4633 if (foldSelectShuffle(I))
4634 return true;
4635 if (foldShuffleToIdentity(I))
4636 return true;
4637 break;
4638 case Instruction::Load:
4639 if (shrinkLoadForShuffles(I))
4640 return true;
4641 break;
4642 case Instruction::BitCast:
4643 if (foldBitcastShuffle(I))
4644 return true;
4645 break;
4646 case Instruction::And:
4647 case Instruction::Or:
4648 case Instruction::Xor:
4649 if (foldBitOpOfCastops(I))
4650 return true;
4651 if (foldBitOpOfCastConstant(I))
4652 return true;
4653 break;
4654 case Instruction::PHI:
4655 if (shrinkPhiOfShuffles(I))
4656 return true;
4657 break;
4658 default:
4659 if (shrinkType(I))
4660 return true;
4661 break;
4662 }
4663 } else {
4664 switch (Opcode) {
4665 case Instruction::Call:
4666 if (foldShuffleFromReductions(I))
4667 return true;
4668 if (foldCastFromReductions(I))
4669 return true;
4670 break;
4671 case Instruction::ExtractElement:
4672 if (foldShuffleChainsToReduce(I))
4673 return true;
4674 break;
4675 case Instruction::ICmp:
4676 case Instruction::FCmp:
4677 if (foldExtractExtract(I))
4678 return true;
4679 break;
4680 case Instruction::Or:
4681 if (foldConcatOfBoolMasks(I))
4682 return true;
4683 [[fallthrough]];
4684 default:
4685 if (Instruction::isBinaryOp(Opcode)) {
4686 if (foldExtractExtract(I))
4687 return true;
4688 if (foldExtractedCmps(I))
4689 return true;
4690 if (foldBinopOfReductions(I))
4691 return true;
4692 }
4693 break;
4694 }
4695 }
4696 return false;
4697 };
4698
4699 bool MadeChange = false;
4700 for (BasicBlock &BB : F) {
4701 // Ignore unreachable basic blocks.
4702 if (!DT.isReachableFromEntry(&BB))
4703 continue;
4704 // Use early increment range so that we can erase instructions in loop.
4705 // make_early_inc_range is not applicable here, as the next iterator may
4706 // be invalidated by RecursivelyDeleteTriviallyDeadInstructions.
4707 // We manually maintain the next instruction and update it when it is about
4708 // to be deleted.
4709 Instruction *I = &BB.front();
4710 while (I) {
4711 NextInst = I->getNextNode();
4712 if (!I->isDebugOrPseudoInst())
4713 MadeChange |= FoldInst(*I);
4714 I = NextInst;
4715 }
4716 }
4717
4718 NextInst = nullptr;
4719
4720 while (!Worklist.isEmpty()) {
4721 Instruction *I = Worklist.removeOne();
4722 if (!I)
4723 continue;
4724
4727 continue;
4728 }
4729
4730 MadeChange |= FoldInst(*I);
4731 }
4732
4733 return MadeChange;
4734}
4735
4738 auto &AC = FAM.getResult<AssumptionAnalysis>(F);
4740 DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
4741 AAResults &AA = FAM.getResult<AAManager>(F);
4742 const DataLayout *DL = &F.getDataLayout();
4743 VectorCombine Combiner(F, TTI, DT, AA, AC, DL, TTI::TCK_RecipThroughput,
4744 TryEarlyFoldsOnly);
4745 if (!Combiner.run())
4746 return PreservedAnalyses::all();
4749 return PA;
4750}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
This is the interface for LLVM's primary stateless and local alias analysis.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static cl::opt< OutputCostKind > CostKind("cost-kind", cl::desc("Target cost kind"), cl::init(OutputCostKind::RecipThroughput), cl::values(clEnumValN(OutputCostKind::RecipThroughput, "throughput", "Reciprocal throughput"), clEnumValN(OutputCostKind::Latency, "latency", "Instruction latency"), clEnumValN(OutputCostKind::CodeSize, "code-size", "Code size"), clEnumValN(OutputCostKind::SizeAndLatency, "size-latency", "Code size and latency"), clEnumValN(OutputCostKind::All, "all", "Print all cost kinds")))
This file defines the DenseMap class.
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
const size_t AbstractManglingParser< Derived, Alloc >::NumOps
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
static void eraseInstruction(Instruction &I, ICFLoopSafetyInfo &SafetyInfo, MemorySSAUpdater &MSSAU)
Definition LICM.cpp:1462
#define F(x, y, z)
Definition MD5.cpp:55
#define I(x, y, z)
Definition MD5.cpp:58
#define T1
MachineInstr unsigned OpIdx
uint64_t IntrinsicInst * II
FunctionAnalysisManager FAM
unsigned OpIndex
This file contains some templates that are useful if you are working with the STL at all.
This file defines the make_scope_exit function, which executes user-defined cleanup logic at scope ex...
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static TableGen::Emitter::OptClass< SkeletonEmitter > X("gen-skeleton-class", "Generate example skeleton class")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This pass exposes codegen information to IR-level passes.
static std::optional< unsigned > getOpcode(ArrayRef< VPValue * > Values)
Returns the opcode of Values or ~0 if they do not all agree.
Definition VPlanSLP.cpp:247
static Value * generateNewInstTree(ArrayRef< InstLane > Item, FixedVectorType *Ty, const SmallPtrSet< Use *, 4 > &IdentityLeafs, const SmallPtrSet< Use *, 4 > &SplatLeafs, const SmallPtrSet< Use *, 4 > &ConcatLeafs, IRBuilderBase &Builder, const TargetTransformInfo *TTI)
static bool isFreeConcat(ArrayRef< InstLane > Item, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI)
Detect concat of multiple values into a vector.
static void analyzeCostOfVecReduction(const IntrinsicInst &II, TTI::TargetCostKind CostKind, const TargetTransformInfo &TTI, InstructionCost &CostBeforeReduction, InstructionCost &CostAfterReduction)
static SmallVector< InstLane > generateInstLaneVectorFromOperand(ArrayRef< InstLane > Item, int Op)
static Value * createShiftShuffle(Value *Vec, unsigned OldIndex, unsigned NewIndex, IRBuilderBase &Builder)
Create a shuffle that translates (shifts) 1 element from the input vector to a new element location.
static Align computeAlignmentAfterScalarization(Align VectorAlignment, Type *ScalarType, Value *Idx, const DataLayout &DL)
The memory operation on a vector of ScalarType had alignment of VectorAlignment.
static bool feedsIntoVectorReduction(ShuffleVectorInst *SVI)
Returns true if this ShuffleVectorInst eventually feeds into a vector reduction intrinsic (e....
static ScalarizationResult canScalarizeAccess(VectorType *VecTy, Value *Idx, Instruction *CtxI, AssumptionCache &AC, const DominatorTree &DT)
Check if it is legal to scalarize a memory access to VecTy at index Idx.
static cl::opt< bool > DisableVectorCombine("disable-vector-combine", cl::init(false), cl::Hidden, cl::desc("Disable all vector combine transforms"))
static InstLane lookThroughShuffles(Use *U, int Lane)
static bool canWidenLoad(LoadInst *Load, const TargetTransformInfo &TTI)
static const unsigned InvalidIndex
std::pair< Use *, int > InstLane
static Value * translateExtract(ExtractElementInst *ExtElt, unsigned NewIndex, IRBuilderBase &Builder)
Given an extract element instruction with constant index operand, shuffle the source vector (shift th...
static cl::opt< unsigned > MaxInstrsToScan("vector-combine-max-scan-instrs", cl::init(30), cl::Hidden, cl::desc("Max number of instructions to scan for vector combining."))
static cl::opt< bool > DisableBinopExtractShuffle("disable-binop-extract-shuffle", cl::init(false), cl::Hidden, cl::desc("Disable binop extract to shuffle transforms"))
static bool isMemModifiedBetween(BasicBlock::iterator Begin, BasicBlock::iterator End, const MemoryLocation &Loc, AAResults &AA)
static constexpr int Concat[]
Value * RHS
Value * LHS
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
LLVM_ABI APInt zext(unsigned width) const
Zero extend to a new width.
Definition APInt.cpp:1012
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1222
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:41
const T & front() const
front - Get the first element.
Definition ArrayRef.h:146
size_t size() const
size - Get the array size.
Definition ArrayRef.h:143
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM_ABI bool hasAttribute(Attribute::AttrKind Kind) const
Return true if the attribute exists in this set.
InstListType::iterator iterator
Instruction iterators...
Definition BasicBlock.h:170
BinaryOps getOpcode() const
Definition InstrTypes.h:374
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
iterator_range< User::op_iterator > args()
Iteration adapter for range-for loops.
static LLVM_ABI CastInst * Create(Instruction::CastOps, Value *S, Type *Ty, const Twine &Name="", InsertPosition InsertBefore=nullptr)
Provides a way to construct any of the CastInst subclasses using an opcode instead of the subclass's ...
static Type * makeCmpResultType(Type *opnd_type)
Create a result type for fcmp/icmp.
Definition InstrTypes.h:982
Predicate
This enumeration lists the possible predicates for CmpInst subclasses.
Definition InstrTypes.h:676
bool isFPPredicate() const
Definition InstrTypes.h:782
static LLVM_ABI std::optional< CmpPredicate > getMatching(CmpPredicate A, CmpPredicate B)
Compares two CmpPredicates taking samesign into account and returns the canonicalized CmpPredicate if...
Combiner implementation.
Definition Combiner.h:34
static LLVM_ABI Constant * getExtractElement(Constant *Vec, Constant *Idx, Type *OnlyIfReducedTy=nullptr)
This is the shared class of boolean and integer constants.
Definition Constants.h:87
const APInt & getValue() const
Return the constant as an APInt value reference.
Definition Constants.h:154
This class represents a range of values.
LLVM_ABI ConstantRange urem(const ConstantRange &Other) const
Return a new range representing the possible values resulting from an unsigned remainder operation of...
LLVM_ABI ConstantRange binaryAnd(const ConstantRange &Other) const
Return a new range representing the possible values resulting from a binary-and of a value in this ra...
LLVM_ABI bool contains(const APInt &Val) const
Return true if the specified value is in the set.
static LLVM_ABI Constant * getSplat(ElementCount EC, Constant *Elt)
Return a ConstantVector with the specified constant in each element.
static LLVM_ABI Constant * get(ArrayRef< Constant * > V)
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:63
Analysis pass which computes a DominatorTree.
Definition Dominators.h:284
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:165
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
This instruction extracts a single (scalar) element from a VectorType value.
Convenience struct for specifying and reasoning about fast-math flags.
Definition FMF.h:22
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static FixedVectorType * getDoubleElementsVectorType(FixedVectorType *VTy)
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:803
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
Value * CreateInsertElement(Type *VecTy, Value *NewElt, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2579
Value * CreateExtractElement(Value *Vec, Value *Idx, const Twine &Name="")
Definition IRBuilder.h:2567
LoadInst * CreateAlignedLoad(Type *Ty, Value *Ptr, MaybeAlign Align, const char *Name)
Definition IRBuilder.h:1867
LLVM_ABI Value * CreateSelectFMF(Value *C, Value *True, Value *False, FMFSource FMFSource, const Twine &Name="", Instruction *MDFrom=nullptr)
LLVM_ABI Value * CreateVectorSplat(unsigned NumElts, Value *V, const Twine &Name="")
Return a vector value that contains.
LLVM_ABI Value * CreateSelect(Value *C, Value *True, Value *False, const Twine &Name="", Instruction *MDFrom=nullptr)
Value * CreateFreeze(Value *V, const Twine &Name="")
Definition IRBuilder.h:2645
Value * CreateLShr(Value *LHS, Value *RHS, const Twine &Name="", bool isExact=false)
Definition IRBuilder.h:1513
Value * CreateCast(Instruction::CastOps Op, Value *V, Type *DestTy, const Twine &Name="", MDNode *FPMathTag=nullptr, FMFSource FMFSource={})
Definition IRBuilder.h:2241
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
Value * CreateInBoundsGEP(Type *Ty, Value *Ptr, ArrayRef< Value * > IdxList, const Twine &Name="")
Definition IRBuilder.h:1934
Value * CreatePointerBitCastOrAddrSpaceCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2266
ConstantInt * getInt64(uint64_t C)
Get a constant 64-bit value.
Definition IRBuilder.h:527
LLVM_ABI CallInst * CreateIntrinsic(Intrinsic::ID ID, ArrayRef< Type * > Types, ArrayRef< Value * > Args, FMFSource FMFSource={}, const Twine &Name="")
Create a call to intrinsic ID with Args, mangled using Types.
ConstantInt * getInt32(uint32_t C)
Get a constant 32-bit value.
Definition IRBuilder.h:522
Value * CreateCmp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:2466
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2497
InstTy * Insert(InstTy *I, const Twine &Name="") const
Insert and return the specified instruction.
Definition IRBuilder.h:172
Value * CreateBitCast(Value *V, Type *DestTy, const Twine &Name="")
Definition IRBuilder.h:2207
LoadInst * CreateLoad(Type *Ty, Value *Ptr, const char *Name)
Provided to resolve 'CreateLoad(Ty, Ptr, "...")' correctly, instead of converting the string to 'bool...
Definition IRBuilder.h:1850
Value * CreateShl(Value *LHS, Value *RHS, const Twine &Name="", bool HasNUW=false, bool HasNSW=false)
Definition IRBuilder.h:1492
LLVM_ABI Value * CreateNAryOp(unsigned Opc, ArrayRef< Value * > Ops, const Twine &Name="", MDNode *FPMathTag=nullptr)
Create either a UnaryOperator or BinaryOperator depending on Opc.
Value * CreateZExt(Value *V, Type *DestTy, const Twine &Name="", bool IsNonNeg=false)
Definition IRBuilder.h:2085
Value * CreateShuffleVector(Value *V1, Value *V2, Value *Mask, const Twine &Name="")
Definition IRBuilder.h:2601
Value * CreateAnd(Value *LHS, Value *RHS, const Twine &Name="")
Definition IRBuilder.h:1551
StoreInst * CreateStore(Value *Val, Value *Ptr, bool isVolatile=false)
Definition IRBuilder.h:1863
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2071
PointerType * getPtrTy(unsigned AddrSpace=0)
Fetch the type representing a pointer.
Definition IRBuilder.h:605
Value * CreateBinOp(Instruction::BinaryOps Opc, Value *LHS, Value *RHS, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1708
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateFNegFMF(Value *V, FMFSource FMFSource, const Twine &Name="", MDNode *FPMathTag=nullptr)
Definition IRBuilder.h:1798
Value * CreateOr(Value *LHS, Value *RHS, const Twine &Name="", bool IsDisjoint=false)
Definition IRBuilder.h:1573
InstSimplifyFolder - Use InstructionSimplify to fold operations to existing values.
void push(Instruction *I)
Push the instruction onto the worklist stack.
LLVM_ABI void setHasNoUnsignedWrap(bool b=true)
Set or clear the nuw flag on this instruction, which must be an operator which supports this flag.
LLVM_ABI void copyIRFlags(const Value *V, bool IncludeWrapFlags=true)
Convenience method to copy supported exact, fast-math, and (optionally) wrapping flags from V to this...
LLVM_ABI void setHasNoSignedWrap(bool b=true)
Set or clear the nsw flag on this instruction, which must be an operator which supports this flag.
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void andIRFlags(const Value *V)
Logical 'and' of any supported wrapping, exact, and fast-math flags of V and this instruction.
bool isBinaryOp() const
LLVM_ABI void setNonNeg(bool b=true)
Set or clear the nneg flag on this instruction, which must be a zext instruction.
LLVM_ABI bool comesBefore(const Instruction *Other) const
Given an instruction Other in the same basic block as this instruction, return true if this instructi...
unsigned getOpcode() const
Returns a member of one of the enums like Instruction::Add.
LLVM_ABI void copyMetadata(const Instruction &SrcInst, ArrayRef< unsigned > WL=ArrayRef< unsigned >())
Copy metadata from SrcInst to this instruction.
bool isIntDivRem() const
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:319
A wrapper class for inspecting calls to intrinsic functions.
Intrinsic::ID getIntrinsicID() const
Return the intrinsic ID of this intrinsic.
An instruction for reading from memory.
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
const SDValue & getOperand(unsigned Num) const
This instruction constructs a fixed permutation of two input vectors.
int getMaskValue(unsigned Elt) const
Return the shuffle mask value of this instruction for the given element index.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
static void commuteShuffleMask(MutableArrayRef< int > Mask, unsigned InVecNumElts)
Change values in a shuffle permute mask assuming the two vector operands of length InVecNumElts have ...
size_type size() const
Definition SmallPtrSet.h:99
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
bool contains(ConstPtrType Ptr) const
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
void assign(size_type NumElts, ValueParamT Elt)
reference emplace_back(ArgTypes &&... Args)
void reserve(size_type N)
void append(ItTy in_start, ItTy in_end)
Add the specified range to the end of the SmallVector.
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
void setAlignment(Align Align)
Analysis pass providing the TargetTransformInfo.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
LLVM_ABI InstructionCost getVectorInstrCost(unsigned Opcode, Type *Val, TTI::TargetCostKind CostKind, unsigned Index=-1, const Value *Op0=nullptr, const Value *Op1=nullptr) const
LLVM_ABI InstructionCost getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts, bool Insert, bool Extract, TTI::TargetCostKind CostKind, bool ForPoisonSrc=true, ArrayRef< Value * > VL={}) const
Estimate the overhead of scalarizing an instruction.
LLVM_ABI InstructionCost getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, CmpInst::Predicate VecPred, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo Op1Info={OK_AnyValue, OP_None}, OperandValueInfo Op2Info={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI TypeSize getRegisterBitWidth(RegisterKind K) const
LLVM_ABI InstructionCost getMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, OperandValueInfo OpdInfo={OK_AnyValue, OP_None}, const Instruction *I=nullptr) const
LLVM_ABI bool allowVectorElementIndexingUsingGEP() const
Returns true if GEP should not be used to index into vectors for this target.
LLVM_ABI InstructionCost getShuffleCost(ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy, ArrayRef< int > Mask={}, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, int Index=0, VectorType *SubTp=nullptr, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr) const
LLVM_ABI InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) const
LLVM_ABI InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, std::optional< FastMathFlags > FMF, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput) const
Calculate the cost of vector reduction intrinsics.
LLVM_ABI InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, TTI::CastContextHint CCH, TTI::TargetCostKind CostKind=TTI::TCK_SizeAndLatency, const Instruction *I=nullptr) const
LLVM_ABI unsigned getRegisterClassForType(bool Vector, Type *Ty=nullptr) const
TargetCostKind
The kind of cost model.
@ TCK_RecipThroughput
Reciprocal throughput.
LLVM_ABI InstructionCost getArithmeticInstrCost(unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind=TTI::TCK_RecipThroughput, TTI::OperandValueInfo Opd1Info={TTI::OK_AnyValue, TTI::OP_None}, TTI::OperandValueInfo Opd2Info={TTI::OK_AnyValue, TTI::OP_None}, ArrayRef< const Value * > Args={}, const Instruction *CxtI=nullptr, const TargetLibraryInfo *TLibInfo=nullptr) const
This is an approximation of reciprocal throughput of a math/logic op.
LLVM_ABI unsigned getMinVectorRegisterBitWidth() const
LLVM_ABI InstructionCost getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE, const SCEV *Ptr, TTI::TargetCostKind CostKind) const
LLVM_ABI unsigned getNumberOfRegisters(unsigned ClassID) const
LLVM_ABI InstructionCost getInstructionCost(const User *U, ArrayRef< const Value * > Operands, TargetCostKind CostKind) const
Estimate the cost of a given IR user when lowered.
ShuffleKind
The various kinds of shuffle patterns for vector queries.
@ SK_PermuteSingleSrc
Shuffle elements of single source vector with any shuffle mask.
@ SK_Broadcast
Broadcast element 0 to all other elements.
@ SK_PermuteTwoSrc
Merge elements from two source vectors into one with any shuffle mask.
@ None
The cast is not used with a load/store of any kind.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:45
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:267
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:198
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:128
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:231
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:184
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:240
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
op_range operands()
Definition User.h:292
Value * getOperand(unsigned i) const
Definition User.h:232
static LLVM_ABI bool isVPBinOp(Intrinsic::ID ID)
std::optional< unsigned > getFunctionalIntrinsicID() const
std::optional< unsigned > getFunctionalOpcode() const
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
const Value * stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL, APInt &Offset) const
This is a wrapper around stripAndAccumulateConstantOffsets with the in-bounds requirement set to fals...
Definition Value.h:759
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:546
iterator_range< user_iterator > users()
Definition Value.h:426
LLVM_ABI Align getPointerAlignment(const DataLayout &DL) const
Returns an alignment of the pointer value.
Definition Value.cpp:956
unsigned getValueID() const
Return an ID for the concrete type of this object.
Definition Value.h:543
LLVM_ABI bool hasNUses(unsigned N) const
Return true if this Value has exactly N uses.
Definition Value.cpp:150
bool use_empty() const
Definition Value.h:346
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
PreservedAnalyses run(Function &F, FunctionAnalysisManager &)
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:202
size_type size() const
Definition DenseSet.h:87
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
NodeTy * getNextNode()
Get the next node, or nullptr for the list tail.
Definition ilist_node.h:348
Abstract Attribute helper functions.
Definition Attributor.h:165
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
constexpr char Attrs[]
Key for Kernel::Metadata::mAttrs.
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
LLVM_ABI AttributeSet getFnAttributes(LLVMContext &C, ID id)
Return the function attributes for an intrinsic.
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
class_match< PoisonValue > m_Poison()
Match an arbitrary poison constant.
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
class_match< BinaryOperator > m_BinOp()
Match an arbitrary binary operation and ignore it.
BinaryOp_match< LHS, RHS, Instruction::URem > m_URem(const LHS &L, const RHS &R)
class_match< Constant > m_Constant()
Match an arbitrary Constant and ignore it.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
DisjointOr_match< LHS, RHS > m_DisjointOr(const LHS &L, const RHS &R)
TwoOps_match< Val_t, Idx_t, Instruction::ExtractElement > m_ExtractElt(const Val_t &Val, const Idx_t &Idx)
Matches ExtractElementInst.
class_match< ConstantInt > m_ConstantInt()
Match an arbitrary ConstantInt and ignore it.
IntrinsicID_match m_Intrinsic()
Match intrinsic calls like this: m_Intrinsic<Intrinsic::fabs>(m_Value(X))
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
match_combine_and< LTy, RTy > m_CombineAnd(const LTy &L, const RTy &R)
Combine two pattern matchers matching L && R.
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
TwoOps_match< V1_t, V2_t, Instruction::ShuffleVector > m_Shuffle(const V1_t &v1, const V2_t &v2)
Matches ShuffleVectorInst independently of mask value.
OneOps_match< OpTy, Instruction::Load > m_Load(const OpTy &Op)
Matches LoadInst.
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
BinOpPred_match< LHS, RHS, is_bitwiselogic_op, true > m_c_BitwiseLogic(const LHS &L, const RHS &R)
Matches bitwise logic operations in either order.
class_match< CmpInst > m_Cmp()
Matches any compare instruction and ignore it.
CastOperator_match< OpTy, Instruction::BitCast > m_BitCast(const OpTy &Op)
Matches BitCast.
match_combine_or< CastInst_match< OpTy, SExtInst >, NNegZExt_match< OpTy > > m_SExtLike(const OpTy &Op)
Match either "sext" or "zext nneg".
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, ZExtInst >, CastInst_match< OpTy, SExtInst > > m_ZExtOrSExt(const OpTy &Op)
FNeg_match< OpTy > m_FNeg(const OpTy &X)
Match 'fneg X' as 'fsub -0.0, X'.
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
auto m_Undef()
Match an arbitrary undef constant.
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
ThreeOps_match< Val_t, Elt_t, Idx_t, Instruction::InsertElement > m_InsertElt(const Val_t &Val, const Elt_t &Elt, const Idx_t &Idx)
Matches InsertElementInst.
initializer< Ty > init(const Ty &Val)
PointerTypeMap run(const Module &M)
Compute the PointerTypeMap for the module M.
@ User
could "use" a pointer
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
NodeAddr< UseNode * > Use
Definition RDFGraph.h:385
friend class Instruction
Iterator for Instructions in a `BasicBlock.
Definition BasicBlock.h:73
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Offset
Definition DWP.cpp:477
detail::zippy< detail::zip_shortest, T, U, Args... > zip(T &&t, U &&u, Args &&...args)
zip iterator for two or more iteratable types.
Definition STLExtras.h:829
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
void stable_sort(R &&Range)
Definition STLExtras.h:2058
UnaryFunction for_each(R &&Range, UnaryFunction F)
Provide wrappers to std::for_each which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1718
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1725
LLVM_ABI bool RecursivelyDeleteTriviallyDeadInstructions(Value *V, const TargetLibraryInfo *TLI=nullptr, MemorySSAUpdater *MSSAU=nullptr, std::function< void(Value *)> AboutToDeleteCallback=std::function< void(Value *)>())
If the specified value is a trivially dead instruction, delete it.
Definition Local.cpp:533
detail::scope_exit< std::decay_t< Callable > > make_scope_exit(Callable &&F)
Definition ScopeExit.h:59
LLVM_ABI SDValue peekThroughBitcasts(SDValue V)
Return the non-bitcasted source operand of V if it exists.
auto enumerate(FirstRange &&First, RestRanges &&...Rest)
Given two or more input ranges, returns a new range whose values are tuples (A, B,...
Definition STLExtras.h:2472
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
unsigned Log2_64_Ceil(uint64_t Value)
Return the ceil log base 2 of the specified value, 64 if the value is zero.
Definition MathExtras.h:350
LLVM_ABI Value * simplifyUnOp(unsigned Opcode, Value *Op, const SimplifyQuery &Q)
Given operand for a UnaryOperator, fold the result or return null.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID)
Returns the arithmetic instruction opcode used when expanding a reduction.
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
LLVM_ABI Value * simplifyCall(CallBase *Call, Value *Callee, ArrayRef< Value * > Args, const SimplifyQuery &Q)
Given a callsite, callee, and arguments, fold the result or return null.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:632
LLVM_ABI bool mustSuppressSpeculation(const LoadInst &LI)
Return true if speculation of the given load must be suppressed to avoid ordering or interfering with...
Definition Loads.cpp:420
LLVM_ABI bool widenShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Try to transform a shuffle mask by replacing elements with the scaled index for an equivalent mask of...
LLVM_ABI bool isSafeToSpeculativelyExecute(const Instruction *I, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
Return true if the instruction does not have any effects besides calculating the result and does not ...
LLVM_ABI Value * getSplatValue(const Value *V)
Get splat value if the input is a splat vector or return nullptr.
LLVM_ABI ConstantRange computeConstantRange(const Value *V, bool ForSigned, bool UseInstrInfo=true, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Determine the possible constant range of an integer or vector of integer value.
unsigned M1(unsigned Val)
Definition VE.h:377
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1732
LLVM_ABI bool isInstructionTriviallyDead(Instruction *I, const TargetLibraryInfo *TLI=nullptr)
Return true if the result produced by the instruction is not used, and the instruction will return.
Definition Local.cpp:402
LLVM_ABI bool isSplatValue(const Value *V, int Index=-1, unsigned Depth=0)
Return true if each element of the vector value V is poisoned or equal to every other non-poisoned el...
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1622
LLVM_ABI void computeKnownBits(const Value *V, KnownBits &Known, const DataLayout &DL, AssumptionCache *AC=nullptr, const Instruction *CxtI=nullptr, const DominatorTree *DT=nullptr, bool UseInstrInfo=true, unsigned Depth=0)
Determine which bits of V are known to be either zero or one and return them in the KnownZero/KnownOn...
LLVM_ABI bool programUndefinedIfPoison(const Instruction *Inst)
LLVM_ABI bool isSafeToLoadUnconditionally(Value *V, Align Alignment, const APInt &Size, const DataLayout &DL, Instruction *ScanFrom, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr)
Return true if we know that executing a load from this value cannot trap.
Definition Loads.cpp:435
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI void propagateIRFlags(Value *I, ArrayRef< Value * > VL, Value *OpValue=nullptr, bool IncludeWrapFlags=true)
Get the intersection (logical and) of all of the potential IR flags of each scalar operation (VL) tha...
LLVM_ABI bool isKnownNonZero(const Value *V, const SimplifyQuery &Q, unsigned Depth=0)
Return true if the given value is known to be non-zero when defined.
MutableArrayRef(T &OneElt) -> MutableArrayRef< T >
constexpr int PoisonMaskElem
LLVM_ABI bool isSafeToSpeculativelyExecuteWithOpcode(unsigned Opcode, const Instruction *Inst, const Instruction *CtxI=nullptr, AssumptionCache *AC=nullptr, const DominatorTree *DT=nullptr, const TargetLibraryInfo *TLI=nullptr, bool UseVariableInfo=true, bool IgnoreUBImplyingAttrs=true)
This returns the same result as isSafeToSpeculativelyExecute if Opcode is the actual opcode of Inst.
@ Other
Any other memory.
Definition ModRef.h:68
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
LLVM_ABI Value * simplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a BinaryOperator, fold the result or return null.
LLVM_ABI void narrowShuffleMaskElts(int Scale, ArrayRef< int > Mask, SmallVectorImpl< int > &ScaledMask)
Replace each shuffle mask index with the scaled sequential indices for an equivalent mask of narrowed...
LLVM_ABI Intrinsic::ID getReductionForBinop(Instruction::BinaryOps Opc)
Returns the reduction intrinsic id corresponding to the binary operation.
@ And
Bitwise or logical AND of integers.
LLVM_ABI bool isVectorIntrinsicWithScalarOpAtArg(Intrinsic::ID ID, unsigned ScalarOpdIdx, const TargetTransformInfo *TTI)
Identifies if the vector form of the intrinsic has a scalar operand.
DWARFExpression::Operation Op
unsigned M0(unsigned Val)
Definition VE.h:376
constexpr unsigned BitWidth
LLVM_ABI bool isGuaranteedToTransferExecutionToSuccessor(const Instruction *I)
Return true if this function can prove that the instruction I will always transfer execution to one o...
LLVM_ABI Constant * getLosslessInvCast(Constant *C, Type *InvCastTo, unsigned CastOp, const DataLayout &DL, PreservedCastFlags *Flags=nullptr)
Try to cast C to InvC losslessly, satisfying CastOp(InvC) equals C, or CastOp(InvC) is a refined valu...
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1758
bool is_contained(R &&Range, const E &Element)
Returns true if Element is found in Range.
Definition STLExtras.h:1897
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
bool all_equal(std::initializer_list< T > Values)
Returns true if all Values in the initializer lists are equal or the list.
Definition STLExtras.h:2108
LLVM_ABI Value * simplifyCmpInst(CmpPredicate Predicate, Value *LHS, Value *RHS, const SimplifyQuery &Q)
Given operands for a CmpInst, fold the result or return null.
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
Type * toVectorTy(Type *Scalar, ElementCount EC)
A helper function for converting Scalar types to vector types.
LLVM_ABI bool isTriviallyVectorizable(Intrinsic::ID ID)
Identify if the intrinsic is trivially vectorizable.
LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID)
Returns the llvm.vector.reduce min/max intrinsic that corresponds to the intrinsic op.
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:869
LLVM_ABI AAMDNodes adjustForAccess(unsigned AccessSize)
Create a new AAMDNode for accessing AccessSize bytes of this AAMDNode.
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
unsigned countMaxActiveBits() const
Returns the maximum number of bits needed to represent all possible unsigned values with these known ...
Definition KnownBits.h:296
APInt getMaxValue() const
Return the maximal unsigned value possible given these KnownBits.
Definition KnownBits.h:145