LLVM 23.0.0git
AggressiveInstCombine.cpp
Go to the documentation of this file.
1//===- AggressiveInstCombine.cpp ------------------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file implements the aggressive expression pattern combiner classes.
10// Currently, it handles expression patterns for:
11// * Truncate instruction
12//
13//===----------------------------------------------------------------------===//
14
17#include "llvm/ADT/Statistic.h"
27#include "llvm/IR/DataLayout.h"
28#include "llvm/IR/Dominators.h"
29#include "llvm/IR/Function.h"
30#include "llvm/IR/IRBuilder.h"
31#include "llvm/IR/Instruction.h"
32#include "llvm/IR/MDBuilder.h"
40
41using namespace llvm;
42using namespace PatternMatch;
43
44#define DEBUG_TYPE "aggressive-instcombine"
45
46namespace llvm {
48}
49
50STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
51STATISTIC(NumGuardedRotates,
52 "Number of guarded rotates transformed into funnel shifts");
53STATISTIC(NumGuardedFunnelShifts,
54 "Number of guarded funnel shifts transformed into funnel shifts");
55STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
56
58 "aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden,
59 cl::desc("Max number of instructions to scan for aggressive instcombine."));
60
62 "strncmp-inline-threshold", cl::init(3), cl::Hidden,
63 cl::desc("The maximum length of a constant string for a builtin string cmp "
64 "call eligible for inlining. The default value is 3."));
65
67 MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden,
68 cl::desc("The maximum length of a constant string to "
69 "inline a memchr call."));
70
71/// Match a pattern for a bitwise funnel/rotate operation that partially guards
72/// against undefined behavior by branching around the funnel-shift/rotation
73/// when the shift amount is 0.
75 if (I.getOpcode() != Instruction::PHI || I.getNumOperands() != 2)
76 return false;
77
78 // As with the one-use checks below, this is not strictly necessary, but we
79 // are being cautious to avoid potential perf regressions on targets that
80 // do not actually have a funnel/rotate instruction (where the funnel shift
81 // would be expanded back into math/shift/logic ops).
82 if (!isPowerOf2_32(I.getType()->getScalarSizeInBits()))
83 return false;
84
85 // Match V to funnel shift left/right and capture the source operands and
86 // shift amount.
87 auto matchFunnelShift = [](Value *V, Value *&ShVal0, Value *&ShVal1,
88 Value *&ShAmt) {
89 unsigned Width = V->getType()->getScalarSizeInBits();
90
91 // fshl(ShVal0, ShVal1, ShAmt)
92 // == (ShVal0 << ShAmt) | (ShVal1 >> (Width -ShAmt))
93 if (match(V, m_OneUse(m_c_Or(
94 m_Shl(m_Value(ShVal0), m_Value(ShAmt)),
95 m_LShr(m_Value(ShVal1), m_Sub(m_SpecificInt(Width),
96 m_Deferred(ShAmt))))))) {
97 return Intrinsic::fshl;
98 }
99
100 // fshr(ShVal0, ShVal1, ShAmt)
101 // == (ShVal0 >> ShAmt) | (ShVal1 << (Width - ShAmt))
102 if (match(V,
104 m_Value(ShAmt))),
105 m_LShr(m_Value(ShVal1), m_Deferred(ShAmt)))))) {
106 return Intrinsic::fshr;
107 }
108
110 };
111
112 // One phi operand must be a funnel/rotate operation, and the other phi
113 // operand must be the source value of that funnel/rotate operation:
114 // phi [ rotate(RotSrc, ShAmt), FunnelBB ], [ RotSrc, GuardBB ]
115 // phi [ fshl(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal0, GuardBB ]
116 // phi [ fshr(ShVal0, ShVal1, ShAmt), FunnelBB ], [ ShVal1, GuardBB ]
117 PHINode &Phi = cast<PHINode>(I);
118 unsigned FunnelOp = 0, GuardOp = 1;
119 Value *P0 = Phi.getOperand(0), *P1 = Phi.getOperand(1);
120 Value *ShVal0, *ShVal1, *ShAmt;
121 Intrinsic::ID IID = matchFunnelShift(P0, ShVal0, ShVal1, ShAmt);
122 if (IID == Intrinsic::not_intrinsic ||
123 (IID == Intrinsic::fshl && ShVal0 != P1) ||
124 (IID == Intrinsic::fshr && ShVal1 != P1)) {
125 IID = matchFunnelShift(P1, ShVal0, ShVal1, ShAmt);
126 if (IID == Intrinsic::not_intrinsic ||
127 (IID == Intrinsic::fshl && ShVal0 != P0) ||
128 (IID == Intrinsic::fshr && ShVal1 != P0))
129 return false;
130 assert((IID == Intrinsic::fshl || IID == Intrinsic::fshr) &&
131 "Pattern must match funnel shift left or right");
132 std::swap(FunnelOp, GuardOp);
133 }
134
135 // The incoming block with our source operand must be the "guard" block.
136 // That must contain a cmp+branch to avoid the funnel/rotate when the shift
137 // amount is equal to 0. The other incoming block is the block with the
138 // funnel/rotate.
139 BasicBlock *GuardBB = Phi.getIncomingBlock(GuardOp);
140 BasicBlock *FunnelBB = Phi.getIncomingBlock(FunnelOp);
141 Instruction *TermI = GuardBB->getTerminator();
142
143 // Ensure that the shift values dominate each block.
144 if (!DT.dominates(ShVal0, TermI) || !DT.dominates(ShVal1, TermI))
145 return false;
146
147 BasicBlock *PhiBB = Phi.getParent();
149 m_ZeroInt()),
150 m_SpecificBB(PhiBB), m_SpecificBB(FunnelBB))))
151 return false;
152
153 IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
154
155 if (ShVal0 == ShVal1)
156 ++NumGuardedRotates;
157 else
158 ++NumGuardedFunnelShifts;
159
160 // If this is not a rotate then the select was blocking poison from the
161 // 'shift-by-zero' non-TVal, but a funnel shift won't - so freeze it.
162 bool IsFshl = IID == Intrinsic::fshl;
163 if (ShVal0 != ShVal1) {
164 if (IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal1))
165 ShVal1 = Builder.CreateFreeze(ShVal1);
166 else if (!IsFshl && !llvm::isGuaranteedNotToBePoison(ShVal0))
167 ShVal0 = Builder.CreateFreeze(ShVal0);
168 }
169
170 // We matched a variation of this IR pattern:
171 // GuardBB:
172 // %cmp = icmp eq i32 %ShAmt, 0
173 // br i1 %cmp, label %PhiBB, label %FunnelBB
174 // FunnelBB:
175 // %sub = sub i32 32, %ShAmt
176 // %shr = lshr i32 %ShVal1, %sub
177 // %shl = shl i32 %ShVal0, %ShAmt
178 // %fsh = or i32 %shr, %shl
179 // br label %PhiBB
180 // PhiBB:
181 // %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
182 // -->
183 // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
184 Phi.replaceAllUsesWith(
185 Builder.CreateIntrinsic(IID, Phi.getType(), {ShVal0, ShVal1, ShAmt}));
186 return true;
187}
188
189/// This is used by foldAnyOrAllBitsSet() to capture a source value (Root) and
190/// the bit indexes (Mask) needed by a masked compare. If we're matching a chain
191/// of 'and' ops, then we also need to capture the fact that we saw an
192/// "and X, 1", so that's an extra return value for that case.
193namespace {
194struct MaskOps {
195 Value *Root = nullptr;
196 APInt Mask;
197 bool MatchAndChain;
198 bool FoundAnd1 = false;
199
200 MaskOps(unsigned BitWidth, bool MatchAnds)
201 : Mask(APInt::getZero(BitWidth)), MatchAndChain(MatchAnds) {}
202};
203} // namespace
204
205/// This is a recursive helper for foldAnyOrAllBitsSet() that walks through a
206/// chain of 'and' or 'or' instructions looking for shift ops of a common source
207/// value. Examples:
208/// or (or (or X, (X >> 3)), (X >> 5)), (X >> 8)
209/// returns { X, 0x129 }
210/// and (and (X >> 1), 1), (X >> 4)
211/// returns { X, 0x12 }
212static bool matchAndOrChain(Value *V, MaskOps &MOps) {
213 Value *Op0, *Op1;
214 if (MOps.MatchAndChain) {
215 // Recurse through a chain of 'and' operands. This requires an extra check
216 // vs. the 'or' matcher: we must find an "and X, 1" instruction somewhere
217 // in the chain to know that all of the high bits are cleared.
218 if (match(V, m_And(m_Value(Op0), m_One()))) {
219 MOps.FoundAnd1 = true;
220 return matchAndOrChain(Op0, MOps);
221 }
222 if (match(V, m_And(m_Value(Op0), m_Value(Op1))))
223 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
224 } else {
225 // Recurse through a chain of 'or' operands.
226 if (match(V, m_Or(m_Value(Op0), m_Value(Op1))))
227 return matchAndOrChain(Op0, MOps) && matchAndOrChain(Op1, MOps);
228 }
229
230 // We need a shift-right or a bare value representing a compare of bit 0 of
231 // the original source operand.
232 Value *Candidate;
233 const APInt *BitIndex = nullptr;
234 if (!match(V, m_LShr(m_Value(Candidate), m_APInt(BitIndex))))
235 Candidate = V;
236
237 // Initialize result source operand.
238 if (!MOps.Root)
239 MOps.Root = Candidate;
240
241 // The shift constant is out-of-range? This code hasn't been simplified.
242 if (BitIndex && BitIndex->uge(MOps.Mask.getBitWidth()))
243 return false;
244
245 // Fill in the mask bit derived from the shift constant.
246 MOps.Mask.setBit(BitIndex ? BitIndex->getZExtValue() : 0);
247 return MOps.Root == Candidate;
248}
249
250/// Match patterns that correspond to "any-bits-set" and "all-bits-set".
251/// These will include a chain of 'or' or 'and'-shifted bits from a
252/// common source value:
253/// and (or (lshr X, C), ...), 1 --> (X & CMask) != 0
254/// and (and (lshr X, C), ...), 1 --> (X & CMask) == CMask
255/// Note: "any-bits-clear" and "all-bits-clear" are variations of these patterns
256/// that differ only with a final 'not' of the result. We expect that final
257/// 'not' to be folded with the compare that we create here (invert predicate).
259 // The 'any-bits-set' ('or' chain) pattern is simpler to match because the
260 // final "and X, 1" instruction must be the final op in the sequence.
261 bool MatchAllBitsSet;
262 bool MatchTrunc;
263 Value *X;
264 if (I.getType()->isIntOrIntVectorTy(1)) {
265 if (match(&I, m_Trunc(m_OneUse(m_And(m_Value(), m_Value())))))
266 MatchAllBitsSet = true;
267 else if (match(&I, m_Trunc(m_OneUse(m_Or(m_Value(), m_Value())))))
268 MatchAllBitsSet = false;
269 else
270 return false;
271 MatchTrunc = true;
272 X = I.getOperand(0);
273 } else {
274 if (match(&I, m_c_And(m_OneUse(m_And(m_Value(), m_Value())), m_Value()))) {
275 X = &I;
276 MatchAllBitsSet = true;
277 } else if (match(&I,
278 m_And(m_OneUse(m_Or(m_Value(), m_Value())), m_One()))) {
279 X = I.getOperand(0);
280 MatchAllBitsSet = false;
281 } else
282 return false;
283 MatchTrunc = false;
284 }
285 Type *Ty = X->getType();
286
287 MaskOps MOps(Ty->getScalarSizeInBits(), MatchAllBitsSet);
288 if (!matchAndOrChain(X, MOps) ||
289 (MatchAllBitsSet && !MatchTrunc && !MOps.FoundAnd1))
290 return false;
291
292 // The pattern was found. Create a masked compare that replaces all of the
293 // shift and logic ops.
294 IRBuilder<> Builder(&I);
295 Constant *Mask = ConstantInt::get(Ty, MOps.Mask);
296 Value *And = Builder.CreateAnd(MOps.Root, Mask);
297 Value *Cmp = MatchAllBitsSet ? Builder.CreateICmpEQ(And, Mask)
298 : Builder.CreateIsNotNull(And);
299 Value *Zext = MatchTrunc ? Cmp : Builder.CreateZExt(Cmp, Ty);
300 I.replaceAllUsesWith(Zext);
301 ++NumAnyOrAllBitsSet;
302 return true;
303}
304
305// Try to recognize below function as popcount intrinsic.
306// This is the "best" algorithm from
307// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
308// Also used in TargetLowering::expandCTPOP().
309//
310// int popcount(unsigned int i) {
311// i = i - ((i >> 1) & 0x55555555);
312// i = (i & 0x33333333) + ((i >> 2) & 0x33333333);
313// i = ((i + (i >> 4)) & 0x0F0F0F0F);
314// return (i * 0x01010101) >> 24;
315// }
317 if (I.getOpcode() != Instruction::LShr)
318 return false;
319
320 Type *Ty = I.getType();
321 if (!Ty->isIntOrIntVectorTy())
322 return false;
323
324 unsigned Len = Ty->getScalarSizeInBits();
325 // FIXME: fix Len == 8 and other irregular type lengths.
326 if (!(Len <= 128 && Len > 8 && Len % 8 == 0))
327 return false;
328
329 APInt Mask55 = APInt::getSplat(Len, APInt(8, 0x55));
330 APInt Mask33 = APInt::getSplat(Len, APInt(8, 0x33));
331 APInt Mask0F = APInt::getSplat(Len, APInt(8, 0x0F));
332 APInt Mask01 = APInt::getSplat(Len, APInt(8, 0x01));
333 APInt MaskShift = APInt(Len, Len - 8);
334
335 Value *Op0 = I.getOperand(0);
336 Value *Op1 = I.getOperand(1);
337 Value *MulOp0;
338 // Matching "(i * 0x01010101...) >> 24".
339 if ((match(Op0, m_Mul(m_Value(MulOp0), m_SpecificInt(Mask01)))) &&
341 Value *ShiftOp0;
342 // Matching "((i + (i >> 4)) & 0x0F0F0F0F...)".
343 if (match(MulOp0, m_And(m_c_Add(m_LShr(m_Value(ShiftOp0), m_SpecificInt(4)),
344 m_Deferred(ShiftOp0)),
345 m_SpecificInt(Mask0F)))) {
346 Value *AndOp0;
347 // Matching "(i & 0x33333333...) + ((i >> 2) & 0x33333333...)".
348 if (match(ShiftOp0,
349 m_c_Add(m_And(m_Value(AndOp0), m_SpecificInt(Mask33)),
351 m_SpecificInt(Mask33))))) {
352 Value *Root, *SubOp1;
353 // Matching "i - ((i >> 1) & 0x55555555...)".
354 const APInt *AndMask;
355 if (match(AndOp0, m_Sub(m_Value(Root), m_Value(SubOp1))) &&
356 match(SubOp1, m_And(m_LShr(m_Specific(Root), m_SpecificInt(1)),
357 m_APInt(AndMask)))) {
358 auto CheckAndMask = [&]() {
359 if (*AndMask == Mask55)
360 return true;
361
362 // Exact match failed, see if any bits are known to be 0 where we
363 // expect a 1 in the mask.
364 if (!AndMask->isSubsetOf(Mask55))
365 return false;
366
367 APInt NeededMask = Mask55 & ~*AndMask;
368 return MaskedValueIsZero(cast<Instruction>(SubOp1)->getOperand(0),
369 NeededMask,
370 SimplifyQuery(I.getDataLayout()));
371 };
372
373 if (CheckAndMask()) {
374 LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
375 IRBuilder<> Builder(&I);
376 I.replaceAllUsesWith(
377 Builder.CreateIntrinsic(Intrinsic::ctpop, I.getType(), {Root}));
378 ++NumPopCountRecognized;
379 return true;
380 }
381 }
382 }
383 }
384 }
385
386 return false;
387}
388
389/// Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and
390/// C2 saturate the value of the fp conversion. The transform is not reversable
391/// as the fptosi.sat is more defined than the input - all values produce a
392/// valid value for the fptosi.sat, where as some produce poison for original
393/// that were out of range of the integer conversion. The reversed pattern may
394/// use fmax and fmin instead. As we cannot directly reverse the transform, and
395/// it is not always profitable, we make it conditional on the cost being
396/// reported as lower by TTI.
398 // Look for min(max(fptosi, converting to fptosi_sat.
399 Value *In;
400 const APInt *MinC, *MaxC;
402 m_APInt(MinC))),
403 m_APInt(MaxC))) &&
405 m_APInt(MaxC))),
406 m_APInt(MinC))))
407 return false;
408
409 // Check that the constants clamp a saturate.
410 if (!(*MinC + 1).isPowerOf2() || -*MaxC != *MinC + 1)
411 return false;
412
413 Type *IntTy = I.getType();
414 Type *FpTy = In->getType();
415 Type *SatTy =
416 IntegerType::get(IntTy->getContext(), (*MinC + 1).exactLogBase2() + 1);
417 if (auto *VecTy = dyn_cast<VectorType>(IntTy))
418 SatTy = VectorType::get(SatTy, VecTy->getElementCount());
419
420 // Get the cost of the intrinsic, and check that against the cost of
421 // fptosi+smin+smax
422 InstructionCost SatCost = TTI.getIntrinsicInstrCost(
423 IntrinsicCostAttributes(Intrinsic::fptosi_sat, SatTy, {In}, {FpTy}),
425 SatCost += TTI.getCastInstrCost(Instruction::SExt, IntTy, SatTy,
428
429 InstructionCost MinMaxCost = TTI.getCastInstrCost(
430 Instruction::FPToSI, IntTy, FpTy, TTI::CastContextHint::None,
432 MinMaxCost += TTI.getIntrinsicInstrCost(
433 IntrinsicCostAttributes(Intrinsic::smin, IntTy, {IntTy}),
435 MinMaxCost += TTI.getIntrinsicInstrCost(
436 IntrinsicCostAttributes(Intrinsic::smax, IntTy, {IntTy}),
438
439 if (SatCost >= MinMaxCost)
440 return false;
441
442 IRBuilder<> Builder(&I);
443 Value *Sat =
444 Builder.CreateIntrinsic(Intrinsic::fptosi_sat, {SatTy, FpTy}, In);
445 I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy));
446 return true;
447}
448
449/// Try to replace a mathlib call to sqrt with the LLVM intrinsic. This avoids
450/// pessimistic codegen that has to account for setting errno and can enable
451/// vectorization.
452static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
454 DominatorTree &DT) {
455 // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
456 // (because NNAN or the operand arg must not be less than -0.0) and (2) we
457 // would not end up lowering to a libcall anyway (which could change the value
458 // of errno), then:
459 // (1) errno won't be set.
460 // (2) it is safe to convert this to an intrinsic call.
461 Type *Ty = Call->getType();
462 Value *Arg = Call->getArgOperand(0);
463 if (TTI.haveFastSqrt(Ty) &&
464 (Call->hasNoNaNs() ||
466 Arg, SimplifyQuery(Call->getDataLayout(), &TLI, &DT, &AC, Call)))) {
467 IRBuilder<> Builder(Call);
468 Value *NewSqrt =
469 Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg, Call, "sqrt");
470 Call->replaceAllUsesWith(NewSqrt);
471
472 // Explicitly erase the old call because a call with side effects is not
473 // trivially dead.
474 Call->eraseFromParent();
475 return true;
476 }
477
478 return false;
479}
480
481// Check if this array of constants represents a cttz table.
482// Iterate over the elements from \p Table by trying to find/match all
483// the numbers from 0 to \p InputBits that should represent cttz results.
484static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift,
485 const APInt &AndMask, Type *AccessTy,
486 unsigned InputBits, const APInt &GEPIdxFactor,
487 const DataLayout &DL) {
488 for (unsigned Idx = 0; Idx < InputBits; Idx++) {
489 APInt Index =
490 (APInt::getOneBitSet(InputBits, Idx) * Mul).lshr(Shift) & AndMask;
492 ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
493 if (!C || C->getValue() != Idx)
494 return false;
495 }
496
497 return true;
498}
499
500// Try to recognize table-based ctz implementation.
501// E.g., an example in C (for more cases please see the llvm/tests):
502// int f(unsigned x) {
503// static const char table[32] =
504// {0, 1, 28, 2, 29, 14, 24, 3, 30,
505// 22, 20, 15, 25, 17, 4, 8, 31, 27,
506// 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9};
507// return table[((unsigned)((x & -x) * 0x077CB531U)) >> 27];
508// }
509// this can be lowered to `cttz` instruction.
510// There is also a special case when the element is 0.
511//
512// The (x & -x) sets the lowest non-zero bit to 1. The multiply is a de-bruijn
513// sequence that contains each pattern of bits in it. The shift extracts
514// the top bits after the multiply, and that index into the table should
515// represent the number of trailing zeros in the original number.
516//
517// Here are some examples or LLVM IR for a 64-bit target:
518//
519// CASE 1:
520// %sub = sub i32 0, %x
521// %and = and i32 %sub, %x
522// %mul = mul i32 %and, 125613361
523// %shr = lshr i32 %mul, 27
524// %idxprom = zext i32 %shr to i64
525// %arrayidx = getelementptr inbounds [32 x i8], [32 x i8]* @ctz1.table, i64 0,
526// i64 %idxprom
527// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
528//
529// CASE 2:
530// %sub = sub i32 0, %x
531// %and = and i32 %sub, %x
532// %mul = mul i32 %and, 72416175
533// %shr = lshr i32 %mul, 26
534// %idxprom = zext i32 %shr to i64
535// %arrayidx = getelementptr inbounds [64 x i16], [64 x i16]* @ctz2.table,
536// i64 0, i64 %idxprom
537// %0 = load i16, i16* %arrayidx, align 2, !tbaa !8
538//
539// CASE 3:
540// %sub = sub i32 0, %x
541// %and = and i32 %sub, %x
542// %mul = mul i32 %and, 81224991
543// %shr = lshr i32 %mul, 27
544// %idxprom = zext i32 %shr to i64
545// %arrayidx = getelementptr inbounds [32 x i32], [32 x i32]* @ctz3.table,
546// i64 0, i64 %idxprom
547// %0 = load i32, i32* %arrayidx, align 4, !tbaa !8
548//
549// CASE 4:
550// %sub = sub i64 0, %x
551// %and = and i64 %sub, %x
552// %mul = mul i64 %and, 283881067100198605
553// %shr = lshr i64 %mul, 58
554// %arrayidx = getelementptr inbounds [64 x i8], [64 x i8]* @table, i64 0,
555// i64 %shr
556// %0 = load i8, i8* %arrayidx, align 1, !tbaa !8
557//
558// All these can be lowered to @llvm.cttz.i32/64 intrinsics.
561 if (!LI)
562 return false;
563
564 Type *AccessType = LI->getType();
565 if (!AccessType->isIntegerTy())
566 return false;
567
569 if (!GEP || !GEP->hasNoUnsignedSignedWrap())
570 return false;
571
572 GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
573 if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
574 return false;
575
576 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
577 APInt ModOffset(BW, 0);
579 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
580 VarOffsets.size() != 1 || ModOffset != 0)
581 return false;
582 auto [GepIdx, GEPScale] = VarOffsets.front();
583
584 Value *X1;
585 const APInt *MulConst, *ShiftConst, *AndCst = nullptr;
586 // Check that the gep variable index is ((x & -x) * MulConst) >> ShiftConst.
587 // This might be extended to the pointer index type, and if the gep index type
588 // has been replaced with an i8 then a new And (and different ShiftConst) will
589 // be present.
590 auto MatchInner = m_LShr(
591 m_Mul(m_c_And(m_Neg(m_Value(X1)), m_Deferred(X1)), m_APInt(MulConst)),
592 m_APInt(ShiftConst));
593 if (!match(GepIdx, m_CastOrSelf(MatchInner)) &&
594 !match(GepIdx, m_CastOrSelf(m_And(MatchInner, m_APInt(AndCst)))))
595 return false;
596
597 unsigned InputBits = X1->getType()->getScalarSizeInBits();
598 if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
599 return false;
600
601 if (!GEPScale.isIntN(InputBits) ||
602 !isCTTZTable(GVTable->getInitializer(), *MulConst, *ShiftConst,
603 AndCst ? *AndCst : APInt::getAllOnes(InputBits), AccessType,
604 InputBits, GEPScale.zextOrTrunc(InputBits), DL))
605 return false;
606
607 ConstantInt *ZeroTableElem = cast<ConstantInt>(
608 ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
609 bool DefinedForZero = ZeroTableElem->getZExtValue() == InputBits;
610
611 IRBuilder<> B(LI);
612 ConstantInt *BoolConst = B.getInt1(!DefinedForZero);
613 Type *XType = X1->getType();
614 auto Cttz = B.CreateIntrinsic(Intrinsic::cttz, {XType}, {X1, BoolConst});
615 Value *ZExtOrTrunc = nullptr;
616
617 if (DefinedForZero) {
618 ZExtOrTrunc = B.CreateZExtOrTrunc(Cttz, AccessType);
619 } else {
620 // If the value in elem 0 isn't the same as InputBits, we still want to
621 // produce the value from the table.
622 auto Cmp = B.CreateICmpEQ(X1, ConstantInt::get(XType, 0));
623 auto Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Cttz);
624
625 // The true branch of select handles the cttz(0) case, which is rare.
628 SelectI->setMetadata(
629 LLVMContext::MD_prof,
630 MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
631 }
632
633 // NOTE: If the table[0] is 0, but the cttz(0) is defined by the Target
634 // it should be handled as: `cttz(x) & (typeSize - 1)`.
635
636 ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
637 }
638
639 LI->replaceAllUsesWith(ZExtOrTrunc);
640
641 return true;
642}
643
644// Check if this array of constants represents a log2 table.
645// Iterate over the elements from \p Table by trying to find/match all
646// the numbers from 0 to \p InputBits that should represent log2 results.
647static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift,
648 Type *AccessTy, unsigned InputBits,
649 const APInt &GEPIdxFactor, const DataLayout &DL) {
650 for (unsigned Idx = 0; Idx < InputBits; Idx++) {
651 APInt Index = (APInt::getLowBitsSet(InputBits, Idx + 1) * Mul).lshr(Shift);
653 ConstantFoldLoadFromConst(Table, AccessTy, Index * GEPIdxFactor, DL));
654 if (!C || C->getValue() != Idx)
655 return false;
656 }
657
658 // Verify that an input of zero will select table index 0.
659 APInt ZeroIndex = Mul.lshr(Shift);
660 if (!ZeroIndex.isZero())
661 return false;
662
663 return true;
664}
665
666// Try to recognize table-based log2 implementation.
667// E.g., an example in C (for more cases please the llvm/tests):
668// int f(unsigned v) {
669// static const char table[32] =
670// {0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
671// 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31};
672//
673// v |= v >> 1; // first round down to one less than a power of 2
674// v |= v >> 2;
675// v |= v >> 4;
676// v |= v >> 8;
677// v |= v >> 16;
678//
679// return table[(unsigned)(v * 0x07C4ACDDU) >> 27];
680// }
681// this can be lowered to `ctlz` instruction.
682// There is also a special case when the element is 0.
683//
684// The >> and |= sequence sets all bits below the most significant set bit. The
685// multiply is a de-bruijn sequence that contains each pattern of bits in it.
686// The shift extracts the top bits after the multiply, and that index into the
687// table should represent the floor log base 2 of the original number.
688//
689// Here are some examples of LLVM IR for a 64-bit target.
690//
691// CASE 1:
692// %shr = lshr i32 %v, 1
693// %or = or i32 %shr, %v
694// %shr1 = lshr i32 %or, 2
695// %or2 = or i32 %shr1, %or
696// %shr3 = lshr i32 %or2, 4
697// %or4 = or i32 %shr3, %or2
698// %shr5 = lshr i32 %or4, 8
699// %or6 = or i32 %shr5, %or4
700// %shr7 = lshr i32 %or6, 16
701// %or8 = or i32 %shr7, %or6
702// %mul = mul i32 %or8, 130329821
703// %shr9 = lshr i32 %mul, 27
704// %idxprom = zext nneg i32 %shr9 to i64
705// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %idxprom
706// %0 = load i8, ptr %arrayidx, align 1
707//
708// CASE 2:
709// %shr = lshr i64 %v, 1
710// %or = or i64 %shr, %v
711// %shr1 = lshr i64 %or, 2
712// %or2 = or i64 %shr1, %or
713// %shr3 = lshr i64 %or2, 4
714// %or4 = or i64 %shr3, %or2
715// %shr5 = lshr i64 %or4, 8
716// %or6 = or i64 %shr5, %or4
717// %shr7 = lshr i64 %or6, 16
718// %or8 = or i64 %shr7, %or6
719// %shr9 = lshr i64 %or8, 32
720// %or10 = or i64 %shr9, %or8
721// %mul = mul i64 %or10, 285870213051386505
722// %shr11 = lshr i64 %mul, 58
723// %arrayidx = getelementptr inbounds i8, ptr @table, i64 %shr11
724// %0 = load i8, ptr %arrayidx, align 1
725//
726// All these can be lowered to @llvm.ctlz.i32/64 intrinsics and a subtract.
730 if (!LI)
731 return false;
732
733 Type *AccessType = LI->getType();
734 if (!AccessType->isIntegerTy())
735 return false;
736
738 if (!GEP || !GEP->hasNoUnsignedSignedWrap())
739 return false;
740
741 GlobalVariable *GVTable = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
742 if (!GVTable || !GVTable->hasInitializer() || !GVTable->isConstant())
743 return false;
744
745 unsigned BW = DL.getIndexTypeSizeInBits(GEP->getType());
746 APInt ModOffset(BW, 0);
748 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset) ||
749 VarOffsets.size() != 1 || ModOffset != 0)
750 return false;
751 auto [GepIdx, GEPScale] = VarOffsets.front();
752
753 Value *X;
754 const APInt *MulConst, *ShiftConst;
755 // Check that the gep variable index is (x * MulConst) >> ShiftConst.
756 auto MatchInner =
757 m_LShr(m_Mul(m_Value(X), m_APInt(MulConst)), m_APInt(ShiftConst));
758 if (!match(GepIdx, m_CastOrSelf(MatchInner)))
759 return false;
760
761 unsigned InputBits = X->getType()->getScalarSizeInBits();
762 if (InputBits != 16 && InputBits != 32 && InputBits != 64 && InputBits != 128)
763 return false;
764
765 // Verify shift amount.
766 // TODO: Allow other shift amounts when we have proper test coverage.
767 if (*ShiftConst != InputBits - Log2_32(InputBits))
768 return false;
769
770 // Match the sequence of OR operations with right shifts by powers of 2.
771 for (unsigned ShiftAmt = InputBits / 2; ShiftAmt != 0; ShiftAmt /= 2) {
772 Value *Y;
773 if (!match(X, m_c_Or(m_LShr(m_Value(Y), m_SpecificInt(ShiftAmt)),
774 m_Deferred(Y))))
775 return false;
776 X = Y;
777 }
778
779 if (!GEPScale.isIntN(InputBits) ||
780 !isLog2Table(GVTable->getInitializer(), *MulConst, *ShiftConst,
781 AccessType, InputBits, GEPScale.zextOrTrunc(InputBits), DL))
782 return false;
783
784 ConstantInt *ZeroTableElem = cast<ConstantInt>(
785 ConstantFoldLoadFromConst(GVTable->getInitializer(), AccessType, DL));
786
787 // Use InputBits - 1 - ctlz(X) to compute log2(X).
788 IRBuilder<> B(LI);
789 ConstantInt *BoolConst = B.getTrue();
790 Type *XType = X->getType();
791
792 // Check the the backend has an efficient ctlz instruction.
793 // FIXME: Teach the backend to emit the original code when ctlz isn't
794 // supported like we do for cttz.
796 Intrinsic::ctlz, XType,
797 {PoisonValue::get(XType), /*is_zero_poison=*/BoolConst});
798 InstructionCost Cost =
799 TTI.getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
801 return false;
802
803 Value *Ctlz = B.CreateIntrinsic(Intrinsic::ctlz, {XType}, {X, BoolConst});
804
805 Constant *InputBitsM1 = ConstantInt::get(XType, InputBits - 1);
806 Value *Sub = B.CreateSub(InputBitsM1, Ctlz);
807
808 // The table won't produce a sensible result for 0.
809 Value *Cmp = B.CreateICmpEQ(X, ConstantInt::get(XType, 0));
810 Value *Select = B.CreateSelect(Cmp, B.CreateZExt(ZeroTableElem, XType), Sub);
811
812 // The true branch of select handles the log2(0) case, which is rare.
815 SelectI->setMetadata(
816 LLVMContext::MD_prof,
817 MDBuilder(SelectI->getContext()).createUnlikelyBranchWeights());
818 }
819
820 Value *ZExtOrTrunc = B.CreateZExtOrTrunc(Select, AccessType);
821
822 LI->replaceAllUsesWith(ZExtOrTrunc);
823
824 return true;
825}
826
827/// This is used by foldLoadsRecursive() to capture a Root Load node which is
828/// of type or(load, load) and recursively build the wide load. Also capture the
829/// shift amount, zero extend type and loadSize.
839
840// Identify and Merge consecutive loads recursively which is of the form
841// (ZExt(L1) << shift1) | (ZExt(L2) << shift2) -> ZExt(L3) << shift1
842// (ZExt(L1) << shift1) | ZExt(L2) -> ZExt(L3)
843static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL,
844 AliasAnalysis &AA, bool IsRoot = false) {
845 uint64_t ShAmt2;
846 Value *X;
847 Instruction *L1, *L2;
848
849 // For the root instruction, allow multiple uses since the final result
850 // may legitimately be used in multiple places. For intermediate values,
851 // require single use to avoid creating duplicate loads.
852 if (!IsRoot && !V->hasOneUse())
853 return false;
854
855 if (!match(V, m_c_Or(m_Value(X),
857 ShAmt2)))))
858 return false;
859
860 if (!foldLoadsRecursive(X, LOps, DL, AA, /*IsRoot=*/false) && LOps.FoundRoot)
861 // Avoid Partial chain merge.
862 return false;
863
864 // Check if the pattern has loads
865 LoadInst *LI1 = LOps.Root;
866 uint64_t ShAmt1 = LOps.Shift;
867 if (LOps.FoundRoot == false &&
869 m_ShlOrSelf(m_OneUse(m_ZExt(m_Instruction(L1))), ShAmt1)))) {
870 LI1 = dyn_cast<LoadInst>(L1);
871 }
872 LoadInst *LI2 = dyn_cast<LoadInst>(L2);
873
874 // Check if loads are same, atomic, volatile and having same address space.
875 if (LI1 == LI2 || !LI1 || !LI2 || !LI1->isSimple() || !LI2->isSimple() ||
877 return false;
878
879 // Check if Loads come from same BB.
880 if (LI1->getParent() != LI2->getParent())
881 return false;
882
883 // Find the data layout
884 bool IsBigEndian = DL.isBigEndian();
885
886 // Check if loads are consecutive and same size.
887 Value *Load1Ptr = LI1->getPointerOperand();
888 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
889 Load1Ptr =
890 Load1Ptr->stripAndAccumulateConstantOffsets(DL, Offset1,
891 /* AllowNonInbounds */ true);
892
893 Value *Load2Ptr = LI2->getPointerOperand();
894 APInt Offset2(DL.getIndexTypeSizeInBits(Load2Ptr->getType()), 0);
895 Load2Ptr =
896 Load2Ptr->stripAndAccumulateConstantOffsets(DL, Offset2,
897 /* AllowNonInbounds */ true);
898
899 // Verify if both loads have same base pointers
900 uint64_t LoadSize1 = LI1->getType()->getPrimitiveSizeInBits();
901 uint64_t LoadSize2 = LI2->getType()->getPrimitiveSizeInBits();
902 if (Load1Ptr != Load2Ptr)
903 return false;
904
905 // Make sure that there are no padding bits.
906 if (!DL.typeSizeEqualsStoreSize(LI1->getType()) ||
907 !DL.typeSizeEqualsStoreSize(LI2->getType()))
908 return false;
909
910 // Alias Analysis to check for stores b/w the loads.
911 LoadInst *Start = LOps.FoundRoot ? LOps.RootInsert : LI1, *End = LI2;
913 if (!Start->comesBefore(End)) {
914 std::swap(Start, End);
915 // If LOps.RootInsert comes after LI2, since we use LI2 as the new insert
916 // point, we should make sure whether the memory region accessed by LOps
917 // isn't modified.
918 if (LOps.FoundRoot)
920 LOps.Root->getPointerOperand(),
921 LocationSize::precise(DL.getTypeStoreSize(
922 IntegerType::get(LI1->getContext(), LOps.LoadSize))),
923 LOps.AATags);
924 else
926 } else
928 unsigned NumScanned = 0;
929 for (Instruction &Inst :
930 make_range(Start->getIterator(), End->getIterator())) {
931 if (Inst.mayWriteToMemory() && isModSet(AA.getModRefInfo(&Inst, Loc)))
932 return false;
933
934 if (++NumScanned > MaxInstrsToScan)
935 return false;
936 }
937
938 // Make sure Load with lower Offset is at LI1
939 bool Reverse = false;
940 if (Offset2.slt(Offset1)) {
941 std::swap(LI1, LI2);
942 std::swap(ShAmt1, ShAmt2);
943 std::swap(Offset1, Offset2);
944 std::swap(Load1Ptr, Load2Ptr);
945 std::swap(LoadSize1, LoadSize2);
946 Reverse = true;
947 }
948
949 // Big endian swap the shifts
950 if (IsBigEndian)
951 std::swap(ShAmt1, ShAmt2);
952
953 // First load is always LI1. This is where we put the new load.
954 // Use the merged load size available from LI1 for forward loads.
955 if (LOps.FoundRoot) {
956 if (!Reverse)
957 LoadSize1 = LOps.LoadSize;
958 else
959 LoadSize2 = LOps.LoadSize;
960 }
961
962 // Verify if shift amount and load index aligns and verifies that loads
963 // are consecutive.
964 uint64_t ShiftDiff = IsBigEndian ? LoadSize2 : LoadSize1;
965 uint64_t PrevSize =
966 DL.getTypeStoreSize(IntegerType::get(LI1->getContext(), LoadSize1));
967 if ((ShAmt2 - ShAmt1) != ShiftDiff || (Offset2 - Offset1) != PrevSize)
968 return false;
969
970 // Update LOps
971 AAMDNodes AATags1 = LOps.AATags;
972 AAMDNodes AATags2 = LI2->getAAMetadata();
973 if (LOps.FoundRoot == false) {
974 LOps.FoundRoot = true;
975 AATags1 = LI1->getAAMetadata();
976 }
977 LOps.LoadSize = LoadSize1 + LoadSize2;
978 LOps.RootInsert = Start;
979
980 // Concatenate the AATags of the Merged Loads.
981 LOps.AATags = AATags1.concat(AATags2);
982
983 LOps.Root = LI1;
984 LOps.Shift = ShAmt1;
985 LOps.ZextType = X->getType();
986 return true;
987}
988
989// For a given BB instruction, evaluate all loads in the chain that form a
990// pattern which suggests that the loads can be combined. The one and only use
991// of the loads is to form a wider load.
994 const DominatorTree &DT) {
995 // Only consider load chains of scalar values.
996 if (isa<VectorType>(I.getType()))
997 return false;
998
999 LoadOps LOps;
1000 if (!foldLoadsRecursive(&I, LOps, DL, AA, /*IsRoot=*/true) || !LOps.FoundRoot)
1001 return false;
1002
1003 IRBuilder<> Builder(&I);
1004 LoadInst *NewLoad = nullptr, *LI1 = LOps.Root;
1005
1006 IntegerType *WiderType = IntegerType::get(I.getContext(), LOps.LoadSize);
1007 // TTI based checks if we want to proceed with wider load
1008 bool Allowed = TTI.isTypeLegal(WiderType);
1009 if (!Allowed)
1010 return false;
1011
1012 unsigned AS = LI1->getPointerAddressSpace();
1013 unsigned Fast = 0;
1014 Allowed = TTI.allowsMisalignedMemoryAccesses(I.getContext(), LOps.LoadSize,
1015 AS, LI1->getAlign(), &Fast);
1016 if (!Allowed || !Fast)
1017 return false;
1018
1019 // Get the Index and Ptr for the new GEP.
1020 Value *Load1Ptr = LI1->getPointerOperand();
1021 Builder.SetInsertPoint(LOps.RootInsert);
1022 if (!DT.dominates(Load1Ptr, LOps.RootInsert)) {
1023 APInt Offset1(DL.getIndexTypeSizeInBits(Load1Ptr->getType()), 0);
1024 Load1Ptr = Load1Ptr->stripAndAccumulateConstantOffsets(
1025 DL, Offset1, /* AllowNonInbounds */ true);
1026 Load1Ptr = Builder.CreatePtrAdd(Load1Ptr, Builder.getInt(Offset1));
1027 }
1028 // Generate wider load.
1029 NewLoad = Builder.CreateAlignedLoad(WiderType, Load1Ptr, LI1->getAlign(),
1030 LI1->isVolatile(), "");
1031 NewLoad->takeName(LI1);
1032 // Set the New Load AATags Metadata.
1033 if (LOps.AATags)
1034 NewLoad->setAAMetadata(LOps.AATags);
1035
1036 Value *NewOp = NewLoad;
1037 // Check if zero extend needed.
1038 if (LOps.ZextType)
1039 NewOp = Builder.CreateZExt(NewOp, LOps.ZextType);
1040
1041 // Check if shift needed. We need to shift with the amount of load1
1042 // shift if not zero.
1043 if (LOps.Shift)
1044 NewOp = Builder.CreateShl(NewOp, LOps.Shift);
1045 I.replaceAllUsesWith(NewOp);
1046
1047 return true;
1048}
1049
1050/// ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
1058
1059 bool isCompatibleWith(const PartStore &Other) const {
1060 return PtrBase == Other.PtrBase && Val == Other.Val;
1061 }
1062
1063 bool operator<(const PartStore &Other) const {
1064 return PtrOffset.slt(Other.PtrOffset);
1065 }
1066};
1067
1068static std::optional<PartStore> matchPartStore(Instruction &I,
1069 const DataLayout &DL) {
1070 auto *Store = dyn_cast<StoreInst>(&I);
1071 if (!Store || !Store->isSimple())
1072 return std::nullopt;
1073
1074 Value *StoredVal = Store->getValueOperand();
1075 Type *StoredTy = StoredVal->getType();
1076 if (!StoredTy->isIntegerTy() || !DL.typeSizeEqualsStoreSize(StoredTy))
1077 return std::nullopt;
1078
1079 uint64_t ValWidth = StoredTy->getPrimitiveSizeInBits();
1080 uint64_t ValOffset;
1081 Value *Val;
1082 if (!match(StoredVal, m_Trunc(m_LShrOrSelf(m_Value(Val), ValOffset))))
1083 return std::nullopt;
1084
1085 Value *Ptr = Store->getPointerOperand();
1086 APInt PtrOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
1088 DL, PtrOffset, /*AllowNonInbounds=*/true);
1089 return {{PtrBase, PtrOffset, Val, ValOffset, ValWidth, Store}};
1090}
1091
1093 unsigned Width, const DataLayout &DL,
1095 if (Parts.size() < 2)
1096 return false;
1097
1098 // Check whether combining the stores is profitable.
1099 // FIXME: We could generate smaller stores if we can't produce a large one.
1100 const PartStore &First = Parts.front();
1101 LLVMContext &Ctx = First.Store->getContext();
1102 Type *NewTy = Type::getIntNTy(Ctx, Width);
1103 unsigned Fast = 0;
1104 if (!TTI.isTypeLegal(NewTy) ||
1105 !TTI.allowsMisalignedMemoryAccesses(Ctx, Width,
1106 First.Store->getPointerAddressSpace(),
1107 First.Store->getAlign(), &Fast) ||
1108 !Fast)
1109 return false;
1110
1111 // Generate the combined store.
1112 IRBuilder<> Builder(First.Store);
1113 Value *Val = First.Val;
1114 if (First.ValOffset != 0)
1115 Val = Builder.CreateLShr(Val, First.ValOffset);
1116 Val = Builder.CreateZExtOrTrunc(Val, NewTy);
1117 StoreInst *Store = Builder.CreateAlignedStore(
1118 Val, First.Store->getPointerOperand(), First.Store->getAlign());
1119
1120 // Merge various metadata onto the new store.
1121 AAMDNodes AATags = First.Store->getAAMetadata();
1122 SmallVector<Instruction *> Stores = {First.Store};
1123 Stores.reserve(Parts.size());
1124 SmallVector<DebugLoc> DbgLocs = {First.Store->getDebugLoc()};
1125 DbgLocs.reserve(Parts.size());
1126 for (const PartStore &Part : drop_begin(Parts)) {
1127 AATags = AATags.concat(Part.Store->getAAMetadata());
1128 Stores.push_back(Part.Store);
1129 DbgLocs.push_back(Part.Store->getDebugLoc());
1130 }
1131 Store->setAAMetadata(AATags);
1132 Store->mergeDIAssignID(Stores);
1133 Store->setDebugLoc(DebugLoc::getMergedLocations(DbgLocs));
1134
1135 // Remove the old stores.
1136 for (const PartStore &Part : Parts)
1137 Part.Store->eraseFromParent();
1138
1139 return true;
1140}
1141
1144 if (Parts.size() < 2)
1145 return false;
1146
1147 // We now have multiple parts of the same value stored to the same pointer.
1148 // Sort the parts by pointer offset, and make sure they are consistent with
1149 // the value offsets. Also check that the value is fully covered without
1150 // overlaps.
1151 bool Changed = false;
1152 llvm::sort(Parts);
1153 int64_t LastEndOffsetFromFirst = 0;
1154 const PartStore *First = &Parts[0];
1155 for (const PartStore &Part : Parts) {
1156 APInt PtrOffsetFromFirst = Part.PtrOffset - First->PtrOffset;
1157 int64_t ValOffsetFromFirst = Part.ValOffset - First->ValOffset;
1158 if (PtrOffsetFromFirst * 8 != ValOffsetFromFirst ||
1159 LastEndOffsetFromFirst != ValOffsetFromFirst) {
1161 LastEndOffsetFromFirst, DL, TTI);
1162 First = &Part;
1163 LastEndOffsetFromFirst = Part.ValWidth;
1164 continue;
1165 }
1166
1167 LastEndOffsetFromFirst = ValOffsetFromFirst + Part.ValWidth;
1168 }
1169
1171 LastEndOffsetFromFirst, DL, TTI);
1172 return Changed;
1173}
1174
1177 // FIXME: Add big endian support.
1178 if (DL.isBigEndian())
1179 return false;
1180
1181 BatchAAResults BatchAA(AA);
1183 bool MadeChange = false;
1184 for (Instruction &I : make_early_inc_range(BB)) {
1185 if (std::optional<PartStore> Part = matchPartStore(I, DL)) {
1186 if (Parts.empty() || Part->isCompatibleWith(Parts[0])) {
1187 Parts.push_back(std::move(*Part));
1188 continue;
1189 }
1190
1191 MadeChange |= mergePartStores(Parts, DL, TTI);
1192 Parts.clear();
1193 Parts.push_back(std::move(*Part));
1194 continue;
1195 }
1196
1197 if (Parts.empty())
1198 continue;
1199
1200 if (I.mayThrow() ||
1201 (I.mayReadOrWriteMemory() &&
1203 &I, MemoryLocation::getBeforeOrAfter(Parts[0].PtrBase))))) {
1204 MadeChange |= mergePartStores(Parts, DL, TTI);
1205 Parts.clear();
1206 continue;
1207 }
1208 }
1209
1210 MadeChange |= mergePartStores(Parts, DL, TTI);
1211 return MadeChange;
1212}
1213
1214/// Combine away instructions providing they are still equivalent when compared
1215/// against 0. i.e do they have any bits set.
1217 auto *I = dyn_cast<Instruction>(V);
1218 if (!I || I->getOpcode() != Instruction::Or || !I->hasOneUse())
1219 return nullptr;
1220
1221 Value *A;
1222
1223 // Look deeper into the chain of or's, combining away shl (so long as they are
1224 // nuw or nsw).
1225 Value *Op0 = I->getOperand(0);
1226 if (match(Op0, m_CombineOr(m_NSWShl(m_Value(A), m_Value()),
1227 m_NUWShl(m_Value(A), m_Value()))))
1228 Op0 = A;
1229 else if (auto *NOp = optimizeShiftInOrChain(Op0, Builder))
1230 Op0 = NOp;
1231
1232 Value *Op1 = I->getOperand(1);
1233 if (match(Op1, m_CombineOr(m_NSWShl(m_Value(A), m_Value()),
1234 m_NUWShl(m_Value(A), m_Value()))))
1235 Op1 = A;
1236 else if (auto *NOp = optimizeShiftInOrChain(Op1, Builder))
1237 Op1 = NOp;
1238
1239 if (Op0 != I->getOperand(0) || Op1 != I->getOperand(1))
1240 return Builder.CreateOr(Op0, Op1);
1241 return nullptr;
1242}
1243
1246 const DominatorTree &DT) {
1247 CmpPredicate Pred;
1248 Value *Op0;
1249 if (!match(&I, m_ICmp(Pred, m_Value(Op0), m_Zero())) ||
1250 !ICmpInst::isEquality(Pred))
1251 return false;
1252
1253 // If the chain or or's matches a load, combine to that before attempting to
1254 // remove shifts.
1255 if (auto OpI = dyn_cast<Instruction>(Op0))
1256 if (OpI->getOpcode() == Instruction::Or)
1257 if (foldConsecutiveLoads(*OpI, DL, TTI, AA, DT))
1258 return true;
1259
1260 IRBuilder<> Builder(&I);
1261 // icmp eq/ne or(shl(a), b), 0 -> icmp eq/ne or(a, b), 0
1262 if (auto *Res = optimizeShiftInOrChain(Op0, Builder)) {
1263 I.replaceAllUsesWith(Builder.CreateICmp(Pred, Res, I.getOperand(1)));
1264 return true;
1265 }
1266
1267 return false;
1268}
1269
1270// Calculate GEP Stride and accumulated const ModOffset. Return Stride and
1271// ModOffset
1272static std::pair<APInt, APInt>
1274 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
1275 std::optional<APInt> Stride;
1276 APInt ModOffset(BW, 0);
1277 // Return a minimum gep stride, greatest common divisor of consective gep
1278 // index scales(c.f. Bézout's identity).
1279 while (auto *GEP = dyn_cast<GEPOperator>(PtrOp)) {
1281 if (!GEP->collectOffset(DL, BW, VarOffsets, ModOffset))
1282 break;
1283
1284 for (auto [V, Scale] : VarOffsets) {
1285 // Only keep a power of two factor for non-inbounds
1286 if (!GEP->hasNoUnsignedSignedWrap())
1287 Scale = APInt::getOneBitSet(Scale.getBitWidth(), Scale.countr_zero());
1288
1289 if (!Stride)
1290 Stride = Scale;
1291 else
1292 Stride = APIntOps::GreatestCommonDivisor(*Stride, Scale);
1293 }
1294
1295 PtrOp = GEP->getPointerOperand();
1296 }
1297
1298 // Check whether pointer arrives back at Global Variable via at least one GEP.
1299 // Even if it doesn't, we can check by alignment.
1300 if (!isa<GlobalVariable>(PtrOp) || !Stride)
1301 return {APInt(BW, 1), APInt(BW, 0)};
1302
1303 // In consideration of signed GEP indices, non-negligible offset become
1304 // remainder of division by minimum GEP stride.
1305 ModOffset = ModOffset.srem(*Stride);
1306 if (ModOffset.isNegative())
1307 ModOffset += *Stride;
1308
1309 return {*Stride, ModOffset};
1310}
1311
1312/// If C is a constant patterned array and all valid loaded results for given
1313/// alignment are same to a constant, return that constant.
1315 auto *LI = dyn_cast<LoadInst>(&I);
1316 if (!LI || LI->isVolatile())
1317 return false;
1318
1319 // We can only fold the load if it is from a constant global with definitive
1320 // initializer. Skip expensive logic if this is not the case.
1321 auto *PtrOp = LI->getPointerOperand();
1323 if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
1324 return false;
1325
1326 // Bail for large initializers in excess of 4K to avoid too many scans.
1327 Constant *C = GV->getInitializer();
1328 uint64_t GVSize = DL.getTypeAllocSize(C->getType());
1329 if (!GVSize || 4096 < GVSize)
1330 return false;
1331
1332 Type *LoadTy = LI->getType();
1333 unsigned BW = DL.getIndexTypeSizeInBits(PtrOp->getType());
1334 auto [Stride, ConstOffset] = getStrideAndModOffsetOfGEP(PtrOp, DL);
1335
1336 // Any possible offset could be multiple of GEP stride. And any valid
1337 // offset is multiple of load alignment, so checking only multiples of bigger
1338 // one is sufficient to say results' equality.
1339 if (auto LA = LI->getAlign();
1340 LA <= GV->getAlign().valueOrOne() && Stride.getZExtValue() < LA.value()) {
1341 ConstOffset = APInt(BW, 0);
1342 Stride = APInt(BW, LA.value());
1343 }
1344
1345 Constant *Ca = ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL);
1346 if (!Ca)
1347 return false;
1348
1349 unsigned E = GVSize - DL.getTypeStoreSize(LoadTy);
1350 for (; ConstOffset.getZExtValue() <= E; ConstOffset += Stride)
1351 if (Ca != ConstantFoldLoadFromConst(C, LoadTy, ConstOffset, DL))
1352 return false;
1353
1354 I.replaceAllUsesWith(Ca);
1355
1356 return true;
1357}
1358
1359namespace {
1360class StrNCmpInliner {
1361public:
1362 StrNCmpInliner(CallInst *CI, LibFunc Func, DomTreeUpdater *DTU,
1363 const DataLayout &DL)
1364 : CI(CI), Func(Func), DTU(DTU), DL(DL) {}
1365
1366 bool optimizeStrNCmp();
1367
1368private:
1369 void inlineCompare(Value *LHS, StringRef RHS, uint64_t N, bool Swapped);
1370
1371 CallInst *CI;
1372 LibFunc Func;
1373 DomTreeUpdater *DTU;
1374 const DataLayout &DL;
1375};
1376
1377} // namespace
1378
1379/// First we normalize calls to strncmp/strcmp to the form of
1380/// compare(s1, s2, N), which means comparing first N bytes of s1 and s2
1381/// (without considering '\0').
1382///
1383/// Examples:
1384///
1385/// \code
1386/// strncmp(s, "a", 3) -> compare(s, "a", 2)
1387/// strncmp(s, "abc", 3) -> compare(s, "abc", 3)
1388/// strncmp(s, "a\0b", 3) -> compare(s, "a\0b", 2)
1389/// strcmp(s, "a") -> compare(s, "a", 2)
1390///
1391/// char s2[] = {'a'}
1392/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1393///
1394/// char s2[] = {'a', 'b', 'c', 'd'}
1395/// strncmp(s, s2, 3) -> compare(s, s2, 3)
1396/// \endcode
1397///
1398/// We only handle cases where N and exactly one of s1 and s2 are constant.
1399/// Cases that s1 and s2 are both constant are already handled by the
1400/// instcombine pass.
1401///
1402/// We do not handle cases where N > StrNCmpInlineThreshold.
1403///
1404/// We also do not handles cases where N < 2, which are already
1405/// handled by the instcombine pass.
1406///
1407bool StrNCmpInliner::optimizeStrNCmp() {
1408 if (StrNCmpInlineThreshold < 2)
1409 return false;
1410
1412 return false;
1413
1414 Value *Str1P = CI->getArgOperand(0);
1415 Value *Str2P = CI->getArgOperand(1);
1416 // Should be handled elsewhere.
1417 if (Str1P == Str2P)
1418 return false;
1419
1420 StringRef Str1, Str2;
1421 bool HasStr1 = getConstantStringInfo(Str1P, Str1, /*TrimAtNul=*/false);
1422 bool HasStr2 = getConstantStringInfo(Str2P, Str2, /*TrimAtNul=*/false);
1423 if (HasStr1 == HasStr2)
1424 return false;
1425
1426 // Note that '\0' and characters after it are not trimmed.
1427 StringRef Str = HasStr1 ? Str1 : Str2;
1428 Value *StrP = HasStr1 ? Str2P : Str1P;
1429
1430 size_t Idx = Str.find('\0');
1431 uint64_t N = Idx == StringRef::npos ? UINT64_MAX : Idx + 1;
1432 if (Func == LibFunc_strncmp) {
1433 if (auto *ConstInt = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
1434 N = std::min(N, ConstInt->getZExtValue());
1435 else
1436 return false;
1437 }
1438 // Now N means how many bytes we need to compare at most.
1439 if (N > Str.size() || N < 2 || N > StrNCmpInlineThreshold)
1440 return false;
1441
1442 // Cases where StrP has two or more dereferenceable bytes might be better
1443 // optimized elsewhere.
1444 bool CanBeNull = false, CanBeFreed = false;
1445 if (StrP->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed) > 1)
1446 return false;
1447 inlineCompare(StrP, Str, N, HasStr1);
1448 return true;
1449}
1450
1451/// Convert
1452///
1453/// \code
1454/// ret = compare(s1, s2, N)
1455/// \endcode
1456///
1457/// into
1458///
1459/// \code
1460/// ret = (int)s1[0] - (int)s2[0]
1461/// if (ret != 0)
1462/// goto NE
1463/// ...
1464/// ret = (int)s1[N-2] - (int)s2[N-2]
1465/// if (ret != 0)
1466/// goto NE
1467/// ret = (int)s1[N-1] - (int)s2[N-1]
1468/// NE:
1469/// \endcode
1470///
1471/// CFG before and after the transformation:
1472///
1473/// (before)
1474/// BBCI
1475///
1476/// (after)
1477/// BBCI -> BBSubs[0] (sub,icmp) --NE-> BBNE -> BBTail
1478/// | ^
1479/// E |
1480/// | |
1481/// BBSubs[1] (sub,icmp) --NE-----+
1482/// ... |
1483/// BBSubs[N-1] (sub) ---------+
1484///
1485void StrNCmpInliner::inlineCompare(Value *LHS, StringRef RHS, uint64_t N,
1486 bool Swapped) {
1487 auto &Ctx = CI->getContext();
1488 IRBuilder<> B(Ctx);
1489 // We want these instructions to be recognized as inlined instructions for the
1490 // compare call, but we don't have a source location for the definition of
1491 // that function, since we're generating that code now. Because the generated
1492 // code is a viable point for a memory access error, we make the pragmatic
1493 // choice here to directly use CI's location so that we have useful
1494 // attribution for the generated code.
1495 B.SetCurrentDebugLocation(CI->getDebugLoc());
1496
1497 BasicBlock *BBCI = CI->getParent();
1498 BasicBlock *BBTail =
1499 SplitBlock(BBCI, CI, DTU, nullptr, nullptr, BBCI->getName() + ".tail");
1500
1502 for (uint64_t I = 0; I < N; ++I)
1503 BBSubs.push_back(
1504 BasicBlock::Create(Ctx, "sub_" + Twine(I), BBCI->getParent(), BBTail));
1505 BasicBlock *BBNE = BasicBlock::Create(Ctx, "ne", BBCI->getParent(), BBTail);
1506
1507 cast<UncondBrInst>(BBCI->getTerminator())->setSuccessor(BBSubs[0]);
1508
1509 B.SetInsertPoint(BBNE);
1510 PHINode *Phi = B.CreatePHI(CI->getType(), N);
1511 B.CreateBr(BBTail);
1512
1513 Value *Base = LHS;
1514 for (uint64_t i = 0; i < N; ++i) {
1515 B.SetInsertPoint(BBSubs[i]);
1516 Value *VL =
1517 B.CreateZExt(B.CreateLoad(B.getInt8Ty(),
1518 B.CreateInBoundsPtrAdd(Base, B.getInt64(i))),
1519 CI->getType());
1520 Value *VR =
1521 ConstantInt::get(CI->getType(), static_cast<unsigned char>(RHS[i]));
1522 Value *Sub = Swapped ? B.CreateSub(VR, VL) : B.CreateSub(VL, VR);
1523 if (i < N - 1) {
1524 CondBrInst *CondBrInst = B.CreateCondBr(
1525 B.CreateICmpNE(Sub, ConstantInt::get(CI->getType(), 0)), BBNE,
1526 BBSubs[i + 1]);
1527
1528 Function *F = CI->getFunction();
1529 assert(F && "Instruction does not belong to a function!");
1530 std::optional<Function::ProfileCount> EC = F->getEntryCount();
1531 if (EC && EC->getCount() > 0)
1533 } else {
1534 B.CreateBr(BBNE);
1535 }
1536
1537 Phi->addIncoming(Sub, BBSubs[i]);
1538 }
1539
1540 CI->replaceAllUsesWith(Phi);
1541 CI->eraseFromParent();
1542
1543 if (DTU) {
1545 Updates.push_back({DominatorTree::Insert, BBCI, BBSubs[0]});
1546 for (uint64_t i = 0; i < N; ++i) {
1547 if (i < N - 1)
1548 Updates.push_back({DominatorTree::Insert, BBSubs[i], BBSubs[i + 1]});
1549 Updates.push_back({DominatorTree::Insert, BBSubs[i], BBNE});
1550 }
1551 Updates.push_back({DominatorTree::Insert, BBNE, BBTail});
1552 Updates.push_back({DominatorTree::Delete, BBCI, BBTail});
1553 DTU->applyUpdates(Updates);
1554 }
1555}
1556
1557/// Convert memchr with a small constant string into a switch
1559 const DataLayout &DL) {
1560 if (isa<Constant>(Call->getArgOperand(1)))
1561 return false;
1562
1563 StringRef Str;
1564 Value *Base = Call->getArgOperand(0);
1565 if (!getConstantStringInfo(Base, Str, /*TrimAtNul=*/false))
1566 return false;
1567
1568 uint64_t N = Str.size();
1569 if (auto *ConstInt = dyn_cast<ConstantInt>(Call->getArgOperand(2))) {
1570 uint64_t Val = ConstInt->getZExtValue();
1571 // Ignore the case that n is larger than the size of string.
1572 if (Val > N)
1573 return false;
1574 N = Val;
1575 } else
1576 return false;
1577
1579 return false;
1580
1581 BasicBlock *BB = Call->getParent();
1582 BasicBlock *BBNext = SplitBlock(BB, Call, DTU);
1583 IRBuilder<> IRB(BB);
1584 IRB.SetCurrentDebugLocation(Call->getDebugLoc());
1585 IntegerType *ByteTy = IRB.getInt8Ty();
1587 SwitchInst *SI = IRB.CreateSwitch(
1588 IRB.CreateTrunc(Call->getArgOperand(1), ByteTy), BBNext, N);
1589 // We can't know the precise weights here, as they would depend on the value
1590 // distribution of Call->getArgOperand(1). So we just mark it as "unknown".
1592 Type *IndexTy = DL.getIndexType(Call->getType());
1594
1595 BasicBlock *BBSuccess = BasicBlock::Create(
1596 Call->getContext(), "memchr.success", BB->getParent(), BBNext);
1597 IRB.SetInsertPoint(BBSuccess);
1598 PHINode *IndexPHI = IRB.CreatePHI(IndexTy, N, "memchr.idx");
1599 Value *FirstOccursLocation = IRB.CreateInBoundsPtrAdd(Base, IndexPHI);
1600 IRB.CreateBr(BBNext);
1601 if (DTU)
1602 Updates.push_back({DominatorTree::Insert, BBSuccess, BBNext});
1603
1605 for (uint64_t I = 0; I < N; ++I) {
1606 ConstantInt *CaseVal =
1607 ConstantInt::get(ByteTy, static_cast<unsigned char>(Str[I]));
1608 if (!Cases.insert(CaseVal).second)
1609 continue;
1610
1611 BasicBlock *BBCase = BasicBlock::Create(Call->getContext(), "memchr.case",
1612 BB->getParent(), BBSuccess);
1613 SI->addCase(CaseVal, BBCase);
1614 IRB.SetInsertPoint(BBCase);
1615 IndexPHI->addIncoming(ConstantInt::get(IndexTy, I), BBCase);
1616 IRB.CreateBr(BBSuccess);
1617 if (DTU) {
1618 Updates.push_back({DominatorTree::Insert, BB, BBCase});
1619 Updates.push_back({DominatorTree::Insert, BBCase, BBSuccess});
1620 }
1621 }
1622
1623 PHINode *PHI =
1624 PHINode::Create(Call->getType(), 2, Call->getName(), BBNext->begin());
1625 PHI->addIncoming(Constant::getNullValue(Call->getType()), BB);
1626 PHI->addIncoming(FirstOccursLocation, BBSuccess);
1627
1628 Call->replaceAllUsesWith(PHI);
1629 Call->eraseFromParent();
1630
1631 if (DTU)
1632 DTU->applyUpdates(Updates);
1633
1634 return true;
1635}
1636
1639 DominatorTree &DT, const DataLayout &DL,
1640 bool &MadeCFGChange) {
1641
1642 auto *CI = dyn_cast<CallInst>(&I);
1643 if (!CI || CI->isNoBuiltin())
1644 return false;
1645
1646 Function *CalledFunc = CI->getCalledFunction();
1647 if (!CalledFunc)
1648 return false;
1649
1650 LibFunc LF;
1651 if (!TLI.getLibFunc(*CalledFunc, LF) ||
1652 !isLibFuncEmittable(CI->getModule(), &TLI, LF))
1653 return false;
1654
1655 DomTreeUpdater DTU(&DT, DomTreeUpdater::UpdateStrategy::Lazy);
1656
1657 switch (LF) {
1658 case LibFunc_sqrt:
1659 case LibFunc_sqrtf:
1660 case LibFunc_sqrtl:
1661 return foldSqrt(CI, LF, TTI, TLI, AC, DT);
1662 case LibFunc_strcmp:
1663 case LibFunc_strncmp:
1664 if (StrNCmpInliner(CI, LF, &DTU, DL).optimizeStrNCmp()) {
1665 MadeCFGChange = true;
1666 return true;
1667 }
1668 break;
1669 case LibFunc_memchr:
1670 if (foldMemChr(CI, &DTU, DL)) {
1671 MadeCFGChange = true;
1672 return true;
1673 }
1674 break;
1675 default:;
1676 }
1677 return false;
1678}
1679
1680/// Match high part of long multiplication.
1681///
1682/// Considering a multiply made up of high and low parts, we can split the
1683/// multiply into:
1684/// x * y == (xh*T + xl) * (yh*T + yl)
1685/// where xh == x>>32 and xl == x & 0xffffffff. T = 2^32.
1686/// This expands to
1687/// xh*yh*T*T + xh*yl*T + xl*yh*T + xl*yl
1688/// which can be drawn as
1689/// [ xh*yh ]
1690/// [ xh*yl ]
1691/// [ xl*yh ]
1692/// [ xl*yl ]
1693/// We are looking for the "high" half, which is xh*yh + xh*yl>>32 + xl*yh>>32 +
1694/// some carrys. The carry makes this difficult and there are multiple ways of
1695/// representing it. The ones we attempt to support here are:
1696/// Carry: xh*yh + carry + lowsum
1697/// carry = lowsum < xh*yl ? 0x1000000 : 0
1698/// lowsum = xh*yl + xl*yh + (xl*yl>>32)
1699/// Ladder: xh*yh + c2>>32 + c3>>32
1700/// c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
1701/// or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xl*yh
1702/// Carry4: xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
1703/// crosssum = xh*yl + xl*yh
1704/// carry = crosssum < xh*yl ? 0x1000000 : 0
1705/// Ladder4: xh*yh + (xl*yh)>>32 + (xh*yl)>>32 + low>>32;
1706/// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
1707///
1708/// They all start by matching xh*yh + 2 or 3 other operands. The bottom of the
1709/// tree is xh*yh, xh*yl, xl*yh and xl*yl.
1711 Type *Ty = I.getType();
1712 if (!Ty->isIntOrIntVectorTy())
1713 return false;
1714
1715 unsigned BitWidth = Ty->getScalarSizeInBits();
1717 if (BitWidth % 2 != 0)
1718 return false;
1719
1720 auto CreateMulHigh = [&](Value *X, Value *Y) {
1721 IRBuilder<> Builder(&I);
1722 Type *NTy = Ty->getWithNewBitWidth(BitWidth * 2);
1723 Value *XExt = Builder.CreateZExt(X, NTy);
1724 Value *YExt = Builder.CreateZExt(Y, NTy);
1725 Value *Mul = Builder.CreateMul(XExt, YExt, "", /*HasNUW=*/true);
1726 Value *High = Builder.CreateLShr(Mul, BitWidth);
1727 Value *Res = Builder.CreateTrunc(High, Ty, "", /*HasNUW=*/true);
1728 Res->takeName(&I);
1729 I.replaceAllUsesWith(Res);
1730 LLVM_DEBUG(dbgs() << "Created long multiply from parts of " << *X << " and "
1731 << *Y << "\n");
1732 return true;
1733 };
1734
1735 // Common check routines for X_lo*Y_lo and X_hi*Y_lo
1736 auto CheckLoLo = [&](Value *XlYl, Value *X, Value *Y) {
1737 return match(XlYl, m_c_Mul(m_And(m_Specific(X), m_SpecificInt(LowMask)),
1738 m_And(m_Specific(Y), m_SpecificInt(LowMask))));
1739 };
1740 auto CheckHiLo = [&](Value *XhYl, Value *X, Value *Y) {
1741 return match(XhYl,
1743 m_And(m_Specific(Y), m_SpecificInt(LowMask))));
1744 };
1745
1746 auto FoldMulHighCarry = [&](Value *X, Value *Y, Instruction *Carry,
1747 Instruction *B) {
1748 // Looking for LowSum >> 32 and carry (select)
1749 if (Carry->getOpcode() != Instruction::Select)
1750 std::swap(Carry, B);
1751
1752 // Carry = LowSum < XhYl ? 0x100000000 : 0
1753 Value *LowSum, *XhYl;
1754 if (!match(Carry,
1757 m_Value(XhYl))),
1759 m_Zero()))))
1760 return false;
1761
1762 // XhYl can be Xh*Yl or Xl*Yh
1763 if (!CheckHiLo(XhYl, X, Y)) {
1764 if (CheckHiLo(XhYl, Y, X))
1765 std::swap(X, Y);
1766 else
1767 return false;
1768 }
1769 if (XhYl->hasNUsesOrMore(3))
1770 return false;
1771
1772 // B = LowSum >> 32
1773 if (!match(B, m_OneUse(m_LShr(m_Specific(LowSum),
1774 m_SpecificInt(BitWidth / 2)))) ||
1775 LowSum->hasNUsesOrMore(3))
1776 return false;
1777
1778 // LowSum = XhYl + XlYh + XlYl>>32
1779 Value *XlYh, *XlYl;
1780 auto XlYlHi = m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2));
1781 if (!match(LowSum,
1782 m_c_Add(m_Specific(XhYl),
1783 m_OneUse(m_c_Add(m_OneUse(m_Value(XlYh)), XlYlHi)))) &&
1784 !match(LowSum, m_c_Add(m_OneUse(m_Value(XlYh)),
1785 m_OneUse(m_c_Add(m_Specific(XhYl), XlYlHi)))) &&
1786 !match(LowSum,
1787 m_c_Add(XlYlHi, m_OneUse(m_c_Add(m_Specific(XhYl),
1788 m_OneUse(m_Value(XlYh)))))))
1789 return false;
1790
1791 // Check XlYl and XlYh
1792 if (!CheckLoLo(XlYl, X, Y))
1793 return false;
1794 if (!CheckHiLo(XlYh, Y, X))
1795 return false;
1796
1797 return CreateMulHigh(X, Y);
1798 };
1799
1800 auto FoldMulHighLadder = [&](Value *X, Value *Y, Instruction *A,
1801 Instruction *B) {
1802 // xh*yh + c2>>32 + c3>>32
1803 // c2 = xh*yl + (xl*yl>>32); c3 = c2&0xffffffff + xl*yh
1804 // or c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32); c3 = xh*yl
1805 Value *XlYh, *XhYl, *XlYl, *C2, *C3;
1806 // Strip off the two expected shifts.
1807 if (!match(A, m_LShr(m_Value(C2), m_SpecificInt(BitWidth / 2))) ||
1809 return false;
1810
1811 if (match(C3, m_c_Add(m_Add(m_Value(), m_Value()), m_Value())))
1812 std::swap(C2, C3);
1813 // Try to match c2 = (xl*yh&0xffffffff) + xh*yl + (xl*yl>>32)
1814 if (match(C2,
1816 m_Value(XlYh)),
1817 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)))) ||
1819 m_LShr(m_Value(XlYl),
1820 m_SpecificInt(BitWidth / 2))),
1821 m_Value(XlYh))) ||
1823 m_SpecificInt(BitWidth / 2)),
1824 m_Value(XlYh)),
1825 m_And(m_Specific(C3), m_SpecificInt(LowMask))))) {
1826 XhYl = C3;
1827 } else {
1828 // Match c3 = c2&0xffffffff + xl*yh
1829 if (!match(C3, m_c_Add(m_And(m_Specific(C2), m_SpecificInt(LowMask)),
1830 m_Value(XlYh))))
1831 std::swap(C2, C3);
1832 if (!match(C3, m_c_Add(m_OneUse(
1833 m_And(m_Specific(C2), m_SpecificInt(LowMask))),
1834 m_Value(XlYh))) ||
1835 !C3->hasOneUse() || C2->hasNUsesOrMore(3))
1836 return false;
1837
1838 // Match c2 = xh*yl + (xl*yl >> 32)
1839 if (!match(C2, m_c_Add(m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2)),
1840 m_Value(XhYl))))
1841 return false;
1842 }
1843
1844 // Match XhYl and XlYh - they can appear either way around.
1845 if (!CheckHiLo(XlYh, Y, X))
1846 std::swap(XlYh, XhYl);
1847 if (!CheckHiLo(XlYh, Y, X))
1848 return false;
1849 if (!CheckHiLo(XhYl, X, Y))
1850 return false;
1851 if (!CheckLoLo(XlYl, X, Y))
1852 return false;
1853
1854 return CreateMulHigh(X, Y);
1855 };
1856
1857 auto FoldMulHighLadder4 = [&](Value *X, Value *Y, Instruction *A,
1859 /// Ladder4: xh*yh + (xl*yh)>>32 + (xh+yl)>>32 + low>>32;
1860 /// low = (xl*yl)>>32 + (xl*yh)&0xffffffff + (xh*yl)&0xffffffff
1861
1862 // Find A = Low >> 32 and B/C = XhYl>>32, XlYh>>32.
1863 auto ShiftAdd =
1865 if (!match(A, ShiftAdd))
1866 std::swap(A, B);
1867 if (!match(A, ShiftAdd))
1868 std::swap(A, C);
1869 Value *Low;
1871 return false;
1872
1873 // Match B == XhYl>>32 and C == XlYh>>32
1874 Value *XhYl, *XlYh;
1875 if (!match(B, m_LShr(m_Value(XhYl), m_SpecificInt(BitWidth / 2))) ||
1876 !match(C, m_LShr(m_Value(XlYh), m_SpecificInt(BitWidth / 2))))
1877 return false;
1878 if (!CheckHiLo(XhYl, X, Y))
1879 std::swap(XhYl, XlYh);
1880 if (!CheckHiLo(XhYl, X, Y) || XhYl->hasNUsesOrMore(3))
1881 return false;
1882 if (!CheckHiLo(XlYh, Y, X) || XlYh->hasNUsesOrMore(3))
1883 return false;
1884
1885 // Match Low as XlYl>>32 + XhYl&0xffffffff + XlYh&0xffffffff
1886 Value *XlYl;
1887 if (!match(
1888 Low,
1889 m_c_Add(
1891 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
1892 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))),
1893 m_OneUse(
1894 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))) &&
1895 !match(
1896 Low,
1897 m_c_Add(
1899 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))),
1900 m_OneUse(
1901 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))),
1902 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))))) &&
1903 !match(
1904 Low,
1905 m_c_Add(
1907 m_OneUse(m_And(m_Specific(XlYh), m_SpecificInt(LowMask))),
1908 m_OneUse(
1909 m_LShr(m_Value(XlYl), m_SpecificInt(BitWidth / 2))))),
1910 m_OneUse(m_And(m_Specific(XhYl), m_SpecificInt(LowMask))))))
1911 return false;
1912 if (!CheckLoLo(XlYl, X, Y))
1913 return false;
1914
1915 return CreateMulHigh(X, Y);
1916 };
1917
1918 auto FoldMulHighCarry4 = [&](Value *X, Value *Y, Instruction *Carry,
1920 // xh*yh + carry + crosssum>>32 + (xl*yl + crosssum&0xffffffff) >> 32
1921 // crosssum = xh*yl+xl*yh
1922 // carry = crosssum < xh*yl ? 0x1000000 : 0
1923 if (Carry->getOpcode() != Instruction::Select)
1924 std::swap(Carry, B);
1925 if (Carry->getOpcode() != Instruction::Select)
1926 std::swap(Carry, C);
1927
1928 // Carry = CrossSum < XhYl ? 0x100000000 : 0
1929 Value *CrossSum, *XhYl;
1930 if (!match(Carry,
1933 m_Value(CrossSum), m_Value(XhYl))),
1935 m_Zero()))))
1936 return false;
1937
1938 if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2))))
1939 std::swap(B, C);
1940 if (!match(B, m_LShr(m_Specific(CrossSum), m_SpecificInt(BitWidth / 2))))
1941 return false;
1942
1943 Value *XlYl, *LowAccum;
1944 if (!match(C, m_LShr(m_Value(LowAccum), m_SpecificInt(BitWidth / 2))) ||
1945 !match(LowAccum, m_c_Add(m_OneUse(m_LShr(m_Value(XlYl),
1946 m_SpecificInt(BitWidth / 2))),
1947 m_OneUse(m_And(m_Specific(CrossSum),
1948 m_SpecificInt(LowMask))))) ||
1949 LowAccum->hasNUsesOrMore(3))
1950 return false;
1951 if (!CheckLoLo(XlYl, X, Y))
1952 return false;
1953
1954 if (!CheckHiLo(XhYl, X, Y))
1955 std::swap(X, Y);
1956 if (!CheckHiLo(XhYl, X, Y))
1957 return false;
1958 Value *XlYh;
1959 if (!match(CrossSum, m_c_Add(m_Specific(XhYl), m_OneUse(m_Value(XlYh)))) ||
1960 !CheckHiLo(XlYh, Y, X) || CrossSum->hasNUsesOrMore(4) ||
1961 XhYl->hasNUsesOrMore(3))
1962 return false;
1963
1964 return CreateMulHigh(X, Y);
1965 };
1966
1967 // X and Y are the two inputs, A, B and C are other parts of the pattern
1968 // (crosssum>>32, carry, etc).
1969 Value *X, *Y;
1970 Instruction *A, *B, *C;
1971 auto HiHi = m_OneUse(m_Mul(m_LShr(m_Value(X), m_SpecificInt(BitWidth / 2)),
1973 if ((match(&I, m_c_Add(HiHi, m_OneUse(m_Add(m_Instruction(A),
1974 m_Instruction(B))))) ||
1976 m_OneUse(m_c_Add(HiHi, m_Instruction(B)))))) &&
1977 A->hasOneUse() && B->hasOneUse())
1978 if (FoldMulHighCarry(X, Y, A, B) || FoldMulHighLadder(X, Y, A, B))
1979 return true;
1980
1981 if ((match(&I, m_c_Add(HiHi, m_OneUse(m_c_Add(
1984 m_Instruction(C))))))) ||
1988 m_Instruction(C))))))) ||
1992 m_OneUse(m_c_Add(HiHi, m_Instruction(C))))))) ||
1993 match(&I,
1996 A->hasOneUse() && B->hasOneUse() && C->hasOneUse())
1997 return FoldMulHighCarry4(X, Y, A, B, C) ||
1998 FoldMulHighLadder4(X, Y, A, B, C);
1999
2000 return false;
2001}
2002
2003/// This is the entry point for folds that could be implemented in regular
2004/// InstCombine, but they are separated because they are not expected to
2005/// occur frequently and/or have more than a constant-length pattern match.
2009 AssumptionCache &AC, bool &MadeCFGChange) {
2010 bool MadeChange = false;
2011 for (BasicBlock &BB : F) {
2012 // Ignore unreachable basic blocks.
2013 if (!DT.isReachableFromEntry(&BB))
2014 continue;
2015
2016 const DataLayout &DL = F.getDataLayout();
2017
2018 // Walk the block backwards for efficiency. We're matching a chain of
2019 // use->defs, so we're more likely to succeed by starting from the bottom.
2020 // Also, we want to avoid matching partial patterns.
2021 // TODO: It would be more efficient if we removed dead instructions
2022 // iteratively in this loop rather than waiting until the end.
2024 MadeChange |= foldAnyOrAllBitsSet(I);
2025 MadeChange |= foldGuardedFunnelShift(I, DT);
2026 MadeChange |= tryToRecognizePopCount(I);
2027 MadeChange |= tryToFPToSat(I, TTI);
2028 MadeChange |= tryToRecognizeTableBasedCttz(I, DL);
2029 MadeChange |= tryToRecognizeTableBasedLog2(I, DL, TTI);
2030 MadeChange |= foldConsecutiveLoads(I, DL, TTI, AA, DT);
2031 MadeChange |= foldPatternedLoads(I, DL);
2032 MadeChange |= foldICmpOrChain(I, DL, TTI, AA, DT);
2033 MadeChange |= foldMulHigh(I);
2034 // NOTE: This function introduces erasing of the instruction `I`, so it
2035 // needs to be called at the end of this sequence, otherwise we may make
2036 // bugs.
2037 MadeChange |= foldLibCalls(I, TTI, TLI, AC, DT, DL, MadeCFGChange);
2038 }
2039
2040 // Do this separately to avoid redundantly scanning stores multiple times.
2041 MadeChange |= foldConsecutiveStores(BB, DL, TTI, AA);
2042 }
2043
2044 // We're done with transforms, so remove dead instructions.
2045 if (MadeChange)
2046 for (BasicBlock &BB : F)
2048
2049 return MadeChange;
2050}
2051
2052/// This is the entry point for all transforms. Pass manager differences are
2053/// handled in the callers of this function.
2056 AliasAnalysis &AA, bool &MadeCFGChange) {
2057 bool MadeChange = false;
2058 const DataLayout &DL = F.getDataLayout();
2059 TruncInstCombine TIC(AC, TLI, DL, DT);
2060 MadeChange |= TIC.run(F);
2061 MadeChange |= foldUnusualPatterns(F, DT, TTI, TLI, AA, AC, MadeCFGChange);
2062 return MadeChange;
2063}
2064
2067 auto &AC = AM.getResult<AssumptionAnalysis>(F);
2068 auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
2069 auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
2070 auto &TTI = AM.getResult<TargetIRAnalysis>(F);
2071 auto &AA = AM.getResult<AAManager>(F);
2072 bool MadeCFGChange = false;
2073 if (!runImpl(F, AC, TTI, TLI, DT, AA, MadeCFGChange)) {
2074 // No changes, all analyses are preserved.
2075 return PreservedAnalyses::all();
2076 }
2077 // Mark all the analyses that instcombine updates as preserved.
2079 if (MadeCFGChange)
2081 else
2083 return PA;
2084}
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
AMDGPU Register Bank Select
Rewrite undef for PHI
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
static bool tryToRecognizePopCount(Instruction &I)
static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT)
Try to replace a mathlib call to sqrt with the LLVM intrinsic.
static bool isLog2Table(Constant *Table, const APInt &Mul, const APInt &Shift, Type *AccessTy, unsigned InputBits, const APInt &GEPIdxFactor, const DataLayout &DL)
static bool foldAnyOrAllBitsSet(Instruction &I)
Match patterns that correspond to "any-bits-set" and "all-bits-set".
static cl::opt< unsigned > MemChrInlineThreshold("memchr-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string to " "inline a memchr call."))
static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI)
Fold smin(smax(fptosi(x), C1), C2) to llvm.fptosi.sat(x), providing C1 and C2 saturate the value of t...
static cl::opt< unsigned > StrNCmpInlineThreshold("strncmp-inline-threshold", cl::init(3), cl::Hidden, cl::desc("The maximum length of a constant string for a builtin string cmp " "call eligible for inlining. The default value is 3."))
static bool matchAndOrChain(Value *V, MaskOps &MOps)
This is a recursive helper for foldAnyOrAllBitsSet() that walks through a chain of 'and' or 'or' inst...
static bool foldMemChr(CallInst *Call, DomTreeUpdater *DTU, const DataLayout &DL)
Convert memchr with a small constant string into a switch.
static Value * optimizeShiftInOrChain(Value *V, IRBuilder<> &Builder)
Combine away instructions providing they are still equivalent when compared against 0.
static bool foldConsecutiveLoads(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA, const DominatorTree &DT)
static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT)
Match a pattern for a bitwise funnel/rotate operation that partially guards against undefined behavio...
static bool tryToRecognizeTableBasedCttz(Instruction &I, const DataLayout &DL)
static bool mergePartStores(SmallVectorImpl< PartStore > &Parts, const DataLayout &DL, TargetTransformInfo &TTI)
static bool foldLoadsRecursive(Value *V, LoadOps &LOps, const DataLayout &DL, AliasAnalysis &AA, bool IsRoot=false)
static bool mergeConsecutivePartStores(ArrayRef< PartStore > Parts, unsigned Width, const DataLayout &DL, TargetTransformInfo &TTI)
static cl::opt< unsigned > MaxInstrsToScan("aggressive-instcombine-max-scan-instrs", cl::init(64), cl::Hidden, cl::desc("Max number of instructions to scan for aggressive instcombine."))
static bool foldICmpOrChain(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA, const DominatorTree &DT)
static bool isCTTZTable(Constant *Table, const APInt &Mul, const APInt &Shift, const APInt &AndMask, Type *AccessTy, unsigned InputBits, const APInt &GEPIdxFactor, const DataLayout &DL)
static std::optional< PartStore > matchPartStore(Instruction &I, const DataLayout &DL)
static bool foldConsecutiveStores(BasicBlock &BB, const DataLayout &DL, TargetTransformInfo &TTI, AliasAnalysis &AA)
static std::pair< APInt, APInt > getStrideAndModOffsetOfGEP(Value *PtrOp, const DataLayout &DL)
static bool foldPatternedLoads(Instruction &I, const DataLayout &DL)
If C is a constant patterned array and all valid loaded results for given alignment are same to a con...
static bool tryToRecognizeTableBasedLog2(Instruction &I, const DataLayout &DL, TargetTransformInfo &TTI)
static bool foldLibCalls(Instruction &I, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AssumptionCache &AC, DominatorTree &DT, const DataLayout &DL, bool &MadeCFGChange)
static bool foldMulHigh(Instruction &I)
Match high part of long multiplication.
static bool foldUnusualPatterns(Function &F, DominatorTree &DT, TargetTransformInfo &TTI, TargetLibraryInfo &TLI, AliasAnalysis &AA, AssumptionCache &AC, bool &MadeCFGChange)
This is the entry point for folds that could be implemented in regular InstCombine,...
AggressiveInstCombiner - Combine expression patterns to form expressions with fewer,...
This is the interface for LLVM's primary stateless and local alias analysis.
#define X(NUM, ENUM, NAME)
Definition ELF.h:849
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static bool runImpl(Function &F, const TargetLowering &TLI, const LibcallLoweringInfo &Libcalls, AssumptionCache *AC)
#define DEBUG_TYPE
This is the interface for a simple mod/ref and alias analysis over globals.
Hexagon Common GEP
static MaybeAlign getAlign(Value *Ptr)
static Instruction * matchFunnelShift(Instruction &Or, InstCombinerImpl &IC)
Match UB-safe variants of the funnel shift intrinsic.
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
uint64_t High
This file contains the declarations for profiling metadata utility functions.
static const MCExpr * MaskShift(const MCExpr *Val, uint32_t Mask, uint32_t Shift, MCContext &Ctx)
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
#define LLVM_DEBUG(...)
Definition Debug.h:114
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
This pass exposes codegen information to IR-level passes.
Value * RHS
Value * LHS
BinaryOperator * Mul
A manager for alias analyses.
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1555
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1345
bool isZero() const
Determine if this value is zero, i.e. all bits are clear.
Definition APInt.h:381
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1503
bool isNegative() const
Determine sign of this APInt.
Definition APInt.h:330
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:651
LLVM_ABI APInt srem(const APInt &RHS) const
Function for signed remainder operation.
Definition APInt.cpp:1776
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
bool slt(const APInt &RHS) const
Signed less than comparison.
Definition APInt.h:1137
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM)
PassT::Result & getResult(IRUnitT &IR, ExtraArgTs... ExtraArgs)
Get the result of an analysis pass for a given IR unit.
ArrayRef - Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
const T & front() const
front - Get the first element.
Definition ArrayRef.h:145
size_t size() const
size - Get the array size.
Definition ArrayRef.h:142
A function analysis which provides an AssumptionCache.
A cache of @llvm.assume calls within a function.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
iterator begin()
Instruction iterator methods.
Definition BasicBlock.h:461
LLVM_ABI const_iterator getFirstInsertionPt() const
Returns an iterator to the first instruction in this block that is suitable for inserting a non-PHI i...
const Function * getParent() const
Return the enclosing method, or null if none.
Definition BasicBlock.h:213
static BasicBlock * Create(LLVMContext &Context, const Twine &Name="", Function *Parent=nullptr, BasicBlock *InsertBefore=nullptr)
Creates a new BasicBlock.
Definition BasicBlock.h:206
const Instruction * getTerminator() const LLVM_READONLY
Returns the terminator instruction; assumes that the block is well-formed.
Definition BasicBlock.h:237
This class is a wrapper over an AAResults, and it is intended to be used only when there are no IR ch...
ModRefInfo getModRefInfo(const Instruction *I, const std::optional< MemoryLocation > &OptLoc)
Represents analyses that only rely on functions' control flow.
Definition Analysis.h:73
Value * getArgOperand(unsigned i) const
This class represents a function call, abstracting a target machine's calling convention.
@ ICMP_ULT
unsigned less than
Definition InstrTypes.h:701
An abstraction over a floating-point predicate, and a pack of an integer predicate with samesign info...
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
This is an important base class in LLVM.
Definition Constant.h:43
static LLVM_ABI Constant * getNullValue(Type *Ty)
Constructor to create a '0' constant of arbitrary type.
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
static LLVM_ABI DebugLoc getMergedLocations(ArrayRef< DebugLoc > Locs)
Try to combine the vector of locations passed as input in a single one.
Definition DebugLoc.cpp:166
Analysis pass which computes a DominatorTree.
Definition Dominators.h:278
Concrete subclass of DominatorTreeBase that is used to compute a normal dominator tree.
Definition Dominators.h:159
LLVM_ABI bool isReachableFromEntry(const Use &U) const
Provide an overload for a Use.
LLVM_ABI bool dominates(const BasicBlock *BB, const Use &U) const
Return true if the (end of the) basic block BB dominates the use U.
void applyUpdates(ArrayRef< UpdateT > Updates)
Submit updates to all available trees.
an instruction for type-safe pointer arithmetic to access elements of arrays and structs
const Constant * getInitializer() const
getInitializer - Return the initializer for this global variable.
bool hasInitializer() const
Definitions have initializers, declarations don't.
bool isConstant() const
If the value is a global constant, its value is immutable throughout the runtime execution of the pro...
static bool isEquality(Predicate P)
Return true if this predicate is either EQ or NE.
void SetCurrentDebugLocation(DebugLoc L)
Set location information used by debugging information.
Definition IRBuilder.h:247
UncondBrInst * CreateBr(BasicBlock *Dest)
Create an unconditional 'br label X' instruction.
Definition IRBuilder.h:1217
PHINode * CreatePHI(Type *Ty, unsigned NumReservedValues, const Twine &Name="")
Definition IRBuilder.h:2496
SwitchInst * CreateSwitch(Value *V, BasicBlock *Dest, unsigned NumCases=10, MDNode *BranchWeights=nullptr, MDNode *Unpredictable=nullptr)
Create a switch instruction with the specified value, default dest, and with a hint for the number of...
Definition IRBuilder.h:1246
Value * CreateTrunc(Value *V, Type *DestTy, const Twine &Name="", bool IsNUW=false, bool IsNSW=false)
Definition IRBuilder.h:2063
void SetInsertPoint(BasicBlock *TheBB)
This specifies that created instructions should be appended to the end of the specified block.
Definition IRBuilder.h:207
Value * CreateInBoundsPtrAdd(Value *Ptr, Value *Offset, const Twine &Name="")
Definition IRBuilder.h:2053
IntegerType * getInt8Ty()
Fetch the type representing an 8-bit integer.
Definition IRBuilder.h:569
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2811
const DebugLoc & getDebugLoc() const
Return the debug location for this node as a DebugLoc.
LLVM_ABI void setAAMetadata(const AAMDNodes &N)
Sets the AA metadata on this instruction from the AAMDNodes structure.
LLVM_ABI InstListType::iterator eraseFromParent()
This method unlinks 'this' from the containing basic block and deletes it.
LLVM_ABI const Function * getFunction() const
Return the function this instruction belongs to.
LLVM_ABI AAMDNodes getAAMetadata() const
Returns the AA metadata for this instruction.
Class to represent integer types.
static LLVM_ABI IntegerType * get(LLVMContext &C, unsigned NumBits)
This static method is the primary way of constructing an IntegerType.
Definition Type.cpp:354
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
An instruction for reading from memory.
unsigned getPointerAddressSpace() const
Returns the address space of the pointer operand.
Value * getPointerOperand()
bool isSimple() const
static LocationSize precise(uint64_t Value)
LLVM_ABI MDNode * createUnlikelyBranchWeights()
Return metadata containing two branch weights, with significant bias towards false destination.
Definition MDBuilder.cpp:48
size_type size() const
Definition MapVector.h:56
std::pair< KeyT, ValueT > & front()
Definition MapVector.h:79
Representation for a specific memory location.
static LLVM_ABI MemoryLocation get(const LoadInst *LI)
Return a location with information about the memory reference by the given instruction.
static MemoryLocation getBeforeOrAfter(const Value *Ptr, const AAMDNodes &AATags=AAMDNodes())
Return a location that may access any location before or after Ptr, while remaining within the underl...
void addIncoming(Value *V, BasicBlock *BB)
Add an incoming value to the end of the PHI list.
static PHINode * Create(Type *Ty, unsigned NumReservedValues, const Twine &NameStr="", InsertPosition InsertBefore=nullptr)
Constructors - NumReservedValues is a hint for the number of incoming edges that this phi node will h...
static LLVM_ABI PoisonValue * get(Type *T)
Static factory methods - Return an 'poison' object of the specified type.
A set of analyses that are preserved following a run of a transformation pass.
Definition Analysis.h:112
static PreservedAnalyses all()
Construct a special preserved set that preserves all passes.
Definition Analysis.h:118
PreservedAnalyses & preserveSet()
Mark an analysis set as preserved.
Definition Analysis.h:151
PreservedAnalyses & preserve()
Mark an analysis as preserved.
Definition Analysis.h:132
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
void reserve(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
StringRef - Represent a constant reference to a string, i.e.
Definition StringRef.h:55
static constexpr size_t npos
Definition StringRef.h:57
Multiway switch.
Analysis pass providing the TargetTransformInfo.
Analysis pass providing the TargetLibraryInfo.
Provides information about what library functions are available for the current target.
bool getLibFunc(StringRef funcName, LibFunc &F) const
Searches for a particular function name.
This pass provides access to the codegen interfaces that are needed for IR-level transformations.
@ None
The insert/extract is not used with a load/store.
@ TCK_RecipThroughput
Reciprocal throughput.
@ TCK_SizeAndLatency
The weighted sum of size and latency.
@ TCC_Basic
The cost of a typical 'add' instruction.
bool run(Function &F)
Perform TruncInst pattern optimization on given function.
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:201
LLVM_ABI Type * getWithNewBitWidth(unsigned NewBitWidth) const
Given an integer or vector type, change the lane bitwidth to NewBitwidth, whilst keeping the old numb...
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:236
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
static LLVM_ABI IntegerType * getIntNTy(LLVMContext &C, unsigned N)
Definition Type.cpp:317
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:256
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:440
LLVM_ABI void replaceAllUsesWith(Value *V)
Change all uses of this to point to a new Value.
Definition Value.cpp:553
LLVMContext & getContext() const
All values hold a context through their type.
Definition Value.h:259
LLVM_ABI bool hasNUsesOrMore(unsigned N) const
Return true if this value has N uses or more.
Definition Value.cpp:158
LLVM_ABI const Value * stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset, bool AllowNonInbounds, bool AllowInvariantGroup=false, function_ref< bool(Value &Value, APInt &Offset)> ExternalAnalysis=nullptr, bool LookThroughIntToPtr=false) const
Accumulate the constant offset this value has compared to a base pointer.
LLVM_ABI uint64_t getPointerDereferenceableBytes(const DataLayout &DL, bool &CanBeNull, bool &CanBeFreed) const
Returns the number of bytes known to be dereferenceable for the pointer value.
Definition Value.cpp:893
LLVM_ABI StringRef getName() const
Return a constant reference to the value's name.
Definition Value.cpp:322
LLVM_ABI void takeName(Value *V)
Transfer the name from V to this value.
Definition Value.cpp:403
static LLVM_ABI VectorType * get(Type *ElementType, ElementCount EC)
This static method is the primary way to construct an VectorType.
const ParentTy * getParent() const
Definition ilist_node.h:34
CallInst * Call
Changed
#define UINT64_MAX
Definition DataTypes.h:77
Abstract Attribute helper functions.
Definition Attributor.h:165
LLVM_ABI APInt GreatestCommonDivisor(APInt A, APInt B)
Compute GCD of two unsigned APInt values.
Definition APInt.cpp:818
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
SpecificConstantMatch m_ZeroInt()
Convenience matchers for specific integer values.
BinaryOp_match< SpecificConstantMatch, SrcTy, TargetOpcode::G_SUB > m_Neg(const SrcTy &&Src)
Matches a register negated by a G_SUB.
OneUse_match< SubPat > m_OneUse(const SubPat &SP)
BinaryOp_match< LHS, RHS, Instruction::And > m_And(const LHS &L, const RHS &R)
ShiftLike_match< LHS, Instruction::LShr > m_LShrOrSelf(const LHS &L, uint64_t &R)
Matches lshr L, ConstShAmt or L itself (R will be set to zero in this case).
BinaryOp_match< LHS, RHS, Instruction::Add > m_Add(const LHS &L, const RHS &R)
match_combine_or< CastInst_match< OpTy, CastInst >, OpTy > m_CastOrSelf(const OpTy &Op)
Matches any cast or self. Used to ignore casts.
ap_match< APInt > m_APInt(const APInt *&Res)
Match a ConstantInt or splatted ConstantVector, binding the specified pointer to the contained APInt.
BinaryOp_match< LHS, RHS, Instruction::And, true > m_c_And(const LHS &L, const RHS &R)
Matches an And with LHS and RHS in either order.
CastInst_match< OpTy, TruncInst > m_Trunc(const OpTy &Op)
Matches Trunc.
specific_intval< false > m_SpecificInt(const APInt &V)
Match a specific integer value or vector with all elements equal to the value.
bool match(Val *V, const Pattern &P)
bind_ty< Instruction > m_Instruction(Instruction *&I)
Match an instruction, capturing it if we match.
specificval_ty m_Specific(const Value *V)
Match if we have a specific specified value.
cst_pred_ty< is_one > m_One()
Match an integer 1 or a vector with all elements equal to 1.
ThreeOps_match< Cond, LHS, RHS, Instruction::Select > m_Select(const Cond &C, const LHS &L, const RHS &R)
Matches SelectInst.
MaxMin_match< ICmpInst, LHS, RHS, smin_pred_ty > m_SMin(const LHS &L, const RHS &R)
ShiftLike_match< LHS, Instruction::Shl > m_ShlOrSelf(const LHS &L, uint64_t &R)
Matches shl L, ConstShAmt or L itself (R will be set to zero in this case).
BinaryOp_match< LHS, RHS, Instruction::Mul > m_Mul(const LHS &L, const RHS &R)
deferredval_ty< Value > m_Deferred(Value *const &V)
Like m_Specific(), but works if the specific value to match is determined as part of the same match()...
specific_bbval m_SpecificBB(BasicBlock *BB)
Match a specific basic block value.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoSignedWrap > m_NSWShl(const LHS &L, const RHS &R)
SpecificCmpClass_match< LHS, RHS, ICmpInst > m_SpecificICmp(CmpPredicate MatchPred, const LHS &L, const RHS &R)
CastInst_match< OpTy, ZExtInst > m_ZExt(const OpTy &Op)
Matches ZExt.
OverflowingBinaryOp_match< LHS, RHS, Instruction::Shl, OverflowingBinaryOperator::NoUnsignedWrap > m_NUWShl(const LHS &L, const RHS &R)
brc_match< Cond_t, bind_ty< BasicBlock >, bind_ty< BasicBlock > > m_Br(const Cond_t &C, BasicBlock *&T, BasicBlock *&F)
BinaryOp_match< LHS, RHS, Instruction::Add, true > m_c_Add(const LHS &L, const RHS &R)
Matches a Add with LHS and RHS in either order.
CastInst_match< OpTy, FPToSIInst > m_FPToSI(const OpTy &Op)
MaxMin_match< ICmpInst, LHS, RHS, smax_pred_ty > m_SMax(const LHS &L, const RHS &R)
class_match< Value > m_Value()
Match an arbitrary value and ignore it.
BinaryOp_match< LHS, RHS, Instruction::LShr > m_LShr(const LHS &L, const RHS &R)
CmpClass_match< LHS, RHS, ICmpInst > m_ICmp(CmpPredicate &Pred, const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Shl > m_Shl(const LHS &L, const RHS &R)
BinaryOp_match< LHS, RHS, Instruction::Or > m_Or(const LHS &L, const RHS &R)
is_zero m_Zero()
Match any null constant or a vector with all elements equal to 0.
BinaryOp_match< LHS, RHS, Instruction::Or, true > m_c_Or(const LHS &L, const RHS &R)
Matches an Or with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Mul, true > m_c_Mul(const LHS &L, const RHS &R)
Matches a Mul with LHS and RHS in either order.
BinaryOp_match< LHS, RHS, Instruction::Sub > m_Sub(const LHS &L, const RHS &R)
match_combine_or< LTy, RTy > m_CombineOr(const LTy &L, const RTy &R)
Combine two pattern matchers matching L || R.
initializer< Ty > init(const Ty &Val)
NodeAddr< PhiNode * > Phi
Definition RDFGraph.h:390
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:316
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
FunctionAddr VTableAddr Value
Definition InstrProf.h:137
cl::opt< bool > ProfcheckDisableMetadataFixes
Definition LoopInfo.cpp:60
LLVM_ABI void setExplicitlyUnknownBranchWeightsIfProfiled(Instruction &I, StringRef PassName, const Function *F=nullptr)
Like setExplicitlyUnknownBranchWeights(...), but only sets unknown branch weights in the new instruct...
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
LLVM_ABI bool isOnlyUsedInZeroComparison(const Instruction *CxtI)
LLVM_ABI bool getConstantStringInfo(const Value *V, StringRef &Str, bool TrimAtNul=true)
This function computes the length of a null-terminated C string pointed to by V.
iterator_range< T > make_range(T x, T y)
Convenience function for iterating over sub-ranges.
iterator_range< early_inc_iterator_impl< detail::IterOfRange< RangeT > > > make_early_inc_range(RangeT &&Range)
Make a range that does early increment to allow mutation of the underlying range without disrupting i...
Definition STLExtras.h:634
LLVM_ABI bool SimplifyInstructionsInBlock(BasicBlock *BB, const TargetLibraryInfo *TLI=nullptr)
Scan the specified basic block and try to simplify any instructions in it and recursively delete dead...
Definition Local.cpp:723
LLVM_ABI void setExplicitlyUnknownBranchWeights(Instruction &I, StringRef PassName)
Specify that the branch weights for this terminator cannot be known at compile time.
LLVM_ABI bool MaskedValueIsZero(const Value *V, const APInt &Mask, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if 'V & Mask' is known to be zero.
LLVM_ABI bool isLibFuncEmittable(const Module *M, const TargetLibraryInfo *TLI, LibFunc TheLibFunc)
Check whether the library function is available on target and also that it in the current Module is a...
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
auto reverse(ContainerTy &&C)
Definition STLExtras.h:408
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool isModSet(const ModRefInfo MRI)
Definition ModRef.h:49
void sort(IteratorTy Start, IteratorTy End)
Definition STLExtras.h:1636
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:207
bool isModOrRefSet(const ModRefInfo MRI)
Definition ModRef.h:43
LLVM_ABI Constant * ConstantFoldLoadFromConst(Constant *C, Type *Ty, const APInt &Offset, const DataLayout &DL)
Extract value of C at the given Offset reinterpreted as Ty.
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
@ Other
Any other memory.
Definition ModRef.h:68
@ First
Helpers to iterate all locations in the MemoryEffectsBase class.
Definition ModRef.h:74
TargetTransformInfo TTI
IRBuilder(LLVMContext &, FolderTy, InserterTy, MDNode *, ArrayRef< OperandBundleDef >) -> IRBuilder< FolderTy, InserterTy >
@ Sub
Subtraction of integers.
LLVM_ABI BasicBlock * SplitBlock(BasicBlock *Old, BasicBlock::iterator SplitPt, DominatorTree *DT, LoopInfo *LI=nullptr, MemorySSAUpdater *MSSAU=nullptr, const Twine &BBName="")
Split the specified block at the specified instruction.
ArrayRef(const T &OneElt) -> ArrayRef< T >
constexpr unsigned BitWidth
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
AnalysisManager< Function > FunctionAnalysisManager
Convenience typedef for the Function analysis manager.
LLVM_ABI bool isGuaranteedNotToBePoison(const Value *V, AssumptionCache *AC=nullptr, const Instruction *CtxI=nullptr, const DominatorTree *DT=nullptr, unsigned Depth=0)
Returns true if V cannot be poison, but may be undef.
LLVM_ABI const Value * getUnderlyingObject(const Value *V, unsigned MaxLookup=MaxLookupSearchDepth)
This method strips off any GEP address adjustments, pointer casts or llvm.threadlocal....
AAResults AliasAnalysis
Temporary typedef for legacy code that uses a generic AliasAnalysis pointer or reference.
LLVM_ABI bool cannotBeOrderedLessThanZero(const Value *V, const SimplifyQuery &SQ, unsigned Depth=0)
Return true if we can prove that the specified FP value is either NaN or never less than -0....
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:872
#define N
This is used by foldLoadsRecursive() to capture a Root Load node which is of type or(load,...
LoadInst * RootInsert
ValWidth bits starting at ValOffset of Val stored at PtrBase+PtrOffset.
bool operator<(const PartStore &Other) const
bool isCompatibleWith(const PartStore &Other) const
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
LLVM_ABI AAMDNodes concat(const AAMDNodes &Other) const
Determine the best AAMDNodes after concatenating two different locations together.
Matching combinators.
A MapVector that performs no allocations if smaller than a certain size.
Definition MapVector.h:276