LLVM 23.0.0git
ARMISelLowering.cpp
Go to the documentation of this file.
1//===- ARMISelLowering.cpp - ARM DAG Lowering Implementation --------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This file defines the interfaces that ARM uses to lower LLVM code into a
10// selection DAG.
11//
12//===----------------------------------------------------------------------===//
13
14#include "ARMISelLowering.h"
15#include "ARMBaseInstrInfo.h"
16#include "ARMBaseRegisterInfo.h"
17#include "ARMCallingConv.h"
20#include "ARMPerfectShuffle.h"
21#include "ARMRegisterInfo.h"
22#include "ARMSelectionDAGInfo.h"
23#include "ARMSubtarget.h"
27#include "Utils/ARMBaseInfo.h"
28#include "llvm/ADT/APFloat.h"
29#include "llvm/ADT/APInt.h"
30#include "llvm/ADT/ArrayRef.h"
31#include "llvm/ADT/BitVector.h"
32#include "llvm/ADT/DenseMap.h"
33#include "llvm/ADT/STLExtras.h"
36#include "llvm/ADT/Statistic.h"
38#include "llvm/ADT/StringRef.h"
40#include "llvm/ADT/Twine.h"
66#include "llvm/IR/Attributes.h"
67#include "llvm/IR/CallingConv.h"
68#include "llvm/IR/Constant.h"
69#include "llvm/IR/Constants.h"
70#include "llvm/IR/DataLayout.h"
71#include "llvm/IR/DebugLoc.h"
73#include "llvm/IR/Function.h"
74#include "llvm/IR/GlobalAlias.h"
75#include "llvm/IR/GlobalValue.h"
77#include "llvm/IR/IRBuilder.h"
78#include "llvm/IR/InlineAsm.h"
79#include "llvm/IR/Instruction.h"
82#include "llvm/IR/Intrinsics.h"
83#include "llvm/IR/IntrinsicsARM.h"
84#include "llvm/IR/Module.h"
85#include "llvm/IR/Type.h"
86#include "llvm/IR/User.h"
87#include "llvm/IR/Value.h"
88#include "llvm/MC/MCInstrDesc.h"
90#include "llvm/MC/MCSchedule.h"
97#include "llvm/Support/Debug.h"
105#include <algorithm>
106#include <cassert>
107#include <cstdint>
108#include <cstdlib>
109#include <iterator>
110#include <limits>
111#include <optional>
112#include <tuple>
113#include <utility>
114#include <vector>
115
116using namespace llvm;
117
118#define DEBUG_TYPE "arm-isel"
119
120STATISTIC(NumTailCalls, "Number of tail calls");
121STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
122STATISTIC(NumMovwMovt, "Number of GAs materialized with movw + movt");
123STATISTIC(NumLoopByVals, "Number of loops generated for byval arguments");
124STATISTIC(NumConstpoolPromoted,
125 "Number of constants with their storage promoted into constant pools");
126
127static cl::opt<bool>
128ARMInterworking("arm-interworking", cl::Hidden,
129 cl::desc("Enable / disable ARM interworking (for debugging only)"),
130 cl::init(true));
131
133 "arm-promote-constant", cl::Hidden,
134 cl::desc("Enable / disable promotion of unnamed_addr constants into "
135 "constant pools"),
136 cl::init(false)); // FIXME: set to true by default once PR32780 is fixed
138 "arm-promote-constant-max-size", cl::Hidden,
139 cl::desc("Maximum size of constant to promote into a constant pool"),
140 cl::init(64));
142 "arm-promote-constant-max-total", cl::Hidden,
143 cl::desc("Maximum size of ALL constants to promote into a constant pool"),
144 cl::init(128));
145
147MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden,
148 cl::desc("Maximum interleave factor for MVE VLDn to generate."),
149 cl::init(2));
150
152 "arm-max-base-updates-to-check", cl::Hidden,
153 cl::desc("Maximum number of base-updates to check generating postindex."),
154 cl::init(64));
155
156/// Value type used for "flags" operands / results (either CPSR or FPSCR_NZCV).
157constexpr MVT FlagsVT = MVT::i32;
158
159// The APCS parameter registers.
160static const MCPhysReg GPRArgRegs[] = {
161 ARM::R0, ARM::R1, ARM::R2, ARM::R3
162};
163
165 SelectionDAG &DAG, const SDLoc &DL) {
167 assert(Arg.ArgVT.bitsLT(MVT::i32));
168 SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, Arg.ArgVT, Value);
169 SDValue Ext =
171 MVT::i32, Trunc);
172 return Ext;
173}
174
175void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT) {
176 if (VT != PromotedLdStVT) {
178 AddPromotedToType (ISD::LOAD, VT, PromotedLdStVT);
179
181 AddPromotedToType (ISD::STORE, VT, PromotedLdStVT);
182 }
183
184 MVT ElemTy = VT.getVectorElementType();
185 if (ElemTy != MVT::f64)
189 if (ElemTy == MVT::i32) {
194 } else {
199 }
208 if (VT.isInteger()) {
212 }
213
214 // Neon does not support vector divide/remainder operations.
223
224 if (!VT.isFloatingPoint() && VT != MVT::v2i64 && VT != MVT::v1i64)
225 for (auto Opcode : {ISD::ABS, ISD::ABDS, ISD::ABDU, ISD::SMIN, ISD::SMAX,
227 setOperationAction(Opcode, VT, Legal);
228 if (!VT.isFloatingPoint())
229 for (auto Opcode : {ISD::SADDSAT, ISD::UADDSAT, ISD::SSUBSAT, ISD::USUBSAT})
230 setOperationAction(Opcode, VT, Legal);
231}
232
233void ARMTargetLowering::addDRTypeForNEON(MVT VT) {
234 addRegisterClass(VT, &ARM::DPRRegClass);
235 addTypeForNEON(VT, MVT::f64);
236}
237
238void ARMTargetLowering::addQRTypeForNEON(MVT VT) {
239 addRegisterClass(VT, &ARM::DPairRegClass);
240 addTypeForNEON(VT, MVT::v2f64);
241}
242
243void ARMTargetLowering::setAllExpand(MVT VT) {
244 for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
246
247 // We support these really simple operations even on types where all
248 // the actual arithmetic has to be broken down into simpler
249 // operations or turned into library calls.
254}
255
256void ARMTargetLowering::addAllExtLoads(const MVT From, const MVT To,
257 LegalizeAction Action) {
258 setLoadExtAction(ISD::EXTLOAD, From, To, Action);
259 setLoadExtAction(ISD::ZEXTLOAD, From, To, Action);
260 setLoadExtAction(ISD::SEXTLOAD, From, To, Action);
261}
262
263void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
264 const MVT IntTypes[] = { MVT::v16i8, MVT::v8i16, MVT::v4i32 };
265
266 for (auto VT : IntTypes) {
267 addRegisterClass(VT, &ARM::MQPRRegClass);
298
299 // No native support for these.
309
310 // Vector reductions
320
321 if (!HasMVEFP) {
326 } else {
329 }
330
331 // Pre and Post inc are supported on loads and stores
332 for (unsigned im = (unsigned)ISD::PRE_INC;
333 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
338 }
339 }
340
341 const MVT FloatTypes[] = { MVT::v8f16, MVT::v4f32 };
342 for (auto VT : FloatTypes) {
343 addRegisterClass(VT, &ARM::MQPRRegClass);
344 if (!HasMVEFP)
345 setAllExpand(VT);
346
347 // These are legal or custom whether we have MVE.fp or not
360
361 // Pre and Post inc are supported on loads and stores
362 for (unsigned im = (unsigned)ISD::PRE_INC;
363 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
368 }
369
370 if (HasMVEFP) {
378 }
383
384 // No native support for these.
399 }
400 }
401
402 // Custom Expand smaller than legal vector reductions to prevent false zero
403 // items being added.
412
413 // We 'support' these types up to bitcast/load/store level, regardless of
414 // MVE integer-only / float support. Only doing FP data processing on the FP
415 // vector types is inhibited at integer-only level.
416 const MVT LongTypes[] = { MVT::v2i64, MVT::v2f64 };
417 for (auto VT : LongTypes) {
418 addRegisterClass(VT, &ARM::MQPRRegClass);
419 setAllExpand(VT);
425 }
427
428 // We can do bitwise operations on v2i64 vectors
429 setOperationAction(ISD::AND, MVT::v2i64, Legal);
430 setOperationAction(ISD::OR, MVT::v2i64, Legal);
431 setOperationAction(ISD::XOR, MVT::v2i64, Legal);
432
433 // It is legal to extload from v4i8 to v4i16 or v4i32.
434 addAllExtLoads(MVT::v8i16, MVT::v8i8, Legal);
435 addAllExtLoads(MVT::v4i32, MVT::v4i16, Legal);
436 addAllExtLoads(MVT::v4i32, MVT::v4i8, Legal);
437
438 // It is legal to sign extend from v4i8/v4i16 to v4i32 or v8i8 to v8i16.
444
445 // Some truncating stores are legal too.
446 setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
447 setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
448 setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
449
450 // Pre and Post inc on these are legal, given the correct extends
451 for (unsigned im = (unsigned)ISD::PRE_INC;
452 im != (unsigned)ISD::LAST_INDEXED_MODE; ++im) {
453 for (auto VT : {MVT::v8i8, MVT::v4i8, MVT::v4i16}) {
458 }
459 }
460
461 // Predicate types
462 const MVT pTypes[] = {MVT::v16i1, MVT::v8i1, MVT::v4i1, MVT::v2i1};
463 for (auto VT : pTypes) {
464 addRegisterClass(VT, &ARM::VCCRRegClass);
479
480 if (!HasMVEFP) {
485 }
486 }
490 setOperationAction(ISD::OR, MVT::v2i1, Expand);
496
505}
506
508 return static_cast<const ARMBaseTargetMachine &>(getTargetMachine());
509}
510
512 const ARMSubtarget &STI)
513 : TargetLowering(TM_, STI), Subtarget(&STI),
514 RegInfo(Subtarget->getRegisterInfo()),
515 Itins(Subtarget->getInstrItineraryData()) {
516 const auto &TM = static_cast<const ARMBaseTargetMachine &>(TM_);
517
520
521 const Triple &TT = TM.getTargetTriple();
522
523 if (Subtarget->isThumb1Only())
524 addRegisterClass(MVT::i32, &ARM::tGPRRegClass);
525 else
526 addRegisterClass(MVT::i32, &ARM::GPRRegClass);
527
528 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only() &&
529 Subtarget->hasFPRegs()) {
530 addRegisterClass(MVT::f32, &ARM::SPRRegClass);
531 addRegisterClass(MVT::f64, &ARM::DPRRegClass);
532
533 if (!Subtarget->hasVFP2Base()) {
534 setAllExpand(MVT::f32);
535 } else {
538
541 setOperationAction(Op, MVT::f32, Legal);
542 }
543 if (!Subtarget->hasFP64()) {
544 setAllExpand(MVT::f64);
545 } else {
548 setOperationAction(Op, MVT::f64, Legal);
549
551 }
552 }
553
554 if (Subtarget->hasFullFP16()) {
557 setOperationAction(Op, MVT::f16, Legal);
558
559 addRegisterClass(MVT::f16, &ARM::HPRRegClass);
562
567 }
568
569 if (Subtarget->hasBF16()) {
570 addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
571 setAllExpand(MVT::bf16);
572 if (!Subtarget->hasFullFP16())
576 } else {
581 }
582
584 for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
585 setTruncStoreAction(VT, InnerVT, Expand);
586 addAllExtLoads(VT, InnerVT, Expand);
587 }
588
591
593 }
594
595 if (!Subtarget->isThumb1Only() && !Subtarget->hasV8_1MMainlineOps())
597
598 if (!Subtarget->hasV8_1MMainlineOps())
600
601 if (!Subtarget->isThumb1Only())
603
606
609
610 if (Subtarget->hasMVEIntegerOps())
611 addMVEVectorTypes(Subtarget->hasMVEFloatOps());
612
613 // Combine low-overhead loop intrinsics so that we can lower i1 types.
614 if (Subtarget->hasLOB()) {
616 }
617
618 if (Subtarget->hasNEON()) {
619 addDRTypeForNEON(MVT::v2f32);
620 addDRTypeForNEON(MVT::v8i8);
621 addDRTypeForNEON(MVT::v4i16);
622 addDRTypeForNEON(MVT::v2i32);
623 addDRTypeForNEON(MVT::v1i64);
624
625 addQRTypeForNEON(MVT::v4f32);
626 addQRTypeForNEON(MVT::v2f64);
627 addQRTypeForNEON(MVT::v16i8);
628 addQRTypeForNEON(MVT::v8i16);
629 addQRTypeForNEON(MVT::v4i32);
630 addQRTypeForNEON(MVT::v2i64);
631
632 if (Subtarget->hasFullFP16()) {
633 addQRTypeForNEON(MVT::v8f16);
634 addDRTypeForNEON(MVT::v4f16);
635 }
636
637 if (Subtarget->hasBF16()) {
638 addQRTypeForNEON(MVT::v8bf16);
639 addDRTypeForNEON(MVT::v4bf16);
640 }
641 }
642
643 if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
644 // v2f64 is legal so that QR subregs can be extracted as f64 elements, but
645 // none of Neon, MVE or VFP supports any arithmetic operations on it.
646 setOperationAction(ISD::FADD, MVT::v2f64, Expand);
647 setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
648 setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
649 // FIXME: Code duplication: FDIV and FREM are expanded always, see
650 // ARMTargetLowering::addTypeForNEON method for details.
651 setOperationAction(ISD::FDIV, MVT::v2f64, Expand);
652 setOperationAction(ISD::FREM, MVT::v2f64, Expand);
653 // FIXME: Create unittest.
654 // In another words, find a way when "copysign" appears in DAG with vector
655 // operands.
657 // FIXME: Code duplication: SETCC has custom operation action, see
658 // ARMTargetLowering::addTypeForNEON method for details.
660 // FIXME: Create unittest for FNEG and for FABS.
661 setOperationAction(ISD::FNEG, MVT::v2f64, Expand);
662 setOperationAction(ISD::FABS, MVT::v2f64, Expand);
664 setOperationAction(ISD::FSIN, MVT::v2f64, Expand);
665 setOperationAction(ISD::FCOS, MVT::v2f64, Expand);
666 setOperationAction(ISD::FTAN, MVT::v2f64, Expand);
667 setOperationAction(ISD::FPOW, MVT::v2f64, Expand);
668 setOperationAction(ISD::FLOG, MVT::v2f64, Expand);
671 setOperationAction(ISD::FEXP, MVT::v2f64, Expand);
680 setOperationAction(ISD::FMA, MVT::v2f64, Expand);
681 }
682
683 if (Subtarget->hasNEON()) {
684 // The same with v4f32. But keep in mind that vadd, vsub, vmul are natively
685 // supported for v4f32.
687 setOperationAction(ISD::FSIN, MVT::v4f32, Expand);
688 setOperationAction(ISD::FCOS, MVT::v4f32, Expand);
689 setOperationAction(ISD::FTAN, MVT::v4f32, Expand);
690 setOperationAction(ISD::FPOW, MVT::v4f32, Expand);
691 setOperationAction(ISD::FLOG, MVT::v4f32, Expand);
694 setOperationAction(ISD::FEXP, MVT::v4f32, Expand);
703
704 // Mark v2f32 intrinsics.
706 setOperationAction(ISD::FSIN, MVT::v2f32, Expand);
707 setOperationAction(ISD::FCOS, MVT::v2f32, Expand);
708 setOperationAction(ISD::FTAN, MVT::v2f32, Expand);
709 setOperationAction(ISD::FPOW, MVT::v2f32, Expand);
710 setOperationAction(ISD::FLOG, MVT::v2f32, Expand);
713 setOperationAction(ISD::FEXP, MVT::v2f32, Expand);
722
725 setOperationAction(Op, MVT::v4f16, Expand);
726 setOperationAction(Op, MVT::v8f16, Expand);
727 }
728
729 // Neon does not support some operations on v1i64 and v2i64 types.
730 setOperationAction(ISD::MUL, MVT::v1i64, Expand);
731 // Custom handling for some quad-vector types to detect VMULL.
732 setOperationAction(ISD::MUL, MVT::v8i16, Custom);
733 setOperationAction(ISD::MUL, MVT::v4i32, Custom);
734 setOperationAction(ISD::MUL, MVT::v2i64, Custom);
735 // Custom handling for some vector types to avoid expensive expansions
736 setOperationAction(ISD::SDIV, MVT::v4i16, Custom);
738 setOperationAction(ISD::UDIV, MVT::v4i16, Custom);
740 // Neon does not have single instruction SINT_TO_FP and UINT_TO_FP with
741 // a destination type that is wider than the source, and nor does
742 // it have a FP_TO_[SU]INT instruction with a narrower destination than
743 // source.
752
755
756 // NEON does not have single instruction CTPOP for vectors with element
757 // types wider than 8-bits. However, custom lowering can leverage the
758 // v8i8/v16i8 vcnt instruction.
765
766 setOperationAction(ISD::CTLZ, MVT::v1i64, Expand);
767 setOperationAction(ISD::CTLZ, MVT::v2i64, Expand);
768
769 // NEON does not have single instruction CTTZ for vectors.
771 setOperationAction(ISD::CTTZ, MVT::v4i16, Custom);
772 setOperationAction(ISD::CTTZ, MVT::v2i32, Custom);
773 setOperationAction(ISD::CTTZ, MVT::v1i64, Custom);
774
775 setOperationAction(ISD::CTTZ, MVT::v16i8, Custom);
776 setOperationAction(ISD::CTTZ, MVT::v8i16, Custom);
777 setOperationAction(ISD::CTTZ, MVT::v4i32, Custom);
778 setOperationAction(ISD::CTTZ, MVT::v2i64, Custom);
779
784
789
793 }
794
795 // NEON only has FMA instructions as of VFP4.
796 if (!Subtarget->hasVFP4Base()) {
797 setOperationAction(ISD::FMA, MVT::v2f32, Expand);
798 setOperationAction(ISD::FMA, MVT::v4f32, Expand);
799 }
800
803
804 // It is legal to extload from v4i8 to v4i16 or v4i32.
805 for (MVT Ty : {MVT::v8i8, MVT::v4i8, MVT::v2i8, MVT::v4i16, MVT::v2i16,
806 MVT::v2i32}) {
811 }
812 }
813
814 for (auto VT : {MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16,
815 MVT::v4i32}) {
820 }
821 }
822
823 if (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) {
830 }
831 if (Subtarget->hasMVEIntegerOps()) {
834 ISD::SETCC});
835 }
836 if (Subtarget->hasMVEFloatOps()) {
838 }
839
840 if (!Subtarget->hasFP64()) {
841 // When targeting a floating-point unit with only single-precision
842 // operations, f64 is legal for the few double-precision instructions which
843 // are present However, no double-precision operations other than moves,
844 // loads and stores are provided by the hardware.
881 }
882
885
886 if (!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) {
889 if (Subtarget->hasFullFP16()) {
892 }
893 } else {
895 }
896
897 if (!Subtarget->hasFP16()) {
900 } else {
903 }
904
905 computeRegisterProperties(Subtarget->getRegisterInfo());
906
907 // ARM does not have floating-point extending loads.
908 for (MVT VT : MVT::fp_valuetypes()) {
909 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
910 setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
911 setLoadExtAction(ISD::EXTLOAD, VT, MVT::bf16, Expand);
912 }
913
914 // ... or truncating stores
915 setTruncStoreAction(MVT::f64, MVT::f32, Expand);
916 setTruncStoreAction(MVT::f32, MVT::f16, Expand);
917 setTruncStoreAction(MVT::f64, MVT::f16, Expand);
918 setTruncStoreAction(MVT::f32, MVT::bf16, Expand);
919 setTruncStoreAction(MVT::f64, MVT::bf16, Expand);
920
921 // ARM does not have i1 sign extending load.
922 for (MVT VT : MVT::integer_valuetypes())
924
925 // ARM supports all 4 flavors of integer indexed load / store.
926 if (!Subtarget->isThumb1Only()) {
927 for (unsigned im = (unsigned)ISD::PRE_INC;
929 setIndexedLoadAction(im, MVT::i1, Legal);
930 setIndexedLoadAction(im, MVT::i8, Legal);
931 setIndexedLoadAction(im, MVT::i16, Legal);
932 setIndexedLoadAction(im, MVT::i32, Legal);
933 setIndexedStoreAction(im, MVT::i1, Legal);
934 setIndexedStoreAction(im, MVT::i8, Legal);
935 setIndexedStoreAction(im, MVT::i16, Legal);
936 setIndexedStoreAction(im, MVT::i32, Legal);
937 }
938 } else {
939 // Thumb-1 has limited post-inc load/store support - LDM r0!, {r1}.
942 }
943
944 // Custom loads/stores to possible use __aeabi_uread/write*
945 if (TT.isTargetAEABI() && !Subtarget->allowsUnalignedMem()) {
950 }
951
956
957 if (!Subtarget->isThumb1Only()) {
960 }
961
966 if (Subtarget->hasDSP()) {
975 }
976 if (Subtarget->hasBaseDSP()) {
979 }
980
981 // i64 operation support.
984 if (Subtarget->isThumb1Only()) {
987 }
988 if (Subtarget->isThumb1Only() || !Subtarget->hasV6Ops()
989 || (Subtarget->isThumb2() && !Subtarget->hasDSP()))
991
1001
1002 // MVE lowers 64 bit shifts to lsll and lsrl
1003 // assuming that ISD::SRL and SRA of i64 are already marked custom
1004 if (Subtarget->hasMVEIntegerOps())
1006
1007 // Expand to __aeabi_l{lsl,lsr,asr} calls for Thumb1.
1008 if (Subtarget->isThumb1Only()) {
1012 }
1013
1014 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
1016
1017 // ARM does not have ROTL.
1022 }
1024 // TODO: These two should be set to LibCall, but this currently breaks
1025 // the Linux kernel build. See #101786.
1028 if (!Subtarget->hasV5TOps() || Subtarget->isThumb1Only()) {
1031 }
1032
1033 // @llvm.readcyclecounter requires the Performance Monitors extension.
1034 // Default to the 0 expansion on unsupported platforms.
1035 // FIXME: Technically there are older ARM CPUs that have
1036 // implementation-specific ways of obtaining this information.
1037 if (Subtarget->hasPerfMon())
1039
1040 // Only ARMv6 has BSWAP.
1041 if (!Subtarget->hasV6Ops())
1043
1044 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
1045 : Subtarget->hasDivideInARMMode();
1046 if (!hasDivide) {
1047 // These are expanded into libcalls if the cpu doesn't have HW divider.
1050 }
1051
1052 if (TT.isOSWindows() && !Subtarget->hasDivideInThumbMode()) {
1055
1058 }
1059
1062
1063 // Register based DivRem for AEABI (RTABI 4.2)
1064 if (TT.isTargetAEABI() || TT.isAndroid() || TT.isTargetGNUAEABI() ||
1065 TT.isTargetMuslAEABI() || TT.isOSFuchsia() || TT.isOSWindows()) {
1068 HasStandaloneRem = false;
1069
1074 } else {
1077 }
1078
1083
1084 setOperationAction(ISD::TRAP, MVT::Other, Legal);
1086
1087 // Use the default implementation.
1089 setOperationAction(ISD::VAARG, MVT::Other, Expand);
1091 setOperationAction(ISD::VAEND, MVT::Other, Expand);
1094
1095 if (TT.isOSWindows())
1097 else
1099
1100 // ARMv6 Thumb1 (except for CPUs that support dmb / dsb) and earlier use
1101 // the default expansion.
1102 InsertFencesForAtomic = false;
1103 if (Subtarget->hasAnyDataBarrier() &&
1104 (!Subtarget->isThumb() || Subtarget->hasV8MBaselineOps())) {
1105 // ATOMIC_FENCE needs custom lowering; the others should have been expanded
1106 // to ldrex/strex loops already.
1108 if (!Subtarget->isThumb() || !Subtarget->isMClass())
1110
1111 // On v8, we have particularly efficient implementations of atomic fences
1112 // if they can be combined with nearby atomic loads and stores.
1113 if (!Subtarget->hasAcquireRelease() ||
1114 getTargetMachine().getOptLevel() == CodeGenOptLevel::None) {
1115 // Automatically insert fences (dmb ish) around ATOMIC_SWAP etc.
1116 InsertFencesForAtomic = true;
1117 }
1118 } else {
1119 // If there's anything we can use as a barrier, go through custom lowering
1120 // for ATOMIC_FENCE.
1121 // If target has DMB in thumb, Fences can be inserted.
1122 if (Subtarget->hasDataBarrier())
1123 InsertFencesForAtomic = true;
1124
1126 Subtarget->hasAnyDataBarrier() ? Custom : Expand);
1127
1128 // Set them all for libcall, which will force libcalls.
1141 // Mark ATOMIC_LOAD and ATOMIC_STORE custom so we can handle the
1142 // Unordered/Monotonic case.
1143 if (!InsertFencesForAtomic) {
1146 }
1147 }
1148
1149 // Compute supported atomic widths.
1150 if (TT.isOSLinux() || (!Subtarget->isMClass() && Subtarget->hasV6Ops())) {
1151 // For targets where __sync_* routines are reliably available, we use them
1152 // if necessary.
1153 //
1154 // ARM Linux always supports 64-bit atomics through kernel-assisted atomic
1155 // routines (kernel 3.1 or later). FIXME: Not with compiler-rt?
1156 //
1157 // ARMv6 targets have native instructions in ARM mode. For Thumb mode,
1158 // such targets should provide __sync_* routines, which use the ARM mode
1159 // instructions. (ARMv6 doesn't have dmb, but it has an equivalent
1160 // encoding; see ARMISD::MEMBARRIER_MCR.)
1162 } else if ((Subtarget->isMClass() && Subtarget->hasV8MBaselineOps()) ||
1163 Subtarget->hasForced32BitAtomics()) {
1164 // Cortex-M (besides Cortex-M0) have 32-bit atomics.
1166 } else {
1167 // We can't assume anything about other targets; just use libatomic
1168 // routines.
1170 }
1171
1173
1175
1176 // Requires SXTB/SXTH, available on v6 and up in both ARM and Thumb modes.
1177 if (!Subtarget->hasV6Ops()) {
1180 }
1182
1183 if (!Subtarget->useSoftFloat() && Subtarget->hasFPRegs() &&
1184 !Subtarget->isThumb1Only()) {
1185 // Turn f64->i64 into VMOVRRD, i64 -> f64 to VMOVDRR
1186 // iff target supports vfp2.
1196 }
1197
1198 // We want to custom lower some of our intrinsics.
1203
1213 if (Subtarget->hasFullFP16()) {
1217 }
1218
1220
1223 if (Subtarget->hasFullFP16())
1227 setOperationAction(ISD::BR_JT, MVT::Other, Custom);
1228
1229 // We don't support sin/cos/fmod/copysign/pow
1238 if (!Subtarget->useSoftFloat() && Subtarget->hasVFP2Base() &&
1239 !Subtarget->isThumb1Only()) {
1242 }
1245
1246 if (!Subtarget->hasVFP4Base()) {
1249 }
1250
1251 // Various VFP goodness
1252 if (!Subtarget->useSoftFloat() && !Subtarget->isThumb1Only()) {
1253 // FP-ARMv8 adds f64 <-> f16 conversion. Before that it should be expanded.
1254 if (!Subtarget->hasFPARMv8Base() || !Subtarget->hasFP64()) {
1259 }
1260
1261 // fp16 is a special v7 extension that adds f16 <-> f32 conversions.
1262 if (!Subtarget->hasFP16()) {
1267 }
1268
1269 // Strict floating-point comparisons need custom lowering.
1276 }
1277
1278 // FP-ARMv8 implements a lot of rounding-like FP operations.
1279 if (Subtarget->hasFPARMv8Base()) {
1280 for (auto Op :
1287 setOperationAction(Op, MVT::f32, Legal);
1288
1289 if (Subtarget->hasFP64())
1290 setOperationAction(Op, MVT::f64, Legal);
1291 }
1292
1293 if (Subtarget->hasNEON()) {
1298 }
1299 }
1300
1301 // FP16 often need to be promoted to call lib functions
1302 // clang-format off
1303 if (Subtarget->hasFullFP16()) {
1307
1308 for (auto Op : {ISD::FREM, ISD::FPOW, ISD::FPOWI,
1322 setOperationAction(Op, MVT::f16, Promote);
1323 }
1324
1325 // Round-to-integer need custom lowering for fp16, as Promote doesn't work
1326 // because the result type is integer.
1328 setOperationAction(Op, MVT::f16, Custom);
1329
1335 setOperationAction(Op, MVT::f16, Legal);
1336 }
1337 // clang-format on
1338 }
1339
1340 if (Subtarget->hasNEON()) {
1341 // vmin and vmax aren't available in a scalar form, so we can use
1342 // a NEON instruction with an undef lane instead.
1351
1352 if (Subtarget->hasV8Ops()) {
1357 setOperationAction(Op, MVT::v2f32, Legal);
1358 setOperationAction(Op, MVT::v4f32, Legal);
1359 }
1360 }
1361
1362 if (Subtarget->hasFullFP16()) {
1367
1372
1377 setOperationAction(Op, MVT::v4f16, Legal);
1378 setOperationAction(Op, MVT::v8f16, Legal);
1379 }
1380 }
1381 }
1382
1383 // On MSVC, both 32-bit and 64-bit, ldexpf(f32) is not defined. MinGW has
1384 // it, but it's just a wrapper around ldexp.
1385 if (TT.isOSWindows()) {
1387 if (isOperationExpand(Op, MVT::f32))
1388 setOperationAction(Op, MVT::f32, Promote);
1389 }
1390
1391 // LegalizeDAG currently can't expand fp16 LDEXP/FREXP on targets where i16
1392 // isn't legal.
1394 if (isOperationExpand(Op, MVT::f16))
1395 setOperationAction(Op, MVT::f16, Promote);
1396
1397 // We have target-specific dag combine patterns for the following nodes:
1398 // ARMISD::VMOVRRD - No need to call setTargetDAGCombine
1401
1402 if (Subtarget->hasMVEIntegerOps())
1404
1405 if (Subtarget->hasV6Ops())
1407 if (Subtarget->isThumb1Only())
1409 // Attempt to lower smin/smax to ssat/usat
1410 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) ||
1411 Subtarget->isThumb2()) {
1413 }
1414
1416
1417 if (Subtarget->useSoftFloat() || Subtarget->isThumb1Only() ||
1418 !Subtarget->hasVFP2Base() || Subtarget->hasMinSize())
1420 else
1422
1423 //// temporary - rewrite interface to use type
1426 MaxStoresPerMemcpy = 4; // For @llvm.memcpy -> sequence of stores
1428 MaxStoresPerMemmove = 4; // For @llvm.memmove -> sequence of stores
1430
1431 // On ARM arguments smaller than 4 bytes are extended, so all arguments
1432 // are at least 4 bytes aligned.
1434
1435 // Prefer likely predicted branches to selects on out-of-order cores.
1436 PredictableSelectIsExpensive = Subtarget->getSchedModel().isOutOfOrder();
1437
1438 setPrefLoopAlignment(Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1440 Align(1ULL << Subtarget->getPreferBranchLogAlignment()));
1441
1442 setMinFunctionAlignment(Subtarget->isThumb() ? Align(2) : Align(4));
1443
1444 IsStrictFPEnabled = true;
1445}
1446
1448 return Subtarget->useSoftFloat();
1449}
1450
1452 return !Subtarget->isThumb1Only() && VT.getSizeInBits() <= 32;
1453}
1454
1455// FIXME: It might make sense to define the representative register class as the
1456// nearest super-register that has a non-null superset. For example, DPR_VFP2 is
1457// a super-register of SPR, and DPR is a superset if DPR_VFP2. Consequently,
1458// SPR's representative would be DPR_VFP2. This should work well if register
1459// pressure tracking were modified such that a register use would increment the
1460// pressure of the register class's representative and all of it's super
1461// classes' representatives transitively. We have not implemented this because
1462// of the difficulty prior to coalescing of modeling operand register classes
1463// due to the common occurrence of cross class copies and subregister insertions
1464// and extractions.
1465std::pair<const TargetRegisterClass *, uint8_t>
1467 MVT VT) const {
1468 const TargetRegisterClass *RRC = nullptr;
1469 uint8_t Cost = 1;
1470 switch (VT.SimpleTy) {
1471 default:
1473 // Use DPR as representative register class for all floating point
1474 // and vector types. Since there are 32 SPR registers and 32 DPR registers so
1475 // the cost is 1 for both f32 and f64.
1476 case MVT::f32: case MVT::f64: case MVT::v8i8: case MVT::v4i16:
1477 case MVT::v2i32: case MVT::v1i64: case MVT::v2f32:
1478 RRC = &ARM::DPRRegClass;
1479 // When NEON is used for SP, only half of the register file is available
1480 // because operations that define both SP and DP results will be constrained
1481 // to the VFP2 class (D0-D15). We currently model this constraint prior to
1482 // coalescing by double-counting the SP regs. See the FIXME above.
1483 if (Subtarget->useNEONForSinglePrecisionFP())
1484 Cost = 2;
1485 break;
1486 case MVT::v16i8: case MVT::v8i16: case MVT::v4i32: case MVT::v2i64:
1487 case MVT::v4f32: case MVT::v2f64:
1488 RRC = &ARM::DPRRegClass;
1489 Cost = 2;
1490 break;
1491 case MVT::v4i64:
1492 RRC = &ARM::DPRRegClass;
1493 Cost = 4;
1494 break;
1495 case MVT::v8i64:
1496 RRC = &ARM::DPRRegClass;
1497 Cost = 8;
1498 break;
1499 }
1500 return std::make_pair(RRC, Cost);
1501}
1502
1504 EVT VT) const {
1505 if (!VT.isVector())
1506 return getPointerTy(DL);
1507
1508 // MVE has a predicate register.
1509 if (Subtarget->hasMVEIntegerOps())
1510 return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
1511
1513}
1514
1515/// getRegClassFor - Return the register class that should be used for the
1516/// specified value type.
1517const TargetRegisterClass *
1518ARMTargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
1519 (void)isDivergent;
1520 // Map v4i64 to QQ registers but do not make the type legal. Similarly map
1521 // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
1522 // load / store 4 to 8 consecutive NEON D registers, or 2 to 4 consecutive
1523 // MVE Q registers.
1524 if (Subtarget->hasNEON()) {
1525 if (VT == MVT::v4i64)
1526 return &ARM::QQPRRegClass;
1527 if (VT == MVT::v8i64)
1528 return &ARM::QQQQPRRegClass;
1529 }
1530 if (Subtarget->hasMVEIntegerOps()) {
1531 if (VT == MVT::v4i64)
1532 return &ARM::MQQPRRegClass;
1533 if (VT == MVT::v8i64)
1534 return &ARM::MQQQQPRRegClass;
1535 }
1537}
1538
1539// memcpy, and other memory intrinsics, typically tries to use LDM/STM if the
1540// source/dest is aligned and the copy size is large enough. We therefore want
1541// to align such objects passed to memory intrinsics.
1543 Align &PrefAlign) const {
1544 if (!isa<MemIntrinsic>(CI))
1545 return false;
1546 MinSize = 8;
1547 // On ARM11 onwards (excluding M class) 8-byte aligned LDM is typically 1
1548 // cycle faster than 4-byte aligned LDM.
1549 PrefAlign =
1550 (Subtarget->hasV6Ops() && !Subtarget->isMClass() ? Align(8) : Align(4));
1551 return true;
1552}
1553
1554// Create a fast isel object.
1556 FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo,
1557 const LibcallLoweringInfo *libcallLowering) const {
1558 return ARM::createFastISel(funcInfo, libInfo, libcallLowering);
1559}
1560
1562 unsigned NumVals = N->getNumValues();
1563 if (!NumVals)
1564 return Sched::RegPressure;
1565
1566 for (unsigned i = 0; i != NumVals; ++i) {
1567 EVT VT = N->getValueType(i);
1568 if (VT == MVT::Glue || VT == MVT::Other)
1569 continue;
1570 if (VT.isFloatingPoint() || VT.isVector())
1571 return Sched::ILP;
1572 }
1573
1574 if (!N->isMachineOpcode())
1575 return Sched::RegPressure;
1576
1577 // Load are scheduled for latency even if there instruction itinerary
1578 // is not available.
1579 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
1580 const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
1581
1582 if (MCID.getNumDefs() == 0)
1583 return Sched::RegPressure;
1584 if (!Itins->isEmpty() &&
1585 Itins->getOperandCycle(MCID.getSchedClass(), 0) > 2U)
1586 return Sched::ILP;
1587
1588 return Sched::RegPressure;
1589}
1590
1591//===----------------------------------------------------------------------===//
1592// Lowering Code
1593//===----------------------------------------------------------------------===//
1594
1595static bool isSRL16(const SDValue &Op) {
1596 if (Op.getOpcode() != ISD::SRL)
1597 return false;
1598 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1599 return Const->getZExtValue() == 16;
1600 return false;
1601}
1602
1603static bool isSRA16(const SDValue &Op) {
1604 if (Op.getOpcode() != ISD::SRA)
1605 return false;
1606 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1607 return Const->getZExtValue() == 16;
1608 return false;
1609}
1610
1611static bool isSHL16(const SDValue &Op) {
1612 if (Op.getOpcode() != ISD::SHL)
1613 return false;
1614 if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
1615 return Const->getZExtValue() == 16;
1616 return false;
1617}
1618
1619// Check for a signed 16-bit value. We special case SRA because it makes it
1620// more simple when also looking for SRAs that aren't sign extending a
1621// smaller value. Without the check, we'd need to take extra care with
1622// checking order for some operations.
1623static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
1624 if (isSRA16(Op))
1625 return isSHL16(Op.getOperand(0));
1626 return DAG.ComputeNumSignBits(Op) == 17;
1627}
1628
1629/// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
1631 switch (CC) {
1632 default: llvm_unreachable("Unknown condition code!");
1633 case ISD::SETNE: return ARMCC::NE;
1634 case ISD::SETEQ: return ARMCC::EQ;
1635 case ISD::SETGT: return ARMCC::GT;
1636 case ISD::SETGE: return ARMCC::GE;
1637 case ISD::SETLT: return ARMCC::LT;
1638 case ISD::SETLE: return ARMCC::LE;
1639 case ISD::SETUGT: return ARMCC::HI;
1640 case ISD::SETUGE: return ARMCC::HS;
1641 case ISD::SETULT: return ARMCC::LO;
1642 case ISD::SETULE: return ARMCC::LS;
1643 }
1644}
1645
1646/// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
1648 ARMCC::CondCodes &CondCode2) {
1649 CondCode2 = ARMCC::AL;
1650 switch (CC) {
1651 default: llvm_unreachable("Unknown FP condition!");
1652 case ISD::SETEQ:
1653 case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
1654 case ISD::SETGT:
1655 case ISD::SETOGT: CondCode = ARMCC::GT; break;
1656 case ISD::SETGE:
1657 case ISD::SETOGE: CondCode = ARMCC::GE; break;
1658 case ISD::SETOLT: CondCode = ARMCC::MI; break;
1659 case ISD::SETOLE: CondCode = ARMCC::LS; break;
1660 case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
1661 case ISD::SETO: CondCode = ARMCC::VC; break;
1662 case ISD::SETUO: CondCode = ARMCC::VS; break;
1663 case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
1664 case ISD::SETUGT: CondCode = ARMCC::HI; break;
1665 case ISD::SETUGE: CondCode = ARMCC::PL; break;
1666 case ISD::SETLT:
1667 case ISD::SETULT: CondCode = ARMCC::LT; break;
1668 case ISD::SETLE:
1669 case ISD::SETULE: CondCode = ARMCC::LE; break;
1670 case ISD::SETNE:
1671 case ISD::SETUNE: CondCode = ARMCC::NE; break;
1672 }
1673}
1674
1675//===----------------------------------------------------------------------===//
1676// Calling Convention Implementation
1677//===----------------------------------------------------------------------===//
1678
1679/// getEffectiveCallingConv - Get the effective calling convention, taking into
1680/// account presence of floating point hardware and calling convention
1681/// limitations, such as support for variadic functions.
1683ARMTargetLowering::getEffectiveCallingConv(CallingConv::ID CC,
1684 bool isVarArg) const {
1685 switch (CC) {
1686 default:
1687 report_fatal_error("Unsupported calling convention");
1690 case CallingConv::GHC:
1692 return CC;
1698 case CallingConv::Swift:
1701 case CallingConv::C:
1702 case CallingConv::Tail:
1703 if (!getTM().isAAPCS_ABI())
1704 return CallingConv::ARM_APCS;
1705 else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1706 getTargetMachine().Options.FloatABIType == FloatABI::Hard &&
1707 !isVarArg)
1709 else
1711 case CallingConv::Fast:
1713 if (!getTM().isAAPCS_ABI()) {
1714 if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() && !isVarArg)
1715 return CallingConv::Fast;
1716 return CallingConv::ARM_APCS;
1717 } else if (Subtarget->hasFPRegs() && !Subtarget->isThumb1Only() &&
1718 !isVarArg)
1720 else
1722 }
1723}
1724
1726 bool isVarArg) const {
1727 return CCAssignFnForNode(CC, false, isVarArg);
1728}
1729
1731 bool isVarArg) const {
1732 return CCAssignFnForNode(CC, true, isVarArg);
1733}
1734
1735/// CCAssignFnForNode - Selects the correct CCAssignFn for the given
1736/// CallingConvention.
1737CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
1738 bool Return,
1739 bool isVarArg) const {
1740 switch (getEffectiveCallingConv(CC, isVarArg)) {
1741 default:
1742 report_fatal_error("Unsupported calling convention");
1744 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS);
1746 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1748 return (Return ? RetCC_ARM_AAPCS_VFP : CC_ARM_AAPCS_VFP);
1749 case CallingConv::Fast:
1750 return (Return ? RetFastCC_ARM_APCS : FastCC_ARM_APCS);
1751 case CallingConv::GHC:
1752 return (Return ? RetCC_ARM_APCS : CC_ARM_APCS_GHC);
1754 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1756 return (Return ? RetCC_ARM_AAPCS : CC_ARM_AAPCS);
1758 return (Return ? RetCC_ARM_AAPCS : CC_ARM_Win32_CFGuard_Check);
1759 }
1760}
1761
1762SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
1763 MVT LocVT, MVT ValVT, SDValue Val) const {
1764 Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
1765 Val);
1766 if (Subtarget->hasFullFP16()) {
1767 Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
1768 } else {
1769 Val = DAG.getNode(ISD::TRUNCATE, dl,
1770 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1771 Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
1772 }
1773 return Val;
1774}
1775
1776SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
1777 MVT LocVT, MVT ValVT,
1778 SDValue Val) const {
1779 if (Subtarget->hasFullFP16()) {
1780 Val = DAG.getNode(ARMISD::VMOVrh, dl,
1781 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1782 } else {
1783 Val = DAG.getNode(ISD::BITCAST, dl,
1784 MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
1785 Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
1786 MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
1787 }
1788 return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
1789}
1790
1791/// LowerCallResult - Lower the result values of a call into the
1792/// appropriate copies out of appropriate physical registers.
1793SDValue ARMTargetLowering::LowerCallResult(
1794 SDValue Chain, SDValue InGlue, CallingConv::ID CallConv, bool isVarArg,
1795 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
1796 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
1797 SDValue ThisVal, bool isCmseNSCall) const {
1798 // Assign locations to each value returned by this call.
1800 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
1801 *DAG.getContext());
1802 CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
1803
1804 // Copy all of the result registers out of their specified physreg.
1805 for (unsigned i = 0; i != RVLocs.size(); ++i) {
1806 CCValAssign VA = RVLocs[i];
1807
1808 // Pass 'this' value directly from the argument to return value, to avoid
1809 // reg unit interference
1810 if (i == 0 && isThisReturn) {
1811 assert(!VA.needsCustom() && VA.getLocVT() == MVT::i32 &&
1812 "unexpected return calling convention register assignment");
1813 InVals.push_back(ThisVal);
1814 continue;
1815 }
1816
1817 SDValue Val;
1818 if (VA.needsCustom() &&
1819 (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
1820 // Handle f64 or half of a v2f64.
1821 SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1822 InGlue);
1823 Chain = Lo.getValue(1);
1824 InGlue = Lo.getValue(2);
1825 VA = RVLocs[++i]; // skip ahead to next loc
1826 SDValue Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
1827 InGlue);
1828 Chain = Hi.getValue(1);
1829 InGlue = Hi.getValue(2);
1830 if (!Subtarget->isLittle())
1831 std::swap (Lo, Hi);
1832 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1833
1834 if (VA.getLocVT() == MVT::v2f64) {
1835 SDValue Vec = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
1836 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1837 DAG.getConstant(0, dl, MVT::i32));
1838
1839 VA = RVLocs[++i]; // skip ahead to next loc
1840 Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1841 Chain = Lo.getValue(1);
1842 InGlue = Lo.getValue(2);
1843 VA = RVLocs[++i]; // skip ahead to next loc
1844 Hi = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32, InGlue);
1845 Chain = Hi.getValue(1);
1846 InGlue = Hi.getValue(2);
1847 if (!Subtarget->isLittle())
1848 std::swap (Lo, Hi);
1849 Val = DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
1850 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Vec, Val,
1851 DAG.getConstant(1, dl, MVT::i32));
1852 }
1853 } else {
1854 Val = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), VA.getLocVT(),
1855 InGlue);
1856 Chain = Val.getValue(1);
1857 InGlue = Val.getValue(2);
1858 }
1859
1860 switch (VA.getLocInfo()) {
1861 default: llvm_unreachable("Unknown loc info!");
1862 case CCValAssign::Full: break;
1863 case CCValAssign::BCvt:
1864 Val = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), Val);
1865 break;
1866 }
1867
1868 // f16 arguments have their size extended to 4 bytes and passed as if they
1869 // had been copied to the LSBs of a 32-bit register.
1870 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
1871 if (VA.needsCustom() &&
1872 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
1873 Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
1874
1875 // On CMSE Non-secure Calls, call results (returned values) whose bitwidth
1876 // is less than 32 bits must be sign- or zero-extended after the call for
1877 // security reasons. Although the ABI mandates an extension done by the
1878 // callee, the latter cannot be trusted to follow the rules of the ABI.
1879 const ISD::InputArg &Arg = Ins[VA.getValNo()];
1880 if (isCmseNSCall && Arg.ArgVT.isScalarInteger() &&
1881 VA.getLocVT().isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
1882 Val = handleCMSEValue(Val, Arg, DAG, dl);
1883
1884 InVals.push_back(Val);
1885 }
1886
1887 return Chain;
1888}
1889
1890std::pair<SDValue, MachinePointerInfo> ARMTargetLowering::computeAddrForCallArg(
1891 const SDLoc &dl, SelectionDAG &DAG, const CCValAssign &VA, SDValue StackPtr,
1892 bool IsTailCall, int SPDiff) const {
1893 SDValue DstAddr;
1894 MachinePointerInfo DstInfo;
1895 int32_t Offset = VA.getLocMemOffset();
1896 MachineFunction &MF = DAG.getMachineFunction();
1897
1898 if (IsTailCall) {
1899 Offset += SPDiff;
1900 auto PtrVT = getPointerTy(DAG.getDataLayout());
1901 int Size = VA.getLocVT().getFixedSizeInBits() / 8;
1902 int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
1903 DstAddr = DAG.getFrameIndex(FI, PtrVT);
1904 DstInfo =
1906 } else {
1907 SDValue PtrOff = DAG.getIntPtrConstant(Offset, dl);
1908 DstAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
1909 StackPtr, PtrOff);
1910 DstInfo =
1912 }
1913
1914 return std::make_pair(DstAddr, DstInfo);
1915}
1916
1917// Returns the type of copying which is required to set up a byval argument to
1918// a tail-called function. This isn't needed for non-tail calls, because they
1919// always need the equivalent of CopyOnce, but tail-calls sometimes need two to
1920// avoid clobbering another argument (CopyViaTemp), and sometimes can be
1921// optimised to zero copies when forwarding an argument from the caller's
1922// caller (NoCopy).
1923ARMTargetLowering::ByValCopyKind ARMTargetLowering::ByValNeedsCopyForTailCall(
1924 SelectionDAG &DAG, SDValue Src, SDValue Dst, ISD::ArgFlagsTy Flags) const {
1925 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
1926 ARMFunctionInfo *AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
1927
1928 // Globals are always safe to copy from.
1930 return CopyOnce;
1931
1932 // Can only analyse frame index nodes, conservatively assume we need a
1933 // temporary.
1934 auto *SrcFrameIdxNode = dyn_cast<FrameIndexSDNode>(Src);
1935 auto *DstFrameIdxNode = dyn_cast<FrameIndexSDNode>(Dst);
1936 if (!SrcFrameIdxNode || !DstFrameIdxNode)
1937 return CopyViaTemp;
1938
1939 int SrcFI = SrcFrameIdxNode->getIndex();
1940 int DstFI = DstFrameIdxNode->getIndex();
1941 assert(MFI.isFixedObjectIndex(DstFI) &&
1942 "byval passed in non-fixed stack slot");
1943
1944 int64_t SrcOffset = MFI.getObjectOffset(SrcFI);
1945 int64_t DstOffset = MFI.getObjectOffset(DstFI);
1946
1947 // If the source is in the local frame, then the copy to the argument memory
1948 // is always valid.
1949 bool FixedSrc = MFI.isFixedObjectIndex(SrcFI);
1950 if (!FixedSrc ||
1951 (FixedSrc && SrcOffset < -(int64_t)AFI->getArgRegsSaveSize()))
1952 return CopyOnce;
1953
1954 // In the case of byval arguments split between registers and the stack,
1955 // computeAddrForCallArg returns a FrameIndex which corresponds only to the
1956 // stack portion, but the Src SDValue will refer to the full value, including
1957 // the local stack memory that the register portion gets stored into. We only
1958 // need to compare them for equality, so normalise on the full value version.
1959 uint64_t RegSize = Flags.getByValSize() - MFI.getObjectSize(DstFI);
1960 DstOffset -= RegSize;
1961
1962 // If the value is already in the correct location, then no copying is
1963 // needed. If not, then we need to copy via a temporary.
1964 if (SrcOffset == DstOffset)
1965 return NoCopy;
1966 else
1967 return CopyViaTemp;
1968}
1969
1970void ARMTargetLowering::PassF64ArgInRegs(const SDLoc &dl, SelectionDAG &DAG,
1971 SDValue Chain, SDValue &Arg,
1972 RegsToPassVector &RegsToPass,
1973 CCValAssign &VA, CCValAssign &NextVA,
1974 SDValue &StackPtr,
1975 SmallVectorImpl<SDValue> &MemOpChains,
1976 bool IsTailCall,
1977 int SPDiff) const {
1978 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
1979 DAG.getVTList(MVT::i32, MVT::i32), Arg);
1980 unsigned id = Subtarget->isLittle() ? 0 : 1;
1981 RegsToPass.push_back(std::make_pair(VA.getLocReg(), fmrrd.getValue(id)));
1982
1983 if (NextVA.isRegLoc())
1984 RegsToPass.push_back(std::make_pair(NextVA.getLocReg(), fmrrd.getValue(1-id)));
1985 else {
1986 assert(NextVA.isMemLoc());
1987 if (!StackPtr.getNode())
1988 StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP,
1990
1991 SDValue DstAddr;
1992 MachinePointerInfo DstInfo;
1993 std::tie(DstAddr, DstInfo) =
1994 computeAddrForCallArg(dl, DAG, NextVA, StackPtr, IsTailCall, SPDiff);
1995 MemOpChains.push_back(
1996 DAG.getStore(Chain, dl, fmrrd.getValue(1 - id), DstAddr, DstInfo));
1997 }
1998}
1999
2000static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls) {
2001 return (CC == CallingConv::Fast && GuaranteeTailCalls) ||
2003}
2004
2005/// LowerCall - Lowering a call into a callseq_start <-
2006/// ARMISD:CALL <- callseq_end chain. Also add input and output parameter
2007/// nodes.
2008SDValue
2009ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
2010 SmallVectorImpl<SDValue> &InVals) const {
2011 SelectionDAG &DAG = CLI.DAG;
2012 SDLoc &dl = CLI.DL;
2013 SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2014 SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2015 SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2016 SDValue Chain = CLI.Chain;
2017 SDValue Callee = CLI.Callee;
2018 bool &isTailCall = CLI.IsTailCall;
2019 CallingConv::ID CallConv = CLI.CallConv;
2020 bool doesNotRet = CLI.DoesNotReturn;
2021 bool isVarArg = CLI.IsVarArg;
2022 const CallBase *CB = CLI.CB;
2023
2024 MachineFunction &MF = DAG.getMachineFunction();
2025 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2026 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
2027 MachineFunction::CallSiteInfo CSInfo;
2028 bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
2029 bool isThisReturn = false;
2030 bool isCmseNSCall = false;
2031 bool isSibCall = false;
2032 bool PreferIndirect = false;
2033 bool GuardWithBTI = false;
2034
2035 // Analyze operands of the call, assigning locations to each operand.
2037 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
2038 *DAG.getContext());
2039 CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
2040
2041 // Lower 'returns_twice' calls to a pseudo-instruction.
2042 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr(Attribute::ReturnsTwice) &&
2043 !Subtarget->noBTIAtReturnTwice())
2044 GuardWithBTI = AFI->branchTargetEnforcement();
2045
2046 // Set type id for call site info.
2047 setTypeIdForCallsiteInfo(CB, MF, CSInfo);
2048
2049 // Determine whether this is a non-secure function call.
2050 if (CLI.CB && CLI.CB->getAttributes().hasFnAttr("cmse_nonsecure_call"))
2051 isCmseNSCall = true;
2052
2053 // Disable tail calls if they're not supported.
2054 if (!Subtarget->supportsTailCall())
2055 isTailCall = false;
2056
2057 // For both the non-secure calls and the returns from a CMSE entry function,
2058 // the function needs to do some extra work after the call, or before the
2059 // return, respectively, thus it cannot end with a tail call
2060 if (isCmseNSCall || AFI->isCmseNSEntryFunction())
2061 isTailCall = false;
2062
2063 if (isa<GlobalAddressSDNode>(Callee)) {
2064 // If we're optimizing for minimum size and the function is called three or
2065 // more times in this block, we can improve codesize by calling indirectly
2066 // as BLXr has a 16-bit encoding.
2067 auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
2068 if (CLI.CB) {
2069 auto *BB = CLI.CB->getParent();
2070 PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
2071 count_if(GV->users(), [&BB](const User *U) {
2072 return isa<Instruction>(U) &&
2073 cast<Instruction>(U)->getParent() == BB;
2074 }) > 2;
2075 }
2076 }
2077 if (isTailCall) {
2078 // Check if it's really possible to do a tail call.
2079 isTailCall =
2080 IsEligibleForTailCallOptimization(CLI, CCInfo, ArgLocs, PreferIndirect);
2081
2082 if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt &&
2083 CallConv != CallingConv::Tail && CallConv != CallingConv::SwiftTail)
2084 isSibCall = true;
2085
2086 // We don't support GuaranteedTailCallOpt for ARM, only automatically
2087 // detected sibcalls.
2088 if (isTailCall)
2089 ++NumTailCalls;
2090 }
2091
2092 if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
2093 report_fatal_error("failed to perform tail call elimination on a call "
2094 "site marked musttail");
2095
2096 // Get a count of how many bytes are to be pushed on the stack.
2097 unsigned NumBytes = CCInfo.getStackSize();
2098
2099 // SPDiff is the byte offset of the call's argument area from the callee's.
2100 // Stores to callee stack arguments will be placed in FixedStackSlots offset
2101 // by this amount for a tail call. In a sibling call it must be 0 because the
2102 // caller will deallocate the entire stack and the callee still expects its
2103 // arguments to begin at SP+0. Completely unused for non-tail calls.
2104 int SPDiff = 0;
2105
2106 if (isTailCall && !isSibCall) {
2107 auto FuncInfo = MF.getInfo<ARMFunctionInfo>();
2108 unsigned NumReusableBytes = FuncInfo->getArgumentStackSize();
2109
2110 // Since callee will pop argument stack as a tail call, we must keep the
2111 // popped size 16-byte aligned.
2112 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
2113 assert(StackAlign && "data layout string is missing stack alignment");
2114 NumBytes = alignTo(NumBytes, *StackAlign);
2115
2116 // SPDiff will be negative if this tail call requires more space than we
2117 // would automatically have in our incoming argument space. Positive if we
2118 // can actually shrink the stack.
2119 SPDiff = NumReusableBytes - NumBytes;
2120
2121 // If this call requires more stack than we have available from
2122 // LowerFormalArguments, tell FrameLowering to reserve space for it.
2123 if (SPDiff < 0 && AFI->getArgRegsSaveSize() < (unsigned)-SPDiff)
2124 AFI->setArgRegsSaveSize(-SPDiff);
2125 }
2126
2127 if (isSibCall) {
2128 // For sibling tail calls, memory operands are available in our caller's stack.
2129 NumBytes = 0;
2130 } else {
2131 // Adjust the stack pointer for the new arguments...
2132 // These operations are automatically eliminated by the prolog/epilog pass
2133 Chain = DAG.getCALLSEQ_START(Chain, isTailCall ? 0 : NumBytes, 0, dl);
2134 }
2135
2137 DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout()));
2138
2139 RegsToPassVector RegsToPass;
2140 SmallVector<SDValue, 8> MemOpChains;
2141
2142 // If we are doing a tail-call, any byval arguments will be written to stack
2143 // space which was used for incoming arguments. If any the values being used
2144 // are incoming byval arguments to this function, then they might be
2145 // overwritten by the stores of the outgoing arguments. To avoid this, we
2146 // need to make a temporary copy of them in local stack space, then copy back
2147 // to the argument area.
2148 DenseMap<unsigned, SDValue> ByValTemporaries;
2149 SDValue ByValTempChain;
2150 if (isTailCall) {
2151 SmallVector<SDValue, 8> ByValCopyChains;
2152 for (const CCValAssign &VA : ArgLocs) {
2153 unsigned ArgIdx = VA.getValNo();
2154 SDValue Src = OutVals[ArgIdx];
2155 ISD::ArgFlagsTy Flags = Outs[ArgIdx].Flags;
2156
2157 if (!Flags.isByVal())
2158 continue;
2159
2160 SDValue Dst;
2161 MachinePointerInfo DstInfo;
2162 std::tie(Dst, DstInfo) =
2163 computeAddrForCallArg(dl, DAG, VA, SDValue(), true, SPDiff);
2164 ByValCopyKind Copy = ByValNeedsCopyForTailCall(DAG, Src, Dst, Flags);
2165
2166 if (Copy == NoCopy) {
2167 // If the argument is already at the correct offset on the stack
2168 // (because we are forwarding a byval argument from our caller), we
2169 // don't need any copying.
2170 continue;
2171 } else if (Copy == CopyOnce) {
2172 // If the argument is in our local stack frame, no other argument
2173 // preparation can clobber it, so we can copy it to the final location
2174 // later.
2175 ByValTemporaries[ArgIdx] = Src;
2176 } else {
2177 assert(Copy == CopyViaTemp && "unexpected enum value");
2178 // If we might be copying this argument from the outgoing argument
2179 // stack area, we need to copy via a temporary in the local stack
2180 // frame.
2181 int TempFrameIdx = MFI.CreateStackObject(
2182 Flags.getByValSize(), Flags.getNonZeroByValAlign(), false);
2183 SDValue Temp =
2184 DAG.getFrameIndex(TempFrameIdx, getPointerTy(DAG.getDataLayout()));
2185
2186 SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
2187 SDValue AlignNode =
2188 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2189
2190 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2191 SDValue Ops[] = {Chain, Temp, Src, SizeNode, AlignNode};
2192 ByValCopyChains.push_back(
2193 DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs, Ops));
2194 ByValTemporaries[ArgIdx] = Temp;
2195 }
2196 }
2197 if (!ByValCopyChains.empty())
2198 ByValTempChain =
2199 DAG.getNode(ISD::TokenFactor, dl, MVT::Other, ByValCopyChains);
2200 }
2201
2202 // During a tail call, stores to the argument area must happen after all of
2203 // the function's incoming arguments have been loaded because they may alias.
2204 // This is done by folding in a TokenFactor from LowerFormalArguments, but
2205 // there's no point in doing so repeatedly so this tracks whether that's
2206 // happened yet.
2207 bool AfterFormalArgLoads = false;
2208
2209 // Walk the register/memloc assignments, inserting copies/loads. In the case
2210 // of tail call optimization, arguments are handled later.
2211 for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
2212 i != e;
2213 ++i, ++realArgIdx) {
2214 CCValAssign &VA = ArgLocs[i];
2215 SDValue Arg = OutVals[realArgIdx];
2216 ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
2217 bool isByVal = Flags.isByVal();
2218
2219 // Promote the value if needed.
2220 switch (VA.getLocInfo()) {
2221 default: llvm_unreachable("Unknown loc info!");
2222 case CCValAssign::Full: break;
2223 case CCValAssign::SExt:
2224 Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
2225 break;
2226 case CCValAssign::ZExt:
2227 Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
2228 break;
2229 case CCValAssign::AExt:
2230 Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
2231 break;
2232 case CCValAssign::BCvt:
2233 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2234 break;
2235 }
2236
2237 if (isTailCall && VA.isMemLoc() && !AfterFormalArgLoads) {
2238 Chain = DAG.getStackArgumentTokenFactor(Chain);
2239 if (ByValTempChain) {
2240 // In case of large byval copies, re-using the stackframe for tail-calls
2241 // can lead to overwriting incoming arguments on the stack. Force
2242 // loading these stack arguments before the copy to avoid that.
2243 SmallVector<SDValue, 8> IncomingLoad;
2244 for (unsigned I = 0; I < OutVals.size(); ++I) {
2245 if (Outs[I].Flags.isByVal())
2246 continue;
2247
2248 SDValue OutVal = OutVals[I];
2249 LoadSDNode *OutLN = dyn_cast_or_null<LoadSDNode>(OutVal);
2250 if (!OutLN)
2251 continue;
2252
2253 FrameIndexSDNode *FIN =
2255 if (!FIN)
2256 continue;
2257
2258 if (!MFI.isFixedObjectIndex(FIN->getIndex()))
2259 continue;
2260
2261 for (const CCValAssign &VA : ArgLocs) {
2262 if (VA.isMemLoc())
2263 IncomingLoad.push_back(OutVal.getValue(1));
2264 }
2265 }
2266
2267 // Update the chain to force loads for potentially clobbered argument
2268 // loads to happen before the byval copy.
2269 if (!IncomingLoad.empty()) {
2270 IncomingLoad.push_back(Chain);
2271 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, IncomingLoad);
2272 }
2273
2274 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chain,
2275 ByValTempChain);
2276 }
2277 AfterFormalArgLoads = true;
2278 }
2279
2280 // f16 arguments have their size extended to 4 bytes and passed as if they
2281 // had been copied to the LSBs of a 32-bit register.
2282 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
2283 if (VA.needsCustom() &&
2284 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
2285 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2286 } else {
2287 // f16 arguments could have been extended prior to argument lowering.
2288 // Mask them arguments if this is a CMSE nonsecure call.
2289 auto ArgVT = Outs[realArgIdx].ArgVT;
2290 if (isCmseNSCall && (ArgVT == MVT::f16)) {
2291 auto LocBits = VA.getLocVT().getSizeInBits();
2292 auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
2293 SDValue Mask =
2294 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2295 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2296 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2297 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2298 }
2299 }
2300
2301 // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
2302 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
2303 SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2304 DAG.getConstant(0, dl, MVT::i32));
2305 SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
2306 DAG.getConstant(1, dl, MVT::i32));
2307
2308 PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
2309 StackPtr, MemOpChains, isTailCall, SPDiff);
2310
2311 VA = ArgLocs[++i]; // skip ahead to next loc
2312 if (VA.isRegLoc()) {
2313 PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
2314 StackPtr, MemOpChains, isTailCall, SPDiff);
2315 } else {
2316 assert(VA.isMemLoc());
2317 SDValue DstAddr;
2318 MachinePointerInfo DstInfo;
2319 std::tie(DstAddr, DstInfo) =
2320 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2321 MemOpChains.push_back(DAG.getStore(Chain, dl, Op1, DstAddr, DstInfo));
2322 }
2323 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
2324 PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
2325 StackPtr, MemOpChains, isTailCall, SPDiff);
2326 } else if (VA.isRegLoc()) {
2327 if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
2328 Outs[0].VT == MVT::i32) {
2329 assert(VA.getLocVT() == MVT::i32 &&
2330 "unexpected calling convention register assignment");
2331 assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
2332 "unexpected use of 'returned'");
2333 isThisReturn = true;
2334 }
2335 const TargetOptions &Options = DAG.getTarget().Options;
2336 if (Options.EmitCallSiteInfo)
2337 CSInfo.ArgRegPairs.emplace_back(VA.getLocReg(), i);
2338 RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
2339 } else if (isByVal) {
2340 assert(VA.isMemLoc());
2341 unsigned offset = 0;
2342
2343 // True if this byval aggregate will be split between registers
2344 // and memory.
2345 unsigned ByValArgsCount = CCInfo.getInRegsParamsCount();
2346 unsigned CurByValIdx = CCInfo.getInRegsParamsProcessed();
2347
2348 SDValue ByValSrc;
2349 bool NeedsStackCopy;
2350 if (auto It = ByValTemporaries.find(realArgIdx);
2351 It != ByValTemporaries.end()) {
2352 ByValSrc = It->second;
2353 NeedsStackCopy = true;
2354 } else {
2355 ByValSrc = Arg;
2356 NeedsStackCopy = !isTailCall;
2357 }
2358
2359 // If part of the argument is in registers, load them.
2360 if (CurByValIdx < ByValArgsCount) {
2361 unsigned RegBegin, RegEnd;
2362 CCInfo.getInRegsParamInfo(CurByValIdx, RegBegin, RegEnd);
2363
2364 EVT PtrVT = getPointerTy(DAG.getDataLayout());
2365 unsigned int i, j;
2366 for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
2367 SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
2368 SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, Const);
2369 SDValue Load =
2370 DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
2371 DAG.InferPtrAlign(AddArg));
2372 MemOpChains.push_back(Load.getValue(1));
2373 RegsToPass.push_back(std::make_pair(j, Load));
2374 }
2375
2376 // If parameter size outsides register area, "offset" value
2377 // helps us to calculate stack slot for remained part properly.
2378 offset = RegEnd - RegBegin;
2379
2380 CCInfo.nextInRegsParam();
2381 }
2382
2383 // If the memory part of the argument isn't already in the correct place
2384 // (which can happen with tail calls), copy it into the argument area.
2385 if (NeedsStackCopy && Flags.getByValSize() > 4 * offset) {
2386 auto PtrVT = getPointerTy(DAG.getDataLayout());
2387 SDValue Dst;
2388 MachinePointerInfo DstInfo;
2389 std::tie(Dst, DstInfo) =
2390 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2391 SDValue SrcOffset = DAG.getIntPtrConstant(4*offset, dl);
2392 SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, ByValSrc, SrcOffset);
2393 SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
2394 MVT::i32);
2395 SDValue AlignNode =
2396 DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
2397
2398 SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
2399 SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
2400 MemOpChains.push_back(DAG.getNode(ARMISD::COPY_STRUCT_BYVAL, dl, VTs,
2401 Ops));
2402 }
2403 } else {
2404 assert(VA.isMemLoc());
2405 SDValue DstAddr;
2406 MachinePointerInfo DstInfo;
2407 std::tie(DstAddr, DstInfo) =
2408 computeAddrForCallArg(dl, DAG, VA, StackPtr, isTailCall, SPDiff);
2409
2410 SDValue Store = DAG.getStore(Chain, dl, Arg, DstAddr, DstInfo);
2411 MemOpChains.push_back(Store);
2412 }
2413 }
2414
2415 if (!MemOpChains.empty())
2416 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
2417
2418 // Build a sequence of copy-to-reg nodes chained together with token chain
2419 // and flag operands which copy the outgoing args into the appropriate regs.
2420 SDValue InGlue;
2421 for (const auto &[Reg, N] : RegsToPass) {
2422 Chain = DAG.getCopyToReg(Chain, dl, Reg, N, InGlue);
2423 InGlue = Chain.getValue(1);
2424 }
2425
2426 // If the callee is a GlobalAddress/ExternalSymbol node (quite common, every
2427 // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
2428 // node so that legalize doesn't hack it.
2429 bool isDirect = false;
2430
2431 const TargetMachine &TM = getTargetMachine();
2432 const Triple &TT = TM.getTargetTriple();
2433 const GlobalValue *GVal = nullptr;
2434 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
2435 GVal = G->getGlobal();
2436 bool isStub = !TM.shouldAssumeDSOLocal(GVal) && TT.isOSBinFormatMachO();
2437
2438 bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
2439 bool isLocalARMFunc = false;
2440 auto PtrVt = getPointerTy(DAG.getDataLayout());
2441
2442 if (Subtarget->genLongCalls()) {
2443 assert((!isPositionIndependent() || TT.isOSWindows()) &&
2444 "long-calls codegen is not position independent!");
2445 // Handle a global address or an external symbol. If it's not one of
2446 // those, the target's already in a register, so we don't need to do
2447 // anything extra.
2448 if (isa<GlobalAddressSDNode>(Callee)) {
2449 if (Subtarget->genExecuteOnly()) {
2450 if (Subtarget->useMovt())
2451 ++NumMovwMovt;
2452 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2453 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2454 } else {
2455 // Create a constant pool entry for the callee address
2456 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2457 ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(
2458 GVal, ARMPCLabelIndex, ARMCP::CPValue, 0);
2459
2460 // Get the address of the callee into a register
2461 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2462 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2463 Callee = DAG.getLoad(
2464 PtrVt, dl, DAG.getEntryNode(), Addr,
2466 }
2467 } else if (ExternalSymbolSDNode *S=dyn_cast<ExternalSymbolSDNode>(Callee)) {
2468 const char *Sym = S->getSymbol();
2469
2470 if (Subtarget->genExecuteOnly()) {
2471 if (Subtarget->useMovt())
2472 ++NumMovwMovt;
2473 Callee = DAG.getNode(ARMISD::Wrapper, dl, PtrVt,
2474 DAG.getTargetGlobalAddress(GVal, dl, PtrVt));
2475 } else {
2476 // Create a constant pool entry for the callee address
2477 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2478 ARMConstantPoolValue *CPV = ARMConstantPoolSymbol::Create(
2479 *DAG.getContext(), Sym, ARMPCLabelIndex, 0);
2480
2481 // Get the address of the callee into a register
2482 SDValue Addr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2483 Addr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Addr);
2484 Callee = DAG.getLoad(
2485 PtrVt, dl, DAG.getEntryNode(), Addr,
2487 }
2488 }
2489 } else if (isa<GlobalAddressSDNode>(Callee)) {
2490 if (!PreferIndirect) {
2491 isDirect = true;
2492 bool isDef = GVal->isStrongDefinitionForLinker();
2493
2494 // ARM call to a local ARM function is predicable.
2495 isLocalARMFunc = !Subtarget->isThumb() && (isDef || !ARMInterworking);
2496 // tBX takes a register source operand.
2497 if (isStub && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2498 assert(TT.isOSBinFormatMachO() && "WrapperPIC use on non-MachO?");
2499 Callee = DAG.getNode(
2500 ARMISD::WrapperPIC, dl, PtrVt,
2501 DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, ARMII::MO_NONLAZY));
2502 Callee = DAG.getLoad(
2503 PtrVt, dl, DAG.getEntryNode(), Callee,
2507 } else if (Subtarget->isTargetCOFF()) {
2508 assert(Subtarget->isTargetWindows() &&
2509 "Windows is the only supported COFF target");
2510 unsigned TargetFlags = ARMII::MO_NO_FLAG;
2511 if (GVal->hasDLLImportStorageClass())
2512 TargetFlags = ARMII::MO_DLLIMPORT;
2513 else if (!TM.shouldAssumeDSOLocal(GVal))
2514 TargetFlags = ARMII::MO_COFFSTUB;
2515 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, /*offset=*/0,
2516 TargetFlags);
2517 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
2518 Callee =
2519 DAG.getLoad(PtrVt, dl, DAG.getEntryNode(),
2520 DAG.getNode(ARMISD::Wrapper, dl, PtrVt, Callee),
2522 } else {
2523 Callee = DAG.getTargetGlobalAddress(GVal, dl, PtrVt, 0, 0);
2524 }
2525 }
2526 } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
2527 isDirect = true;
2528 // tBX takes a register source operand.
2529 const char *Sym = S->getSymbol();
2530 if (isARMFunc && Subtarget->isThumb1Only() && !Subtarget->hasV5TOps()) {
2531 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
2532 ARMConstantPoolValue *CPV =
2534 ARMPCLabelIndex, 4);
2535 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
2536 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
2537 Callee = DAG.getLoad(
2538 PtrVt, dl, DAG.getEntryNode(), CPAddr,
2540 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
2541 Callee = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVt, Callee, PICLabel);
2542 } else {
2543 Callee = DAG.getTargetExternalSymbol(Sym, PtrVt, 0);
2544 }
2545 }
2546
2547 if (isCmseNSCall) {
2548 assert(!isARMFunc && !isDirect &&
2549 "Cannot handle call to ARM function or direct call");
2550 if (NumBytes > 0) {
2551 DAG.getContext()->diagnose(
2552 DiagnosticInfoUnsupported(DAG.getMachineFunction().getFunction(),
2553 "call to non-secure function would require "
2554 "passing arguments on stack",
2555 dl.getDebugLoc()));
2556 }
2557 if (isStructRet) {
2558 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2560 "call to non-secure function would return value through pointer",
2561 dl.getDebugLoc()));
2562 }
2563 }
2564
2565 // FIXME: handle tail calls differently.
2566 unsigned CallOpc;
2567 if (Subtarget->isThumb()) {
2568 if (GuardWithBTI)
2569 CallOpc = ARMISD::t2CALL_BTI;
2570 else if (isCmseNSCall)
2571 CallOpc = ARMISD::tSECALL;
2572 else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
2573 CallOpc = ARMISD::CALL_NOLINK;
2574 else
2575 CallOpc = ARMISD::CALL;
2576 } else {
2577 if (!isDirect && !Subtarget->hasV5TOps())
2578 CallOpc = ARMISD::CALL_NOLINK;
2579 else if (doesNotRet && isDirect && Subtarget->hasRetAddrStack() &&
2580 // Emit regular call when code size is the priority
2581 !Subtarget->hasMinSize())
2582 // "mov lr, pc; b _foo" to avoid confusing the RSP
2583 CallOpc = ARMISD::CALL_NOLINK;
2584 else
2585 CallOpc = isLocalARMFunc ? ARMISD::CALL_PRED : ARMISD::CALL;
2586 }
2587
2588 // We don't usually want to end the call-sequence here because we would tidy
2589 // the frame up *after* the call, however in the ABI-changing tail-call case
2590 // we've carefully laid out the parameters so that when sp is reset they'll be
2591 // in the correct location.
2592 if (isTailCall && !isSibCall) {
2593 Chain = DAG.getCALLSEQ_END(Chain, 0, 0, InGlue, dl);
2594 InGlue = Chain.getValue(1);
2595 }
2596
2597 std::vector<SDValue> Ops;
2598 Ops.push_back(Chain);
2599 Ops.push_back(Callee);
2600
2601 if (isTailCall) {
2602 Ops.push_back(DAG.getSignedTargetConstant(SPDiff, dl, MVT::i32));
2603 }
2604
2605 // Add argument registers to the end of the list so that they are known live
2606 // into the call.
2607 for (const auto &[Reg, N] : RegsToPass)
2608 Ops.push_back(DAG.getRegister(Reg, N.getValueType()));
2609
2610 // Add a register mask operand representing the call-preserved registers.
2611 const uint32_t *Mask;
2612 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
2613 if (isThisReturn) {
2614 // For 'this' returns, use the R0-preserving mask if applicable
2615 Mask = ARI->getThisReturnPreservedMask(MF, CallConv);
2616 if (!Mask) {
2617 // Set isThisReturn to false if the calling convention is not one that
2618 // allows 'returned' to be modeled in this way, so LowerCallResult does
2619 // not try to pass 'this' straight through
2620 isThisReturn = false;
2621 Mask = ARI->getCallPreservedMask(MF, CallConv);
2622 }
2623 } else
2624 Mask = ARI->getCallPreservedMask(MF, CallConv);
2625
2626 assert(Mask && "Missing call preserved mask for calling convention");
2627 Ops.push_back(DAG.getRegisterMask(Mask));
2628
2629 if (InGlue.getNode())
2630 Ops.push_back(InGlue);
2631
2632 if (isTailCall) {
2634 SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
2635 if (CLI.CFIType)
2636 Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2637 DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
2638 DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
2639 return Ret;
2640 }
2641
2642 // Returns a chain and a flag for retval copy to use.
2643 Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
2644 if (CLI.CFIType)
2645 Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
2646 DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
2647 InGlue = Chain.getValue(1);
2648 DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
2649
2650 // If we're guaranteeing tail-calls will be honoured, the callee must
2651 // pop its own argument stack on return. But this call is *not* a tail call so
2652 // we need to undo that after it returns to restore the status-quo.
2653 bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
2654 uint64_t CalleePopBytes =
2655 canGuaranteeTCO(CallConv, TailCallOpt) ? alignTo(NumBytes, 16) : -1U;
2656
2657 Chain = DAG.getCALLSEQ_END(Chain, NumBytes, CalleePopBytes, InGlue, dl);
2658 if (!Ins.empty())
2659 InGlue = Chain.getValue(1);
2660
2661 // Handle result values, copying them out of physregs into vregs that we
2662 // return.
2663 return LowerCallResult(Chain, InGlue, CallConv, isVarArg, Ins, dl, DAG,
2664 InVals, isThisReturn,
2665 isThisReturn ? OutVals[0] : SDValue(), isCmseNSCall);
2666}
2667
2668/// HandleByVal - Every parameter *after* a byval parameter is passed
2669/// on the stack. Remember the next parameter register to allocate,
2670/// and then confiscate the rest of the parameter registers to insure
2671/// this.
2672void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
2673 Align Alignment) const {
2674 // Byval (as with any stack) slots are always at least 4 byte aligned.
2675 Alignment = std::max(Alignment, Align(4));
2676
2677 MCRegister Reg = State->AllocateReg(GPRArgRegs);
2678 if (!Reg)
2679 return;
2680
2681 unsigned AlignInRegs = Alignment.value() / 4;
2682 unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
2683 for (unsigned i = 0; i < Waste; ++i)
2684 Reg = State->AllocateReg(GPRArgRegs);
2685
2686 if (!Reg)
2687 return;
2688
2689 unsigned Excess = 4 * (ARM::R4 - Reg);
2690
2691 // Special case when NSAA != SP and parameter size greater than size of
2692 // all remained GPR regs. In that case we can't split parameter, we must
2693 // send it to stack. We also must set NCRN to R4, so waste all
2694 // remained registers.
2695 const unsigned NSAAOffset = State->getStackSize();
2696 if (NSAAOffset != 0 && Size > Excess) {
2697 while (State->AllocateReg(GPRArgRegs))
2698 ;
2699 return;
2700 }
2701
2702 // First register for byval parameter is the first register that wasn't
2703 // allocated before this method call, so it would be "reg".
2704 // If parameter is small enough to be saved in range [reg, r4), then
2705 // the end (first after last) register would be reg + param-size-in-regs,
2706 // else parameter would be splitted between registers and stack,
2707 // end register would be r4 in this case.
2708 unsigned ByValRegBegin = Reg;
2709 unsigned ByValRegEnd = std::min<unsigned>(Reg + Size / 4, ARM::R4);
2710 State->addInRegsParamInfo(ByValRegBegin, ByValRegEnd);
2711 // Note, first register is allocated in the beginning of function already,
2712 // allocate remained amount of registers we need.
2713 for (unsigned i = Reg + 1; i != ByValRegEnd; ++i)
2714 State->AllocateReg(GPRArgRegs);
2715 // A byval parameter that is split between registers and memory needs its
2716 // size truncated here.
2717 // In the case where the entire structure fits in registers, we set the
2718 // size in memory to zero.
2719 Size = std::max<int>(Size - Excess, 0);
2720}
2721
2722/// IsEligibleForTailCallOptimization - Check whether the call is eligible
2723/// for tail call optimization. Targets which want to do tail call
2724/// optimization should implement this function. Note that this function also
2725/// processes musttail calls, so when this function returns false on a valid
2726/// musttail call, a fatal backend error occurs.
2727bool ARMTargetLowering::IsEligibleForTailCallOptimization(
2729 SmallVectorImpl<CCValAssign> &ArgLocs, const bool isIndirect) const {
2730 CallingConv::ID CalleeCC = CLI.CallConv;
2731 SDValue Callee = CLI.Callee;
2732 bool isVarArg = CLI.IsVarArg;
2733 const SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
2734 const SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
2735 const SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
2736 const SelectionDAG &DAG = CLI.DAG;
2737 MachineFunction &MF = DAG.getMachineFunction();
2738 const Function &CallerF = MF.getFunction();
2739 CallingConv::ID CallerCC = CallerF.getCallingConv();
2740
2741 assert(Subtarget->supportsTailCall());
2742
2743 // Indirect tail-calls require a register to hold the target address. That
2744 // register must be:
2745 // * Allocatable (i.e. r0-r7 if the target is Thumb1).
2746 // * Not callee-saved, so must be one of r0-r3 or r12.
2747 // * Not used to hold an argument to the tail-called function, which might be
2748 // in r0-r3.
2749 // * Not used to hold the return address authentication code, which is in r12
2750 // if enabled.
2751 // Sometimes, no register matches all of these conditions, so we can't do a
2752 // tail-call.
2753 if (!isa<GlobalAddressSDNode>(Callee.getNode()) || isIndirect) {
2754 SmallSet<MCPhysReg, 5> AddressRegisters = {ARM::R0, ARM::R1, ARM::R2,
2755 ARM::R3};
2756 if (!(Subtarget->isThumb1Only() ||
2757 MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress(true)))
2758 AddressRegisters.insert(ARM::R12);
2759 for (const CCValAssign &AL : ArgLocs)
2760 if (AL.isRegLoc())
2761 AddressRegisters.erase(AL.getLocReg());
2762 if (AddressRegisters.empty()) {
2763 LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
2764 return false;
2765 }
2766 }
2767
2768 // Look for obvious safe cases to perform tail call optimization that do not
2769 // require ABI changes. This is what gcc calls sibcall.
2770
2771 // Exception-handling functions need a special set of instructions to indicate
2772 // a return to the hardware. Tail-calling another function would probably
2773 // break this.
2774 if (CallerF.hasFnAttribute("interrupt")) {
2775 LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
2776 return false;
2777 }
2778
2779 if (canGuaranteeTCO(CalleeCC,
2780 getTargetMachine().Options.GuaranteedTailCallOpt)) {
2781 LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
2782 << " (guaranteed tail-call CC)\n");
2783 return CalleeCC == CallerCC;
2784 }
2785
2786 // Also avoid sibcall optimization if either caller or callee uses struct
2787 // return semantics.
2788 bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
2789 bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
2790 if (isCalleeStructRet != isCallerStructRet) {
2791 LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
2792 return false;
2793 }
2794
2795 // Externally-defined functions with weak linkage should not be
2796 // tail-called on ARM when the OS does not support dynamic
2797 // pre-emption of symbols, as the AAELF spec requires normal calls
2798 // to undefined weak functions to be replaced with a NOP or jump to the
2799 // next instruction. The behaviour of branch instructions in this
2800 // situation (as used for tail calls) is implementation-defined, so we
2801 // cannot rely on the linker replacing the tail call with a return.
2802 if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
2803 const GlobalValue *GV = G->getGlobal();
2804 const Triple &TT = getTargetMachine().getTargetTriple();
2805 if (GV->hasExternalWeakLinkage() &&
2806 (!TT.isOSWindows() || TT.isOSBinFormatELF() ||
2807 TT.isOSBinFormatMachO())) {
2808 LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
2809 return false;
2810 }
2811 }
2812
2813 // Check that the call results are passed in the same way.
2814 LLVMContext &C = *DAG.getContext();
2816 getEffectiveCallingConv(CalleeCC, isVarArg),
2817 getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
2818 CCAssignFnForReturn(CalleeCC, isVarArg),
2819 CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
2820 LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
2821 return false;
2822 }
2823 // The callee has to preserve all registers the caller needs to preserve.
2824 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
2825 const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
2826 if (CalleeCC != CallerCC) {
2827 const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
2828 if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
2829 LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
2830 return false;
2831 }
2832 }
2833
2834 // If Caller's vararg argument has been split between registers and stack, do
2835 // not perform tail call, since part of the argument is in caller's local
2836 // frame.
2837 const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
2838 if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
2839 LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
2840 return false;
2841 }
2842
2843 // If the callee takes no arguments then go on to check the results of the
2844 // call.
2845 const MachineRegisterInfo &MRI = MF.getRegInfo();
2846 if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
2847 LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
2848 return false;
2849 }
2850
2851 // If the stack arguments for this call do not fit into our own save area then
2852 // the call cannot be made tail.
2853 if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
2854 return false;
2855
2856 LLVM_DEBUG(dbgs() << "true\n");
2857 return true;
2858}
2859
2860bool
2861ARMTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
2862 MachineFunction &MF, bool isVarArg,
2864 LLVMContext &Context, const Type *RetTy) const {
2866 CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
2867 return CCInfo.CheckReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2868}
2869
2871 const SDLoc &DL, SelectionDAG &DAG) {
2872 const MachineFunction &MF = DAG.getMachineFunction();
2873 const Function &F = MF.getFunction();
2874
2875 StringRef IntKind = F.getFnAttribute("interrupt").getValueAsString();
2876
2877 // See ARM ARM v7 B1.8.3. On exception entry LR is set to a possibly offset
2878 // version of the "preferred return address". These offsets affect the return
2879 // instruction if this is a return from PL1 without hypervisor extensions.
2880 // IRQ/FIQ: +4 "subs pc, lr, #4"
2881 // SWI: 0 "subs pc, lr, #0"
2882 // ABORT: +4 "subs pc, lr, #4"
2883 // UNDEF: +4/+2 "subs pc, lr, #0"
2884 // UNDEF varies depending on where the exception came from ARM or Thumb
2885 // mode. Alongside GCC, we throw our hands up in disgust and pretend it's 0.
2886
2887 int64_t LROffset;
2888 if (IntKind == "" || IntKind == "IRQ" || IntKind == "FIQ" ||
2889 IntKind == "ABORT")
2890 LROffset = 4;
2891 else if (IntKind == "SWI" || IntKind == "UNDEF")
2892 LROffset = 0;
2893 else
2894 report_fatal_error("Unsupported interrupt attribute. If present, value "
2895 "must be one of: IRQ, FIQ, SWI, ABORT or UNDEF");
2896
2897 RetOps.insert(RetOps.begin() + 1,
2898 DAG.getConstant(LROffset, DL, MVT::i32, false));
2899
2900 return DAG.getNode(ARMISD::INTRET_GLUE, DL, MVT::Other, RetOps);
2901}
2902
2903SDValue
2904ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
2905 bool isVarArg,
2907 const SmallVectorImpl<SDValue> &OutVals,
2908 const SDLoc &dl, SelectionDAG &DAG) const {
2909 // CCValAssign - represent the assignment of the return value to a location.
2911
2912 // CCState - Info about the registers and stack slots.
2913 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
2914 *DAG.getContext());
2915
2916 // Analyze outgoing return values.
2917 CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
2918
2919 SDValue Glue;
2921 RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
2922 bool isLittleEndian = Subtarget->isLittle();
2923
2924 MachineFunction &MF = DAG.getMachineFunction();
2925 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
2926 AFI->setReturnRegsCount(RVLocs.size());
2927
2928 // Report error if cmse entry function returns structure through first ptr arg.
2929 if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
2930 // Note: using an empty SDLoc(), as the first line of the function is a
2931 // better place to report than the last line.
2932 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
2934 "secure entry function would return value through pointer",
2935 SDLoc().getDebugLoc()));
2936 }
2937
2938 // Copy the result values into the output registers.
2939 for (unsigned i = 0, realRVLocIdx = 0;
2940 i != RVLocs.size();
2941 ++i, ++realRVLocIdx) {
2942 CCValAssign &VA = RVLocs[i];
2943 assert(VA.isRegLoc() && "Can only return in registers!");
2944
2945 SDValue Arg = OutVals[realRVLocIdx];
2946 bool ReturnF16 = false;
2947
2948 if (Subtarget->hasFullFP16() && getTM().isTargetHardFloat()) {
2949 // Half-precision return values can be returned like this:
2950 //
2951 // t11 f16 = fadd ...
2952 // t12: i16 = bitcast t11
2953 // t13: i32 = zero_extend t12
2954 // t14: f32 = bitcast t13 <~~~~~~~ Arg
2955 //
2956 // to avoid code generation for bitcasts, we simply set Arg to the node
2957 // that produces the f16 value, t11 in this case.
2958 //
2959 if (Arg.getValueType() == MVT::f32 && Arg.getOpcode() == ISD::BITCAST) {
2960 SDValue ZE = Arg.getOperand(0);
2961 if (ZE.getOpcode() == ISD::ZERO_EXTEND && ZE.getValueType() == MVT::i32) {
2962 SDValue BC = ZE.getOperand(0);
2963 if (BC.getOpcode() == ISD::BITCAST && BC.getValueType() == MVT::i16) {
2964 Arg = BC.getOperand(0);
2965 ReturnF16 = true;
2966 }
2967 }
2968 }
2969 }
2970
2971 switch (VA.getLocInfo()) {
2972 default: llvm_unreachable("Unknown loc info!");
2973 case CCValAssign::Full: break;
2974 case CCValAssign::BCvt:
2975 if (!ReturnF16)
2976 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2977 break;
2978 }
2979
2980 // Mask f16 arguments if this is a CMSE nonsecure entry.
2981 auto RetVT = Outs[realRVLocIdx].ArgVT;
2982 if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
2983 if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
2984 Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
2985 } else {
2986 auto LocBits = VA.getLocVT().getSizeInBits();
2987 auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
2988 SDValue Mask =
2989 DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
2990 Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
2991 Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
2992 Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
2993 }
2994 }
2995
2996 if (VA.needsCustom() &&
2997 (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
2998 if (VA.getLocVT() == MVT::v2f64) {
2999 // Extract the first half and return it in two registers.
3000 SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3001 DAG.getConstant(0, dl, MVT::i32));
3002 SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
3003 DAG.getVTList(MVT::i32, MVT::i32), Half);
3004
3005 Chain =
3006 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3007 HalfGPRs.getValue(isLittleEndian ? 0 : 1), Glue);
3008 Glue = Chain.getValue(1);
3009 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3010 VA = RVLocs[++i]; // skip ahead to next loc
3011 Chain =
3012 DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3013 HalfGPRs.getValue(isLittleEndian ? 1 : 0), Glue);
3014 Glue = Chain.getValue(1);
3015 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3016 VA = RVLocs[++i]; // skip ahead to next loc
3017
3018 // Extract the 2nd half and fall through to handle it as an f64 value.
3019 Arg = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
3020 DAG.getConstant(1, dl, MVT::i32));
3021 }
3022 // Legalize ret f64 -> ret 2 x i32. We always have fmrrd if f64 is
3023 // available.
3024 SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
3025 DAG.getVTList(MVT::i32, MVT::i32), Arg);
3026 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3027 fmrrd.getValue(isLittleEndian ? 0 : 1), Glue);
3028 Glue = Chain.getValue(1);
3029 RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
3030 VA = RVLocs[++i]; // skip ahead to next loc
3031 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
3032 fmrrd.getValue(isLittleEndian ? 1 : 0), Glue);
3033 } else
3034 Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Glue);
3035
3036 // Guarantee that all emitted copies are
3037 // stuck together, avoiding something bad.
3038 Glue = Chain.getValue(1);
3039 RetOps.push_back(DAG.getRegister(
3040 VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
3041 }
3042 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
3043 const MCPhysReg *I =
3044 TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
3045 if (I) {
3046 for (; *I; ++I) {
3047 if (ARM::GPRRegClass.contains(*I))
3048 RetOps.push_back(DAG.getRegister(*I, MVT::i32));
3049 else if (ARM::DPRRegClass.contains(*I))
3051 else
3052 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
3053 }
3054 }
3055
3056 // Update chain and glue.
3057 RetOps[0] = Chain;
3058 if (Glue.getNode())
3059 RetOps.push_back(Glue);
3060
3061 // CPUs which aren't M-class use a special sequence to return from
3062 // exceptions (roughly, any instruction setting pc and cpsr simultaneously,
3063 // though we use "subs pc, lr, #N").
3064 //
3065 // M-class CPUs actually use a normal return sequence with a special
3066 // (hardware-provided) value in LR, so the normal code path works.
3067 if (DAG.getMachineFunction().getFunction().hasFnAttribute("interrupt") &&
3068 !Subtarget->isMClass()) {
3069 if (Subtarget->isThumb1Only())
3070 report_fatal_error("interrupt attribute is not supported in Thumb1");
3071 return LowerInterruptReturn(RetOps, dl, DAG);
3072 }
3073
3074 unsigned RetNode =
3075 AFI->isCmseNSEntryFunction() ? ARMISD::SERET_GLUE : ARMISD::RET_GLUE;
3076 return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
3077}
3078
3079bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
3080 if (N->getNumValues() != 1)
3081 return false;
3082 if (!N->hasNUsesOfValue(1, 0))
3083 return false;
3084
3085 SDValue TCChain = Chain;
3086 SDNode *Copy = *N->user_begin();
3087 if (Copy->getOpcode() == ISD::CopyToReg) {
3088 // If the copy has a glue operand, we conservatively assume it isn't safe to
3089 // perform a tail call.
3090 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3091 return false;
3092 TCChain = Copy->getOperand(0);
3093 } else if (Copy->getOpcode() == ARMISD::VMOVRRD) {
3094 SDNode *VMov = Copy;
3095 // f64 returned in a pair of GPRs.
3096 SmallPtrSet<SDNode*, 2> Copies;
3097 for (SDNode *U : VMov->users()) {
3098 if (U->getOpcode() != ISD::CopyToReg)
3099 return false;
3100 Copies.insert(U);
3101 }
3102 if (Copies.size() > 2)
3103 return false;
3104
3105 for (SDNode *U : VMov->users()) {
3106 SDValue UseChain = U->getOperand(0);
3107 if (Copies.count(UseChain.getNode()))
3108 // Second CopyToReg
3109 Copy = U;
3110 else {
3111 // We are at the top of this chain.
3112 // If the copy has a glue operand, we conservatively assume it
3113 // isn't safe to perform a tail call.
3114 if (U->getOperand(U->getNumOperands() - 1).getValueType() == MVT::Glue)
3115 return false;
3116 // First CopyToReg
3117 TCChain = UseChain;
3118 }
3119 }
3120 } else if (Copy->getOpcode() == ISD::BITCAST) {
3121 // f32 returned in a single GPR.
3122 if (!Copy->hasOneUse())
3123 return false;
3124 Copy = *Copy->user_begin();
3125 if (Copy->getOpcode() != ISD::CopyToReg || !Copy->hasNUsesOfValue(1, 0))
3126 return false;
3127 // If the copy has a glue operand, we conservatively assume it isn't safe to
3128 // perform a tail call.
3129 if (Copy->getOperand(Copy->getNumOperands()-1).getValueType() == MVT::Glue)
3130 return false;
3131 TCChain = Copy->getOperand(0);
3132 } else {
3133 return false;
3134 }
3135
3136 bool HasRet = false;
3137 for (const SDNode *U : Copy->users()) {
3138 if (U->getOpcode() != ARMISD::RET_GLUE &&
3139 U->getOpcode() != ARMISD::INTRET_GLUE)
3140 return false;
3141 HasRet = true;
3142 }
3143
3144 if (!HasRet)
3145 return false;
3146
3147 Chain = TCChain;
3148 return true;
3149}
3150
3151bool ARMTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
3152 if (!Subtarget->supportsTailCall())
3153 return false;
3154
3155 if (!CI->isTailCall())
3156 return false;
3157
3158 return true;
3159}
3160
3161// Trying to write a 64 bit value so need to split into two 32 bit values first,
3162// and pass the lower and high parts through.
3164 SDLoc DL(Op);
3165 SDValue WriteValue = Op->getOperand(2);
3166
3167 // This function is only supposed to be called for i64 type argument.
3168 assert(WriteValue.getValueType() == MVT::i64
3169 && "LowerWRITE_REGISTER called for non-i64 type argument.");
3170
3171 SDValue Lo, Hi;
3172 std::tie(Lo, Hi) = DAG.SplitScalar(WriteValue, DL, MVT::i32, MVT::i32);
3173 SDValue Ops[] = { Op->getOperand(0), Op->getOperand(1), Lo, Hi };
3174 return DAG.getNode(ISD::WRITE_REGISTER, DL, MVT::Other, Ops);
3175}
3176
3177// ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
3178// their target counterpart wrapped in the ARMISD::Wrapper node. Suppose N is
3179// one of the above mentioned nodes. It has to be wrapped because otherwise
3180// Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
3181// be used to form addressing mode. These wrapped nodes will be selected
3182// into MOVi.
3183SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
3184 SelectionDAG &DAG) const {
3185 EVT PtrVT = Op.getValueType();
3186 // FIXME there is no actual debug info here
3187 SDLoc dl(Op);
3188 ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(Op);
3189 SDValue Res;
3190
3191 // When generating execute-only code Constant Pools must be promoted to the
3192 // global data section. It's a bit ugly that we can't share them across basic
3193 // blocks, but this way we guarantee that execute-only behaves correct with
3194 // position-independent addressing modes.
3195 if (Subtarget->genExecuteOnly()) {
3196 auto AFI = DAG.getMachineFunction().getInfo<ARMFunctionInfo>();
3197 auto *T = CP->getType();
3198 auto C = const_cast<Constant*>(CP->getConstVal());
3199 auto M = DAG.getMachineFunction().getFunction().getParent();
3200 auto GV = new GlobalVariable(
3201 *M, T, /*isConstant=*/true, GlobalVariable::InternalLinkage, C,
3202 Twine(DAG.getDataLayout().getInternalSymbolPrefix()) + "CP" +
3203 Twine(DAG.getMachineFunction().getFunctionNumber()) + "_" +
3204 Twine(AFI->createPICLabelUId()));
3205 SDValue GA = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3206 return LowerGlobalAddress(GA, DAG);
3207 }
3208
3209 // The 16-bit ADR instruction can only encode offsets that are multiples of 4,
3210 // so we need to align to at least 4 bytes when we don't have 32-bit ADR.
3211 Align CPAlign = CP->getAlign();
3212 if (Subtarget->isThumb1Only())
3213 CPAlign = std::max(CPAlign, Align(4));
3215 Res =
3216 DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CPAlign);
3217 else
3218 Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CPAlign);
3219 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
3220}
3221
3223 // If we don't have a 32-bit pc-relative branch instruction then the jump
3224 // table consists of block addresses. Usually this is inline, but for
3225 // execute-only it must be placed out-of-line.
3226 if (Subtarget->genExecuteOnly() && !Subtarget->hasV8MBaselineOps())
3229}
3230
3231SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
3232 SelectionDAG &DAG) const {
3235 unsigned ARMPCLabelIndex = 0;
3236 SDLoc DL(Op);
3237 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3238 const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
3239 SDValue CPAddr;
3240 bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
3241 if (!IsPositionIndependent) {
3242 CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
3243 } else {
3244 unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
3245 ARMPCLabelIndex = AFI->createPICLabelUId();
3247 ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
3248 ARMCP::CPBlockAddress, PCAdj);
3249 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3250 }
3251 CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
3252 SDValue Result = DAG.getLoad(
3253 PtrVT, DL, DAG.getEntryNode(), CPAddr,
3255 if (!IsPositionIndependent)
3256 return Result;
3257 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, DL, MVT::i32);
3258 return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
3259}
3260
3261/// Convert a TLS address reference into the correct sequence of loads
3262/// and calls to compute the variable's address for Darwin, and return an
3263/// SDValue containing the final node.
3264
3265/// Darwin only has one TLS scheme which must be capable of dealing with the
3266/// fully general situation, in the worst case. This means:
3267/// + "extern __thread" declaration.
3268/// + Defined in a possibly unknown dynamic library.
3269///
3270/// The general system is that each __thread variable has a [3 x i32] descriptor
3271/// which contains information used by the runtime to calculate the address. The
3272/// only part of this the compiler needs to know about is the first word, which
3273/// contains a function pointer that must be called with the address of the
3274/// entire descriptor in "r0".
3275///
3276/// Since this descriptor may be in a different unit, in general access must
3277/// proceed along the usual ARM rules. A common sequence to produce is:
3278///
3279/// movw rT1, :lower16:_var$non_lazy_ptr
3280/// movt rT1, :upper16:_var$non_lazy_ptr
3281/// ldr r0, [rT1]
3282/// ldr rT2, [r0]
3283/// blx rT2
3284/// [...address now in r0...]
3285SDValue
3286ARMTargetLowering::LowerGlobalTLSAddressDarwin(SDValue Op,
3287 SelectionDAG &DAG) const {
3288 assert(getTargetMachine().getTargetTriple().isOSDarwin() &&
3289 "This function expects a Darwin target");
3290 SDLoc DL(Op);
3291
3292 // First step is to get the address of the actua global symbol. This is where
3293 // the TLS descriptor lives.
3294 SDValue DescAddr = LowerGlobalAddressDarwin(Op, DAG);
3295
3296 // The first entry in the descriptor is a function pointer that we must call
3297 // to obtain the address of the variable.
3298 SDValue Chain = DAG.getEntryNode();
3299 SDValue FuncTLVGet = DAG.getLoad(
3300 MVT::i32, DL, Chain, DescAddr,
3304 Chain = FuncTLVGet.getValue(1);
3305
3306 MachineFunction &F = DAG.getMachineFunction();
3307 MachineFrameInfo &MFI = F.getFrameInfo();
3308 MFI.setAdjustsStack(true);
3309
3310 // TLS calls preserve all registers except those that absolutely must be
3311 // trashed: R0 (it takes an argument), LR (it's a call) and CPSR (let's not be
3312 // silly).
3313 auto TRI =
3315 auto ARI = static_cast<const ARMRegisterInfo *>(TRI);
3316 const uint32_t *Mask = ARI->getTLSCallPreservedMask(DAG.getMachineFunction());
3317
3318 // Finally, we can make the call. This is just a degenerate version of a
3319 // normal AArch64 call node: r0 takes the address of the descriptor, and
3320 // returns the address of the variable in this thread.
3321 Chain = DAG.getCopyToReg(Chain, DL, ARM::R0, DescAddr, SDValue());
3322 Chain =
3323 DAG.getNode(ARMISD::CALL, DL, DAG.getVTList(MVT::Other, MVT::Glue),
3324 Chain, FuncTLVGet, DAG.getRegister(ARM::R0, MVT::i32),
3325 DAG.getRegisterMask(Mask), Chain.getValue(1));
3326 return DAG.getCopyFromReg(Chain, DL, ARM::R0, MVT::i32, Chain.getValue(1));
3327}
3328
3329SDValue
3330ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
3331 SelectionDAG &DAG) const {
3332 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3333 "Windows specific TLS lowering");
3334
3335 SDValue Chain = DAG.getEntryNode();
3336 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3337 SDLoc DL(Op);
3338
3339 // Load the current TEB (thread environment block)
3340 SDValue Ops[] = {Chain,
3341 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
3342 DAG.getTargetConstant(15, DL, MVT::i32),
3343 DAG.getTargetConstant(0, DL, MVT::i32),
3344 DAG.getTargetConstant(13, DL, MVT::i32),
3345 DAG.getTargetConstant(0, DL, MVT::i32),
3346 DAG.getTargetConstant(2, DL, MVT::i32)};
3347 SDValue CurrentTEB = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
3348 DAG.getVTList(MVT::i32, MVT::Other), Ops);
3349
3350 SDValue TEB = CurrentTEB.getValue(0);
3351 Chain = CurrentTEB.getValue(1);
3352
3353 // Load the ThreadLocalStoragePointer from the TEB
3354 // A pointer to the TLS array is located at offset 0x2c from the TEB.
3355 SDValue TLSArray =
3356 DAG.getNode(ISD::ADD, DL, PtrVT, TEB, DAG.getIntPtrConstant(0x2c, DL));
3357 TLSArray = DAG.getLoad(PtrVT, DL, Chain, TLSArray, MachinePointerInfo());
3358
3359 // The pointer to the thread's TLS data area is at the TLS Index scaled by 4
3360 // offset into the TLSArray.
3361
3362 // Load the TLS index from the C runtime
3363 SDValue TLSIndex =
3364 DAG.getTargetExternalSymbol("_tls_index", PtrVT, ARMII::MO_NO_FLAG);
3365 TLSIndex = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, TLSIndex);
3366 TLSIndex = DAG.getLoad(PtrVT, DL, Chain, TLSIndex, MachinePointerInfo());
3367
3368 SDValue Slot = DAG.getNode(ISD::SHL, DL, PtrVT, TLSIndex,
3369 DAG.getConstant(2, DL, MVT::i32));
3370 SDValue TLS = DAG.getLoad(PtrVT, DL, Chain,
3371 DAG.getNode(ISD::ADD, DL, PtrVT, TLSArray, Slot),
3372 MachinePointerInfo());
3373
3374 // Get the offset of the start of the .tls section (section base)
3375 const auto *GA = cast<GlobalAddressSDNode>(Op);
3376 auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
3377 SDValue Offset = DAG.getLoad(
3378 PtrVT, DL, Chain,
3379 DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
3380 DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
3382
3383 return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
3384}
3385
3386// Lower ISD::GlobalTLSAddress using the "general dynamic" model
3387SDValue
3388ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
3389 SelectionDAG &DAG) const {
3390 SDLoc dl(GA);
3391 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3392 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3393 MachineFunction &MF = DAG.getMachineFunction();
3394 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3395 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3396 ARMConstantPoolValue *CPV =
3397 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3398 ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
3399 SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3400 Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
3401 Argument = DAG.getLoad(
3402 PtrVT, dl, DAG.getEntryNode(), Argument,
3404 SDValue Chain = Argument.getValue(1);
3405
3406 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3407 Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
3408
3409 // call __tls_get_addr.
3411 Args.emplace_back(Argument, Type::getInt32Ty(*DAG.getContext()));
3412
3413 // FIXME: is there useful debug info available here?
3414 TargetLowering::CallLoweringInfo CLI(DAG);
3415 CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
3417 DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
3418
3419 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
3420 return CallResult.first;
3421}
3422
3423// Lower ISD::GlobalTLSAddress using the "initial exec" or
3424// "local exec" model.
3425SDValue
3426ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
3427 SelectionDAG &DAG,
3428 TLSModel::Model model) const {
3429 const GlobalValue *GV = GA->getGlobal();
3430 SDLoc dl(GA);
3432 SDValue Chain = DAG.getEntryNode();
3433 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3434 // Get the Thread Pointer
3435 SDValue ThreadPointer = DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3436
3437 if (model == TLSModel::InitialExec) {
3438 MachineFunction &MF = DAG.getMachineFunction();
3439 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3440 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3441 // Initial exec model.
3442 unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
3443 ARMConstantPoolValue *CPV =
3444 ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
3446 true);
3447 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3448 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3449 Offset = DAG.getLoad(
3450 PtrVT, dl, Chain, Offset,
3452 Chain = Offset.getValue(1);
3453
3454 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3455 Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
3456
3457 Offset = DAG.getLoad(
3458 PtrVT, dl, Chain, Offset,
3460 } else {
3461 // local exec model
3462 assert(model == TLSModel::LocalExec);
3463 ARMConstantPoolValue *CPV =
3465 Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3466 Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
3467 Offset = DAG.getLoad(
3468 PtrVT, dl, Chain, Offset,
3470 }
3471
3472 // The address of the thread local variable is the add of the thread
3473 // pointer with the offset of the variable.
3474 return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer, Offset);
3475}
3476
3477SDValue
3478ARMTargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
3479 GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
3480 if (DAG.getTarget().useEmulatedTLS())
3481 return LowerToTLSEmulatedModel(GA, DAG);
3482
3483 const Triple &TT = getTargetMachine().getTargetTriple();
3484 if (TT.isOSDarwin())
3485 return LowerGlobalTLSAddressDarwin(Op, DAG);
3486
3487 if (TT.isOSWindows())
3488 return LowerGlobalTLSAddressWindows(Op, DAG);
3489
3490 // TODO: implement the "local dynamic" model
3491 assert(TT.isOSBinFormatELF() && "Only ELF implemented here");
3493
3494 switch (model) {
3497 return LowerToTLSGeneralDynamicModel(GA, DAG);
3500 return LowerToTLSExecModels(GA, DAG, model);
3501 }
3502 llvm_unreachable("bogus TLS model");
3503}
3504
3505/// Return true if all users of V are within function F, looking through
3506/// ConstantExprs.
3507static bool allUsersAreInFunction(const Value *V, const Function *F) {
3508 SmallVector<const User*,4> Worklist(V->users());
3509 while (!Worklist.empty()) {
3510 auto *U = Worklist.pop_back_val();
3511 if (isa<ConstantExpr>(U)) {
3512 append_range(Worklist, U->users());
3513 continue;
3514 }
3515
3516 auto *I = dyn_cast<Instruction>(U);
3517 if (!I || I->getParent()->getParent() != F)
3518 return false;
3519 }
3520 return true;
3521}
3522
3524 const GlobalValue *GV, SelectionDAG &DAG,
3525 EVT PtrVT, const SDLoc &dl) {
3526 // If we're creating a pool entry for a constant global with unnamed address,
3527 // and the global is small enough, we can emit it inline into the constant pool
3528 // to save ourselves an indirection.
3529 //
3530 // This is a win if the constant is only used in one function (so it doesn't
3531 // need to be duplicated) or duplicating the constant wouldn't increase code
3532 // size (implying the constant is no larger than 4 bytes).
3533 const Function &F = DAG.getMachineFunction().getFunction();
3534
3535 // We rely on this decision to inline being idempotent and unrelated to the
3536 // use-site. We know that if we inline a variable at one use site, we'll
3537 // inline it elsewhere too (and reuse the constant pool entry). Fast-isel
3538 // doesn't know about this optimization, so bail out if it's enabled else
3539 // we could decide to inline here (and thus never emit the GV) but require
3540 // the GV from fast-isel generated code.
3543 return SDValue();
3544
3545 auto *GVar = dyn_cast<GlobalVariable>(GV);
3546 if (!GVar || !GVar->hasInitializer() ||
3547 !GVar->isConstant() || !GVar->hasGlobalUnnamedAddr() ||
3548 !GVar->hasLocalLinkage())
3549 return SDValue();
3550
3551 // If we inline a value that contains relocations, we move the relocations
3552 // from .data to .text. This is not allowed in position-independent code.
3553 auto *Init = GVar->getInitializer();
3554 if ((TLI->isPositionIndependent() || TLI->getSubtarget()->isROPI()) &&
3555 Init->needsDynamicRelocation())
3556 return SDValue();
3557
3558 // The constant islands pass can only really deal with alignment requests
3559 // <= 4 bytes and cannot pad constants itself. Therefore we cannot promote
3560 // any type wanting greater alignment requirements than 4 bytes. We also
3561 // can only promote constants that are multiples of 4 bytes in size or
3562 // are paddable to a multiple of 4. Currently we only try and pad constants
3563 // that are strings for simplicity.
3564 auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
3565 unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
3566 Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
3567 unsigned RequiredPadding = 4 - (Size % 4);
3568 bool PaddingPossible =
3569 RequiredPadding == 4 || (CDAInit && CDAInit->isString());
3570 if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
3571 Size == 0)
3572 return SDValue();
3573
3574 unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
3576 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3577
3578 // We can't bloat the constant pool too much, else the ConstantIslands pass
3579 // may fail to converge. If we haven't promoted this global yet (it may have
3580 // multiple uses), and promoting it would increase the constant pool size (Sz
3581 // > 4), ensure we have space to do so up to MaxTotal.
3582 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar) && Size > 4)
3583 if (AFI->getPromotedConstpoolIncrease() + PaddedSize - 4 >=
3585 return SDValue();
3586
3587 // This is only valid if all users are in a single function; we can't clone
3588 // the constant in general. The LLVM IR unnamed_addr allows merging
3589 // constants, but not cloning them.
3590 //
3591 // We could potentially allow cloning if we could prove all uses of the
3592 // constant in the current function don't care about the address, like
3593 // printf format strings. But that isn't implemented for now.
3594 if (!allUsersAreInFunction(GVar, &F))
3595 return SDValue();
3596
3597 // We're going to inline this global. Pad it out if needed.
3598 if (RequiredPadding != 4) {
3599 StringRef S = CDAInit->getAsString();
3600
3602 std::copy(S.bytes_begin(), S.bytes_end(), V.begin());
3603 while (RequiredPadding--)
3604 V.push_back(0);
3606 }
3607
3608 auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
3609 SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
3610 if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
3613 PaddedSize - 4);
3614 }
3615 ++NumConstpoolPromoted;
3616 return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3617}
3618
3620 if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
3621 if (!(GV = GA->getAliaseeObject()))
3622 return false;
3623 if (const auto *V = dyn_cast<GlobalVariable>(GV))
3624 return V->isConstant();
3625 return isa<Function>(GV);
3626}
3627
3628SDValue ARMTargetLowering::LowerGlobalAddress(SDValue Op,
3629 SelectionDAG &DAG) const {
3630 switch (Subtarget->getTargetTriple().getObjectFormat()) {
3631 default: llvm_unreachable("unknown object format");
3632 case Triple::COFF:
3633 return LowerGlobalAddressWindows(Op, DAG);
3634 case Triple::ELF:
3635 return LowerGlobalAddressELF(Op, DAG);
3636 case Triple::MachO:
3637 return LowerGlobalAddressDarwin(Op, DAG);
3638 }
3639}
3640
3641SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
3642 SelectionDAG &DAG) const {
3643 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3644 SDLoc dl(Op);
3645 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3646 bool IsRO = isReadOnly(GV);
3647
3648 // promoteToConstantPool only if not generating XO text section
3649 if (GV->isDSOLocal() && !Subtarget->genExecuteOnly())
3650 if (SDValue V = promoteToConstantPool(this, GV, DAG, PtrVT, dl))
3651 return V;
3652
3653 if (isPositionIndependent()) {
3654 // Weak symbols need GOT indirection even when hidden/DSO-local.
3655 // The assembler eagerly resolves PC-relative expressions when the
3656 // symbol and reference are in the same section, which prevents the
3657 // linker from overriding a weak definition with a non-weak one.
3658 bool UseGOT = !GV->isDSOLocal() || GV->isWeakForLinker();
3659 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
3660 UseGOT ? ARMII::MO_GOT : 0);
3661 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3662 if (UseGOT)
3663 Result =
3664 DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3666 return Result;
3667 } else if (Subtarget->isROPI() && IsRO) {
3668 // PC-relative.
3669 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT);
3670 SDValue Result = DAG.getNode(ARMISD::WrapperPIC, dl, PtrVT, G);
3671 return Result;
3672 } else if (Subtarget->isRWPI() && !IsRO) {
3673 // SB-relative.
3674 SDValue RelAddr;
3675 if (Subtarget->useMovt()) {
3676 ++NumMovwMovt;
3677 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
3678 RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
3679 } else { // use literal pool for address constant
3680 ARMConstantPoolValue *CPV =
3682 SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3683 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3684 RelAddr = DAG.getLoad(
3685 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3687 }
3688 SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
3689 SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
3690 return Result;
3691 }
3692
3693 // If we have T2 ops, we can materialize the address directly via movt/movw
3694 // pair. This is always cheaper. If need to generate Execute Only code, and we
3695 // only have Thumb1 available, we can't use a constant pool and are forced to
3696 // use immediate relocations.
3697 if (Subtarget->useMovt() || Subtarget->genExecuteOnly()) {
3698 if (Subtarget->useMovt())
3699 ++NumMovwMovt;
3700 // FIXME: Once remat is capable of dealing with instructions with register
3701 // operands, expand this into two nodes.
3702 return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
3703 DAG.getTargetGlobalAddress(GV, dl, PtrVT));
3704 } else {
3705 SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
3706 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3707 return DAG.getLoad(
3708 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3710 }
3711}
3712
3713SDValue ARMTargetLowering::LowerGlobalAddressDarwin(SDValue Op,
3714 SelectionDAG &DAG) const {
3715 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3716 "ROPI/RWPI not currently supported for Darwin");
3717 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3718 SDLoc dl(Op);
3719 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3720
3721 if (Subtarget->useMovt())
3722 ++NumMovwMovt;
3723
3724 // FIXME: Once remat is capable of dealing with instructions with register
3725 // operands, expand this into multiple nodes
3726 unsigned Wrapper =
3727 isPositionIndependent() ? ARMISD::WrapperPIC : ARMISD::Wrapper;
3728
3729 SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_NONLAZY);
3730 SDValue Result = DAG.getNode(Wrapper, dl, PtrVT, G);
3731
3732 if (Subtarget->isGVIndirectSymbol(GV))
3733 Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Result,
3735 return Result;
3736}
3737
3738SDValue ARMTargetLowering::LowerGlobalAddressWindows(SDValue Op,
3739 SelectionDAG &DAG) const {
3740 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
3741 "non-Windows COFF is not supported");
3742 assert(Subtarget->useMovt() &&
3743 "Windows on ARM expects to use movw/movt");
3744 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
3745 "ROPI/RWPI not currently supported for Windows");
3746
3747 const TargetMachine &TM = getTargetMachine();
3748 const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
3749 ARMII::TOF TargetFlags = ARMII::MO_NO_FLAG;
3750 if (GV->hasDLLImportStorageClass())
3751 TargetFlags = ARMII::MO_DLLIMPORT;
3752 else if (!TM.shouldAssumeDSOLocal(GV))
3753 TargetFlags = ARMII::MO_COFFSTUB;
3754 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3756 SDLoc DL(Op);
3757
3758 ++NumMovwMovt;
3759
3760 // FIXME: Once remat is capable of dealing with instructions with register
3761 // operands, expand this into two nodes.
3762 Result = DAG.getNode(ARMISD::Wrapper, DL, PtrVT,
3763 DAG.getTargetGlobalAddress(GV, DL, PtrVT, /*offset=*/0,
3764 TargetFlags));
3765 if (TargetFlags & (ARMII::MO_DLLIMPORT | ARMII::MO_COFFSTUB))
3766 Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
3768 return Result;
3769}
3770
3771SDValue
3772ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
3773 SDLoc dl(Op);
3774 SDValue Val = DAG.getConstant(0, dl, MVT::i32);
3775 return DAG.getNode(ARMISD::EH_SJLJ_SETJMP, dl,
3776 DAG.getVTList(MVT::i32, MVT::Other), Op.getOperand(0),
3777 Op.getOperand(1), Val);
3778}
3779
3780SDValue
3781ARMTargetLowering::LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const {
3782 SDLoc dl(Op);
3783 return DAG.getNode(ARMISD::EH_SJLJ_LONGJMP, dl, MVT::Other, Op.getOperand(0),
3784 Op.getOperand(1), DAG.getConstant(0, dl, MVT::i32));
3785}
3786
3787SDValue ARMTargetLowering::LowerEH_SJLJ_SETUP_DISPATCH(SDValue Op,
3788 SelectionDAG &DAG) const {
3789 SDLoc dl(Op);
3790 return DAG.getNode(ARMISD::EH_SJLJ_SETUP_DISPATCH, dl, MVT::Other,
3791 Op.getOperand(0));
3792}
3793
3794SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
3795 SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) const {
3796 unsigned IntNo =
3797 Op.getConstantOperandVal(Op.getOperand(0).getValueType() == MVT::Other);
3798 switch (IntNo) {
3799 default:
3800 return SDValue(); // Don't custom lower most intrinsics.
3801 case Intrinsic::arm_gnu_eabi_mcount: {
3802 MachineFunction &MF = DAG.getMachineFunction();
3803 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3804 SDLoc dl(Op);
3805 SDValue Chain = Op.getOperand(0);
3806 // call "\01__gnu_mcount_nc"
3807 const ARMBaseRegisterInfo *ARI = Subtarget->getRegisterInfo();
3808 const uint32_t *Mask =
3810 assert(Mask && "Missing call preserved mask for calling convention");
3811 // Mark LR an implicit live-in.
3812 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
3813 SDValue ReturnAddress =
3814 DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
3815 constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
3816 SDValue Callee =
3817 DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
3819 if (Subtarget->isThumb())
3820 return SDValue(
3821 DAG.getMachineNode(
3822 ARM::tBL_PUSHLR, dl, ResultTys,
3823 {ReturnAddress, DAG.getTargetConstant(ARMCC::AL, dl, PtrVT),
3824 DAG.getRegister(0, PtrVT), Callee, RegisterMask, Chain}),
3825 0);
3826 return SDValue(
3827 DAG.getMachineNode(ARM::BL_PUSHLR, dl, ResultTys,
3828 {ReturnAddress, Callee, RegisterMask, Chain}),
3829 0);
3830 }
3831 }
3832}
3833
3834SDValue
3835ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
3836 const ARMSubtarget *Subtarget) const {
3837 unsigned IntNo = Op.getConstantOperandVal(0);
3838 SDLoc dl(Op);
3839 switch (IntNo) {
3840 default: return SDValue(); // Don't custom lower most intrinsics.
3841 case Intrinsic::localaddress: {
3842 const MachineFunction &MF = DAG.getMachineFunction();
3843 const auto *RegInfo = Subtarget->getRegisterInfo();
3844 unsigned Reg = RegInfo->getLocalAddressRegister(MF);
3845 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg,
3846 Op.getSimpleValueType());
3847 }
3848 case Intrinsic::eh_recoverfp: {
3849 SDValue FnOp = Op.getOperand(1);
3850 GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(FnOp);
3851 auto *Fn = dyn_cast_or_null<Function>(GSD ? GSD->getGlobal() : nullptr);
3852 if (!Fn)
3854 "llvm.eh.recoverfp must take a function as the first argument");
3855 const auto *RegInfo = Subtarget->getRegisterInfo();
3856 Register BaseReg = RegInfo->getBaseRegister();
3857 MachineFunction &MF = DAG.getMachineFunction();
3858 MachineBasicBlock &MBB = *MF.begin();
3859 if (!MBB.isLiveIn(BaseReg))
3860 MBB.addLiveIn(BaseReg);
3861 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3862 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, BaseReg, PtrVT);
3863 }
3864 case Intrinsic::thread_pointer: {
3865 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3866 return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
3867 }
3868 case Intrinsic::arm_cls: {
3869 // Note: arm_cls and arm_cls64 intrinsics are expanded directly here
3870 // in LowerINTRINSIC_WO_CHAIN since there's no native scalar CLS
3871 // instruction.
3872 const SDValue &Operand = Op.getOperand(1);
3873 const EVT VTy = Op.getValueType();
3874 return DAG.getNode(ISD::CTLS, dl, VTy, Operand);
3875 }
3876 case Intrinsic::arm_cls64: {
3877 // arm_cls64 returns i32 but takes i64 input.
3878 // Use ISD::CTLS for i64 and truncate the result.
3879 SDValue CTLS64 = DAG.getNode(ISD::CTLS, dl, MVT::i64, Op.getOperand(1));
3880 return DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, CTLS64);
3881 }
3882 case Intrinsic::arm_neon_vcls:
3883 case Intrinsic::arm_mve_vcls: {
3884 // Lower vector CLS intrinsics to ISD::CTLS.
3885 // Vector CTLS is Legal when NEON/MVE is available (set elsewhere).
3886 const EVT VTy = Op.getValueType();
3887 return DAG.getNode(ISD::CTLS, dl, VTy, Op.getOperand(1));
3888 }
3889 case Intrinsic::eh_sjlj_lsda: {
3890 MachineFunction &MF = DAG.getMachineFunction();
3891 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
3892 unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
3893 EVT PtrVT = getPointerTy(DAG.getDataLayout());
3894 SDValue CPAddr;
3895 bool IsPositionIndependent = isPositionIndependent();
3896 unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
3897 ARMConstantPoolValue *CPV =
3898 ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
3899 ARMCP::CPLSDA, PCAdj);
3900 CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
3901 CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
3902 SDValue Result = DAG.getLoad(
3903 PtrVT, dl, DAG.getEntryNode(), CPAddr,
3905
3906 if (IsPositionIndependent) {
3907 SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, dl, MVT::i32);
3908 Result = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
3909 }
3910 return Result;
3911 }
3912 case Intrinsic::arm_neon_vabs:
3913 return DAG.getNode(ISD::ABS, SDLoc(Op), Op.getValueType(),
3914 Op.getOperand(1));
3915 case Intrinsic::arm_neon_vabds:
3916 if (Op.getValueType().isInteger())
3917 return DAG.getNode(ISD::ABDS, SDLoc(Op), Op.getValueType(),
3918 Op.getOperand(1), Op.getOperand(2));
3919 return SDValue();
3920 case Intrinsic::arm_neon_vabdu:
3921 return DAG.getNode(ISD::ABDU, SDLoc(Op), Op.getValueType(),
3922 Op.getOperand(1), Op.getOperand(2));
3923 case Intrinsic::arm_neon_vmulls:
3924 case Intrinsic::arm_neon_vmullu: {
3925 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmulls)
3926 ? ARMISD::VMULLs : ARMISD::VMULLu;
3927 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3928 Op.getOperand(1), Op.getOperand(2));
3929 }
3930 case Intrinsic::arm_neon_vminnm:
3931 case Intrinsic::arm_neon_vmaxnm: {
3932 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminnm)
3933 ? ISD::FMINNUM : ISD::FMAXNUM;
3934 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3935 Op.getOperand(1), Op.getOperand(2));
3936 }
3937 case Intrinsic::arm_neon_vminu:
3938 case Intrinsic::arm_neon_vmaxu: {
3939 if (Op.getValueType().isFloatingPoint())
3940 return SDValue();
3941 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vminu)
3942 ? ISD::UMIN : ISD::UMAX;
3943 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3944 Op.getOperand(1), Op.getOperand(2));
3945 }
3946 case Intrinsic::arm_neon_vmins:
3947 case Intrinsic::arm_neon_vmaxs: {
3948 // v{min,max}s is overloaded between signed integers and floats.
3949 if (!Op.getValueType().isFloatingPoint()) {
3950 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3951 ? ISD::SMIN : ISD::SMAX;
3952 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3953 Op.getOperand(1), Op.getOperand(2));
3954 }
3955 unsigned NewOpc = (IntNo == Intrinsic::arm_neon_vmins)
3956 ? ISD::FMINIMUM : ISD::FMAXIMUM;
3957 return DAG.getNode(NewOpc, SDLoc(Op), Op.getValueType(),
3958 Op.getOperand(1), Op.getOperand(2));
3959 }
3960 case Intrinsic::arm_neon_vtbl1:
3961 return DAG.getNode(ARMISD::VTBL1, SDLoc(Op), Op.getValueType(),
3962 Op.getOperand(1), Op.getOperand(2));
3963 case Intrinsic::arm_neon_vtbl2:
3964 return DAG.getNode(ARMISD::VTBL2, SDLoc(Op), Op.getValueType(),
3965 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3966 case Intrinsic::arm_mve_pred_i2v:
3967 case Intrinsic::arm_mve_pred_v2i:
3968 return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
3969 Op.getOperand(1));
3970 case Intrinsic::arm_mve_vreinterpretq:
3971 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
3972 Op.getOperand(1));
3973 case Intrinsic::arm_mve_lsll:
3974 return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
3975 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3976 case Intrinsic::arm_mve_asrl:
3977 return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
3978 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3979 case Intrinsic::arm_mve_vsli:
3980 return DAG.getNode(ARMISD::VSLIIMM, SDLoc(Op), Op->getVTList(),
3981 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3982 case Intrinsic::arm_mve_vsri:
3983 return DAG.getNode(ARMISD::VSRIIMM, SDLoc(Op), Op->getVTList(),
3984 Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
3985 }
3986}
3987
3989 const ARMSubtarget *Subtarget) {
3990 SDLoc dl(Op);
3991 auto SSID = static_cast<SyncScope::ID>(Op.getConstantOperandVal(2));
3992 if (SSID == SyncScope::SingleThread)
3993 return Op;
3994
3995 if (!Subtarget->hasDataBarrier()) {
3996 // Some ARMv6 cpus can support data barriers with an mcr instruction.
3997 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
3998 // here.
3999 assert(Subtarget->hasV6Ops() && !Subtarget->isThumb() &&
4000 "Unexpected ISD::ATOMIC_FENCE encountered. Should be libcall!");
4001 return DAG.getNode(ARMISD::MEMBARRIER_MCR, dl, MVT::Other, Op.getOperand(0),
4002 DAG.getConstant(0, dl, MVT::i32));
4003 }
4004
4005 AtomicOrdering Ord =
4006 static_cast<AtomicOrdering>(Op.getConstantOperandVal(1));
4008 if (Subtarget->isMClass()) {
4009 // Only a full system barrier exists in the M-class architectures.
4011 } else if (Subtarget->preferISHSTBarriers() &&
4012 Ord == AtomicOrdering::Release) {
4013 // Swift happens to implement ISHST barriers in a way that's compatible with
4014 // Release semantics but weaker than ISH so we'd be fools not to use
4015 // it. Beware: other processors probably don't!
4017 }
4018
4019 return DAG.getNode(ISD::INTRINSIC_VOID, dl, MVT::Other, Op.getOperand(0),
4020 DAG.getConstant(Intrinsic::arm_dmb, dl, MVT::i32),
4021 DAG.getConstant(Domain, dl, MVT::i32));
4022}
4023
4025 const ARMSubtarget *Subtarget) {
4026 // ARM pre v5TE and Thumb1 does not have preload instructions.
4027 if (!(Subtarget->isThumb2() ||
4028 (!Subtarget->isThumb1Only() && Subtarget->hasV5TEOps())))
4029 // Just preserve the chain.
4030 return Op.getOperand(0);
4031
4032 SDLoc dl(Op);
4033 unsigned isRead = ~Op.getConstantOperandVal(2) & 1;
4034 if (!isRead &&
4035 (!Subtarget->hasV7Ops() || !Subtarget->hasMPExtension()))
4036 // ARMv7 with MP extension has PLDW.
4037 return Op.getOperand(0);
4038
4039 unsigned isData = Op.getConstantOperandVal(4);
4040 if (Subtarget->isThumb()) {
4041 // Invert the bits.
4042 isRead = ~isRead & 1;
4043 isData = ~isData & 1;
4044 }
4045
4046 return DAG.getNode(ARMISD::PRELOAD, dl, MVT::Other, Op.getOperand(0),
4047 Op.getOperand(1), DAG.getConstant(isRead, dl, MVT::i32),
4048 DAG.getConstant(isData, dl, MVT::i32));
4049}
4050
4053 ARMFunctionInfo *FuncInfo = MF.getInfo<ARMFunctionInfo>();
4054
4055 // vastart just stores the address of the VarArgsFrameIndex slot into the
4056 // memory location argument.
4057 SDLoc dl(Op);
4059 SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
4060 const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
4061 return DAG.getStore(Op.getOperand(0), dl, FR, Op.getOperand(1),
4062 MachinePointerInfo(SV));
4063}
4064
4065SDValue ARMTargetLowering::GetF64FormalArgument(CCValAssign &VA,
4066 CCValAssign &NextVA,
4067 SDValue &Root,
4068 SelectionDAG &DAG,
4069 const SDLoc &dl) const {
4070 MachineFunction &MF = DAG.getMachineFunction();
4071 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4072
4073 const TargetRegisterClass *RC;
4074 if (AFI->isThumb1OnlyFunction())
4075 RC = &ARM::tGPRRegClass;
4076 else
4077 RC = &ARM::GPRRegClass;
4078
4079 // Transform the arguments stored in physical registers into virtual ones.
4080 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4081 SDValue ArgValue = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4082
4083 SDValue ArgValue2;
4084 if (NextVA.isMemLoc()) {
4085 MachineFrameInfo &MFI = MF.getFrameInfo();
4086 int FI = MFI.CreateFixedObject(4, NextVA.getLocMemOffset(), true);
4087
4088 // Create load node to retrieve arguments from the stack.
4089 SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
4090 ArgValue2 = DAG.getLoad(
4091 MVT::i32, dl, Root, FIN,
4093 } else {
4094 Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
4095 ArgValue2 = DAG.getCopyFromReg(Root, dl, Reg, MVT::i32);
4096 }
4097 if (!Subtarget->isLittle())
4098 std::swap (ArgValue, ArgValue2);
4099 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, ArgValue, ArgValue2);
4100}
4101
4102// The remaining GPRs hold either the beginning of variable-argument
4103// data, or the beginning of an aggregate passed by value (usually
4104// byval). Either way, we allocate stack slots adjacent to the data
4105// provided by our caller, and store the unallocated registers there.
4106// If this is a variadic function, the va_list pointer will begin with
4107// these values; otherwise, this reassembles a (byval) structure that
4108// was split between registers and memory.
4109// Return: The frame index registers were stored into.
4110int ARMTargetLowering::StoreByValRegs(CCState &CCInfo, SelectionDAG &DAG,
4111 const SDLoc &dl, SDValue &Chain,
4112 const Value *OrigArg,
4113 unsigned InRegsParamRecordIdx,
4114 int ArgOffset, unsigned ArgSize) const {
4115 // Currently, two use-cases possible:
4116 // Case #1. Non-var-args function, and we meet first byval parameter.
4117 // Setup first unallocated register as first byval register;
4118 // eat all remained registers
4119 // (these two actions are performed by HandleByVal method).
4120 // Then, here, we initialize stack frame with
4121 // "store-reg" instructions.
4122 // Case #2. Var-args function, that doesn't contain byval parameters.
4123 // The same: eat all remained unallocated registers,
4124 // initialize stack frame.
4125
4126 MachineFunction &MF = DAG.getMachineFunction();
4127 MachineFrameInfo &MFI = MF.getFrameInfo();
4128 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4129 unsigned RBegin, REnd;
4130 if (InRegsParamRecordIdx < CCInfo.getInRegsParamsCount()) {
4131 CCInfo.getInRegsParamInfo(InRegsParamRecordIdx, RBegin, REnd);
4132 } else {
4133 unsigned RBeginIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4134 RBegin = RBeginIdx == 4 ? (unsigned)ARM::R4 : GPRArgRegs[RBeginIdx];
4135 REnd = ARM::R4;
4136 }
4137
4138 if (REnd != RBegin)
4139 ArgOffset = -4 * (ARM::R4 - RBegin);
4140
4141 auto PtrVT = getPointerTy(DAG.getDataLayout());
4142 int FrameIndex = MFI.CreateFixedObject(ArgSize, ArgOffset, false);
4143 SDValue FIN = DAG.getFrameIndex(FrameIndex, PtrVT);
4144
4146 const TargetRegisterClass *RC =
4147 AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
4148
4149 for (unsigned Reg = RBegin, i = 0; Reg < REnd; ++Reg, ++i) {
4150 Register VReg = MF.addLiveIn(Reg, RC);
4151 SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, MVT::i32);
4152 SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
4153 MachinePointerInfo(OrigArg, 4 * i));
4154 MemOps.push_back(Store);
4155 FIN = DAG.getNode(ISD::ADD, dl, PtrVT, FIN, DAG.getConstant(4, dl, PtrVT));
4156 }
4157
4158 if (!MemOps.empty())
4159 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
4160 return FrameIndex;
4161}
4162
4163// Setup stack frame, the va_list pointer will start from.
4164void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
4165 const SDLoc &dl, SDValue &Chain,
4166 unsigned ArgOffset,
4167 unsigned TotalArgRegsSaveSize,
4168 bool ForceMutable) const {
4169 MachineFunction &MF = DAG.getMachineFunction();
4170 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4171
4172 // Try to store any remaining integer argument regs
4173 // to their spots on the stack so that they may be loaded by dereferencing
4174 // the result of va_next.
4175 // If there is no regs to be stored, just point address after last
4176 // argument passed via stack.
4177 int FrameIndex = StoreByValRegs(
4178 CCInfo, DAG, dl, Chain, nullptr, CCInfo.getInRegsParamsCount(),
4179 CCInfo.getStackSize(), std::max(4U, TotalArgRegsSaveSize));
4180 AFI->setVarArgsFrameIndex(FrameIndex);
4181}
4182
4183bool ARMTargetLowering::splitValueIntoRegisterParts(
4184 SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
4185 unsigned NumParts, MVT PartVT, std::optional<CallingConv::ID> CC) const {
4186 EVT ValueVT = Val.getValueType();
4187 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4188 unsigned ValueBits = ValueVT.getSizeInBits();
4189 unsigned PartBits = PartVT.getSizeInBits();
4190 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
4191 Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
4192 Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
4193 Parts[0] = Val;
4194 return true;
4195 }
4196 return false;
4197}
4198
4199SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
4200 SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
4201 MVT PartVT, EVT ValueVT, std::optional<CallingConv::ID> CC) const {
4202 if ((ValueVT == MVT::f16 || ValueVT == MVT::bf16) && PartVT == MVT::f32) {
4203 unsigned ValueBits = ValueVT.getSizeInBits();
4204 unsigned PartBits = PartVT.getSizeInBits();
4205 SDValue Val = Parts[0];
4206
4207 Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
4208 Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
4209 Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
4210 return Val;
4211 }
4212 return SDValue();
4213}
4214
4215SDValue ARMTargetLowering::LowerFormalArguments(
4216 SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
4217 const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
4218 SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
4219 MachineFunction &MF = DAG.getMachineFunction();
4220 MachineFrameInfo &MFI = MF.getFrameInfo();
4221
4222 ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
4223
4224 // Assign locations to all of the incoming arguments.
4226 CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
4227 *DAG.getContext());
4228 CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
4229
4231 unsigned CurArgIdx = 0;
4232
4233 // Initially ArgRegsSaveSize is zero.
4234 // Then we increase this value each time we meet byval parameter.
4235 // We also increase this value in case of varargs function.
4236 AFI->setArgRegsSaveSize(0);
4237
4238 // Calculate the amount of stack space that we need to allocate to store
4239 // byval and variadic arguments that are passed in registers.
4240 // We need to know this before we allocate the first byval or variadic
4241 // argument, as they will be allocated a stack slot below the CFA (Canonical
4242 // Frame Address, the stack pointer at entry to the function).
4243 unsigned ArgRegBegin = ARM::R4;
4244 for (const CCValAssign &VA : ArgLocs) {
4245 if (CCInfo.getInRegsParamsProcessed() >= CCInfo.getInRegsParamsCount())
4246 break;
4247
4248 unsigned Index = VA.getValNo();
4249 ISD::ArgFlagsTy Flags = Ins[Index].Flags;
4250 if (!Flags.isByVal())
4251 continue;
4252
4253 assert(VA.isMemLoc() && "unexpected byval pointer in reg");
4254 unsigned RBegin, REnd;
4255 CCInfo.getInRegsParamInfo(CCInfo.getInRegsParamsProcessed(), RBegin, REnd);
4256 ArgRegBegin = std::min(ArgRegBegin, RBegin);
4257
4258 CCInfo.nextInRegsParam();
4259 }
4260 CCInfo.rewindByValRegsInfo();
4261
4262 int lastInsIndex = -1;
4263 if (isVarArg && MFI.hasVAStart()) {
4264 unsigned RegIdx = CCInfo.getFirstUnallocated(GPRArgRegs);
4265 if (RegIdx != std::size(GPRArgRegs))
4266 ArgRegBegin = std::min(ArgRegBegin, (unsigned)GPRArgRegs[RegIdx]);
4267 }
4268
4269 unsigned TotalArgRegsSaveSize = 4 * (ARM::R4 - ArgRegBegin);
4270 AFI->setArgRegsSaveSize(TotalArgRegsSaveSize);
4271 auto PtrVT = getPointerTy(DAG.getDataLayout());
4272
4273 for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
4274 CCValAssign &VA = ArgLocs[i];
4275 if (Ins[VA.getValNo()].isOrigArg()) {
4276 std::advance(CurOrigArg,
4277 Ins[VA.getValNo()].getOrigArgIndex() - CurArgIdx);
4278 CurArgIdx = Ins[VA.getValNo()].getOrigArgIndex();
4279 }
4280 // Arguments stored in registers.
4281 if (VA.isRegLoc()) {
4282 EVT RegVT = VA.getLocVT();
4283 SDValue ArgValue;
4284
4285 if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
4286 // f64 and vector types are split up into multiple registers or
4287 // combinations of registers and stack slots.
4288 SDValue ArgValue1 =
4289 GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4290 VA = ArgLocs[++i]; // skip ahead to next loc
4291 SDValue ArgValue2;
4292 if (VA.isMemLoc()) {
4293 int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
4294 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4295 ArgValue2 = DAG.getLoad(
4296 MVT::f64, dl, Chain, FIN,
4298 } else {
4299 ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4300 }
4301 ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
4302 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4303 ArgValue1, DAG.getIntPtrConstant(0, dl));
4304 ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
4305 ArgValue2, DAG.getIntPtrConstant(1, dl));
4306 } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
4307 ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
4308 } else {
4309 const TargetRegisterClass *RC;
4310
4311 if (RegVT == MVT::f16 || RegVT == MVT::bf16)
4312 RC = &ARM::HPRRegClass;
4313 else if (RegVT == MVT::f32)
4314 RC = &ARM::SPRRegClass;
4315 else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
4316 RegVT == MVT::v4bf16)
4317 RC = &ARM::DPRRegClass;
4318 else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
4319 RegVT == MVT::v8bf16)
4320 RC = &ARM::QPRRegClass;
4321 else if (RegVT == MVT::i32)
4322 RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
4323 : &ARM::GPRRegClass;
4324 else
4325 llvm_unreachable("RegVT not supported by FORMAL_ARGUMENTS Lowering");
4326
4327 // Transform the arguments in physical registers into virtual ones.
4328 Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
4329 ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
4330
4331 // If this value is passed in r0 and has the returned attribute (e.g.
4332 // C++ 'structors), record this fact for later use.
4333 if (VA.getLocReg() == ARM::R0 && Ins[VA.getValNo()].Flags.isReturned()) {
4334 AFI->setPreservesR0();
4335 }
4336 }
4337
4338 // If this is an 8 or 16-bit value, it is really passed promoted
4339 // to 32 bits. Insert an assert[sz]ext to capture this, then
4340 // truncate to the right size.
4341 switch (VA.getLocInfo()) {
4342 default: llvm_unreachable("Unknown loc info!");
4343 case CCValAssign::Full: break;
4344 case CCValAssign::BCvt:
4345 ArgValue = DAG.getNode(ISD::BITCAST, dl, VA.getValVT(), ArgValue);
4346 break;
4347 }
4348
4349 // f16 arguments have their size extended to 4 bytes and passed as if they
4350 // had been copied to the LSBs of a 32-bit register.
4351 // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
4352 if (VA.needsCustom() &&
4353 (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
4354 ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
4355
4356 // On CMSE Entry Functions, formal integer arguments whose bitwidth is
4357 // less than 32 bits must be sign- or zero-extended in the callee for
4358 // security reasons. Although the ABI mandates an extension done by the
4359 // caller, the latter cannot be trusted to follow the rules of the ABI.
4360 const ISD::InputArg &Arg = Ins[VA.getValNo()];
4361 if (AFI->isCmseNSEntryFunction() && Arg.ArgVT.isScalarInteger() &&
4362 RegVT.isScalarInteger() && Arg.ArgVT.bitsLT(MVT::i32))
4363 ArgValue = handleCMSEValue(ArgValue, Arg, DAG, dl);
4364
4365 InVals.push_back(ArgValue);
4366 } else { // VA.isRegLoc()
4367 // Only arguments passed on the stack should make it here.
4368 assert(VA.isMemLoc());
4369 assert(VA.getValVT() != MVT::i64 && "i64 should already be lowered");
4370
4371 int index = VA.getValNo();
4372
4373 // Some Ins[] entries become multiple ArgLoc[] entries.
4374 // Process them only once.
4375 if (index != lastInsIndex)
4376 {
4377 ISD::ArgFlagsTy Flags = Ins[index].Flags;
4378 // FIXME: For now, all byval parameter objects are marked mutable.
4379 // This can be changed with more analysis.
4380 // In case of tail call optimization mark all arguments mutable.
4381 // Since they could be overwritten by lowering of arguments in case of
4382 // a tail call.
4383 if (Flags.isByVal()) {
4384 assert(Ins[index].isOrigArg() &&
4385 "Byval arguments cannot be implicit");
4386 unsigned CurByValIndex = CCInfo.getInRegsParamsProcessed();
4387
4388 int FrameIndex = StoreByValRegs(
4389 CCInfo, DAG, dl, Chain, &*CurOrigArg, CurByValIndex,
4390 VA.getLocMemOffset(), Flags.getByValSize());
4391 InVals.push_back(DAG.getFrameIndex(FrameIndex, PtrVT));
4392 CCInfo.nextInRegsParam();
4393 } else if (VA.needsCustom() && (VA.getValVT() == MVT::f16 ||
4394 VA.getValVT() == MVT::bf16)) {
4395 // f16 and bf16 values are passed in the least-significant half of
4396 // a 4 byte stack slot. This is done as-if the extension was done
4397 // in a 32-bit register, so the actual bytes used for the value
4398 // differ between little and big endian.
4399 assert(VA.getLocVT().getSizeInBits() == 32);
4400 unsigned FIOffset = VA.getLocMemOffset();
4401 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits() / 8,
4402 FIOffset, true);
4403
4404 SDValue Addr = DAG.getFrameIndex(FI, PtrVT);
4405 if (DAG.getDataLayout().isBigEndian())
4406 Addr = DAG.getObjectPtrOffset(dl, Addr, TypeSize::getFixed(2));
4407
4408 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, Addr,
4410 DAG.getMachineFunction(), FI)));
4411
4412 } else {
4413 unsigned FIOffset = VA.getLocMemOffset();
4414 int FI = MFI.CreateFixedObject(VA.getLocVT().getSizeInBits()/8,
4415 FIOffset, true);
4416
4417 // Create load nodes to retrieve arguments from the stack.
4418 SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
4419 InVals.push_back(DAG.getLoad(VA.getValVT(), dl, Chain, FIN,
4421 DAG.getMachineFunction(), FI)));
4422 }
4423 lastInsIndex = index;
4424 }
4425 }
4426 }
4427
4428 // varargs
4429 if (isVarArg && MFI.hasVAStart()) {
4430 VarArgStyleRegisters(CCInfo, DAG, dl, Chain, CCInfo.getStackSize(),
4431 TotalArgRegsSaveSize);
4432 if (AFI->isCmseNSEntryFunction()) {
4433 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4435 "secure entry function must not be variadic", dl.getDebugLoc()));
4436 }
4437 }
4438
4439 unsigned StackArgSize = CCInfo.getStackSize();
4440 bool TailCallOpt = MF.getTarget().Options.GuaranteedTailCallOpt;
4441 if (canGuaranteeTCO(CallConv, TailCallOpt)) {
4442 // The only way to guarantee a tail call is if the callee restores its
4443 // argument area, but it must also keep the stack aligned when doing so.
4444 MaybeAlign StackAlign = DAG.getDataLayout().getStackAlignment();
4445 assert(StackAlign && "data layout string is missing stack alignment");
4446 StackArgSize = alignTo(StackArgSize, *StackAlign);
4447
4448 AFI->setArgumentStackToRestore(StackArgSize);
4449 }
4450 AFI->setArgumentStackSize(StackArgSize);
4451
4452 if (CCInfo.getStackSize() > 0 && AFI->isCmseNSEntryFunction()) {
4453 DAG.getContext()->diagnose(DiagnosticInfoUnsupported(
4455 "secure entry function requires arguments on stack", dl.getDebugLoc()));
4456 }
4457
4458 return Chain;
4459}
4460
4461/// isFloatingPointZero - Return true if this is +0.0.
4464 return CFP->getValueAPF().isPosZero();
4465 else if (ISD::isEXTLoad(Op.getNode()) || ISD::isNON_EXTLoad(Op.getNode())) {
4466 // Maybe this has already been legalized into the constant pool?
4467 if (Op.getOperand(1).getOpcode() == ARMISD::Wrapper) {
4468 SDValue WrapperOp = Op.getOperand(1).getOperand(0);
4470 if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CP->getConstVal()))
4471 return CFP->getValueAPF().isPosZero();
4472 }
4473 } else if (Op->getOpcode() == ISD::BITCAST &&
4474 Op->getValueType(0) == MVT::f64) {
4475 // Handle (ISD::BITCAST (ARMISD::VMOVIMM (ISD::TargetConstant 0)) MVT::f64)
4476 // created by LowerConstantFP().
4477 SDValue BitcastOp = Op->getOperand(0);
4478 if (BitcastOp->getOpcode() == ARMISD::VMOVIMM &&
4479 isNullConstant(BitcastOp->getOperand(0)))
4480 return true;
4481 }
4482 return false;
4483}
4484
4486 // 0 - INT_MIN sign wraps, so no signed wrap means cmn is safe.
4487 if (Op->getFlags().hasNoSignedWrap())
4488 return true;
4489
4490 // We can still figure out if the second operand is safe to use
4491 // in a CMN instruction by checking if it is known to be not the minimum
4492 // signed value. If it is not, then we can safely use CMN.
4493 // Note: We can eventually remove this check and simply rely on
4494 // Op->getFlags().hasNoSignedWrap() once SelectionDAG/ISelLowering
4495 // consistently sets them appropriately when making said nodes.
4496
4497 KnownBits KnownSrc = DAG.computeKnownBits(Op.getOperand(1));
4498 return !KnownSrc.getSignedMinValue().isMinSignedValue();
4499}
4500
4502 return Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)) &&
4503 (isIntEqualitySetCC(CC) ||
4504 (isUnsignedIntSetCC(CC) && DAG.isKnownNeverZero(Op.getOperand(1))) ||
4505 (isSignedIntSetCC(CC) && isSafeSignedCMN(Op, DAG)));
4506}
4507
4508/// Returns appropriate ARM CMP (cmp) and corresponding condition code for
4509/// the given operands.
4510SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
4511 SDValue &ARMcc, SelectionDAG &DAG,
4512 const SDLoc &dl) const {
4513 if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
4514 unsigned C = RHSC->getZExtValue();
4515 if (!isLegalICmpImmediate((int32_t)C)) {
4516 // Constant does not fit, try adjusting it by one.
4517 switch (CC) {
4518 default: break;
4519 case ISD::SETLT:
4520 case ISD::SETGE:
4521 if (C != 0x80000000 && isLegalICmpImmediate(C-1)) {
4522 CC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGT;
4523 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4524 }
4525 break;
4526 case ISD::SETULT:
4527 case ISD::SETUGE:
4528 if (C != 0 && isLegalICmpImmediate(C-1)) {
4529 CC = (CC == ISD::SETULT) ? ISD::SETULE : ISD::SETUGT;
4530 RHS = DAG.getConstant(C - 1, dl, MVT::i32);
4531 }
4532 break;
4533 case ISD::SETLE:
4534 case ISD::SETGT:
4535 if (C != 0x7fffffff && isLegalICmpImmediate(C+1)) {
4536 CC = (CC == ISD::SETLE) ? ISD::SETLT : ISD::SETGE;
4537 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4538 }
4539 break;
4540 case ISD::SETULE:
4541 case ISD::SETUGT:
4542 if (C != 0xffffffff && isLegalICmpImmediate(C+1)) {
4543 CC = (CC == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
4544 RHS = DAG.getConstant(C + 1, dl, MVT::i32);
4545 }
4546 break;
4547 }
4548 }
4549 } else if ((ARM_AM::getShiftOpcForNode(LHS.getOpcode()) != ARM_AM::no_shift) &&
4551 // In ARM and Thumb-2, the compare instructions can shift their second
4552 // operand.
4554 std::swap(LHS, RHS);
4555 }
4556
4557 // Thumb1 has very limited immediate modes, so turning an "and" into a
4558 // shift can save multiple instructions.
4559 //
4560 // If we have (x & C1), and C1 is an appropriate mask, we can transform it
4561 // into "((x << n) >> n)". But that isn't necessarily profitable on its
4562 // own. If it's the operand to an unsigned comparison with an immediate,
4563 // we can eliminate one of the shifts: we transform
4564 // "((x << n) >> n) == C2" to "(x << n) == (C2 << n)".
4565 //
4566 // We avoid transforming cases which aren't profitable due to encoding
4567 // details:
4568 //
4569 // 1. C2 fits into the immediate field of a cmp, and the transformed version
4570 // would not; in that case, we're essentially trading one immediate load for
4571 // another.
4572 // 2. C1 is 255 or 65535, so we can use uxtb or uxth.
4573 // 3. C2 is zero; we have other code for this special case.
4574 //
4575 // FIXME: Figure out profitability for Thumb2; we usually can't save an
4576 // instruction, since the AND is always one instruction anyway, but we could
4577 // use narrow instructions in some cases.
4578 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::AND &&
4579 LHS->hasOneUse() && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4580 LHS.getValueType() == MVT::i32 && isa<ConstantSDNode>(RHS) &&
4581 !isSignedIntSetCC(CC)) {
4582 unsigned Mask = LHS.getConstantOperandVal(1);
4583 auto *RHSC = cast<ConstantSDNode>(RHS.getNode());
4584 uint64_t RHSV = RHSC->getZExtValue();
4585 if (isMask_32(Mask) && (RHSV & ~Mask) == 0 && Mask != 255 && Mask != 65535) {
4586 unsigned ShiftBits = llvm::countl_zero(Mask);
4587 if (RHSV && (RHSV > 255 || (RHSV << ShiftBits) <= 255)) {
4588 SDValue ShiftAmt = DAG.getConstant(ShiftBits, dl, MVT::i32);
4589 LHS = DAG.getNode(ISD::SHL, dl, MVT::i32, LHS.getOperand(0), ShiftAmt);
4590 RHS = DAG.getConstant(RHSV << ShiftBits, dl, MVT::i32);
4591 }
4592 }
4593 }
4594
4595 // The specific comparison "(x<<c) > 0x80000000U" can be optimized to a
4596 // single "lsls x, c+1". The shift sets the "C" and "Z" flags the same
4597 // way a cmp would.
4598 // FIXME: Add support for ARM/Thumb2; this would need isel patterns, and
4599 // some tweaks to the heuristics for the previous and->shift transform.
4600 // FIXME: Optimize cases where the LHS isn't a shift.
4601 if (Subtarget->isThumb1Only() && LHS->getOpcode() == ISD::SHL &&
4602 isa<ConstantSDNode>(RHS) && RHS->getAsZExtVal() == 0x80000000U &&
4603 CC == ISD::SETUGT && isa<ConstantSDNode>(LHS.getOperand(1)) &&
4604 LHS.getConstantOperandVal(1) < 31) {
4605 unsigned ShiftAmt = LHS.getConstantOperandVal(1) + 1;
4606 SDValue Shift =
4607 DAG.getNode(ARMISD::LSLS, dl, DAG.getVTList(MVT::i32, FlagsVT),
4608 LHS.getOperand(0), DAG.getConstant(ShiftAmt, dl, MVT::i32));
4609 ARMcc = DAG.getConstant(ARMCC::HI, dl, MVT::i32);
4610 return Shift.getValue(1);
4611 }
4612
4614
4615 // If the RHS is a constant zero then the V (overflow) flag will never be
4616 // set. This can allow us to simplify GE to PL or LT to MI, which can be
4617 // simpler for other passes (like the peephole optimiser) to deal with.
4618 if (isNullConstant(RHS)) {
4619 switch (CondCode) {
4620 default: break;
4621 case ARMCC::GE:
4623 break;
4624 case ARMCC::LT:
4626 break;
4627 }
4628 }
4629
4630 unsigned CompareType;
4631 switch (CondCode) {
4632 default:
4633 CompareType = ARMISD::CMP;
4634 break;
4635 case ARMCC::EQ:
4636 case ARMCC::NE:
4637 // Uses only Z Flag
4638 CompareType = ARMISD::CMPZ;
4639 break;
4640 }
4641
4642 // TODO: Remove CMPZ check once we generalize and remove the CMPZ enum from
4643 // the codebase.
4644
4645 // TODO: When we have a solution to the vselect predicate not allowing pl/mi
4646 // all the time, allow those cases to be cmn too no matter what.
4647 if (CompareType != ARMISD::CMPZ && isCMN(RHS, CC, DAG)) {
4648 CompareType = ARMISD::CMN;
4649 RHS = RHS.getOperand(1);
4650 } else if (CompareType != ARMISD::CMPZ && isCMN(LHS, CC, DAG)) {
4651 CompareType = ARMISD::CMN;
4652 LHS = LHS.getOperand(1);
4654 }
4655
4656 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
4657 return DAG.getNode(CompareType, dl, FlagsVT, LHS, RHS);
4658}
4659
4660/// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
4661SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
4662 SelectionDAG &DAG, const SDLoc &dl,
4663 bool Signaling) const {
4664 assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
4665 SDValue Flags;
4667 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP, dl, FlagsVT,
4668 LHS, RHS);
4669 else
4670 Flags = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0, dl,
4671 FlagsVT, LHS);
4672 return DAG.getNode(ARMISD::FMSTAT, dl, FlagsVT, Flags);
4673}
4674
4675// This function returns three things: the arithmetic computation itself
4676// (Value), a comparison (OverflowCmp), and a condition code (ARMcc). The
4677// comparison and the condition code define the case in which the arithmetic
4678// computation *does not* overflow.
4679std::pair<SDValue, SDValue>
4680ARMTargetLowering::getARMXALUOOp(SDValue Op, SelectionDAG &DAG,
4681 SDValue &ARMcc) const {
4682 assert(Op.getValueType() == MVT::i32 && "Unsupported value type");
4683
4684 SDValue Value, OverflowCmp;
4685 SDValue LHS = Op.getOperand(0);
4686 SDValue RHS = Op.getOperand(1);
4687 SDLoc dl(Op);
4688
4689 // FIXME: We are currently always generating CMPs because we don't support
4690 // generating CMN through the backend. This is not as good as the natural
4691 // CMP case because it causes a register dependency and cannot be folded
4692 // later.
4693
4694 switch (Op.getOpcode()) {
4695 default:
4696 llvm_unreachable("Unknown overflow instruction!");
4697 case ISD::SADDO:
4698 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4699 Value = DAG.getNode(ISD::ADD, dl, Op.getValueType(), LHS, RHS);
4700 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4701 break;
4702 case ISD::UADDO:
4703 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4704 // We use ADDC here to correspond to its use in LowerALUO.
4705 // We do not use it in the USUBO case as Value may not be used.
4706 Value = DAG.getNode(ARMISD::ADDC, dl,
4707 DAG.getVTList(Op.getValueType(), MVT::i32), LHS, RHS)
4708 .getValue(0);
4709 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, Value, LHS);
4710 break;
4711 case ISD::SSUBO:
4712 ARMcc = DAG.getConstant(ARMCC::VC, dl, MVT::i32);
4713 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4714 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4715 break;
4716 case ISD::USUBO:
4717 ARMcc = DAG.getConstant(ARMCC::HS, dl, MVT::i32);
4718 Value = DAG.getNode(ISD::SUB, dl, Op.getValueType(), LHS, RHS);
4719 OverflowCmp = DAG.getNode(ARMISD::CMP, dl, FlagsVT, LHS, RHS);
4720 break;
4721 case ISD::UMULO:
4722 // We generate a UMUL_LOHI and then check if the high word is 0.
4723 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4724 Value = DAG.getNode(ISD::UMUL_LOHI, dl,
4725 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4726 LHS, RHS);
4727 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4728 DAG.getConstant(0, dl, MVT::i32));
4729 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4730 break;
4731 case ISD::SMULO:
4732 // We generate a SMUL_LOHI and then check if all the bits of the high word
4733 // are the same as the sign bit of the low word.
4734 ARMcc = DAG.getConstant(ARMCC::EQ, dl, MVT::i32);
4735 Value = DAG.getNode(ISD::SMUL_LOHI, dl,
4736 DAG.getVTList(Op.getValueType(), Op.getValueType()),
4737 LHS, RHS);
4738 OverflowCmp = DAG.getNode(ARMISD::CMPZ, dl, FlagsVT, Value.getValue(1),
4739 DAG.getNode(ISD::SRA, dl, Op.getValueType(),
4740 Value.getValue(0),
4741 DAG.getConstant(31, dl, MVT::i32)));
4742 Value = Value.getValue(0); // We only want the low 32 bits for the result.
4743 break;
4744 } // switch (...)
4745
4746 return std::make_pair(Value, OverflowCmp);
4747}
4748
4750 SDLoc DL(Value);
4751 EVT VT = Value.getValueType();
4752
4753 if (Invert)
4754 Value = DAG.getNode(ISD::SUB, DL, MVT::i32,
4755 DAG.getConstant(1, DL, MVT::i32), Value);
4756
4757 SDValue Cmp = DAG.getNode(ARMISD::SUBC, DL, DAG.getVTList(VT, MVT::i32),
4758 Value, DAG.getConstant(1, DL, VT));
4759 return Cmp.getValue(1);
4760}
4761
4763 bool Invert) {
4764 SDLoc DL(Flags);
4765
4766 if (Invert) {
4767 // Convert flags to boolean with ADDE 0,0,Carry then compute 1 - bool.
4768 SDValue BoolCarry = DAG.getNode(
4769 ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4770 DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT), Flags);
4771 return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(1, DL, VT), BoolCarry);
4772 }
4773
4774 // Now convert the carry flag into a boolean carry. We do this
4775 // using ARMISD::ADDE 0, 0, Carry
4776 return DAG.getNode(ARMISD::ADDE, DL, DAG.getVTList(VT, MVT::i32),
4777 DAG.getConstant(0, DL, VT), DAG.getConstant(0, DL, VT),
4778 Flags);
4779}
4780
4781// Value is 1 if 'V' bit is 1, else 0
4783 SDLoc DL(Flags);
4784 SDValue Zero = DAG.getConstant(0, DL, VT);
4785 SDValue One = DAG.getConstant(1, DL, VT);
4786 SDValue ARMcc = DAG.getConstant(ARMCC::VS, DL, MVT::i32);
4787 return DAG.getNode(ARMISD::CMOV, DL, VT, Zero, One, ARMcc, Flags);
4788}
4789
4790SDValue ARMTargetLowering::LowerALUO(SDValue Op, SelectionDAG &DAG) const {
4791 // Let legalize expand this if it isn't a legal type yet.
4792 if (!isTypeLegal(Op.getValueType()))
4793 return SDValue();
4794
4795 SDValue LHS = Op.getOperand(0);
4796 SDValue RHS = Op.getOperand(1);
4797 SDLoc dl(Op);
4798
4799 EVT VT = Op.getValueType();
4800 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
4801 SDValue Value;
4802 SDValue Overflow;
4803 switch (Op.getOpcode()) {
4804 case ISD::UADDO:
4805 Value = DAG.getNode(ARMISD::ADDC, dl, VTs, LHS, RHS);
4806 // Convert the carry flag into a boolean value.
4807 Overflow = carryFlagToValue(Value.getValue(1), VT, DAG, false);
4808 break;
4809 case ISD::USUBO:
4810 Value = DAG.getNode(ARMISD::SUBC, dl, VTs, LHS, RHS);
4811 // Convert the carry flag into a boolean value.
4812 Overflow = carryFlagToValue(Value.getValue(1), VT, DAG, true);
4813 break;
4814 default: {
4815 // Handle other operations with getARMXALUOOp
4816 SDValue OverflowCmp, ARMcc;
4817 std::tie(Value, OverflowCmp) = getARMXALUOOp(Op, DAG, ARMcc);
4818 // We use 0 and 1 as false and true values.
4819 // ARMcc represents the "no overflow" condition (e.g., VC for signed ops).
4820 // CMOV operand order is (FalseVal, TrueVal), so we put 1 in FalseVal
4821 // position to get Overflow=1 when the "no overflow" condition is false.
4822 Overflow =
4823 DAG.getNode(ARMISD::CMOV, dl, MVT::i32,
4824 DAG.getConstant(1, dl, MVT::i32), // FalseVal: overflow
4825 DAG.getConstant(0, dl, MVT::i32), // TrueVal: no overflow
4826 ARMcc, OverflowCmp);
4827 break;
4828 }
4829 }
4830
4831 return DAG.getNode(ISD::MERGE_VALUES, dl, VTs, Value, Overflow);
4832}
4833
4835 const ARMSubtarget *Subtarget) {
4836 EVT VT = Op.getValueType();
4837 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() || Subtarget->isThumb1Only())
4838 return SDValue();
4839 if (!VT.isSimple())
4840 return SDValue();
4841
4842 unsigned NewOpcode;
4843 switch (VT.getSimpleVT().SimpleTy) {
4844 default:
4845 return SDValue();
4846 case MVT::i8:
4847 switch (Op->getOpcode()) {
4848 case ISD::UADDSAT:
4849 NewOpcode = ARMISD::UQADD8b;
4850 break;
4851 case ISD::SADDSAT:
4852 NewOpcode = ARMISD::QADD8b;
4853 break;
4854 case ISD::USUBSAT:
4855 NewOpcode = ARMISD::UQSUB8b;
4856 break;
4857 case ISD::SSUBSAT:
4858 NewOpcode = ARMISD::QSUB8b;
4859 break;
4860 }
4861 break;
4862 case MVT::i16:
4863 switch (Op->getOpcode()) {
4864 case ISD::UADDSAT:
4865 NewOpcode = ARMISD::UQADD16b;
4866 break;
4867 case ISD::SADDSAT:
4868 NewOpcode = ARMISD::QADD16b;
4869 break;
4870 case ISD::USUBSAT:
4871 NewOpcode = ARMISD::UQSUB16b;
4872 break;
4873 case ISD::SSUBSAT:
4874 NewOpcode = ARMISD::QSUB16b;
4875 break;
4876 }
4877 break;
4878 }
4879
4880 SDLoc dl(Op);
4881 SDValue Add =
4882 DAG.getNode(NewOpcode, dl, MVT::i32,
4883 DAG.getSExtOrTrunc(Op->getOperand(0), dl, MVT::i32),
4884 DAG.getSExtOrTrunc(Op->getOperand(1), dl, MVT::i32));
4885 return DAG.getNode(ISD::TRUNCATE, dl, VT, Add);
4886}
4887
4888SDValue ARMTargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
4889 SDValue Cond = Op.getOperand(0);
4890 SDValue SelectTrue = Op.getOperand(1);
4891 SDValue SelectFalse = Op.getOperand(2);
4892 SDLoc dl(Op);
4893 unsigned Opc = Cond.getOpcode();
4894
4895 if (Cond.getResNo() == 1 &&
4896 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
4897 Opc == ISD::USUBO)) {
4898 if (!isTypeLegal(Cond->getValueType(0)))
4899 return SDValue();
4900
4901 SDValue Value, OverflowCmp;
4902 SDValue ARMcc;
4903 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
4904 EVT VT = Op.getValueType();
4905
4906 return getCMOV(dl, VT, SelectTrue, SelectFalse, ARMcc, OverflowCmp, DAG);
4907 }
4908
4909 // Convert:
4910 //
4911 // (select (cmov 1, 0, cond), t, f) -> (cmov t, f, cond)
4912 // (select (cmov 0, 1, cond), t, f) -> (cmov f, t, cond)
4913 //
4914 if (Cond.getOpcode() == ARMISD::CMOV && Cond.hasOneUse()) {
4915 const ConstantSDNode *CMOVTrue =
4916 dyn_cast<ConstantSDNode>(Cond.getOperand(0));
4917 const ConstantSDNode *CMOVFalse =
4918 dyn_cast<ConstantSDNode>(Cond.getOperand(1));
4919
4920 if (CMOVTrue && CMOVFalse) {
4921 unsigned CMOVTrueVal = CMOVTrue->getZExtValue();
4922 unsigned CMOVFalseVal = CMOVFalse->getZExtValue();
4923
4924 SDValue True;
4925 SDValue False;
4926 if (CMOVTrueVal == 1 && CMOVFalseVal == 0) {
4927 True = SelectTrue;
4928 False = SelectFalse;
4929 } else if (CMOVTrueVal == 0 && CMOVFalseVal == 1) {
4930 True = SelectFalse;
4931 False = SelectTrue;
4932 }
4933
4934 if (True.getNode() && False.getNode())
4935 return getCMOV(dl, Op.getValueType(), True, False, Cond.getOperand(2),
4936 Cond.getOperand(3), DAG);
4937 }
4938 }
4939
4940 return DAG.getSelectCC(dl, Cond,
4941 DAG.getConstant(0, dl, Cond.getValueType()),
4942 SelectTrue, SelectFalse, ISD::SETNE);
4943}
4944
4946 bool &swpCmpOps, bool &swpVselOps) {
4947 // Start by selecting the GE condition code for opcodes that return true for
4948 // 'equality'
4949 if (CC == ISD::SETUGE || CC == ISD::SETOGE || CC == ISD::SETOLE ||
4950 CC == ISD::SETULE || CC == ISD::SETGE || CC == ISD::SETLE)
4951 CondCode = ARMCC::GE;
4952
4953 // and GT for opcodes that return false for 'equality'.
4954 else if (CC == ISD::SETUGT || CC == ISD::SETOGT || CC == ISD::SETOLT ||
4955 CC == ISD::SETULT || CC == ISD::SETGT || CC == ISD::SETLT)
4956 CondCode = ARMCC::GT;
4957
4958 // Since we are constrained to GE/GT, if the opcode contains 'less', we need
4959 // to swap the compare operands.
4960 if (CC == ISD::SETOLE || CC == ISD::SETULE || CC == ISD::SETOLT ||
4961 CC == ISD::SETULT || CC == ISD::SETLE || CC == ISD::SETLT)
4962 swpCmpOps = true;
4963
4964 // Both GT and GE are ordered comparisons, and return false for 'unordered'.
4965 // If we have an unordered opcode, we need to swap the operands to the VSEL
4966 // instruction (effectively negating the condition).
4967 //
4968 // This also has the effect of swapping which one of 'less' or 'greater'
4969 // returns true, so we also swap the compare operands. It also switches
4970 // whether we return true for 'equality', so we compensate by picking the
4971 // opposite condition code to our original choice.
4972 if (CC == ISD::SETULE || CC == ISD::SETULT || CC == ISD::SETUGE ||
4973 CC == ISD::SETUGT) {
4974 swpCmpOps = !swpCmpOps;
4975 swpVselOps = !swpVselOps;
4976 CondCode = CondCode == ARMCC::GT ? ARMCC::GE : ARMCC::GT;
4977 }
4978
4979 // 'ordered' is 'anything but unordered', so use the VS condition code and
4980 // swap the VSEL operands.
4981 if (CC == ISD::SETO) {
4982 CondCode = ARMCC::VS;
4983 swpVselOps = true;
4984 }
4985
4986 // 'unordered or not equal' is 'anything but equal', so use the EQ condition
4987 // code and swap the VSEL operands. Also do this if we don't care about the
4988 // unordered case.
4989 if (CC == ISD::SETUNE || CC == ISD::SETNE) {
4990 CondCode = ARMCC::EQ;
4991 swpVselOps = true;
4992 }
4993}
4994
4995SDValue ARMTargetLowering::getCMOV(const SDLoc &dl, EVT VT, SDValue FalseVal,
4996 SDValue TrueVal, SDValue ARMcc,
4997 SDValue Flags, SelectionDAG &DAG) const {
4998 if (!Subtarget->hasFP64() && VT == MVT::f64) {
4999 FalseVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5000 DAG.getVTList(MVT::i32, MVT::i32), FalseVal);
5001 TrueVal = DAG.getNode(ARMISD::VMOVRRD, dl,
5002 DAG.getVTList(MVT::i32, MVT::i32), TrueVal);
5003
5004 SDValue TrueLow = TrueVal.getValue(0);
5005 SDValue TrueHigh = TrueVal.getValue(1);
5006 SDValue FalseLow = FalseVal.getValue(0);
5007 SDValue FalseHigh = FalseVal.getValue(1);
5008
5009 SDValue Low = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseLow, TrueLow,
5010 ARMcc, Flags);
5011 SDValue High = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, FalseHigh, TrueHigh,
5012 ARMcc, Flags);
5013
5014 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Low, High);
5015 }
5016 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal, ARMcc, Flags);
5017}
5018
5019static bool isGTorGE(ISD::CondCode CC) {
5020 return CC == ISD::SETGT || CC == ISD::SETGE;
5021}
5022
5023static bool isLTorLE(ISD::CondCode CC) {
5024 return CC == ISD::SETLT || CC == ISD::SETLE;
5025}
5026
5027// See if a conditional (LHS CC RHS ? TrueVal : FalseVal) is lower-saturating.
5028// All of these conditions (and their <= and >= counterparts) will do:
5029// x < k ? k : x
5030// x > k ? x : k
5031// k < x ? x : k
5032// k > x ? k : x
5033static bool isLowerSaturate(const SDValue LHS, const SDValue RHS,
5034 const SDValue TrueVal, const SDValue FalseVal,
5035 const ISD::CondCode CC, const SDValue K) {
5036 return (isGTorGE(CC) &&
5037 ((K == LHS && K == TrueVal) || (K == RHS && K == FalseVal))) ||
5038 (isLTorLE(CC) &&
5039 ((K == RHS && K == TrueVal) || (K == LHS && K == FalseVal)));
5040}
5041
5042// Check if two chained conditionals could be converted into SSAT or USAT.
5043//
5044// SSAT can replace a set of two conditional selectors that bound a number to an
5045// interval of type [k, ~k] when k + 1 is a power of 2. Here are some examples:
5046//
5047// x < -k ? -k : (x > k ? k : x)
5048// x < -k ? -k : (x < k ? x : k)
5049// x > -k ? (x > k ? k : x) : -k
5050// x < k ? (x < -k ? -k : x) : k
5051// etc.
5052//
5053// LLVM canonicalizes these to either a min(max()) or a max(min())
5054// pattern. This function tries to match one of these and will return a SSAT
5055// node if successful.
5056//
5057// USAT works similarly to SSAT but bounds on the interval [0, k] where k + 1
5058// is a power of 2.
5060 EVT VT = Op.getValueType();
5061 SDValue V1 = Op.getOperand(0);
5062 SDValue K1 = Op.getOperand(1);
5063 SDValue TrueVal1 = Op.getOperand(2);
5064 SDValue FalseVal1 = Op.getOperand(3);
5065 ISD::CondCode CC1 = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5066
5067 const SDValue Op2 = isa<ConstantSDNode>(TrueVal1) ? FalseVal1 : TrueVal1;
5068 if (Op2.getOpcode() != ISD::SELECT_CC)
5069 return SDValue();
5070
5071 SDValue V2 = Op2.getOperand(0);
5072 SDValue K2 = Op2.getOperand(1);
5073 SDValue TrueVal2 = Op2.getOperand(2);
5074 SDValue FalseVal2 = Op2.getOperand(3);
5075 ISD::CondCode CC2 = cast<CondCodeSDNode>(Op2.getOperand(4))->get();
5076
5077 SDValue V1Tmp = V1;
5078 SDValue V2Tmp = V2;
5079
5080 // Check that the registers and the constants match a max(min()) or min(max())
5081 // pattern
5082 if (V1Tmp != TrueVal1 || V2Tmp != TrueVal2 || K1 != FalseVal1 ||
5083 K2 != FalseVal2 ||
5084 !((isGTorGE(CC1) && isLTorLE(CC2)) || (isLTorLE(CC1) && isGTorGE(CC2))))
5085 return SDValue();
5086
5087 // Check that the constant in the lower-bound check is
5088 // the opposite of the constant in the upper-bound check
5089 // in 1's complement.
5091 return SDValue();
5092
5093 int64_t Val1 = cast<ConstantSDNode>(K1)->getSExtValue();
5094 int64_t Val2 = cast<ConstantSDNode>(K2)->getSExtValue();
5095 int64_t PosVal = std::max(Val1, Val2);
5096 int64_t NegVal = std::min(Val1, Val2);
5097
5098 if (!((Val1 > Val2 && isLTorLE(CC1)) || (Val1 < Val2 && isLTorLE(CC2))) ||
5099 !isPowerOf2_64(PosVal + 1))
5100 return SDValue();
5101
5102 // Handle the difference between USAT (unsigned) and SSAT (signed)
5103 // saturation
5104 // At this point, PosVal is guaranteed to be positive
5105 uint64_t K = PosVal;
5106 SDLoc dl(Op);
5107 if (Val1 == ~Val2)
5108 return DAG.getNode(ARMISD::SSAT, dl, VT, V2Tmp,
5109 DAG.getConstant(llvm::countr_one(K), dl, VT));
5110 if (NegVal == 0)
5111 return DAG.getNode(ARMISD::USAT, dl, VT, V2Tmp,
5112 DAG.getConstant(llvm::countr_one(K), dl, VT));
5113
5114 return SDValue();
5115}
5116
5117// Check if a condition of the type x < k ? k : x can be converted into a
5118// bit operation instead of conditional moves.
5119// Currently this is allowed given:
5120// - The conditions and values match up
5121// - k is 0 or -1 (all ones)
5122// This function will not check the last condition, thats up to the caller
5123// It returns true if the transformation can be made, and in such case
5124// returns x in V, and k in SatK.
5126 SDValue &SatK)
5127{
5128 SDValue LHS = Op.getOperand(0);
5129 SDValue RHS = Op.getOperand(1);
5130 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5131 SDValue TrueVal = Op.getOperand(2);
5132 SDValue FalseVal = Op.getOperand(3);
5133
5135 ? &RHS
5136 : nullptr;
5137
5138 // No constant operation in comparison, early out
5139 if (!K)
5140 return false;
5141
5142 SDValue KTmp = isa<ConstantSDNode>(TrueVal) ? TrueVal : FalseVal;
5143 V = (KTmp == TrueVal) ? FalseVal : TrueVal;
5144 SDValue VTmp = (K && *K == LHS) ? RHS : LHS;
5145
5146 // If the constant on left and right side, or variable on left and right,
5147 // does not match, early out
5148 if (*K != KTmp || V != VTmp)
5149 return false;
5150
5151 if (isLowerSaturate(LHS, RHS, TrueVal, FalseVal, CC, *K)) {
5152 SatK = *K;
5153 return true;
5154 }
5155
5156 return false;
5157}
5158
5159bool ARMTargetLowering::isUnsupportedFloatingType(EVT VT) const {
5160 if (VT == MVT::f32)
5161 return !Subtarget->hasVFP2Base();
5162 if (VT == MVT::f64)
5163 return !Subtarget->hasFP64();
5164 if (VT == MVT::f16)
5165 return !Subtarget->hasFullFP16();
5166 return false;
5167}
5168
5169static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal,
5170 SDValue FalseVal, const ARMSubtarget *Subtarget) {
5171 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5172 ConstantSDNode *CTVal = dyn_cast<ConstantSDNode>(TrueVal);
5173 if (!CFVal || !CTVal || !Subtarget->hasV8_1MMainlineOps())
5174 return SDValue();
5175
5176 unsigned TVal = CTVal->getZExtValue();
5177 unsigned FVal = CFVal->getZExtValue();
5178
5179 Opcode = 0;
5180 InvertCond = false;
5181 if (TVal == ~FVal) {
5182 Opcode = ARMISD::CSINV;
5183 } else if (TVal == ~FVal + 1) {
5184 Opcode = ARMISD::CSNEG;
5185 } else if (TVal + 1 == FVal) {
5186 Opcode = ARMISD::CSINC;
5187 } else if (TVal == FVal + 1) {
5188 Opcode = ARMISD::CSINC;
5189 std::swap(TrueVal, FalseVal);
5190 std::swap(TVal, FVal);
5191 InvertCond = !InvertCond;
5192 } else {
5193 return SDValue();
5194 }
5195
5196 // If one of the constants is cheaper than another, materialise the
5197 // cheaper one and let the csel generate the other.
5198 if (Opcode != ARMISD::CSINC &&
5199 HasLowerConstantMaterializationCost(FVal, TVal, Subtarget)) {
5200 std::swap(TrueVal, FalseVal);
5201 std::swap(TVal, FVal);
5202 InvertCond = !InvertCond;
5203 }
5204
5205 // Attempt to use ZR checking TVal is 0, possibly inverting the condition
5206 // to get there. CSINC not is invertable like the other two (~(~a) == a,
5207 // -(-a) == a, but (a+1)+1 != a).
5208 if (FVal == 0 && Opcode != ARMISD::CSINC) {
5209 std::swap(TrueVal, FalseVal);
5210 std::swap(TVal, FVal);
5211 InvertCond = !InvertCond;
5212 }
5213
5214 return TrueVal;
5215}
5216
5217SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
5218 EVT VT = Op.getValueType();
5219 SDLoc dl(Op);
5220
5221 // Try to convert two saturating conditional selects into a single SSAT
5222 if ((!Subtarget->isThumb() && Subtarget->hasV6Ops()) || Subtarget->isThumb2())
5223 if (SDValue SatValue = LowerSaturatingConditional(Op, DAG))
5224 return SatValue;
5225
5226 // Try to convert expressions of the form x < k ? k : x (and similar forms)
5227 // into more efficient bit operations, which is possible when k is 0 or -1
5228 // On ARM and Thumb-2 which have flexible operand 2 this will result in
5229 // single instructions. On Thumb the shift and the bit operation will be two
5230 // instructions.
5231 // Only allow this transformation on full-width (32-bit) operations
5232 SDValue LowerSatConstant;
5233 SDValue SatValue;
5234 if (VT == MVT::i32 &&
5235 isLowerSaturatingConditional(Op, SatValue, LowerSatConstant)) {
5236 SDValue ShiftV = DAG.getNode(ISD::SRA, dl, VT, SatValue,
5237 DAG.getConstant(31, dl, VT));
5238 if (isNullConstant(LowerSatConstant)) {
5239 SDValue NotShiftV = DAG.getNode(ISD::XOR, dl, VT, ShiftV,
5240 DAG.getAllOnesConstant(dl, VT));
5241 return DAG.getNode(ISD::AND, dl, VT, SatValue, NotShiftV);
5242 } else if (isAllOnesConstant(LowerSatConstant))
5243 return DAG.getNode(ISD::OR, dl, VT, SatValue, ShiftV);
5244 }
5245
5246 SDValue LHS = Op.getOperand(0);
5247 SDValue RHS = Op.getOperand(1);
5248 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
5249 SDValue TrueVal = Op.getOperand(2);
5250 SDValue FalseVal = Op.getOperand(3);
5251 ConstantSDNode *CFVal = dyn_cast<ConstantSDNode>(FalseVal);
5252 ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS);
5253 if (Op.getValueType().isInteger()) {
5254
5255 // Check for SMAX(lhs, 0) and SMIN(lhs, 0) patterns.
5256 // (SELECT_CC setgt, lhs, 0, lhs, 0) -> (BIC lhs, (SRA lhs, typesize-1))
5257 // (SELECT_CC setlt, lhs, 0, lhs, 0) -> (AND lhs, (SRA lhs, typesize-1))
5258 // Both require less instructions than compare and conditional select.
5259 if ((CC == ISD::SETGT || CC == ISD::SETLT) && LHS == TrueVal && RHSC &&
5260 RHSC->isZero() && CFVal && CFVal->isZero() &&
5261 LHS.getValueType() == RHS.getValueType()) {
5262 EVT VT = LHS.getValueType();
5263 SDValue Shift =
5264 DAG.getNode(ISD::SRA, dl, VT, LHS,
5265 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5266
5267 if (CC == ISD::SETGT)
5268 Shift = DAG.getNOT(dl, Shift, VT);
5269
5270 return DAG.getNode(ISD::AND, dl, VT, LHS, Shift);
5271 }
5272
5273 // (SELECT_CC setlt, x, 0, 1, 0) -> SRL(x, bw-1)
5274 if (CC == ISD::SETLT && isNullConstant(RHS) && isOneConstant(TrueVal) &&
5275 isNullConstant(FalseVal) && LHS.getValueType() == VT)
5276 return DAG.getNode(ISD::SRL, dl, VT, LHS,
5277 DAG.getConstant(VT.getSizeInBits() - 1, dl, VT));
5278 }
5279
5280 if (LHS.getValueType() == MVT::i32) {
5281 unsigned Opcode;
5282 bool InvertCond;
5283 if (SDValue Op =
5284 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
5285 if (InvertCond)
5286 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5287
5288 SDValue ARMcc;
5289 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5290 EVT VT = Op.getValueType();
5291 return DAG.getNode(Opcode, dl, VT, Op, Op, ARMcc, Cmp);
5292 }
5293 }
5294
5295 if (isUnsupportedFloatingType(LHS.getValueType())) {
5296 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5297
5298 // If softenSetCCOperands only returned one value, we should compare it to
5299 // zero.
5300 if (!RHS.getNode()) {
5301 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5302 CC = ISD::SETNE;
5303 }
5304 }
5305
5306 if (LHS.getValueType() == MVT::i32) {
5307 // Try to generate VSEL on ARMv8.
5308 // The VSEL instruction can't use all the usual ARM condition
5309 // codes: it only has two bits to select the condition code, so it's
5310 // constrained to use only GE, GT, VS and EQ.
5311 //
5312 // To implement all the various ISD::SETXXX opcodes, we sometimes need to
5313 // swap the operands of the previous compare instruction (effectively
5314 // inverting the compare condition, swapping 'less' and 'greater') and
5315 // sometimes need to swap the operands to the VSEL (which inverts the
5316 // condition in the sense of firing whenever the previous condition didn't)
5317 if (Subtarget->hasFPARMv8Base() && (TrueVal.getValueType() == MVT::f16 ||
5318 TrueVal.getValueType() == MVT::f32 ||
5319 TrueVal.getValueType() == MVT::f64)) {
5321 if (CondCode == ARMCC::LT || CondCode == ARMCC::LE ||
5322 CondCode == ARMCC::VC || CondCode == ARMCC::NE) {
5323 CC = ISD::getSetCCInverse(CC, LHS.getValueType());
5324 std::swap(TrueVal, FalseVal);
5325 }
5326 }
5327
5328 SDValue ARMcc;
5329 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5330 // Choose GE over PL, which vsel does now support
5331 if (ARMcc->getAsZExtVal() == ARMCC::PL)
5332 ARMcc = DAG.getConstant(ARMCC::GE, dl, MVT::i32);
5333 return getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5334 }
5335
5336 ARMCC::CondCodes CondCode, CondCode2;
5337 FPCCToARMCC(CC, CondCode, CondCode2);
5338
5339 // Normalize the fp compare. If RHS is zero we prefer to keep it there so we
5340 // match CMPFPw0 instead of CMPFP, though we don't do this for f16 because we
5341 // must use VSEL (limited condition codes), due to not having conditional f16
5342 // moves.
5343 if (Subtarget->hasFPARMv8Base() &&
5344 !(isFloatingPointZero(RHS) && TrueVal.getValueType() != MVT::f16) &&
5345 (TrueVal.getValueType() == MVT::f16 ||
5346 TrueVal.getValueType() == MVT::f32 ||
5347 TrueVal.getValueType() == MVT::f64)) {
5348 bool swpCmpOps = false;
5349 bool swpVselOps = false;
5350 checkVSELConstraints(CC, CondCode, swpCmpOps, swpVselOps);
5351
5352 if (CondCode == ARMCC::GT || CondCode == ARMCC::GE ||
5353 CondCode == ARMCC::VS || CondCode == ARMCC::EQ) {
5354 if (swpCmpOps)
5355 std::swap(LHS, RHS);
5356 if (swpVselOps)
5357 std::swap(TrueVal, FalseVal);
5358 }
5359 }
5360
5361 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5362 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5363 SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, Cmp, DAG);
5364 if (CondCode2 != ARMCC::AL) {
5365 SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
5366 Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, Cmp, DAG);
5367 }
5368 return Result;
5369}
5370
5371/// canChangeToInt - Given the fp compare operand, return true if it is suitable
5372/// to morph to an integer compare sequence.
5373static bool canChangeToInt(SDValue Op, bool &SeenZero,
5374 const ARMSubtarget *Subtarget) {
5375 SDNode *N = Op.getNode();
5376 if (!N->hasOneUse())
5377 // Otherwise it requires moving the value from fp to integer registers.
5378 return false;
5379 if (!N->getNumValues())
5380 return false;
5381 EVT VT = Op.getValueType();
5382 if (VT != MVT::f32 && !Subtarget->isFPBrccSlow())
5383 // f32 case is generally profitable. f64 case only makes sense when vcmpe +
5384 // vmrs are very slow, e.g. cortex-a8.
5385 return false;
5386
5387 if (isFloatingPointZero(Op)) {
5388 SeenZero = true;
5389 return true;
5390 }
5391 return ISD::isNormalLoad(N);
5392}
5393
5396 return DAG.getConstant(0, SDLoc(Op), MVT::i32);
5397
5399 return DAG.getLoad(MVT::i32, SDLoc(Op), Ld->getChain(), Ld->getBasePtr(),
5400 Ld->getPointerInfo(), Ld->getAlign(),
5401 Ld->getMemOperand()->getFlags());
5402
5403 llvm_unreachable("Unknown VFP cmp argument!");
5404}
5405
5407 SDValue &RetVal1, SDValue &RetVal2) {
5408 SDLoc dl(Op);
5409
5410 if (isFloatingPointZero(Op)) {
5411 RetVal1 = DAG.getConstant(0, dl, MVT::i32);
5412 RetVal2 = DAG.getConstant(0, dl, MVT::i32);
5413 return;
5414 }
5415
5416 if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Op)) {
5417 SDValue Ptr = Ld->getBasePtr();
5418 RetVal1 =
5419 DAG.getLoad(MVT::i32, dl, Ld->getChain(), Ptr, Ld->getPointerInfo(),
5420 Ld->getAlign(), Ld->getMemOperand()->getFlags());
5421
5422 EVT PtrType = Ptr.getValueType();
5423 SDValue NewPtr = DAG.getNode(ISD::ADD, dl,
5424 PtrType, Ptr, DAG.getConstant(4, dl, PtrType));
5425 RetVal2 = DAG.getLoad(MVT::i32, dl, Ld->getChain(), NewPtr,
5426 Ld->getPointerInfo().getWithOffset(4),
5427 commonAlignment(Ld->getAlign(), 4),
5428 Ld->getMemOperand()->getFlags());
5429 return;
5430 }
5431
5432 llvm_unreachable("Unknown VFP cmp argument!");
5433}
5434
5435/// OptimizeVFPBrcond - With nnan and without daz, it's legal to optimize some
5436/// f32 and even f64 comparisons to integer ones.
5437SDValue
5438ARMTargetLowering::OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const {
5439 SDValue Chain = Op.getOperand(0);
5440 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5441 SDValue LHS = Op.getOperand(2);
5442 SDValue RHS = Op.getOperand(3);
5443 SDValue Dest = Op.getOperand(4);
5444 SDLoc dl(Op);
5445
5446 bool LHSSeenZero = false;
5447 bool LHSOk = canChangeToInt(LHS, LHSSeenZero, Subtarget);
5448 bool RHSSeenZero = false;
5449 bool RHSOk = canChangeToInt(RHS, RHSSeenZero, Subtarget);
5450 if (LHSOk && RHSOk && (LHSSeenZero || RHSSeenZero)) {
5451 // If unsafe fp math optimization is enabled and there are no other uses of
5452 // the CMP operands, and the condition code is EQ or NE, we can optimize it
5453 // to an integer comparison.
5454 if (CC == ISD::SETOEQ)
5455 CC = ISD::SETEQ;
5456 else if (CC == ISD::SETUNE)
5457 CC = ISD::SETNE;
5458
5459 SDValue Mask = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5460 SDValue ARMcc;
5461 if (LHS.getValueType() == MVT::f32) {
5462 LHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5463 bitcastf32Toi32(LHS, DAG), Mask);
5464 RHS = DAG.getNode(ISD::AND, dl, MVT::i32,
5465 bitcastf32Toi32(RHS, DAG), Mask);
5466 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5467 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5468 Cmp);
5469 }
5470
5471 SDValue LHS1, LHS2;
5472 SDValue RHS1, RHS2;
5473 expandf64Toi32(LHS, DAG, LHS1, LHS2);
5474 expandf64Toi32(RHS, DAG, RHS1, RHS2);
5475 LHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, LHS2, Mask);
5476 RHS2 = DAG.getNode(ISD::AND, dl, MVT::i32, RHS2, Mask);
5478 ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5479 SDValue Ops[] = { Chain, ARMcc, LHS1, LHS2, RHS1, RHS2, Dest };
5480 return DAG.getNode(ARMISD::BCC_i64, dl, MVT::Other, Ops);
5481 }
5482
5483 return SDValue();
5484}
5485
5486// Generate CMP + CMOV for integer abs.
5487SDValue ARMTargetLowering::LowerABS(SDValue Op, SelectionDAG &DAG) const {
5488 SDLoc DL(Op);
5489
5490 SDValue Neg = DAG.getNegative(Op.getOperand(0), DL, MVT::i32);
5491
5492 // Generate CMP & CMOV.
5493 SDValue Cmp = DAG.getNode(ARMISD::CMP, DL, FlagsVT, Op.getOperand(0),
5494 DAG.getConstant(0, DL, MVT::i32));
5495 return DAG.getNode(ARMISD::CMOV, DL, MVT::i32, Op.getOperand(0), Neg,
5496 DAG.getConstant(ARMCC::MI, DL, MVT::i32), Cmp);
5497}
5498
5500 ARMCC::CondCodes CondCode =
5501 (ARMCC::CondCodes)cast<ConstantSDNode>(ARMcc)->getZExtValue();
5502 CondCode = ARMCC::getOppositeCondition(CondCode);
5503 return DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
5504}
5505
5506SDValue ARMTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
5507 SDValue Chain = Op.getOperand(0);
5508 SDValue Cond = Op.getOperand(1);
5509 SDValue Dest = Op.getOperand(2);
5510 SDLoc dl(Op);
5511
5512 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5513 // instruction.
5514 unsigned Opc = Cond.getOpcode();
5515 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5516 !Subtarget->isThumb1Only();
5517 if (Cond.getResNo() == 1 &&
5518 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5519 Opc == ISD::USUBO || OptimizeMul)) {
5520 // Only lower legal XALUO ops.
5521 if (!isTypeLegal(Cond->getValueType(0)))
5522 return SDValue();
5523
5524 // The actual operation with overflow check.
5525 SDValue Value, OverflowCmp;
5526 SDValue ARMcc;
5527 std::tie(Value, OverflowCmp) = getARMXALUOOp(Cond, DAG, ARMcc);
5528
5529 // Reverse the condition code.
5530 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5531
5532 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5533 OverflowCmp);
5534 }
5535
5536 return SDValue();
5537}
5538
5539SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
5540 SDValue Chain = Op.getOperand(0);
5541 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
5542 SDValue LHS = Op.getOperand(2);
5543 SDValue RHS = Op.getOperand(3);
5544 SDValue Dest = Op.getOperand(4);
5545 SDLoc dl(Op);
5546
5547 if (isUnsupportedFloatingType(LHS.getValueType())) {
5548 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS);
5549
5550 // If softenSetCCOperands only returned one value, we should compare it to
5551 // zero.
5552 if (!RHS.getNode()) {
5553 RHS = DAG.getConstant(0, dl, LHS.getValueType());
5554 CC = ISD::SETNE;
5555 }
5556 }
5557
5558 // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
5559 // instruction.
5560 unsigned Opc = LHS.getOpcode();
5561 bool OptimizeMul = (Opc == ISD::SMULO || Opc == ISD::UMULO) &&
5562 !Subtarget->isThumb1Only();
5563 if (LHS.getResNo() == 1 && (isOneConstant(RHS) || isNullConstant(RHS)) &&
5564 (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
5565 Opc == ISD::USUBO || OptimizeMul) &&
5566 (CC == ISD::SETEQ || CC == ISD::SETNE)) {
5567 // Only lower legal XALUO ops.
5568 if (!isTypeLegal(LHS->getValueType(0)))
5569 return SDValue();
5570
5571 // The actual operation with overflow check.
5572 SDValue Value, OverflowCmp;
5573 SDValue ARMcc;
5574 std::tie(Value, OverflowCmp) = getARMXALUOOp(LHS.getValue(0), DAG, ARMcc);
5575
5576 if ((CC == ISD::SETNE) != isOneConstant(RHS)) {
5577 // Reverse the condition code.
5578 ARMcc = getInvertedARMCondCode(ARMcc, DAG);
5579 }
5580
5581 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc,
5582 OverflowCmp);
5583 }
5584
5585 if (LHS.getValueType() == MVT::i32) {
5586 SDValue ARMcc;
5587 SDValue Cmp = getARMCmp(LHS, RHS, CC, ARMcc, DAG, dl);
5588 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, Dest, ARMcc, Cmp);
5589 }
5590
5591 SDNodeFlags Flags = Op->getFlags();
5592 if (Flags.hasNoNaNs() &&
5593 DAG.getDenormalMode(MVT::f32) == DenormalMode::getIEEE() &&
5594 DAG.getDenormalMode(MVT::f64) == DenormalMode::getIEEE() &&
5595 (CC == ISD::SETEQ || CC == ISD::SETOEQ || CC == ISD::SETNE ||
5596 CC == ISD::SETUNE)) {
5597 if (SDValue Result = OptimizeVFPBrcond(Op, DAG))
5598 return Result;
5599 }
5600
5601 ARMCC::CondCodes CondCode, CondCode2;
5602 FPCCToARMCC(CC, CondCode, CondCode2);
5603
5604 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
5605 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
5606 SDValue Ops[] = {Chain, Dest, ARMcc, Cmp};
5607 SDValue Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5608 if (CondCode2 != ARMCC::AL) {
5609 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
5610 SDValue Ops[] = {Res, Dest, ARMcc, Cmp};
5611 Res = DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Ops);
5612 }
5613 return Res;
5614}
5615
5616SDValue ARMTargetLowering::LowerBR_JT(SDValue Op, SelectionDAG &DAG) const {
5617 SDValue Chain = Op.getOperand(0);
5618 SDValue Table = Op.getOperand(1);
5619 SDValue Index = Op.getOperand(2);
5620 SDLoc dl(Op);
5621
5622 EVT PTy = getPointerTy(DAG.getDataLayout());
5623 JumpTableSDNode *JT = cast<JumpTableSDNode>(Table);
5624 SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
5625 Table = DAG.getNode(ARMISD::WrapperJT, dl, MVT::i32, JTI);
5626 Index = DAG.getNode(ISD::MUL, dl, PTy, Index, DAG.getConstant(4, dl, PTy));
5627 SDValue Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Index);
5628 if (Subtarget->isThumb2() || (Subtarget->hasV8MBaselineOps() && Subtarget->isThumb())) {
5629 // Thumb2 and ARMv8-M use a two-level jump. That is, it jumps into the jump table
5630 // which does another jump to the destination. This also makes it easier
5631 // to translate it to TBB / TBH later (Thumb2 only).
5632 // FIXME: This might not work if the function is extremely large.
5633 return DAG.getNode(ARMISD::BR2_JT, dl, MVT::Other, Chain,
5634 Addr, Op.getOperand(2), JTI);
5635 }
5636 if (isPositionIndependent() || Subtarget->isROPI()) {
5637 Addr =
5638 DAG.getLoad((EVT)MVT::i32, dl, Chain, Addr,
5640 Chain = Addr.getValue(1);
5641 Addr = DAG.getNode(ISD::ADD, dl, PTy, Table, Addr);
5642 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5643 } else {
5644 Addr =
5645 DAG.getLoad(PTy, dl, Chain, Addr,
5647 Chain = Addr.getValue(1);
5648 return DAG.getNode(ARMISD::BR_JT, dl, MVT::Other, Chain, Addr, JTI);
5649 }
5650}
5651
5653 EVT VT = Op.getValueType();
5654 SDLoc dl(Op);
5655
5656 if (Op.getValueType().getVectorElementType() == MVT::i32) {
5657 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::f32)
5658 return Op;
5659 return DAG.UnrollVectorOp(Op.getNode());
5660 }
5661
5662 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5663
5664 EVT NewTy;
5665 const EVT OpTy = Op.getOperand(0).getValueType();
5666 if (OpTy == MVT::v4f32)
5667 NewTy = MVT::v4i32;
5668 else if (OpTy == MVT::v4f16 && HasFullFP16)
5669 NewTy = MVT::v4i16;
5670 else if (OpTy == MVT::v8f16 && HasFullFP16)
5671 NewTy = MVT::v8i16;
5672 else
5673 llvm_unreachable("Invalid type for custom lowering!");
5674
5675 if (VT != MVT::v4i16 && VT != MVT::v8i16)
5676 return DAG.UnrollVectorOp(Op.getNode());
5677
5678 Op = DAG.getNode(Op.getOpcode(), dl, NewTy, Op.getOperand(0));
5679 return DAG.getNode(ISD::TRUNCATE, dl, VT, Op);
5680}
5681
5682SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
5683 EVT VT = Op.getValueType();
5684 if (VT.isVector())
5685 return LowerVectorFP_TO_INT(Op, DAG);
5686
5687 bool IsStrict = Op->isStrictFPOpcode();
5688 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
5689
5690 if (isUnsupportedFloatingType(SrcVal.getValueType())) {
5691 RTLIB::Libcall LC;
5692 if (Op.getOpcode() == ISD::FP_TO_SINT ||
5693 Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
5694 LC = RTLIB::getFPTOSINT(SrcVal.getValueType(),
5695 Op.getValueType());
5696 else
5697 LC = RTLIB::getFPTOUINT(SrcVal.getValueType(),
5698 Op.getValueType());
5699 SDLoc Loc(Op);
5700 MakeLibCallOptions CallOptions;
5701 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
5703 std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
5704 CallOptions, Loc, Chain);
5705 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
5706 }
5707
5708 // FIXME: Remove this when we have strict fp instruction selection patterns
5709 if (IsStrict) {
5710 SDLoc Loc(Op);
5711 SDValue Result =
5714 Loc, Op.getValueType(), SrcVal);
5715 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
5716 }
5717
5718 return Op;
5719}
5720
5722 const ARMSubtarget *Subtarget) {
5723 EVT VT = Op.getValueType();
5724 EVT ToVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
5725 EVT FromVT = Op.getOperand(0).getValueType();
5726
5727 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f32)
5728 return Op;
5729 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f64 &&
5730 Subtarget->hasFP64())
5731 return Op;
5732 if (VT == MVT::i32 && ToVT == MVT::i32 && FromVT == MVT::f16 &&
5733 Subtarget->hasFullFP16())
5734 return Op;
5735 if (VT == MVT::v4i32 && ToVT == MVT::i32 && FromVT == MVT::v4f32 &&
5736 Subtarget->hasMVEFloatOps())
5737 return Op;
5738 if (VT == MVT::v8i16 && ToVT == MVT::i16 && FromVT == MVT::v8f16 &&
5739 Subtarget->hasMVEFloatOps())
5740 return Op;
5741
5742 if (FromVT != MVT::v4f32 && FromVT != MVT::v8f16)
5743 return SDValue();
5744
5745 SDLoc DL(Op);
5746 bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT_SAT;
5747 unsigned BW = ToVT.getScalarSizeInBits() - IsSigned;
5748 SDValue CVT = DAG.getNode(Op.getOpcode(), DL, VT, Op.getOperand(0),
5749 DAG.getValueType(VT.getScalarType()));
5750 SDValue Max = DAG.getNode(IsSigned ? ISD::SMIN : ISD::UMIN, DL, VT, CVT,
5751 DAG.getConstant((1 << BW) - 1, DL, VT));
5752 if (IsSigned)
5753 Max = DAG.getNode(ISD::SMAX, DL, VT, Max,
5754 DAG.getSignedConstant(-(1 << BW), DL, VT));
5755 return Max;
5756}
5757
5759 EVT VT = Op.getValueType();
5760 SDLoc dl(Op);
5761
5762 if (Op.getOperand(0).getValueType().getVectorElementType() == MVT::i32) {
5763 if (VT.getVectorElementType() == MVT::f32)
5764 return Op;
5765 return DAG.UnrollVectorOp(Op.getNode());
5766 }
5767
5768 assert((Op.getOperand(0).getValueType() == MVT::v4i16 ||
5769 Op.getOperand(0).getValueType() == MVT::v8i16) &&
5770 "Invalid type for custom lowering!");
5771
5772 const bool HasFullFP16 = DAG.getSubtarget<ARMSubtarget>().hasFullFP16();
5773
5774 EVT DestVecType;
5775 if (VT == MVT::v4f32)
5776 DestVecType = MVT::v4i32;
5777 else if (VT == MVT::v4f16 && HasFullFP16)
5778 DestVecType = MVT::v4i16;
5779 else if (VT == MVT::v8f16 && HasFullFP16)
5780 DestVecType = MVT::v8i16;
5781 else
5782 return DAG.UnrollVectorOp(Op.getNode());
5783
5784 unsigned CastOpc;
5785 unsigned Opc;
5786 switch (Op.getOpcode()) {
5787 default: llvm_unreachable("Invalid opcode!");
5788 case ISD::SINT_TO_FP:
5789 CastOpc = ISD::SIGN_EXTEND;
5791 break;
5792 case ISD::UINT_TO_FP:
5793 CastOpc = ISD::ZERO_EXTEND;
5795 break;
5796 }
5797
5798 Op = DAG.getNode(CastOpc, dl, DestVecType, Op.getOperand(0));
5799 return DAG.getNode(Opc, dl, VT, Op);
5800}
5801
5802SDValue ARMTargetLowering::LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const {
5803 EVT VT = Op.getValueType();
5804 if (VT.isVector())
5805 return LowerVectorINT_TO_FP(Op, DAG);
5806 if (isUnsupportedFloatingType(VT)) {
5807 RTLIB::Libcall LC;
5808 if (Op.getOpcode() == ISD::SINT_TO_FP)
5809 LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(),
5810 Op.getValueType());
5811 else
5812 LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(),
5813 Op.getValueType());
5814 MakeLibCallOptions CallOptions;
5815 return makeLibCall(DAG, LC, Op.getValueType(), Op.getOperand(0),
5816 CallOptions, SDLoc(Op)).first;
5817 }
5818
5819 return Op;
5820}
5821
5822SDValue ARMTargetLowering::LowerFCOPYSIGN(SDValue Op, SelectionDAG &DAG) const {
5823 // Implement fcopysign with a fabs and a conditional fneg.
5824 SDValue Tmp0 = Op.getOperand(0);
5825 SDValue Tmp1 = Op.getOperand(1);
5826 SDLoc dl(Op);
5827 EVT VT = Op.getValueType();
5828 EVT SrcVT = Tmp1.getValueType();
5829 bool InGPR = Tmp0.getOpcode() == ISD::BITCAST ||
5830 Tmp0.getOpcode() == ARMISD::VMOVDRR;
5831 bool UseNEON = !InGPR && Subtarget->hasNEON();
5832
5833 if (UseNEON) {
5834 // Use VBSL to copy the sign bit.
5835 unsigned EncodedVal = ARM_AM::createVMOVModImm(0x6, 0x80);
5836 SDValue Mask = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v2i32,
5837 DAG.getTargetConstant(EncodedVal, dl, MVT::i32));
5838 EVT OpVT = (VT == MVT::f32) ? MVT::v2i32 : MVT::v1i64;
5839 if (VT == MVT::f64)
5840 Mask = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5841 DAG.getNode(ISD::BITCAST, dl, OpVT, Mask),
5842 DAG.getConstant(32, dl, MVT::i32));
5843 else /*if (VT == MVT::f32)*/
5844 Tmp0 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp0);
5845 if (SrcVT == MVT::f32) {
5846 Tmp1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f32, Tmp1);
5847 if (VT == MVT::f64)
5848 Tmp1 = DAG.getNode(ARMISD::VSHLIMM, dl, OpVT,
5849 DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1),
5850 DAG.getConstant(32, dl, MVT::i32));
5851 } else if (VT == MVT::f32)
5852 Tmp1 = DAG.getNode(ARMISD::VSHRuIMM, dl, MVT::v1i64,
5853 DAG.getNode(ISD::BITCAST, dl, MVT::v1i64, Tmp1),
5854 DAG.getConstant(32, dl, MVT::i32));
5855 Tmp0 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp0);
5856 Tmp1 = DAG.getNode(ISD::BITCAST, dl, OpVT, Tmp1);
5857
5859 dl, MVT::i32);
5860 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v8i8, AllOnes);
5861 SDValue MaskNot = DAG.getNode(ISD::XOR, dl, OpVT, Mask,
5862 DAG.getNode(ISD::BITCAST, dl, OpVT, AllOnes));
5863
5864 SDValue Res = DAG.getNode(ISD::OR, dl, OpVT,
5865 DAG.getNode(ISD::AND, dl, OpVT, Tmp1, Mask),
5866 DAG.getNode(ISD::AND, dl, OpVT, Tmp0, MaskNot));
5867 if (VT == MVT::f32) {
5868 Res = DAG.getNode(ISD::BITCAST, dl, MVT::v2f32, Res);
5869 Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
5870 DAG.getConstant(0, dl, MVT::i32));
5871 } else {
5872 Res = DAG.getNode(ISD::BITCAST, dl, MVT::f64, Res);
5873 }
5874
5875 return Res;
5876 }
5877
5878 // Bitcast operand 1 to i32.
5879 if (SrcVT == MVT::f64)
5880 Tmp1 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5881 Tmp1).getValue(1);
5882 Tmp1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp1);
5883
5884 // Or in the signbit with integer operations.
5885 SDValue Mask1 = DAG.getConstant(0x80000000, dl, MVT::i32);
5886 SDValue Mask2 = DAG.getConstant(0x7fffffff, dl, MVT::i32);
5887 Tmp1 = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp1, Mask1);
5888 if (VT == MVT::f32) {
5889 Tmp0 = DAG.getNode(ISD::AND, dl, MVT::i32,
5890 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Tmp0), Mask2);
5891 return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
5892 DAG.getNode(ISD::OR, dl, MVT::i32, Tmp0, Tmp1));
5893 }
5894
5895 // f64: Or the high part with signbit and then combine two parts.
5896 Tmp0 = DAG.getNode(ARMISD::VMOVRRD, dl, DAG.getVTList(MVT::i32, MVT::i32),
5897 Tmp0);
5898 SDValue Lo = Tmp0.getValue(0);
5899 SDValue Hi = DAG.getNode(ISD::AND, dl, MVT::i32, Tmp0.getValue(1), Mask2);
5900 Hi = DAG.getNode(ISD::OR, dl, MVT::i32, Hi, Tmp1);
5901 return DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi);
5902}
5903
5904SDValue ARMTargetLowering::LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const{
5905 MachineFunction &MF = DAG.getMachineFunction();
5906 MachineFrameInfo &MFI = MF.getFrameInfo();
5907 MFI.setReturnAddressIsTaken(true);
5908
5909 EVT VT = Op.getValueType();
5910 SDLoc dl(Op);
5911 unsigned Depth = Op.getConstantOperandVal(0);
5912 if (Depth) {
5913 SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
5914 SDValue Offset = DAG.getConstant(4, dl, MVT::i32);
5915 return DAG.getLoad(VT, dl, DAG.getEntryNode(),
5916 DAG.getNode(ISD::ADD, dl, VT, FrameAddr, Offset),
5917 MachinePointerInfo());
5918 }
5919
5920 // Return LR, which contains the return address. Mark it an implicit live-in.
5921 Register Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
5922 return DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, VT);
5923}
5924
5925SDValue ARMTargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
5926 const ARMBaseRegisterInfo &ARI =
5927 *static_cast<const ARMBaseRegisterInfo*>(RegInfo);
5928 MachineFunction &MF = DAG.getMachineFunction();
5929 MachineFrameInfo &MFI = MF.getFrameInfo();
5930 MFI.setFrameAddressIsTaken(true);
5931
5932 EVT VT = Op.getValueType();
5933 SDLoc dl(Op); // FIXME probably not meaningful
5934 unsigned Depth = Op.getConstantOperandVal(0);
5935 Register FrameReg = ARI.getFrameRegister(MF);
5936 SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
5937 while (Depth--)
5938 FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
5939 MachinePointerInfo());
5940 return FrameAddr;
5941}
5942
5943// FIXME? Maybe this could be a TableGen attribute on some registers and
5944// this table could be generated automatically from RegInfo.
5945Register ARMTargetLowering::getRegisterByName(const char* RegName, LLT VT,
5946 const MachineFunction &MF) const {
5947 return StringSwitch<Register>(RegName)
5948 .Case("sp", ARM::SP)
5949 .Default(Register());
5950}
5951
5952// Result is 64 bit value so split into two 32 bit values and return as a
5953// pair of values.
5955 SelectionDAG &DAG) {
5956 SDLoc DL(N);
5957
5958 // This function is only supposed to be called for i64 type destination.
5959 assert(N->getValueType(0) == MVT::i64
5960 && "ExpandREAD_REGISTER called for non-i64 type result.");
5961
5963 DAG.getVTList(MVT::i32, MVT::i32, MVT::Other),
5964 N->getOperand(0),
5965 N->getOperand(1));
5966
5967 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Read.getValue(0),
5968 Read.getValue(1)));
5969 Results.push_back(Read.getValue(2)); // Chain
5970}
5971
5972/// \p BC is a bitcast that is about to be turned into a VMOVDRR.
5973/// When \p DstVT, the destination type of \p BC, is on the vector
5974/// register bank and the source of bitcast, \p Op, operates on the same bank,
5975/// it might be possible to combine them, such that everything stays on the
5976/// vector register bank.
5977/// \p return The node that would replace \p BT, if the combine
5978/// is possible.
5980 SelectionDAG &DAG) {
5981 SDValue Op = BC->getOperand(0);
5982 EVT DstVT = BC->getValueType(0);
5983
5984 // The only vector instruction that can produce a scalar (remember,
5985 // since the bitcast was about to be turned into VMOVDRR, the source
5986 // type is i64) from a vector is EXTRACT_VECTOR_ELT.
5987 // Moreover, we can do this combine only if there is one use.
5988 // Finally, if the destination type is not a vector, there is not
5989 // much point on forcing everything on the vector bank.
5990 if (!DstVT.isVector() || Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
5991 !Op.hasOneUse())
5992 return SDValue();
5993
5994 // If the index is not constant, we will introduce an additional
5995 // multiply that will stick.
5996 // Give up in that case.
5997 ConstantSDNode *Index = dyn_cast<ConstantSDNode>(Op.getOperand(1));
5998 if (!Index)
5999 return SDValue();
6000 unsigned DstNumElt = DstVT.getVectorNumElements();
6001
6002 // Compute the new index.
6003 const APInt &APIntIndex = Index->getAPIntValue();
6004 APInt NewIndex(APIntIndex.getBitWidth(), DstNumElt);
6005 NewIndex *= APIntIndex;
6006 // Check if the new constant index fits into i32.
6007 if (NewIndex.getBitWidth() > 32)
6008 return SDValue();
6009
6010 // vMTy bitcast(i64 extractelt vNi64 src, i32 index) ->
6011 // vMTy extractsubvector vNxMTy (bitcast vNi64 src), i32 index*M)
6012 SDLoc dl(Op);
6013 SDValue ExtractSrc = Op.getOperand(0);
6014 EVT VecVT = EVT::getVectorVT(
6015 *DAG.getContext(), DstVT.getScalarType(),
6016 ExtractSrc.getValueType().getVectorNumElements() * DstNumElt);
6017 SDValue BitCast = DAG.getNode(ISD::BITCAST, dl, VecVT, ExtractSrc);
6018 return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DstVT, BitCast,
6019 DAG.getConstant(NewIndex.getZExtValue(), dl, MVT::i32));
6020}
6021
6022/// ExpandBITCAST - If the target supports VFP, this function is called to
6023/// expand a bit convert where either the source or destination type is i64 to
6024/// use a VMOVDRR or VMOVRRD node. This should not be done when the non-i64
6025/// operand type is illegal (e.g., v2f32 for a target that doesn't support
6026/// vectors), since the legalizer won't know what to do with that.
6027SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
6028 const ARMSubtarget *Subtarget) const {
6029 SDLoc dl(N);
6030 SDValue Op = N->getOperand(0);
6031
6032 // This function is only supposed to be called for i16 and i64 types, either
6033 // as the source or destination of the bit convert.
6034 EVT SrcVT = Op.getValueType();
6035 EVT DstVT = N->getValueType(0);
6036
6037 if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
6038 (DstVT == MVT::f16 || DstVT == MVT::bf16))
6039 return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
6040 DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
6041
6042 if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
6043 (SrcVT == MVT::f16 || SrcVT == MVT::bf16)) {
6044 if (Subtarget->hasFullFP16() && !Subtarget->hasBF16())
6045 Op = DAG.getBitcast(MVT::f16, Op);
6046 return DAG.getNode(
6047 ISD::TRUNCATE, SDLoc(N), DstVT,
6048 MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
6049 }
6050
6051 if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
6052 return SDValue();
6053
6054 // Turn i64->f64 into VMOVDRR.
6055 if (SrcVT == MVT::i64 && isTypeLegal(DstVT)) {
6056 // Do not force values to GPRs (this is what VMOVDRR does for the inputs)
6057 // if we can combine the bitcast with its source.
6059 return Val;
6060 SDValue Lo, Hi;
6061 std::tie(Lo, Hi) = DAG.SplitScalar(Op, dl, MVT::i32, MVT::i32);
6062 return DAG.getNode(ISD::BITCAST, dl, DstVT,
6063 DAG.getNode(ARMISD::VMOVDRR, dl, MVT::f64, Lo, Hi));
6064 }
6065
6066 // Turn f64->i64 into VMOVRRD.
6067 if (DstVT == MVT::i64 && isTypeLegal(SrcVT)) {
6068 SDValue Cvt;
6069 if (DAG.getDataLayout().isBigEndian() && SrcVT.isVector() &&
6070 SrcVT.getVectorNumElements() > 1)
6071 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6072 DAG.getVTList(MVT::i32, MVT::i32),
6073 DAG.getNode(ARMISD::VREV64, dl, SrcVT, Op));
6074 else
6075 Cvt = DAG.getNode(ARMISD::VMOVRRD, dl,
6076 DAG.getVTList(MVT::i32, MVT::i32), Op);
6077 // Merge the pieces into a single i64 value.
6078 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Cvt, Cvt.getValue(1));
6079 }
6080
6081 return SDValue();
6082}
6083
6084/// getZeroVector - Returns a vector of specified type with all zero elements.
6085/// Zero vectors are used to represent vector negation and in those cases
6086/// will be implemented with the NEON VNEG instruction. However, VNEG does
6087/// not support i64 elements, so sometimes the zero vectors will need to be
6088/// explicitly constructed. Regardless, use a canonical VMOV to create the
6089/// zero vector.
6090static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
6091 assert(VT.isVector() && "Expected a vector type");
6092 // The canonical modified immediate encoding of a zero vector is....0!
6093 SDValue EncodedVal = DAG.getTargetConstant(0, dl, MVT::i32);
6094 EVT VmovVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
6095 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, EncodedVal);
6096 return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
6097}
6098
6099/// LowerShiftRightParts - Lower SRA_PARTS, which returns two
6100/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6101SDValue ARMTargetLowering::LowerShiftRightParts(SDValue Op,
6102 SelectionDAG &DAG) const {
6103 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6104 EVT VT = Op.getValueType();
6105 unsigned VTBits = VT.getSizeInBits();
6106 SDLoc dl(Op);
6107 SDValue ShOpLo = Op.getOperand(0);
6108 SDValue ShOpHi = Op.getOperand(1);
6109 SDValue ShAmt = Op.getOperand(2);
6110 SDValue ARMcc;
6111 unsigned Opc = (Op.getOpcode() == ISD::SRA_PARTS) ? ISD::SRA : ISD::SRL;
6112
6113 assert(Op.getOpcode() == ISD::SRA_PARTS || Op.getOpcode() == ISD::SRL_PARTS);
6114
6115 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6116 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6117 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, ShAmt);
6118 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6119 DAG.getConstant(VTBits, dl, MVT::i32));
6120 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, RevShAmt);
6121 SDValue LoSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6122 SDValue LoBigShift = DAG.getNode(Opc, dl, VT, ShOpHi, ExtraShAmt);
6123 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6124 ISD::SETGE, ARMcc, DAG, dl);
6125 SDValue Lo =
6126 DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift, LoBigShift, ARMcc, CmpLo);
6127
6128 SDValue HiSmallShift = DAG.getNode(Opc, dl, VT, ShOpHi, ShAmt);
6129 SDValue HiBigShift = Opc == ISD::SRA
6130 ? DAG.getNode(Opc, dl, VT, ShOpHi,
6131 DAG.getConstant(VTBits - 1, dl, VT))
6132 : DAG.getConstant(0, dl, VT);
6133 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6134 ISD::SETGE, ARMcc, DAG, dl);
6135 SDValue Hi =
6136 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6137
6138 SDValue Ops[2] = { Lo, Hi };
6139 return DAG.getMergeValues(Ops, dl);
6140}
6141
6142/// LowerShiftLeftParts - Lower SHL_PARTS, which returns two
6143/// i32 values and take a 2 x i32 value to shift plus a shift amount.
6144SDValue ARMTargetLowering::LowerShiftLeftParts(SDValue Op,
6145 SelectionDAG &DAG) const {
6146 assert(Op.getNumOperands() == 3 && "Not a double-shift!");
6147 EVT VT = Op.getValueType();
6148 unsigned VTBits = VT.getSizeInBits();
6149 SDLoc dl(Op);
6150 SDValue ShOpLo = Op.getOperand(0);
6151 SDValue ShOpHi = Op.getOperand(1);
6152 SDValue ShAmt = Op.getOperand(2);
6153 SDValue ARMcc;
6154
6155 assert(Op.getOpcode() == ISD::SHL_PARTS);
6156 SDValue RevShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6157 DAG.getConstant(VTBits, dl, MVT::i32), ShAmt);
6158 SDValue Tmp1 = DAG.getNode(ISD::SRL, dl, VT, ShOpLo, RevShAmt);
6159 SDValue Tmp2 = DAG.getNode(ISD::SHL, dl, VT, ShOpHi, ShAmt);
6160 SDValue HiSmallShift = DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
6161
6162 SDValue ExtraShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32, ShAmt,
6163 DAG.getConstant(VTBits, dl, MVT::i32));
6164 SDValue HiBigShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ExtraShAmt);
6165 SDValue CmpHi = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6166 ISD::SETGE, ARMcc, DAG, dl);
6167 SDValue Hi =
6168 DAG.getNode(ARMISD::CMOV, dl, VT, HiSmallShift, HiBigShift, ARMcc, CmpHi);
6169
6170 SDValue CmpLo = getARMCmp(ExtraShAmt, DAG.getConstant(0, dl, MVT::i32),
6171 ISD::SETGE, ARMcc, DAG, dl);
6172 SDValue LoSmallShift = DAG.getNode(ISD::SHL, dl, VT, ShOpLo, ShAmt);
6173 SDValue Lo = DAG.getNode(ARMISD::CMOV, dl, VT, LoSmallShift,
6174 DAG.getConstant(0, dl, VT), ARMcc, CmpLo);
6175
6176 SDValue Ops[2] = { Lo, Hi };
6177 return DAG.getMergeValues(Ops, dl);
6178}
6179
6180SDValue ARMTargetLowering::LowerGET_ROUNDING(SDValue Op,
6181 SelectionDAG &DAG) const {
6182 // The rounding mode is in bits 23:22 of the FPSCR.
6183 // The ARM rounding mode value to FLT_ROUNDS mapping is 0->1, 1->2, 2->3, 3->0
6184 // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
6185 // so that the shift + and get folded into a bitfield extract.
6186 SDLoc dl(Op);
6187 SDValue Chain = Op.getOperand(0);
6188 SDValue Ops[] = {Chain,
6189 DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
6190
6191 SDValue FPSCR =
6192 DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
6193 Chain = FPSCR.getValue(1);
6194 SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
6195 DAG.getConstant(1U << 22, dl, MVT::i32));
6196 SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
6197 DAG.getConstant(22, dl, MVT::i32));
6198 SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
6199 DAG.getConstant(3, dl, MVT::i32));
6200 return DAG.getMergeValues({And, Chain}, dl);
6201}
6202
6203SDValue ARMTargetLowering::LowerSET_ROUNDING(SDValue Op,
6204 SelectionDAG &DAG) const {
6205 SDLoc DL(Op);
6206 SDValue Chain = Op->getOperand(0);
6207 SDValue RMValue = Op->getOperand(1);
6208
6209 // The rounding mode is in bits 23:22 of the FPSCR.
6210 // The llvm.set.rounding argument value to ARM rounding mode value mapping
6211 // is 0->3, 1->0, 2->1, 3->2. The formula we use to implement this is
6212 // ((arg - 1) & 3) << 22).
6213 //
6214 // It is expected that the argument of llvm.set.rounding is within the
6215 // segment [0, 3], so NearestTiesToAway (4) is not handled here. It is
6216 // responsibility of the code generated llvm.set.rounding to ensure this
6217 // condition.
6218
6219 // Calculate new value of FPSCR[23:22].
6220 RMValue = DAG.getNode(ISD::SUB, DL, MVT::i32, RMValue,
6221 DAG.getConstant(1, DL, MVT::i32));
6222 RMValue = DAG.getNode(ISD::AND, DL, MVT::i32, RMValue,
6223 DAG.getConstant(0x3, DL, MVT::i32));
6224 RMValue = DAG.getNode(ISD::SHL, DL, MVT::i32, RMValue,
6225 DAG.getConstant(ARM::RoundingBitsPos, DL, MVT::i32));
6226
6227 // Get current value of FPSCR.
6228 SDValue Ops[] = {Chain,
6229 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6230 SDValue FPSCR =
6231 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6232 Chain = FPSCR.getValue(1);
6233 FPSCR = FPSCR.getValue(0);
6234
6235 // Put new rounding mode into FPSCR[23:22].
6236 const unsigned RMMask = ~(ARM::Rounding::rmMask << ARM::RoundingBitsPos);
6237 FPSCR = DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6238 DAG.getConstant(RMMask, DL, MVT::i32));
6239 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCR, RMValue);
6240 SDValue Ops2[] = {
6241 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6242 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6243}
6244
6245SDValue ARMTargetLowering::LowerSET_FPMODE(SDValue Op,
6246 SelectionDAG &DAG) const {
6247 SDLoc DL(Op);
6248 SDValue Chain = Op->getOperand(0);
6249 SDValue Mode = Op->getOperand(1);
6250
6251 // Generate nodes to build:
6252 // FPSCR = (FPSCR & FPStatusBits) | (Mode & ~FPStatusBits)
6253 SDValue Ops[] = {Chain,
6254 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6255 SDValue FPSCR =
6256 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6257 Chain = FPSCR.getValue(1);
6258 FPSCR = FPSCR.getValue(0);
6259
6260 SDValue FPSCRMasked =
6261 DAG.getNode(ISD::AND, DL, MVT::i32, FPSCR,
6262 DAG.getConstant(ARM::FPStatusBits, DL, MVT::i32));
6263 SDValue InputMasked =
6264 DAG.getNode(ISD::AND, DL, MVT::i32, Mode,
6265 DAG.getConstant(~ARM::FPStatusBits, DL, MVT::i32));
6266 FPSCR = DAG.getNode(ISD::OR, DL, MVT::i32, FPSCRMasked, InputMasked);
6267
6268 SDValue Ops2[] = {
6269 Chain, DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32), FPSCR};
6270 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6271}
6272
6273SDValue ARMTargetLowering::LowerRESET_FPMODE(SDValue Op,
6274 SelectionDAG &DAG) const {
6275 SDLoc DL(Op);
6276 SDValue Chain = Op->getOperand(0);
6277
6278 // To get the default FP mode all control bits are cleared:
6279 // FPSCR = FPSCR & (FPStatusBits | FPReservedBits)
6280 SDValue Ops[] = {Chain,
6281 DAG.getConstant(Intrinsic::arm_get_fpscr, DL, MVT::i32)};
6282 SDValue FPSCR =
6283 DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, {MVT::i32, MVT::Other}, Ops);
6284 Chain = FPSCR.getValue(1);
6285 FPSCR = FPSCR.getValue(0);
6286
6287 SDValue FPSCRMasked = DAG.getNode(
6288 ISD::AND, DL, MVT::i32, FPSCR,
6290 SDValue Ops2[] = {Chain,
6291 DAG.getConstant(Intrinsic::arm_set_fpscr, DL, MVT::i32),
6292 FPSCRMasked};
6293 return DAG.getNode(ISD::INTRINSIC_VOID, DL, MVT::Other, Ops2);
6294}
6295
6297 const ARMSubtarget *ST) {
6298 SDLoc dl(N);
6299 EVT VT = N->getValueType(0);
6300 if (VT.isVector() && ST->hasNEON()) {
6301
6302 // Compute the least significant set bit: LSB = X & -X
6303 SDValue X = N->getOperand(0);
6304 SDValue NX = DAG.getNode(ISD::SUB, dl, VT, getZeroVector(VT, DAG, dl), X);
6305 SDValue LSB = DAG.getNode(ISD::AND, dl, VT, X, NX);
6306
6307 EVT ElemTy = VT.getVectorElementType();
6308
6309 if (ElemTy == MVT::i8) {
6310 // Compute with: cttz(x) = ctpop(lsb - 1)
6311 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6312 DAG.getTargetConstant(1, dl, ElemTy));
6313 SDValue Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6314 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6315 }
6316
6317 if ((ElemTy == MVT::i16 || ElemTy == MVT::i32) &&
6318 (N->getOpcode() == ISD::CTTZ_ZERO_POISON)) {
6319 // Compute with: cttz(x) = (width - 1) - ctlz(lsb), if x != 0
6320 unsigned NumBits = ElemTy.getSizeInBits();
6321 SDValue WidthMinus1 =
6322 DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6323 DAG.getTargetConstant(NumBits - 1, dl, ElemTy));
6324 SDValue CTLZ = DAG.getNode(ISD::CTLZ, dl, VT, LSB);
6325 return DAG.getNode(ISD::SUB, dl, VT, WidthMinus1, CTLZ);
6326 }
6327
6328 // Compute with: cttz(x) = ctpop(lsb - 1)
6329
6330 // Compute LSB - 1.
6331 SDValue Bits;
6332 if (ElemTy == MVT::i64) {
6333 // Load constant 0xffff'ffff'ffff'ffff to register.
6334 SDValue FF = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6335 DAG.getTargetConstant(0x1eff, dl, MVT::i32));
6336 Bits = DAG.getNode(ISD::ADD, dl, VT, LSB, FF);
6337 } else {
6338 SDValue One = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
6339 DAG.getTargetConstant(1, dl, ElemTy));
6340 Bits = DAG.getNode(ISD::SUB, dl, VT, LSB, One);
6341 }
6342 return DAG.getNode(ISD::CTPOP, dl, VT, Bits);
6343 }
6344
6345 if (!ST->hasV6T2Ops())
6346 return SDValue();
6347
6348 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, VT, N->getOperand(0));
6349 return DAG.getNode(ISD::CTLZ, dl, VT, rbit);
6350}
6351
6353 const ARMSubtarget *ST) {
6354 EVT VT = N->getValueType(0);
6355 SDLoc DL(N);
6356
6357 assert(ST->hasNEON() && "Custom ctpop lowering requires NEON.");
6358 assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
6359 VT == MVT::v4i32 || VT == MVT::v4i16 || VT == MVT::v8i16) &&
6360 "Unexpected type for custom ctpop lowering");
6361
6362 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
6363 EVT VT8Bit = VT.is64BitVector() ? MVT::v8i8 : MVT::v16i8;
6364 SDValue Res = DAG.getBitcast(VT8Bit, N->getOperand(0));
6365 Res = DAG.getNode(ISD::CTPOP, DL, VT8Bit, Res);
6366
6367 // Widen v8i8/v16i8 CTPOP result to VT by repeatedly widening pairwise adds.
6368 unsigned EltSize = 8;
6369 unsigned NumElts = VT.is64BitVector() ? 8 : 16;
6370 while (EltSize != VT.getScalarSizeInBits()) {
6372 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddlu, DL,
6373 TLI.getPointerTy(DAG.getDataLayout())));
6374 Ops.push_back(Res);
6375
6376 EltSize *= 2;
6377 NumElts /= 2;
6378 MVT WidenVT = MVT::getVectorVT(MVT::getIntegerVT(EltSize), NumElts);
6379 Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, WidenVT, Ops);
6380 }
6381
6382 return Res;
6383}
6384
6385/// Getvshiftimm - Check if this is a valid build_vector for the immediate
6386/// operand of a vector shift operation, where all the elements of the
6387/// build_vector must have the same constant integer value.
6388static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt) {
6389 // Ignore bit_converts.
6390 while (Op.getOpcode() == ISD::BITCAST)
6391 Op = Op.getOperand(0);
6393 APInt SplatBits, SplatUndef;
6394 unsigned SplatBitSize;
6395 bool HasAnyUndefs;
6396 if (!BVN ||
6397 !BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs,
6398 ElementBits) ||
6399 SplatBitSize > ElementBits)
6400 return false;
6401 Cnt = SplatBits.getSExtValue();
6402 return true;
6403}
6404
6405/// isVShiftLImm - Check if this is a valid build_vector for the immediate
6406/// operand of a vector shift left operation. That value must be in the range:
6407/// 0 <= Value < ElementBits for a left shift; or
6408/// 0 <= Value <= ElementBits for a long left shift.
6409static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt) {
6410 assert(VT.isVector() && "vector shift count is not a vector type");
6411 int64_t ElementBits = VT.getScalarSizeInBits();
6412 if (!getVShiftImm(Op, ElementBits, Cnt))
6413 return false;
6414 return (Cnt >= 0 && (isLong ? Cnt - 1 : Cnt) < ElementBits);
6415}
6416
6417/// isVShiftRImm - Check if this is a valid build_vector for the immediate
6418/// operand of a vector shift right operation. For a shift opcode, the value
6419/// is positive, but for an intrinsic the value count must be negative. The
6420/// absolute value must be in the range:
6421/// 1 <= |Value| <= ElementBits for a right shift; or
6422/// 1 <= |Value| <= ElementBits/2 for a narrow right shift.
6423static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, bool isIntrinsic,
6424 int64_t &Cnt) {
6425 assert(VT.isVector() && "vector shift count is not a vector type");
6426 int64_t ElementBits = VT.getScalarSizeInBits();
6427 if (!getVShiftImm(Op, ElementBits, Cnt))
6428 return false;
6429 if (!isIntrinsic)
6430 return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
6431 if (Cnt >= -(isNarrow ? ElementBits / 2 : ElementBits) && Cnt <= -1) {
6432 Cnt = -Cnt;
6433 return true;
6434 }
6435 return false;
6436}
6437
6439 const ARMSubtarget *ST) {
6440 EVT VT = N->getValueType(0);
6441 SDLoc dl(N);
6442 int64_t Cnt;
6443
6444 if (!VT.isVector())
6445 return SDValue();
6446
6447 // We essentially have two forms here. Shift by an immediate and shift by a
6448 // vector register (there are also shift by a gpr, but that is just handled
6449 // with a tablegen pattern). We cannot easily match shift by an immediate in
6450 // tablegen so we do that here and generate a VSHLIMM/VSHRsIMM/VSHRuIMM.
6451 // For shifting by a vector, we don't have VSHR, only VSHL (which can be
6452 // signed or unsigned, and a negative shift indicates a shift right).
6453 if (N->getOpcode() == ISD::SHL) {
6454 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt))
6455 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
6456 DAG.getConstant(Cnt, dl, MVT::i32));
6457 return DAG.getNode(ARMISD::VSHLu, dl, VT, N->getOperand(0),
6458 N->getOperand(1));
6459 }
6460
6461 assert((N->getOpcode() == ISD::SRA || N->getOpcode() == ISD::SRL) &&
6462 "unexpected vector shift opcode");
6463
6464 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
6465 unsigned VShiftOpc =
6466 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
6467 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
6468 DAG.getConstant(Cnt, dl, MVT::i32));
6469 }
6470
6471 // Other right shifts we don't have operations for (we use a shift left by a
6472 // negative number).
6473 EVT ShiftVT = N->getOperand(1).getValueType();
6474 SDValue NegatedCount = DAG.getNode(
6475 ISD::SUB, dl, ShiftVT, getZeroVector(ShiftVT, DAG, dl), N->getOperand(1));
6476 unsigned VShiftOpc =
6477 (N->getOpcode() == ISD::SRA ? ARMISD::VSHLs : ARMISD::VSHLu);
6478 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0), NegatedCount);
6479}
6480
6482 const ARMSubtarget *ST) {
6483 EVT VT = N->getValueType(0);
6484 SDLoc dl(N);
6485
6486 // We can get here for a node like i32 = ISD::SHL i32, i64
6487 if (VT != MVT::i64)
6488 return SDValue();
6489
6490 assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA ||
6491 N->getOpcode() == ISD::SHL) &&
6492 "Unknown shift to lower!");
6493
6494 unsigned ShOpc = N->getOpcode();
6495 if (ST->hasMVEIntegerOps()) {
6496 SDValue ShAmt = N->getOperand(1);
6497 unsigned ShPartsOpc = ARMISD::LSLL;
6499
6500 // If the shift amount is greater than 32 or has a greater bitwidth than 64
6501 // then do the default optimisation
6502 if ((!Con && ShAmt->getValueType(0).getSizeInBits() > 64) ||
6503 (Con && (Con->getAPIntValue() == 0 || Con->getAPIntValue().uge(32))))
6504 return SDValue();
6505
6506 // Extract the lower 32 bits of the shift amount if it's not an i32
6507 if (ShAmt->getValueType(0) != MVT::i32)
6508 ShAmt = DAG.getZExtOrTrunc(ShAmt, dl, MVT::i32);
6509
6510 if (ShOpc == ISD::SRL) {
6511 if (!Con)
6512 // There is no t2LSRLr instruction so negate and perform an lsll if the
6513 // shift amount is in a register, emulating a right shift.
6514 ShAmt = DAG.getNode(ISD::SUB, dl, MVT::i32,
6515 DAG.getConstant(0, dl, MVT::i32), ShAmt);
6516 else
6517 // Else generate an lsrl on the immediate shift amount
6518 ShPartsOpc = ARMISD::LSRL;
6519 } else if (ShOpc == ISD::SRA)
6520 ShPartsOpc = ARMISD::ASRL;
6521
6522 // Split Lower/Upper 32 bits of the destination/source
6523 SDValue Lo, Hi;
6524 std::tie(Lo, Hi) =
6525 DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6526 // Generate the shift operation as computed above
6527 Lo = DAG.getNode(ShPartsOpc, dl, DAG.getVTList(MVT::i32, MVT::i32), Lo, Hi,
6528 ShAmt);
6529 // The upper 32 bits come from the second return value of lsll
6530 Hi = SDValue(Lo.getNode(), 1);
6531 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6532 }
6533
6534 // We only lower SRA, SRL of 1 here, all others use generic lowering.
6535 if (!isOneConstant(N->getOperand(1)) || N->getOpcode() == ISD::SHL)
6536 return SDValue();
6537
6538 // If we are in thumb mode, we don't have RRX.
6539 if (ST->isThumb1Only())
6540 return SDValue();
6541
6542 // Okay, we have a 64-bit SRA or SRL of 1. Lower this to an RRX expr.
6543 SDValue Lo, Hi;
6544 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(0), dl, MVT::i32, MVT::i32);
6545
6546 // First, build a LSRS1/ASRS1 op, which shifts the top part by one and
6547 // captures the shifted out bit into a carry flag.
6548 unsigned Opc = N->getOpcode() == ISD::SRL ? ARMISD::LSRS1 : ARMISD::ASRS1;
6549 Hi = DAG.getNode(Opc, dl, DAG.getVTList(MVT::i32, FlagsVT), Hi);
6550
6551 // The low part is an ARMISD::RRX operand, which shifts the carry in.
6552 Lo = DAG.getNode(ARMISD::RRX, dl, MVT::i32, Lo, Hi.getValue(1));
6553
6554 // Merge the pieces into a single i64 value.
6555 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
6556}
6557
6559 const ARMSubtarget *ST) {
6560 bool Invert = false;
6561 bool Swap = false;
6562 unsigned Opc = ARMCC::AL;
6563
6564 SDValue Op0 = Op.getOperand(0);
6565 SDValue Op1 = Op.getOperand(1);
6566 SDValue CC = Op.getOperand(2);
6567 EVT VT = Op.getValueType();
6568 ISD::CondCode SetCCOpcode = cast<CondCodeSDNode>(CC)->get();
6569 SDLoc dl(Op);
6570
6571 EVT CmpVT;
6572 if (ST->hasNEON())
6574 else {
6575 assert(ST->hasMVEIntegerOps() &&
6576 "No hardware support for integer vector comparison!");
6577
6578 if (Op.getValueType().getVectorElementType() != MVT::i1)
6579 return SDValue();
6580
6581 // Make sure we expand floating point setcc to scalar if we do not have
6582 // mve.fp, so that we can handle them from there.
6583 if (Op0.getValueType().isFloatingPoint() && !ST->hasMVEFloatOps())
6584 return SDValue();
6585
6586 CmpVT = VT;
6587 }
6588
6589 if (Op0.getValueType().getVectorElementType() == MVT::i64 &&
6590 (SetCCOpcode == ISD::SETEQ || SetCCOpcode == ISD::SETNE)) {
6591 // Special-case integer 64-bit equality comparisons. They aren't legal,
6592 // but they can be lowered with a few vector instructions.
6593 unsigned CmpElements = CmpVT.getVectorNumElements() * 2;
6594 EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, CmpElements);
6595 SDValue CastOp0 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op0);
6596 SDValue CastOp1 = DAG.getNode(ISD::BITCAST, dl, SplitVT, Op1);
6597 SDValue Cmp = DAG.getNode(ISD::SETCC, dl, SplitVT, CastOp0, CastOp1,
6598 DAG.getCondCode(ISD::SETEQ));
6599 SDValue Reversed = DAG.getNode(ARMISD::VREV64, dl, SplitVT, Cmp);
6600 SDValue Merged = DAG.getNode(ISD::AND, dl, SplitVT, Cmp, Reversed);
6601 Merged = DAG.getNode(ISD::BITCAST, dl, CmpVT, Merged);
6602 if (SetCCOpcode == ISD::SETNE)
6603 Merged = DAG.getNOT(dl, Merged, CmpVT);
6604 Merged = DAG.getSExtOrTrunc(Merged, dl, VT);
6605 return Merged;
6606 }
6607
6608 if (CmpVT.getVectorElementType() == MVT::i64)
6609 // 64-bit comparisons are not legal in general.
6610 return SDValue();
6611
6612 if (Op1.getValueType().isFloatingPoint()) {
6613 switch (SetCCOpcode) {
6614 default: llvm_unreachable("Illegal FP comparison");
6615 case ISD::SETUNE:
6616 case ISD::SETNE:
6617 if (ST->hasMVEFloatOps()) {
6618 Opc = ARMCC::NE; break;
6619 } else {
6620 Invert = true; [[fallthrough]];
6621 }
6622 case ISD::SETOEQ:
6623 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6624 case ISD::SETOLT:
6625 case ISD::SETLT: Swap = true; [[fallthrough]];
6626 case ISD::SETOGT:
6627 case ISD::SETGT: Opc = ARMCC::GT; break;
6628 case ISD::SETOLE:
6629 case ISD::SETLE: Swap = true; [[fallthrough]];
6630 case ISD::SETOGE:
6631 case ISD::SETGE: Opc = ARMCC::GE; break;
6632 case ISD::SETUGE: Swap = true; [[fallthrough]];
6633 case ISD::SETULE: Invert = true; Opc = ARMCC::GT; break;
6634 case ISD::SETUGT: Swap = true; [[fallthrough]];
6635 case ISD::SETULT: Invert = true; Opc = ARMCC::GE; break;
6636 case ISD::SETUEQ: Invert = true; [[fallthrough]];
6637 case ISD::SETONE: {
6638 // Expand this to (OLT | OGT).
6639 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6640 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6641 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6642 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6643 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6644 if (Invert)
6645 Result = DAG.getNOT(dl, Result, VT);
6646 return Result;
6647 }
6648 case ISD::SETUO: Invert = true; [[fallthrough]];
6649 case ISD::SETO: {
6650 // Expand this to (OLT | OGE).
6651 SDValue TmpOp0 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op1, Op0,
6652 DAG.getConstant(ARMCC::GT, dl, MVT::i32));
6653 SDValue TmpOp1 = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6654 DAG.getConstant(ARMCC::GE, dl, MVT::i32));
6655 SDValue Result = DAG.getNode(ISD::OR, dl, CmpVT, TmpOp0, TmpOp1);
6656 if (Invert)
6657 Result = DAG.getNOT(dl, Result, VT);
6658 return Result;
6659 }
6660 }
6661 } else {
6662 // Integer comparisons.
6663 switch (SetCCOpcode) {
6664 default: llvm_unreachable("Illegal integer comparison");
6665 case ISD::SETNE:
6666 if (ST->hasMVEIntegerOps()) {
6667 Opc = ARMCC::NE; break;
6668 } else {
6669 Invert = true; [[fallthrough]];
6670 }
6671 case ISD::SETEQ: Opc = ARMCC::EQ; break;
6672 case ISD::SETLT: Swap = true; [[fallthrough]];
6673 case ISD::SETGT: Opc = ARMCC::GT; break;
6674 case ISD::SETLE: Swap = true; [[fallthrough]];
6675 case ISD::SETGE: Opc = ARMCC::GE; break;
6676 case ISD::SETULT: Swap = true; [[fallthrough]];
6677 case ISD::SETUGT: Opc = ARMCC::HI; break;
6678 case ISD::SETULE: Swap = true; [[fallthrough]];
6679 case ISD::SETUGE: Opc = ARMCC::HS; break;
6680 }
6681
6682 // Detect VTST (Vector Test Bits) = icmp ne (and (op0, op1), zero).
6683 if (ST->hasNEON() && Opc == ARMCC::EQ) {
6684 SDValue AndOp;
6686 AndOp = Op0;
6687 else if (ISD::isBuildVectorAllZeros(Op0.getNode()))
6688 AndOp = Op1;
6689
6690 // Ignore bitconvert.
6691 if (AndOp.getNode() && AndOp.getOpcode() == ISD::BITCAST)
6692 AndOp = AndOp.getOperand(0);
6693
6694 if (AndOp.getNode() && AndOp.getOpcode() == ISD::AND) {
6695 Op0 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(0));
6696 Op1 = DAG.getNode(ISD::BITCAST, dl, CmpVT, AndOp.getOperand(1));
6697 SDValue Result = DAG.getNode(ARMISD::VTST, dl, CmpVT, Op0, Op1);
6698 if (!Invert)
6699 Result = DAG.getNOT(dl, Result, VT);
6700 return Result;
6701 }
6702 }
6703 }
6704
6705 if (Swap)
6706 std::swap(Op0, Op1);
6707
6708 // If one of the operands is a constant vector zero, attempt to fold the
6709 // comparison to a specialized compare-against-zero form.
6711 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::EQ ||
6712 Opc == ARMCC::NE)) {
6713 if (Opc == ARMCC::GE)
6714 Opc = ARMCC::LE;
6715 else if (Opc == ARMCC::GT)
6716 Opc = ARMCC::LT;
6717 std::swap(Op0, Op1);
6718 }
6719
6720 SDValue Result;
6722 (Opc == ARMCC::GE || Opc == ARMCC::GT || Opc == ARMCC::LE ||
6723 Opc == ARMCC::LT || Opc == ARMCC::NE || Opc == ARMCC::EQ))
6724 Result = DAG.getNode(ARMISD::VCMPZ, dl, CmpVT, Op0,
6725 DAG.getConstant(Opc, dl, MVT::i32));
6726 else
6727 Result = DAG.getNode(ARMISD::VCMP, dl, CmpVT, Op0, Op1,
6728 DAG.getConstant(Opc, dl, MVT::i32));
6729
6730 Result = DAG.getSExtOrTrunc(Result, dl, VT);
6731
6732 if (Invert)
6733 Result = DAG.getNOT(dl, Result, VT);
6734
6735 return Result;
6736}
6737
6739 SDValue LHS = Op.getOperand(0);
6740 SDValue RHS = Op.getOperand(1);
6741
6742 assert(LHS.getSimpleValueType().isInteger() && "SETCCCARRY is integer only.");
6743
6744 SDValue Carry = Op.getOperand(2);
6745 SDValue Cond = Op.getOperand(3);
6746 SDLoc DL(Op);
6747
6748 // ARMISD::SUBE expects a carry not a borrow like ISD::USUBO_CARRY so we
6749 // have to invert the carry first.
6750 SDValue InvCarry = valueToCarryFlag(Carry, DAG, true);
6751
6752 SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
6753 SDValue Cmp = DAG.getNode(ARMISD::SUBE, DL, VTs, LHS, RHS, InvCarry);
6754
6755 SDValue FVal = DAG.getConstant(0, DL, MVT::i32);
6756 SDValue TVal = DAG.getConstant(1, DL, MVT::i32);
6757 SDValue ARMcc = DAG.getConstant(
6758 IntCCToARMCC(cast<CondCodeSDNode>(Cond)->get()), DL, MVT::i32);
6759 return DAG.getNode(ARMISD::CMOV, DL, Op.getValueType(), FVal, TVal, ARMcc,
6760 Cmp.getValue(1));
6761}
6762
6763/// isVMOVModifiedImm - Check if the specified splat value corresponds to a
6764/// valid vector constant for a NEON or MVE instruction with a "modified
6765/// immediate" operand (e.g., VMOV). If so, return the encoded value.
6766static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
6767 unsigned SplatBitSize, SelectionDAG &DAG,
6768 const SDLoc &dl, EVT &VT, EVT VectorVT,
6769 VMOVModImmType type) {
6770 unsigned OpCmode, Imm;
6771 bool is128Bits = VectorVT.is128BitVector();
6772
6773 // SplatBitSize is set to the smallest size that splats the vector, so a
6774 // zero vector will always have SplatBitSize == 8. However, NEON modified
6775 // immediate instructions others than VMOV do not support the 8-bit encoding
6776 // of a zero vector, and the default encoding of zero is supposed to be the
6777 // 32-bit version.
6778 if (SplatBits == 0)
6779 SplatBitSize = 32;
6780
6781 switch (SplatBitSize) {
6782 case 8:
6783 if (type != VMOVModImm)
6784 return SDValue();
6785 // Any 1-byte value is OK. Op=0, Cmode=1110.
6786 assert((SplatBits & ~0xff) == 0 && "one byte splat value is too big");
6787 OpCmode = 0xe;
6788 Imm = SplatBits;
6789 VT = is128Bits ? MVT::v16i8 : MVT::v8i8;
6790 break;
6791
6792 case 16:
6793 // NEON's 16-bit VMOV supports splat values where only one byte is nonzero.
6794 VT = is128Bits ? MVT::v8i16 : MVT::v4i16;
6795 if ((SplatBits & ~0xff) == 0) {
6796 // Value = 0x00nn: Op=x, Cmode=100x.
6797 OpCmode = 0x8;
6798 Imm = SplatBits;
6799 break;
6800 }
6801 if ((SplatBits & ~0xff00) == 0) {
6802 // Value = 0xnn00: Op=x, Cmode=101x.
6803 OpCmode = 0xa;
6804 Imm = SplatBits >> 8;
6805 break;
6806 }
6807 return SDValue();
6808
6809 case 32:
6810 // NEON's 32-bit VMOV supports splat values where:
6811 // * only one byte is nonzero, or
6812 // * the least significant byte is 0xff and the second byte is nonzero, or
6813 // * the least significant 2 bytes are 0xff and the third is nonzero.
6814 VT = is128Bits ? MVT::v4i32 : MVT::v2i32;
6815 if ((SplatBits & ~0xff) == 0) {
6816 // Value = 0x000000nn: Op=x, Cmode=000x.
6817 OpCmode = 0;
6818 Imm = SplatBits;
6819 break;
6820 }
6821 if ((SplatBits & ~0xff00) == 0) {
6822 // Value = 0x0000nn00: Op=x, Cmode=001x.
6823 OpCmode = 0x2;
6824 Imm = SplatBits >> 8;
6825 break;
6826 }
6827 if ((SplatBits & ~0xff0000) == 0) {
6828 // Value = 0x00nn0000: Op=x, Cmode=010x.
6829 OpCmode = 0x4;
6830 Imm = SplatBits >> 16;
6831 break;
6832 }
6833 if ((SplatBits & ~0xff000000) == 0) {
6834 // Value = 0xnn000000: Op=x, Cmode=011x.
6835 OpCmode = 0x6;
6836 Imm = SplatBits >> 24;
6837 break;
6838 }
6839
6840 // cmode == 0b1100 and cmode == 0b1101 are not supported for VORR or VBIC
6841 if (type == OtherModImm) return SDValue();
6842
6843 if ((SplatBits & ~0xffff) == 0 &&
6844 ((SplatBits | SplatUndef) & 0xff) == 0xff) {
6845 // Value = 0x0000nnff: Op=x, Cmode=1100.
6846 OpCmode = 0xc;
6847 Imm = SplatBits >> 8;
6848 break;
6849 }
6850
6851 // cmode == 0b1101 is not supported for MVE VMVN
6852 if (type == MVEVMVNModImm)
6853 return SDValue();
6854
6855 if ((SplatBits & ~0xffffff) == 0 &&
6856 ((SplatBits | SplatUndef) & 0xffff) == 0xffff) {
6857 // Value = 0x00nnffff: Op=x, Cmode=1101.
6858 OpCmode = 0xd;
6859 Imm = SplatBits >> 16;
6860 break;
6861 }
6862
6863 // Note: there are a few 32-bit splat values (specifically: 00ffff00,
6864 // ff000000, ff0000ff, and ffff00ff) that are valid for VMOV.I64 but not
6865 // VMOV.I32. A (very) minor optimization would be to replicate the value
6866 // and fall through here to test for a valid 64-bit splat. But, then the
6867 // caller would also need to check and handle the change in size.
6868 return SDValue();
6869
6870 case 64: {
6871 if (type != VMOVModImm)
6872 return SDValue();
6873 // NEON has a 64-bit VMOV splat where each byte is either 0 or 0xff.
6874 uint64_t BitMask = 0xff;
6875 unsigned ImmMask = 1;
6876 Imm = 0;
6877 for (int ByteNum = 0; ByteNum < 8; ++ByteNum) {
6878 if (((SplatBits | SplatUndef) & BitMask) == BitMask) {
6879 Imm |= ImmMask;
6880 } else if ((SplatBits & BitMask) != 0) {
6881 return SDValue();
6882 }
6883 BitMask <<= 8;
6884 ImmMask <<= 1;
6885 }
6886
6887 // Op=1, Cmode=1110.
6888 OpCmode = 0x1e;
6889 VT = is128Bits ? MVT::v2i64 : MVT::v1i64;
6890 break;
6891 }
6892
6893 default:
6894 llvm_unreachable("unexpected size for isVMOVModifiedImm");
6895 }
6896
6897 unsigned EncodedVal = ARM_AM::createVMOVModImm(OpCmode, Imm);
6898 return DAG.getTargetConstant(EncodedVal, dl, MVT::i32);
6899}
6900
6901SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
6902 const ARMSubtarget *ST) const {
6903 EVT VT = Op.getValueType();
6904 bool IsDouble = (VT == MVT::f64);
6905 ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Op);
6906 const APFloat &FPVal = CFP->getValueAPF();
6907
6908 // Prevent floating-point constants from using literal loads
6909 // when execute-only is enabled.
6910 if (ST->genExecuteOnly()) {
6911 // We shouldn't trigger this for v6m execute-only
6912 assert((!ST->isThumb1Only() || ST->hasV8MBaselineOps()) &&
6913 "Unexpected architecture");
6914
6915 // If we can represent the constant as an immediate, don't lower it
6916 if (isFPImmLegal(FPVal, VT))
6917 return Op;
6918 // Otherwise, construct as integer, and move to float register
6919 APInt INTVal = FPVal.bitcastToAPInt();
6920 SDLoc DL(CFP);
6921 switch (VT.getSimpleVT().SimpleTy) {
6922 default:
6923 llvm_unreachable("Unknown floating point type!");
6924 break;
6925 case MVT::f64: {
6926 SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
6927 SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
6928 return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
6929 }
6930 case MVT::f32:
6931 return DAG.getNode(ARMISD::VMOVSR, DL, VT,
6932 DAG.getConstant(INTVal, DL, MVT::i32));
6933 }
6934 }
6935
6936 if (!ST->hasVFP3Base())
6937 return SDValue();
6938
6939 // Use the default (constant pool) lowering for double constants when we have
6940 // an SP-only FPU
6941 if (IsDouble && !Subtarget->hasFP64())
6942 return SDValue();
6943
6944 // Try splatting with a VMOV.f32...
6945 int ImmVal = IsDouble ? ARM_AM::getFP64Imm(FPVal) : ARM_AM::getFP32Imm(FPVal);
6946
6947 if (ImmVal != -1) {
6948 if (IsDouble || !ST->useNEONForSinglePrecisionFP()) {
6949 // We have code in place to select a valid ConstantFP already, no need to
6950 // do any mangling.
6951 return Op;
6952 }
6953
6954 // It's a float and we are trying to use NEON operations where
6955 // possible. Lower it to a splat followed by an extract.
6956 SDLoc DL(Op);
6957 SDValue NewVal = DAG.getTargetConstant(ImmVal, DL, MVT::i32);
6958 SDValue VecConstant = DAG.getNode(ARMISD::VMOVFPIMM, DL, MVT::v2f32,
6959 NewVal);
6960 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecConstant,
6961 DAG.getConstant(0, DL, MVT::i32));
6962 }
6963
6964 // The rest of our options are NEON only, make sure that's allowed before
6965 // proceeding..
6966 if (!ST->hasNEON() || (!IsDouble && !ST->useNEONForSinglePrecisionFP()))
6967 return SDValue();
6968
6969 EVT VMovVT;
6970 uint64_t iVal = FPVal.bitcastToAPInt().getZExtValue();
6971
6972 // It wouldn't really be worth bothering for doubles except for one very
6973 // important value, which does happen to match: 0.0. So make sure we don't do
6974 // anything stupid.
6975 if (IsDouble && (iVal & 0xffffffff) != (iVal >> 32))
6976 return SDValue();
6977
6978 // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
6979 SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
6980 VMovVT, VT, VMOVModImm);
6981 if (NewVal != SDValue()) {
6982 SDLoc DL(Op);
6983 SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
6984 NewVal);
6985 if (IsDouble)
6986 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
6987
6988 // It's a float: cast and extract a vector element.
6989 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
6990 VecConstant);
6991 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
6992 DAG.getConstant(0, DL, MVT::i32));
6993 }
6994
6995 // Finally, try a VMVN.i32
6996 NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
6997 VT, VMVNModImm);
6998 if (NewVal != SDValue()) {
6999 SDLoc DL(Op);
7000 SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
7001
7002 if (IsDouble)
7003 return DAG.getNode(ISD::BITCAST, DL, MVT::f64, VecConstant);
7004
7005 // It's a float: cast and extract a vector element.
7006 SDValue VecFConstant = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32,
7007 VecConstant);
7008 return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VecFConstant,
7009 DAG.getConstant(0, DL, MVT::i32));
7010 }
7011
7012 return SDValue();
7013}
7014
7015// check if an VEXT instruction can handle the shuffle mask when the
7016// vector sources of the shuffle are the same.
7017static bool isSingletonVEXTMask(ArrayRef<int> M, EVT VT, unsigned &Imm) {
7018 unsigned NumElts = VT.getVectorNumElements();
7019
7020 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7021 if (M[0] < 0)
7022 return false;
7023
7024 Imm = M[0];
7025
7026 // If this is a VEXT shuffle, the immediate value is the index of the first
7027 // element. The other shuffle indices must be the successive elements after
7028 // the first one.
7029 unsigned ExpectedElt = Imm;
7030 for (unsigned i = 1; i < NumElts; ++i) {
7031 // Increment the expected index. If it wraps around, just follow it
7032 // back to index zero and keep going.
7033 ++ExpectedElt;
7034 if (ExpectedElt == NumElts)
7035 ExpectedElt = 0;
7036
7037 if (M[i] < 0) continue; // ignore UNDEF indices
7038 if (ExpectedElt != static_cast<unsigned>(M[i]))
7039 return false;
7040 }
7041
7042 return true;
7043}
7044
7045static bool isVEXTMask(ArrayRef<int> M, EVT VT,
7046 bool &ReverseVEXT, unsigned &Imm) {
7047 unsigned NumElts = VT.getVectorNumElements();
7048 ReverseVEXT = false;
7049
7050 // Assume that the first shuffle index is not UNDEF. Fail if it is.
7051 if (M[0] < 0)
7052 return false;
7053
7054 Imm = M[0];
7055
7056 // If this is a VEXT shuffle, the immediate value is the index of the first
7057 // element. The other shuffle indices must be the successive elements after
7058 // the first one.
7059 unsigned ExpectedElt = Imm;
7060 for (unsigned i = 1; i < NumElts; ++i) {
7061 // Increment the expected index. If it wraps around, it may still be
7062 // a VEXT but the source vectors must be swapped.
7063 ExpectedElt += 1;
7064 if (ExpectedElt == NumElts * 2) {
7065 ExpectedElt = 0;
7066 ReverseVEXT = true;
7067 }
7068
7069 if (M[i] < 0) continue; // ignore UNDEF indices
7070 if (ExpectedElt != static_cast<unsigned>(M[i]))
7071 return false;
7072 }
7073
7074 // Adjust the index value if the source operands will be swapped.
7075 if (ReverseVEXT)
7076 Imm -= NumElts;
7077
7078 return true;
7079}
7080
7081static bool isVTBLMask(ArrayRef<int> M, EVT VT) {
7082 // We can handle <8 x i8> vector shuffles. If the index in the mask is out of
7083 // range, then 0 is placed into the resulting vector. So pretty much any mask
7084 // of 8 elements can work here.
7085 return VT == MVT::v8i8 && M.size() == 8;
7086}
7087
7088static unsigned SelectPairHalf(unsigned Elements, ArrayRef<int> Mask,
7089 unsigned Index) {
7090 if (Mask.size() == Elements * 2)
7091 return Index / Elements;
7092 return Mask[Index] == 0 ? 0 : 1;
7093}
7094
7095// Checks whether the shuffle mask represents a vector transpose (VTRN) by
7096// checking that pairs of elements in the shuffle mask represent the same index
7097// in each vector, incrementing the expected index by 2 at each step.
7098// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 2, 6]
7099// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,c,g}
7100// v2={e,f,g,h}
7101// WhichResult gives the offset for each element in the mask based on which
7102// of the two results it belongs to.
7103//
7104// The transpose can be represented either as:
7105// result1 = shufflevector v1, v2, result1_shuffle_mask
7106// result2 = shufflevector v1, v2, result2_shuffle_mask
7107// where v1/v2 and the shuffle masks have the same number of elements
7108// (here WhichResult (see below) indicates which result is being checked)
7109//
7110// or as:
7111// results = shufflevector v1, v2, shuffle_mask
7112// where both results are returned in one vector and the shuffle mask has twice
7113// as many elements as v1/v2 (here WhichResult will always be 0 if true) here we
7114// want to check the low half and high half of the shuffle mask as if it were
7115// the other case
7116static bool isVTRNMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7117 unsigned EltSz = VT.getScalarSizeInBits();
7118 if (EltSz == 64)
7119 return false;
7120
7121 unsigned NumElts = VT.getVectorNumElements();
7122 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7123 return false;
7124
7125 // If the mask is twice as long as the input vector then we need to check the
7126 // upper and lower parts of the mask with a matching value for WhichResult
7127 // FIXME: A mask with only even values will be rejected in case the first
7128 // element is undefined, e.g. [-1, 4, 2, 6] will be rejected, because only
7129 // M[0] is used to determine WhichResult
7130 for (unsigned i = 0; i < M.size(); i += NumElts) {
7131 WhichResult = SelectPairHalf(NumElts, M, i);
7132 for (unsigned j = 0; j < NumElts; j += 2) {
7133 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7134 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + NumElts + WhichResult))
7135 return false;
7136 }
7137 }
7138
7139 if (M.size() == NumElts*2)
7140 WhichResult = 0;
7141
7142 return true;
7143}
7144
7145/// isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of
7146/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7147/// Mask is e.g., <0, 0, 2, 2> instead of <0, 4, 2, 6>.
7148static bool isVTRN_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7149 unsigned EltSz = VT.getScalarSizeInBits();
7150 if (EltSz == 64)
7151 return false;
7152
7153 unsigned NumElts = VT.getVectorNumElements();
7154 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7155 return false;
7156
7157 for (unsigned i = 0; i < M.size(); i += NumElts) {
7158 WhichResult = SelectPairHalf(NumElts, M, i);
7159 for (unsigned j = 0; j < NumElts; j += 2) {
7160 if ((M[i+j] >= 0 && (unsigned) M[i+j] != j + WhichResult) ||
7161 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != j + WhichResult))
7162 return false;
7163 }
7164 }
7165
7166 if (M.size() == NumElts*2)
7167 WhichResult = 0;
7168
7169 return true;
7170}
7171
7172// Checks whether the shuffle mask represents a vector unzip (VUZP) by checking
7173// that the mask elements are either all even and in steps of size 2 or all odd
7174// and in steps of size 2.
7175// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 2, 4, 6]
7176// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,c,e,g}
7177// v2={e,f,g,h}
7178// Requires similar checks to that of isVTRNMask with
7179// respect the how results are returned.
7180static bool isVUZPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7181 unsigned EltSz = VT.getScalarSizeInBits();
7182 if (EltSz == 64)
7183 return false;
7184
7185 unsigned NumElts = VT.getVectorNumElements();
7186 if (M.size() != NumElts && M.size() != NumElts*2)
7187 return false;
7188
7189 for (unsigned i = 0; i < M.size(); i += NumElts) {
7190 WhichResult = SelectPairHalf(NumElts, M, i);
7191 for (unsigned j = 0; j < NumElts; ++j) {
7192 if (M[i+j] >= 0 && (unsigned) M[i+j] != 2 * j + WhichResult)
7193 return false;
7194 }
7195 }
7196
7197 if (M.size() == NumElts*2)
7198 WhichResult = 0;
7199
7200 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7201 if (VT.is64BitVector() && EltSz == 32)
7202 return false;
7203
7204 return true;
7205}
7206
7207/// isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of
7208/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7209/// Mask is e.g., <0, 2, 0, 2> instead of <0, 2, 4, 6>,
7210static bool isVUZP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7211 unsigned EltSz = VT.getScalarSizeInBits();
7212 if (EltSz == 64)
7213 return false;
7214
7215 unsigned NumElts = VT.getVectorNumElements();
7216 if (M.size() != NumElts && M.size() != NumElts*2)
7217 return false;
7218
7219 unsigned Half = NumElts / 2;
7220 for (unsigned i = 0; i < M.size(); i += NumElts) {
7221 WhichResult = SelectPairHalf(NumElts, M, i);
7222 for (unsigned j = 0; j < NumElts; j += Half) {
7223 unsigned Idx = WhichResult;
7224 for (unsigned k = 0; k < Half; ++k) {
7225 int MIdx = M[i + j + k];
7226 if (MIdx >= 0 && (unsigned) MIdx != Idx)
7227 return false;
7228 Idx += 2;
7229 }
7230 }
7231 }
7232
7233 if (M.size() == NumElts*2)
7234 WhichResult = 0;
7235
7236 // VUZP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7237 if (VT.is64BitVector() && EltSz == 32)
7238 return false;
7239
7240 return true;
7241}
7242
7243// Checks whether the shuffle mask represents a vector zip (VZIP) by checking
7244// that pairs of elements of the shufflemask represent the same index in each
7245// vector incrementing sequentially through the vectors.
7246// e.g. For v1,v2 of type v4i32 a valid shuffle mask is: [0, 4, 1, 5]
7247// v1={a,b,c,d} => x=shufflevector v1, v2 shufflemask => x={a,e,b,f}
7248// v2={e,f,g,h}
7249// Requires similar checks to that of isVTRNMask with respect the how results
7250// are returned.
7251static bool isVZIPMask(ArrayRef<int> M, EVT VT, unsigned &WhichResult) {
7252 unsigned EltSz = VT.getScalarSizeInBits();
7253 if (EltSz == 64)
7254 return false;
7255
7256 unsigned NumElts = VT.getVectorNumElements();
7257 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7258 return false;
7259
7260 for (unsigned i = 0; i < M.size(); i += NumElts) {
7261 WhichResult = SelectPairHalf(NumElts, M, i);
7262 unsigned Idx = WhichResult * NumElts / 2;
7263 for (unsigned j = 0; j < NumElts; j += 2) {
7264 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7265 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx + NumElts))
7266 return false;
7267 Idx += 1;
7268 }
7269 }
7270
7271 if (M.size() == NumElts*2)
7272 WhichResult = 0;
7273
7274 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7275 if (VT.is64BitVector() && EltSz == 32)
7276 return false;
7277
7278 return true;
7279}
7280
7281/// isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of
7282/// "vector_shuffle v, v", i.e., "vector_shuffle v, undef".
7283/// Mask is e.g., <0, 0, 1, 1> instead of <0, 4, 1, 5>.
7284static bool isVZIP_v_undef_Mask(ArrayRef<int> M, EVT VT, unsigned &WhichResult){
7285 unsigned EltSz = VT.getScalarSizeInBits();
7286 if (EltSz == 64)
7287 return false;
7288
7289 unsigned NumElts = VT.getVectorNumElements();
7290 if ((M.size() != NumElts && M.size() != NumElts * 2) || NumElts % 2 != 0)
7291 return false;
7292
7293 for (unsigned i = 0; i < M.size(); i += NumElts) {
7294 WhichResult = SelectPairHalf(NumElts, M, i);
7295 unsigned Idx = WhichResult * NumElts / 2;
7296 for (unsigned j = 0; j < NumElts; j += 2) {
7297 if ((M[i+j] >= 0 && (unsigned) M[i+j] != Idx) ||
7298 (M[i+j+1] >= 0 && (unsigned) M[i+j+1] != Idx))
7299 return false;
7300 Idx += 1;
7301 }
7302 }
7303
7304 if (M.size() == NumElts*2)
7305 WhichResult = 0;
7306
7307 // VZIP.32 for 64-bit vectors is a pseudo-instruction alias for VTRN.32.
7308 if (VT.is64BitVector() && EltSz == 32)
7309 return false;
7310
7311 return true;
7312}
7313
7314/// Check if \p ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN),
7315/// and return the corresponding ARMISD opcode if it is, or 0 if it isn't.
7316static unsigned isNEONTwoResultShuffleMask(ArrayRef<int> ShuffleMask, EVT VT,
7317 unsigned &WhichResult,
7318 bool &isV_UNDEF) {
7319 isV_UNDEF = false;
7320 if (isVTRNMask(ShuffleMask, VT, WhichResult))
7321 return ARMISD::VTRN;
7322 if (isVUZPMask(ShuffleMask, VT, WhichResult))
7323 return ARMISD::VUZP;
7324 if (isVZIPMask(ShuffleMask, VT, WhichResult))
7325 return ARMISD::VZIP;
7326
7327 isV_UNDEF = true;
7328 if (isVTRN_v_undef_Mask(ShuffleMask, VT, WhichResult))
7329 return ARMISD::VTRN;
7330 if (isVUZP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7331 return ARMISD::VUZP;
7332 if (isVZIP_v_undef_Mask(ShuffleMask, VT, WhichResult))
7333 return ARMISD::VZIP;
7334
7335 return 0;
7336}
7337
7338/// \return true if this is a reverse operation on an vector.
7339static bool isReverseMask(ArrayRef<int> M, EVT VT) {
7340 unsigned NumElts = VT.getVectorNumElements();
7341 // Make sure the mask has the right size.
7342 if (NumElts != M.size())
7343 return false;
7344
7345 // Look for <15, ..., 3, -1, 1, 0>.
7346 for (unsigned i = 0; i != NumElts; ++i)
7347 if (M[i] >= 0 && M[i] != (int) (NumElts - 1 - i))
7348 return false;
7349
7350 return true;
7351}
7352
7353static bool isTruncMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7354 unsigned NumElts = VT.getVectorNumElements();
7355 // Make sure the mask has the right size.
7356 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7357 return false;
7358
7359 // Half-width truncation patterns (e.g. v4i32 -> v8i16):
7360 // !Top && SingleSource: <0, 2, 4, 6, 0, 2, 4, 6>
7361 // !Top && !SingleSource: <0, 2, 4, 6, 8, 10, 12, 14>
7362 // Top && SingleSource: <1, 3, 5, 7, 1, 3, 5, 7>
7363 // Top && !SingleSource: <1, 3, 5, 7, 9, 11, 13, 15>
7364 int Ofs = Top ? 1 : 0;
7365 int Upper = SingleSource ? 0 : NumElts;
7366 for (int i = 0, e = NumElts / 2; i != e; ++i) {
7367 if (M[i] >= 0 && M[i] != (i * 2) + Ofs)
7368 return false;
7369 if (M[i + e] >= 0 && M[i + e] != (i * 2) + Ofs + Upper)
7370 return false;
7371 }
7372 return true;
7373}
7374
7375static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top, bool SingleSource) {
7376 unsigned NumElts = VT.getVectorNumElements();
7377 // Make sure the mask has the right size.
7378 if (NumElts != M.size() || (VT != MVT::v8i16 && VT != MVT::v16i8))
7379 return false;
7380
7381 // If Top
7382 // Look for <0, N, 2, N+2, 4, N+4, ..>.
7383 // This inserts Input2 into Input1
7384 // else if not Top
7385 // Look for <0, N+1, 2, N+3, 4, N+5, ..>
7386 // This inserts Input1 into Input2
7387 unsigned Offset = Top ? 0 : 1;
7388 unsigned N = SingleSource ? 0 : NumElts;
7389 for (unsigned i = 0; i < NumElts; i += 2) {
7390 if (M[i] >= 0 && M[i] != (int)i)
7391 return false;
7392 if (M[i + 1] >= 0 && M[i + 1] != (int)(N + i + Offset))
7393 return false;
7394 }
7395
7396 return true;
7397}
7398
7399static bool isVMOVNTruncMask(ArrayRef<int> M, EVT ToVT, bool rev) {
7400 unsigned NumElts = ToVT.getVectorNumElements();
7401 if (NumElts != M.size())
7402 return false;
7403
7404 // Test if the Trunc can be convertible to a VMOVN with this shuffle. We are
7405 // looking for patterns of:
7406 // !rev: 0 N/2 1 N/2+1 2 N/2+2 ...
7407 // rev: N/2 0 N/2+1 1 N/2+2 2 ...
7408
7409 unsigned Off0 = rev ? NumElts / 2 : 0;
7410 unsigned Off1 = rev ? 0 : NumElts / 2;
7411 for (unsigned i = 0; i < NumElts; i += 2) {
7412 if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
7413 return false;
7414 if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
7415 return false;
7416 }
7417
7418 return true;
7419}
7420
7421// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
7422// from a pair of inputs. For example:
7423// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7424// FP_ROUND(EXTRACT_ELT(Y, 0),
7425// FP_ROUND(EXTRACT_ELT(X, 1),
7426// FP_ROUND(EXTRACT_ELT(Y, 1), ...)
7428 const ARMSubtarget *ST) {
7429 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7430 if (!ST->hasMVEFloatOps())
7431 return SDValue();
7432
7433 SDLoc dl(BV);
7434 EVT VT = BV.getValueType();
7435 if (VT != MVT::v8f16)
7436 return SDValue();
7437
7438 // We are looking for a buildvector of fptrunc elements, where all the
7439 // elements are interleavingly extracted from two sources. Check the first two
7440 // items are valid enough and extract some info from them (they are checked
7441 // properly in the loop below).
7442 if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
7445 return SDValue();
7446 if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
7449 return SDValue();
7450 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7451 SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
7452 if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
7453 return SDValue();
7454
7455 // Check all the values in the BuildVector line up with our expectations.
7456 for (unsigned i = 1; i < 4; i++) {
7457 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7458 return Trunc.getOpcode() == ISD::FP_ROUND &&
7460 Trunc.getOperand(0).getOperand(0) == Op &&
7461 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7462 };
7463 if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
7464 return SDValue();
7465 if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
7466 return SDValue();
7467 }
7468
7469 SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
7470 DAG.getConstant(0, dl, MVT::i32));
7471 return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
7472 DAG.getConstant(1, dl, MVT::i32));
7473}
7474
7475// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
7476// from a single input on alternating lanes. For example:
7477// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
7478// FP_ROUND(EXTRACT_ELT(X, 2),
7479// FP_ROUND(EXTRACT_ELT(X, 4), ...)
7481 const ARMSubtarget *ST) {
7482 assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7483 if (!ST->hasMVEFloatOps())
7484 return SDValue();
7485
7486 SDLoc dl(BV);
7487 EVT VT = BV.getValueType();
7488 if (VT != MVT::v4f32)
7489 return SDValue();
7490
7491 // We are looking for a buildvector of fptext elements, where all the
7492 // elements are alternating lanes from a single source. For example <0,2,4,6>
7493 // or <1,3,5,7>. Check the first two items are valid enough and extract some
7494 // info from them (they are checked properly in the loop below).
7495 if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
7497 return SDValue();
7498 SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
7500 if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
7501 return SDValue();
7502
7503 // Check all the values in the BuildVector line up with our expectations.
7504 for (unsigned i = 1; i < 4; i++) {
7505 auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
7506 return Trunc.getOpcode() == ISD::FP_EXTEND &&
7508 Trunc.getOperand(0).getOperand(0) == Op &&
7509 Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
7510 };
7511 if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
7512 return SDValue();
7513 }
7514
7515 return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
7516 DAG.getConstant(Offset, dl, MVT::i32));
7517}
7518
7519// If N is an integer constant that can be moved into a register in one
7520// instruction, return an SDValue of such a constant (will become a MOV
7521// instruction). Otherwise return null.
7523 const ARMSubtarget *ST, const SDLoc &dl) {
7524 uint64_t Val;
7525 if (!isa<ConstantSDNode>(N))
7526 return SDValue();
7527 Val = N->getAsZExtVal();
7528
7529 if (ST->isThumb1Only()) {
7530 if (Val <= 255 || ~Val <= 255)
7531 return DAG.getConstant(Val, dl, MVT::i32);
7532 } else {
7533 if (ARM_AM::getSOImmVal(Val) != -1 || ARM_AM::getSOImmVal(~Val) != -1)
7534 return DAG.getConstant(Val, dl, MVT::i32);
7535 }
7536 return SDValue();
7537}
7538
7540 const ARMSubtarget *ST) {
7541 SDLoc dl(Op);
7542 EVT VT = Op.getValueType();
7543
7544 assert(ST->hasMVEIntegerOps() && "LowerBUILD_VECTOR_i1 called without MVE!");
7545
7546 unsigned NumElts = VT.getVectorNumElements();
7547 unsigned BoolMask;
7548 unsigned BitsPerBool;
7549 if (NumElts == 2) {
7550 BitsPerBool = 8;
7551 BoolMask = 0xff;
7552 } else if (NumElts == 4) {
7553 BitsPerBool = 4;
7554 BoolMask = 0xf;
7555 } else if (NumElts == 8) {
7556 BitsPerBool = 2;
7557 BoolMask = 0x3;
7558 } else if (NumElts == 16) {
7559 BitsPerBool = 1;
7560 BoolMask = 0x1;
7561 } else
7562 return SDValue();
7563
7564 // If this is a single value copied into all lanes (a splat), we can just sign
7565 // extend that single value
7566 SDValue FirstOp = Op.getOperand(0);
7567 if (!isa<ConstantSDNode>(FirstOp) &&
7568 llvm::all_of(llvm::drop_begin(Op->ops()), [&FirstOp](const SDUse &U) {
7569 return U.get().isUndef() || U.get() == FirstOp;
7570 })) {
7571 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32, FirstOp,
7572 DAG.getValueType(MVT::i1));
7573 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), Ext);
7574 }
7575
7576 // First create base with bits set where known
7577 unsigned Bits32 = 0;
7578 for (unsigned i = 0; i < NumElts; ++i) {
7579 SDValue V = Op.getOperand(i);
7580 if (!isa<ConstantSDNode>(V) && !V.isUndef())
7581 continue;
7582 bool BitSet = V.isUndef() ? false : V->getAsZExtVal();
7583 if (BitSet)
7584 Bits32 |= BoolMask << (i * BitsPerBool);
7585 }
7586
7587 // Add in unknown nodes
7588 SDValue Base = DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
7589 DAG.getConstant(Bits32, dl, MVT::i32));
7590 for (unsigned i = 0; i < NumElts; ++i) {
7591 SDValue V = Op.getOperand(i);
7592 if (isa<ConstantSDNode>(V) || V.isUndef())
7593 continue;
7594 Base = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Base, V,
7595 DAG.getConstant(i, dl, MVT::i32));
7596 }
7597
7598 return Base;
7599}
7600
7602 const ARMSubtarget *ST) {
7603 if (!ST->hasMVEIntegerOps())
7604 return SDValue();
7605
7606 // We are looking for a buildvector where each element is Op[0] + i*N
7607 EVT VT = Op.getValueType();
7608 SDValue Op0 = Op.getOperand(0);
7609 unsigned NumElts = VT.getVectorNumElements();
7610
7611 // Get the increment value from operand 1
7612 SDValue Op1 = Op.getOperand(1);
7613 if (Op1.getOpcode() != ISD::ADD || Op1.getOperand(0) != Op0 ||
7615 return SDValue();
7616 unsigned N = Op1.getConstantOperandVal(1);
7617 if (N != 1 && N != 2 && N != 4 && N != 8)
7618 return SDValue();
7619
7620 // Check that each other operand matches
7621 for (unsigned I = 2; I < NumElts; I++) {
7622 SDValue OpI = Op.getOperand(I);
7623 if (OpI.getOpcode() != ISD::ADD || OpI.getOperand(0) != Op0 ||
7625 OpI.getConstantOperandVal(1) != I * N)
7626 return SDValue();
7627 }
7628
7629 SDLoc DL(Op);
7630 return DAG.getNode(ARMISD::VIDUP, DL, DAG.getVTList(VT, MVT::i32), Op0,
7631 DAG.getConstant(N, DL, MVT::i32));
7632}
7633
7634// Returns true if the operation N can be treated as qr instruction variant at
7635// operand Op.
7636static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op) {
7637 switch (N->getOpcode()) {
7638 case ISD::ADD:
7639 case ISD::MUL:
7640 case ISD::SADDSAT:
7641 case ISD::UADDSAT:
7642 case ISD::AVGFLOORS:
7643 case ISD::AVGFLOORU:
7644 return true;
7645 case ISD::SUB:
7646 case ISD::SSUBSAT:
7647 case ISD::USUBSAT:
7648 return N->getOperand(1).getNode() == Op;
7650 switch (N->getConstantOperandVal(0)) {
7651 case Intrinsic::arm_mve_add_predicated:
7652 case Intrinsic::arm_mve_mul_predicated:
7653 case Intrinsic::arm_mve_qadd_predicated:
7654 case Intrinsic::arm_mve_vhadd:
7655 case Intrinsic::arm_mve_hadd_predicated:
7656 case Intrinsic::arm_mve_vqdmulh:
7657 case Intrinsic::arm_mve_qdmulh_predicated:
7658 case Intrinsic::arm_mve_vqrdmulh:
7659 case Intrinsic::arm_mve_qrdmulh_predicated:
7660 case Intrinsic::arm_mve_vqdmull:
7661 case Intrinsic::arm_mve_vqdmull_predicated:
7662 return true;
7663 case Intrinsic::arm_mve_sub_predicated:
7664 case Intrinsic::arm_mve_qsub_predicated:
7665 case Intrinsic::arm_mve_vhsub:
7666 case Intrinsic::arm_mve_hsub_predicated:
7667 return N->getOperand(2).getNode() == Op;
7668 default:
7669 return false;
7670 }
7671 default:
7672 return false;
7673 }
7674}
7675
7676// If this is a case we can't handle, return null and let the default
7677// expansion code take care of it.
7678SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
7679 const ARMSubtarget *ST) const {
7680 BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Op.getNode());
7681 SDLoc dl(Op);
7682 EVT VT = Op.getValueType();
7683
7684 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
7685 return LowerBUILD_VECTOR_i1(Op, DAG, ST);
7686
7687 if (SDValue R = LowerBUILD_VECTORToVIDUP(Op, DAG, ST))
7688 return R;
7689
7690 APInt SplatBits, SplatUndef;
7691 unsigned SplatBitSize;
7692 bool HasAnyUndefs;
7693 if (BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
7694 if (SplatUndef.isAllOnes())
7695 return DAG.getUNDEF(VT);
7696
7697 // If all the users of this constant splat are qr instruction variants,
7698 // generate a vdup of the constant.
7699 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == SplatBitSize &&
7700 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32) &&
7701 all_of(BVN->users(),
7702 [BVN](const SDNode *U) { return IsQRMVEInstruction(U, BVN); })) {
7703 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7704 : SplatBitSize == 16 ? MVT::v8i16
7705 : MVT::v16i8;
7706 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7707 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7708 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7709 }
7710
7711 if ((ST->hasNEON() && SplatBitSize <= 64) ||
7712 (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
7713 // Check if an immediate VMOV works.
7714 EVT VmovVT;
7715 SDValue Val =
7716 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
7717 SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
7718
7719 if (Val.getNode()) {
7720 SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
7721 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7722 }
7723
7724 // Try an immediate VMVN.
7725 uint64_t NegatedImm = (~SplatBits).getZExtValue();
7726 Val = isVMOVModifiedImm(
7727 NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
7728 VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
7729 if (Val.getNode()) {
7730 SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
7731 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vmov);
7732 }
7733
7734 // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
7735 if ((VT == MVT::v2f32 || VT == MVT::v4f32) && SplatBitSize == 32) {
7736 int ImmVal = ARM_AM::getFP32Imm(SplatBits);
7737 if (ImmVal != -1) {
7738 SDValue Val = DAG.getTargetConstant(ImmVal, dl, MVT::i32);
7739 return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
7740 }
7741 }
7742
7743 // If we are under MVE, generate a VDUP(constant), bitcast to the original
7744 // type.
7745 if (ST->hasMVEIntegerOps() &&
7746 (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32)) {
7747 EVT DupVT = SplatBitSize == 32 ? MVT::v4i32
7748 : SplatBitSize == 16 ? MVT::v8i16
7749 : MVT::v16i8;
7750 SDValue Const = DAG.getConstant(SplatBits.getZExtValue(), dl, MVT::i32);
7751 SDValue VDup = DAG.getNode(ARMISD::VDUP, dl, DupVT, Const);
7752 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, VDup);
7753 }
7754 }
7755 }
7756
7757 // Scan through the operands to see if only one value is used.
7758 //
7759 // As an optimisation, even if more than one value is used it may be more
7760 // profitable to splat with one value then change some lanes.
7761 //
7762 // Heuristically we decide to do this if the vector has a "dominant" value,
7763 // defined as splatted to more than half of the lanes.
7764 unsigned NumElts = VT.getVectorNumElements();
7765 bool isOnlyLowElement = true;
7766 bool usesOnlyOneValue = true;
7767 bool hasDominantValue = false;
7768 bool isConstant = true;
7769
7770 // Map of the number of times a particular SDValue appears in the
7771 // element list.
7772 DenseMap<SDValue, unsigned> ValueCounts;
7773 SDValue Value;
7774 for (unsigned i = 0; i < NumElts; ++i) {
7775 SDValue V = Op.getOperand(i);
7776 if (V.isUndef())
7777 continue;
7778 if (i > 0)
7779 isOnlyLowElement = false;
7781 isConstant = false;
7782
7783 unsigned &Count = ValueCounts[V];
7784
7785 // Is this value dominant? (takes up more than half of the lanes)
7786 if (++Count > (NumElts / 2)) {
7787 hasDominantValue = true;
7788 Value = V;
7789 }
7790 }
7791 if (ValueCounts.size() != 1)
7792 usesOnlyOneValue = false;
7793 if (!Value.getNode() && !ValueCounts.empty())
7794 Value = ValueCounts.begin()->first;
7795
7796 if (ValueCounts.empty())
7797 return DAG.getUNDEF(VT);
7798
7799 // Loads are better lowered with insert_vector_elt/ARMISD::BUILD_VECTOR.
7800 // Keep going if we are hitting this case.
7801 if (isOnlyLowElement && !ISD::isNormalLoad(Value.getNode()) &&
7802 (VT != MVT::v8f16 || ST->hasFullFP16()))
7803 return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value);
7804
7805 unsigned EltSize = VT.getScalarSizeInBits();
7806
7807 // Use VDUP for non-constant splats. For f32 constant splats, reduce to
7808 // i32 and try again.
7809 if (hasDominantValue && EltSize <= 32) {
7810 if (!isConstant) {
7811 SDValue N;
7812
7813 // If we are VDUPing a value that comes directly from a vector, that will
7814 // cause an unnecessary move to and from a GPR, where instead we could
7815 // just use VDUPLANE. We can only do this if the lane being extracted
7816 // is at a constant index, as the VDUP from lane instructions only have
7817 // constant-index forms.
7818 ConstantSDNode *constIndex;
7819 if (Value->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
7820 (constIndex = dyn_cast<ConstantSDNode>(Value->getOperand(1)))) {
7821 // We need to create a new undef vector to use for the VDUPLANE if the
7822 // size of the vector from which we get the value is different than the
7823 // size of the vector that we need to create. We will insert the element
7824 // such that the register coalescer will remove unnecessary copies.
7825 if (VT != Value->getOperand(0).getValueType()) {
7826 unsigned index = constIndex->getAPIntValue().getLimitedValue() %
7828 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7829 DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, DAG.getUNDEF(VT),
7830 Value, DAG.getConstant(index, dl, MVT::i32)),
7831 DAG.getConstant(index, dl, MVT::i32));
7832 } else
7833 N = DAG.getNode(ARMISD::VDUPLANE, dl, VT,
7834 Value->getOperand(0), Value->getOperand(1));
7835 } else
7836 N = DAG.getNode(ARMISD::VDUP, dl, VT, Value);
7837
7838 if (!usesOnlyOneValue) {
7839 // The dominant value was splatted as 'N', but we now have to insert
7840 // all differing elements.
7841 for (unsigned I = 0; I < NumElts; ++I) {
7842 if (Op.getOperand(I) == Value)
7843 continue;
7845 Ops.push_back(N);
7846 Ops.push_back(Op.getOperand(I));
7847 Ops.push_back(DAG.getConstant(I, dl, MVT::i32));
7848 N = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ops);
7849 }
7850 }
7851 return N;
7852 }
7855 MVT FVT = VT.getVectorElementType().getSimpleVT();
7856 assert(FVT == MVT::f32 || FVT == MVT::f16);
7857 MVT IVT = (FVT == MVT::f32) ? MVT::i32 : MVT::i16;
7858 for (unsigned i = 0; i < NumElts; ++i)
7859 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IVT,
7860 Op.getOperand(i)));
7861 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IVT, NumElts);
7862 SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
7863 Val = LowerBUILD_VECTOR(Val, DAG, ST);
7864 if (Val.getNode())
7865 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7866 }
7867 if (usesOnlyOneValue) {
7868 SDValue Val = IsSingleInstrConstant(Value, DAG, ST, dl);
7869 if (isConstant && Val.getNode())
7870 return DAG.getNode(ARMISD::VDUP, dl, VT, Val);
7871 }
7872 }
7873
7874 // If all elements are constants and the case above didn't get hit, fall back
7875 // to the default expansion, which will generate a load from the constant
7876 // pool.
7877 if (isConstant)
7878 return SDValue();
7879
7880 // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
7881 // vmovn). Empirical tests suggest this is rarely worth it for vectors of
7882 // length <= 2.
7883 if (NumElts >= 4)
7884 if (SDValue shuffle = ReconstructShuffle(Op, DAG))
7885 return shuffle;
7886
7887 // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
7888 // VCVT's
7889 if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
7890 return VCVT;
7891 if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
7892 return VCVT;
7893
7894 if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
7895 // If we haven't found an efficient lowering, try splitting a 128-bit vector
7896 // into two 64-bit vectors; we might discover a better way to lower it.
7897 SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElts);
7898 EVT ExtVT = VT.getVectorElementType();
7899 EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElts / 2);
7900 SDValue Lower = DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[0], NumElts / 2));
7901 if (Lower.getOpcode() == ISD::BUILD_VECTOR)
7902 Lower = LowerBUILD_VECTOR(Lower, DAG, ST);
7903 SDValue Upper =
7904 DAG.getBuildVector(HVT, dl, ArrayRef(&Ops[NumElts / 2], NumElts / 2));
7905 if (Upper.getOpcode() == ISD::BUILD_VECTOR)
7906 Upper = LowerBUILD_VECTOR(Upper, DAG, ST);
7907 if (Lower && Upper)
7908 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lower, Upper);
7909 }
7910
7911 // Vectors with 32- or 64-bit elements can be built by directly assigning
7912 // the subregisters. Lower it to an ARMISD::BUILD_VECTOR so the operands
7913 // will be legalized.
7914 if (EltSize >= 32) {
7915 // Do the expansion with floating-point types, since that is what the VFP
7916 // registers are defined to use, and since i64 is not legal.
7917 EVT EltVT = EVT::getFloatingPointVT(EltSize);
7918 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
7920 for (unsigned i = 0; i < NumElts; ++i)
7921 Ops.push_back(DAG.getNode(ISD::BITCAST, dl, EltVT, Op.getOperand(i)));
7922 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
7923 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
7924 }
7925
7926 // If all else fails, just use a sequence of INSERT_VECTOR_ELT when we
7927 // know the default expansion would otherwise fall back on something even
7928 // worse. For a vector with one or two non-undef values, that's
7929 // scalar_to_vector for the elements followed by a shuffle (provided the
7930 // shuffle is valid for the target) and materialization element by element
7931 // on the stack followed by a load for everything else.
7932 if ((!isConstant && !usesOnlyOneValue) ||
7933 (VT == MVT::v8f16 && !ST->hasFullFP16())) {
7934 SDValue Vec = DAG.getUNDEF(VT);
7935 for (unsigned i = 0 ; i < NumElts; ++i) {
7936 SDValue V = Op.getOperand(i);
7937 if (V.isUndef())
7938 continue;
7939 SDValue LaneIdx = DAG.getConstant(i, dl, MVT::i32);
7940 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Vec, V, LaneIdx);
7941 }
7942 return Vec;
7943 }
7944
7945 return SDValue();
7946}
7947
7948// Gather data to see if the operation can be modelled as a
7949// shuffle in combination with VEXTs.
7950SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
7951 SelectionDAG &DAG) const {
7952 assert(Op.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
7953 SDLoc dl(Op);
7954 EVT VT = Op.getValueType();
7955 unsigned NumElts = VT.getVectorNumElements();
7956
7957 struct ShuffleSourceInfo {
7958 SDValue Vec;
7959 unsigned MinElt = std::numeric_limits<unsigned>::max();
7960 unsigned MaxElt = 0;
7961
7962 // We may insert some combination of BITCASTs and VEXT nodes to force Vec to
7963 // be compatible with the shuffle we intend to construct. As a result
7964 // ShuffleVec will be some sliding window into the original Vec.
7965 SDValue ShuffleVec;
7966
7967 // Code should guarantee that element i in Vec starts at element "WindowBase
7968 // + i * WindowScale in ShuffleVec".
7969 int WindowBase = 0;
7970 int WindowScale = 1;
7971
7972 ShuffleSourceInfo(SDValue Vec) : Vec(Vec), ShuffleVec(Vec) {}
7973
7974 bool operator ==(SDValue OtherVec) { return Vec == OtherVec; }
7975 };
7976
7977 // First gather all vectors used as an immediate source for this BUILD_VECTOR
7978 // node.
7980 for (unsigned i = 0; i < NumElts; ++i) {
7981 SDValue V = Op.getOperand(i);
7982 if (V.isUndef())
7983 continue;
7984 else if (V.getOpcode() != ISD::EXTRACT_VECTOR_ELT) {
7985 // A shuffle can only come from building a vector from various
7986 // elements of other vectors.
7987 return SDValue();
7988 } else if (!isa<ConstantSDNode>(V.getOperand(1))) {
7989 // Furthermore, shuffles require a constant mask, whereas extractelts
7990 // accept variable indices.
7991 return SDValue();
7992 }
7993
7994 // Add this element source to the list if it's not already there.
7995 SDValue SourceVec = V.getOperand(0);
7996 auto Source = llvm::find(Sources, SourceVec);
7997 if (Source == Sources.end())
7998 Source = Sources.insert(Sources.end(), ShuffleSourceInfo(SourceVec));
7999
8000 // Update the minimum and maximum lane number seen.
8001 unsigned EltNo = V.getConstantOperandVal(1);
8002 Source->MinElt = std::min(Source->MinElt, EltNo);
8003 Source->MaxElt = std::max(Source->MaxElt, EltNo);
8004 }
8005
8006 // Currently only do something sane when at most two source vectors
8007 // are involved.
8008 if (Sources.size() > 2)
8009 return SDValue();
8010
8011 // Find out the smallest element size among result and two sources, and use
8012 // it as element size to build the shuffle_vector.
8013 EVT SmallestEltTy = VT.getVectorElementType();
8014 for (auto &Source : Sources) {
8015 EVT SrcEltTy = Source.Vec.getValueType().getVectorElementType();
8016 if (SrcEltTy.bitsLT(SmallestEltTy))
8017 SmallestEltTy = SrcEltTy;
8018 }
8019 unsigned ResMultiplier =
8020 VT.getScalarSizeInBits() / SmallestEltTy.getSizeInBits();
8021 NumElts = VT.getSizeInBits() / SmallestEltTy.getSizeInBits();
8022 EVT ShuffleVT = EVT::getVectorVT(*DAG.getContext(), SmallestEltTy, NumElts);
8023
8024 // If the source vector is too wide or too narrow, we may nevertheless be able
8025 // to construct a compatible shuffle either by concatenating it with UNDEF or
8026 // extracting a suitable range of elements.
8027 for (auto &Src : Sources) {
8028 EVT SrcVT = Src.ShuffleVec.getValueType();
8029
8030 uint64_t SrcVTSize = SrcVT.getFixedSizeInBits();
8031 uint64_t VTSize = VT.getFixedSizeInBits();
8032 if (SrcVTSize == VTSize)
8033 continue;
8034
8035 // This stage of the search produces a source with the same element type as
8036 // the original, but with a total width matching the BUILD_VECTOR output.
8037 EVT EltVT = SrcVT.getVectorElementType();
8038 unsigned NumSrcElts = VTSize / EltVT.getFixedSizeInBits();
8039 EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumSrcElts);
8040
8041 if (SrcVTSize < VTSize) {
8042 if (2 * SrcVTSize != VTSize)
8043 return SDValue();
8044 // We can pad out the smaller vector for free, so if it's part of a
8045 // shuffle...
8046 Src.ShuffleVec =
8047 DAG.getNode(ISD::CONCAT_VECTORS, dl, DestVT, Src.ShuffleVec,
8048 DAG.getUNDEF(Src.ShuffleVec.getValueType()));
8049 continue;
8050 }
8051
8052 if (SrcVTSize != 2 * VTSize)
8053 return SDValue();
8054
8055 if (Src.MaxElt - Src.MinElt >= NumSrcElts) {
8056 // Span too large for a VEXT to cope
8057 return SDValue();
8058 }
8059
8060 if (Src.MinElt >= NumSrcElts) {
8061 // The extraction can just take the second half
8062 Src.ShuffleVec =
8063 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8064 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8065 Src.WindowBase = -NumSrcElts;
8066 } else if (Src.MaxElt < NumSrcElts) {
8067 // The extraction can just take the first half
8068 Src.ShuffleVec =
8069 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8070 DAG.getConstant(0, dl, MVT::i32));
8071 } else {
8072 // An actual VEXT is needed
8073 SDValue VEXTSrc1 =
8074 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8075 DAG.getConstant(0, dl, MVT::i32));
8076 SDValue VEXTSrc2 =
8077 DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, DestVT, Src.ShuffleVec,
8078 DAG.getConstant(NumSrcElts, dl, MVT::i32));
8079
8080 Src.ShuffleVec = DAG.getNode(ARMISD::VEXT, dl, DestVT, VEXTSrc1,
8081 VEXTSrc2,
8082 DAG.getConstant(Src.MinElt, dl, MVT::i32));
8083 Src.WindowBase = -Src.MinElt;
8084 }
8085 }
8086
8087 // Another possible incompatibility occurs from the vector element types. We
8088 // can fix this by bitcasting the source vectors to the same type we intend
8089 // for the shuffle.
8090 for (auto &Src : Sources) {
8091 EVT SrcEltTy = Src.ShuffleVec.getValueType().getVectorElementType();
8092 if (SrcEltTy == SmallestEltTy)
8093 continue;
8094 assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
8095 Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
8096 Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
8097 Src.WindowBase *= Src.WindowScale;
8098 }
8099
8100 // Final check before we try to actually produce a shuffle.
8101 LLVM_DEBUG({
8102 for (auto Src : Sources)
8103 assert(Src.ShuffleVec.getValueType() == ShuffleVT);
8104 });
8105
8106 // The stars all align, our next step is to produce the mask for the shuffle.
8107 SmallVector<int, 8> Mask(ShuffleVT.getVectorNumElements(), -1);
8108 int BitsPerShuffleLane = ShuffleVT.getScalarSizeInBits();
8109 for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
8110 SDValue Entry = Op.getOperand(i);
8111 if (Entry.isUndef())
8112 continue;
8113
8114 auto Src = llvm::find(Sources, Entry.getOperand(0));
8115 int EltNo = cast<ConstantSDNode>(Entry.getOperand(1))->getSExtValue();
8116
8117 // EXTRACT_VECTOR_ELT performs an implicit any_ext; BUILD_VECTOR an implicit
8118 // trunc. So only std::min(SrcBits, DestBits) actually get defined in this
8119 // segment.
8120 EVT OrigEltTy = Entry.getOperand(0).getValueType().getVectorElementType();
8121 int BitsDefined = std::min(OrigEltTy.getScalarSizeInBits(),
8122 VT.getScalarSizeInBits());
8123 int LanesDefined = BitsDefined / BitsPerShuffleLane;
8124
8125 // This source is expected to fill ResMultiplier lanes of the final shuffle,
8126 // starting at the appropriate offset.
8127 int *LaneMask = &Mask[i * ResMultiplier];
8128
8129 int ExtractBase = EltNo * Src->WindowScale + Src->WindowBase;
8130 ExtractBase += NumElts * (Src - Sources.begin());
8131 for (int j = 0; j < LanesDefined; ++j)
8132 LaneMask[j] = ExtractBase + j;
8133 }
8134
8135
8136 // We can't handle more than two sources. This should have already
8137 // been checked before this point.
8138 assert(Sources.size() <= 2 && "Too many sources!");
8139
8140 SDValue ShuffleOps[] = { DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT) };
8141 for (unsigned i = 0; i < Sources.size(); ++i)
8142 ShuffleOps[i] = Sources[i].ShuffleVec;
8143
8144 SDValue Shuffle = buildLegalVectorShuffle(ShuffleVT, dl, ShuffleOps[0],
8145 ShuffleOps[1], Mask, DAG);
8146 if (!Shuffle)
8147 return SDValue();
8148 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
8149}
8150
8152 OP_COPY = 0, // Copy, used for things like <u,u,u,3> to say it is <0,1,2,3>
8161 OP_VUZPL, // VUZP, left result
8162 OP_VUZPR, // VUZP, right result
8163 OP_VZIPL, // VZIP, left result
8164 OP_VZIPR, // VZIP, right result
8165 OP_VTRNL, // VTRN, left result
8166 OP_VTRNR // VTRN, right result
8167};
8168
8169static bool isLegalMVEShuffleOp(unsigned PFEntry) {
8170 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8171 switch (OpNum) {
8172 case OP_COPY:
8173 case OP_VREV:
8174 case OP_VDUP0:
8175 case OP_VDUP1:
8176 case OP_VDUP2:
8177 case OP_VDUP3:
8178 return true;
8179 }
8180 return false;
8181}
8182
8183/// isShuffleMaskLegal - Targets can use this to indicate that they only
8184/// support *some* VECTOR_SHUFFLE operations, those with specific masks.
8185/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
8186/// are assumed to be legal.
8188 if (VT.getVectorNumElements() == 4 &&
8189 (VT.is128BitVector() || VT.is64BitVector())) {
8190 unsigned PFIndexes[4];
8191 for (unsigned i = 0; i != 4; ++i) {
8192 if (M[i] < 0)
8193 PFIndexes[i] = 8;
8194 else
8195 PFIndexes[i] = M[i];
8196 }
8197
8198 // Compute the index in the perfect shuffle table.
8199 unsigned PFTableIndex =
8200 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8201 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8202 unsigned Cost = (PFEntry >> 30);
8203
8204 if (Cost <= 4 && (Subtarget->hasNEON() || isLegalMVEShuffleOp(PFEntry)))
8205 return true;
8206 }
8207
8208 bool ReverseVEXT, isV_UNDEF;
8209 unsigned Imm, WhichResult;
8210
8211 unsigned EltSize = VT.getScalarSizeInBits();
8212 if (EltSize >= 32 ||
8214 ShuffleVectorInst::isIdentityMask(M, M.size()) ||
8215 isVREVMask(M, VT, 64) ||
8216 isVREVMask(M, VT, 32) ||
8217 isVREVMask(M, VT, 16))
8218 return true;
8219 else if (Subtarget->hasNEON() &&
8220 (isVEXTMask(M, VT, ReverseVEXT, Imm) ||
8221 isVTBLMask(M, VT) ||
8222 isNEONTwoResultShuffleMask(M, VT, WhichResult, isV_UNDEF)))
8223 return true;
8224 else if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8225 isReverseMask(M, VT))
8226 return true;
8227 else if (Subtarget->hasMVEIntegerOps() &&
8228 (isVMOVNMask(M, VT, true, false) ||
8229 isVMOVNMask(M, VT, false, false) || isVMOVNMask(M, VT, true, true)))
8230 return true;
8231 else if (Subtarget->hasMVEIntegerOps() &&
8232 (isTruncMask(M, VT, false, false) ||
8233 isTruncMask(M, VT, false, true) ||
8234 isTruncMask(M, VT, true, false) || isTruncMask(M, VT, true, true)))
8235 return true;
8236 else
8237 return false;
8238}
8239
8240/// GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit
8241/// the specified operations to build the shuffle.
8243 SDValue RHS, SelectionDAG &DAG,
8244 const SDLoc &dl) {
8245 unsigned OpNum = (PFEntry >> 26) & 0x0F;
8246 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8247 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8248
8249 if (OpNum == OP_COPY) {
8250 if (LHSID == (1*9+2)*9+3) return LHS;
8251 assert(LHSID == ((4*9+5)*9+6)*9+7 && "Illegal OP_COPY!");
8252 return RHS;
8253 }
8254
8255 SDValue OpLHS, OpRHS;
8256 OpLHS = GeneratePerfectShuffle(PerfectShuffleTable[LHSID], LHS, RHS, DAG, dl);
8257 OpRHS = GeneratePerfectShuffle(PerfectShuffleTable[RHSID], LHS, RHS, DAG, dl);
8258 EVT VT = OpLHS.getValueType();
8259
8260 switch (OpNum) {
8261 default: llvm_unreachable("Unknown shuffle opcode!");
8262 case OP_VREV:
8263 // VREV divides the vector in half and swaps within the half.
8264 if (VT.getScalarSizeInBits() == 32)
8265 return DAG.getNode(ARMISD::VREV64, dl, VT, OpLHS);
8266 // vrev <4 x i16> -> VREV32
8267 if (VT.getScalarSizeInBits() == 16)
8268 return DAG.getNode(ARMISD::VREV32, dl, VT, OpLHS);
8269 // vrev <4 x i8> -> VREV16
8270 assert(VT.getScalarSizeInBits() == 8);
8271 return DAG.getNode(ARMISD::VREV16, dl, VT, OpLHS);
8272 case OP_VDUP0:
8273 case OP_VDUP1:
8274 case OP_VDUP2:
8275 case OP_VDUP3:
8276 return DAG.getNode(ARMISD::VDUPLANE, dl, VT,
8277 OpLHS, DAG.getConstant(OpNum-OP_VDUP0, dl, MVT::i32));
8278 case OP_VEXT1:
8279 case OP_VEXT2:
8280 case OP_VEXT3:
8281 return DAG.getNode(ARMISD::VEXT, dl, VT,
8282 OpLHS, OpRHS,
8283 DAG.getConstant(OpNum - OP_VEXT1 + 1, dl, MVT::i32));
8284 case OP_VUZPL:
8285 case OP_VUZPR:
8286 return DAG.getNode(ARMISD::VUZP, dl, DAG.getVTList(VT, VT),
8287 OpLHS, OpRHS).getValue(OpNum-OP_VUZPL);
8288 case OP_VZIPL:
8289 case OP_VZIPR:
8290 return DAG.getNode(ARMISD::VZIP, dl, DAG.getVTList(VT, VT),
8291 OpLHS, OpRHS).getValue(OpNum-OP_VZIPL);
8292 case OP_VTRNL:
8293 case OP_VTRNR:
8294 return DAG.getNode(ARMISD::VTRN, dl, DAG.getVTList(VT, VT),
8295 OpLHS, OpRHS).getValue(OpNum-OP_VTRNL);
8296 }
8297}
8298
8300 ArrayRef<int> ShuffleMask,
8301 SelectionDAG &DAG) {
8302 // Check to see if we can use the VTBL instruction.
8303 SDValue V1 = Op.getOperand(0);
8304 SDValue V2 = Op.getOperand(1);
8305 SDLoc DL(Op);
8306
8307 SmallVector<SDValue, 8> VTBLMask;
8308 for (int I : ShuffleMask)
8309 VTBLMask.push_back(DAG.getSignedConstant(I, DL, MVT::i32));
8310
8311 if (V2.getNode()->isUndef())
8312 return DAG.getNode(ARMISD::VTBL1, DL, MVT::v8i8, V1,
8313 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8314
8315 return DAG.getNode(ARMISD::VTBL2, DL, MVT::v8i8, V1, V2,
8316 DAG.getBuildVector(MVT::v8i8, DL, VTBLMask));
8317}
8318
8320 SDLoc DL(Op);
8321 EVT VT = Op.getValueType();
8322
8323 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8324 "Expect an v8i16/v16i8 type");
8325 SDValue OpLHS = DAG.getNode(ARMISD::VREV64, DL, VT, Op.getOperand(0));
8326 // For a v16i8 type: After the VREV, we have got <7, ..., 0, 15, ..., 8>. Now,
8327 // extract the first 8 bytes into the top double word and the last 8 bytes
8328 // into the bottom double word, through a new vector shuffle that will be
8329 // turned into a VEXT on Neon, or a couple of VMOVDs on MVE.
8330 std::vector<int> NewMask;
8331 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8332 NewMask.push_back(VT.getVectorNumElements() / 2 + i);
8333 for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++)
8334 NewMask.push_back(i);
8335 return DAG.getVectorShuffle(VT, DL, OpLHS, OpLHS, NewMask);
8336}
8337
8339 switch (VT.getSimpleVT().SimpleTy) {
8340 case MVT::v2i1:
8341 return MVT::v2f64;
8342 case MVT::v4i1:
8343 return MVT::v4i32;
8344 case MVT::v8i1:
8345 return MVT::v8i16;
8346 case MVT::v16i1:
8347 return MVT::v16i8;
8348 default:
8349 llvm_unreachable("Unexpected vector predicate type");
8350 }
8351}
8352
8354 SelectionDAG &DAG) {
8355 // Converting from boolean predicates to integers involves creating a vector
8356 // of all ones or all zeroes and selecting the lanes based upon the real
8357 // predicate.
8359 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0xff), dl, MVT::i32);
8360 AllOnes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllOnes);
8361
8362 SDValue AllZeroes =
8363 DAG.getTargetConstant(ARM_AM::createVMOVModImm(0xe, 0x0), dl, MVT::i32);
8364 AllZeroes = DAG.getNode(ARMISD::VMOVIMM, dl, MVT::v16i8, AllZeroes);
8365
8366 // Get full vector type from predicate type
8368
8369 SDValue RecastV1;
8370 // If the real predicate is an v8i1 or v4i1 (not v16i1) then we need to recast
8371 // this to a v16i1. This cannot be done with an ordinary bitcast because the
8372 // sizes are not the same. We have to use a MVE specific PREDICATE_CAST node,
8373 // since we know in hardware the sizes are really the same.
8374 if (VT != MVT::v16i1)
8375 RecastV1 = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Pred);
8376 else
8377 RecastV1 = Pred;
8378
8379 // Select either all ones or zeroes depending upon the real predicate bits.
8380 SDValue PredAsVector =
8381 DAG.getNode(ISD::VSELECT, dl, MVT::v16i8, RecastV1, AllOnes, AllZeroes);
8382
8383 // Recast our new predicate-as-integer v16i8 vector into something
8384 // appropriate for the shuffle, i.e. v4i32 for a real v4i1 predicate.
8385 return DAG.getNode(ISD::BITCAST, dl, NewVT, PredAsVector);
8386}
8387
8389 const ARMSubtarget *ST) {
8390 EVT VT = Op.getValueType();
8392 ArrayRef<int> ShuffleMask = SVN->getMask();
8393
8394 assert(ST->hasMVEIntegerOps() &&
8395 "No support for vector shuffle of boolean predicates");
8396
8397 SDValue V1 = Op.getOperand(0);
8398 SDValue V2 = Op.getOperand(1);
8399 SDLoc dl(Op);
8400 if (isReverseMask(ShuffleMask, VT)) {
8401 SDValue cast = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, V1);
8402 SDValue rbit = DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, cast);
8403 SDValue srl = DAG.getNode(ISD::SRL, dl, MVT::i32, rbit,
8404 DAG.getConstant(16, dl, MVT::i32));
8405 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, srl);
8406 }
8407
8408 // Until we can come up with optimised cases for every single vector
8409 // shuffle in existence we have chosen the least painful strategy. This is
8410 // to essentially promote the boolean predicate to a 8-bit integer, where
8411 // each predicate represents a byte. Then we fall back on a normal integer
8412 // vector shuffle and convert the result back into a predicate vector. In
8413 // many cases the generated code might be even better than scalar code
8414 // operating on bits. Just imagine trying to shuffle 8 arbitrary 2-bit
8415 // fields in a register into 8 other arbitrary 2-bit fields!
8416 SDValue PredAsVector1 = PromoteMVEPredVector(dl, V1, VT, DAG);
8417 EVT NewVT = PredAsVector1.getValueType();
8418 SDValue PredAsVector2 = V2.isUndef() ? DAG.getUNDEF(NewVT)
8419 : PromoteMVEPredVector(dl, V2, VT, DAG);
8420 assert(PredAsVector2.getValueType() == NewVT &&
8421 "Expected identical vector type in expanded i1 shuffle!");
8422
8423 // Do the shuffle!
8424 SDValue Shuffled = DAG.getVectorShuffle(NewVT, dl, PredAsVector1,
8425 PredAsVector2, ShuffleMask);
8426
8427 // Now return the result of comparing the shuffled vector with zero,
8428 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1. For a v2i1
8429 // we convert to a v4i1 compare to fill in the two halves of the i64 as i32s.
8430 if (VT == MVT::v2i1) {
8431 SDValue BC = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Shuffled);
8432 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, BC,
8433 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8434 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
8435 }
8436 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Shuffled,
8437 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8438}
8439
8441 ArrayRef<int> ShuffleMask,
8442 SelectionDAG &DAG) {
8443 // Attempt to lower the vector shuffle using as many whole register movs as
8444 // possible. This is useful for types smaller than 32bits, which would
8445 // often otherwise become a series for grp movs.
8446 SDLoc dl(Op);
8447 EVT VT = Op.getValueType();
8448 if (VT.getScalarSizeInBits() >= 32)
8449 return SDValue();
8450
8451 assert((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8452 "Unexpected vector type");
8453 int NumElts = VT.getVectorNumElements();
8454 int QuarterSize = NumElts / 4;
8455 // The four final parts of the vector, as i32's
8456 SDValue Parts[4];
8457
8458 // Look for full lane vmovs like <0,1,2,3> or <u,5,6,7> etc, (but not
8459 // <u,u,u,u>), returning the vmov lane index
8460 auto getMovIdx = [](ArrayRef<int> ShuffleMask, int Start, int Length) {
8461 // Detect which mov lane this would be from the first non-undef element.
8462 int MovIdx = -1;
8463 for (int i = 0; i < Length; i++) {
8464 if (ShuffleMask[Start + i] >= 0) {
8465 if (ShuffleMask[Start + i] % Length != i)
8466 return -1;
8467 MovIdx = ShuffleMask[Start + i] / Length;
8468 break;
8469 }
8470 }
8471 // If all items are undef, leave this for other combines
8472 if (MovIdx == -1)
8473 return -1;
8474 // Check the remaining values are the correct part of the same mov
8475 for (int i = 1; i < Length; i++) {
8476 if (ShuffleMask[Start + i] >= 0 &&
8477 (ShuffleMask[Start + i] / Length != MovIdx ||
8478 ShuffleMask[Start + i] % Length != i))
8479 return -1;
8480 }
8481 return MovIdx;
8482 };
8483
8484 for (int Part = 0; Part < 4; ++Part) {
8485 // Does this part look like a mov
8486 int Elt = getMovIdx(ShuffleMask, Part * QuarterSize, QuarterSize);
8487 if (Elt != -1) {
8488 SDValue Input = Op->getOperand(0);
8489 if (Elt >= 4) {
8490 Input = Op->getOperand(1);
8491 Elt -= 4;
8492 }
8493 SDValue BitCast = DAG.getBitcast(MVT::v4f32, Input);
8494 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, BitCast,
8495 DAG.getConstant(Elt, dl, MVT::i32));
8496 }
8497 }
8498
8499 // Nothing interesting found, just return
8500 if (!Parts[0] && !Parts[1] && !Parts[2] && !Parts[3])
8501 return SDValue();
8502
8503 // The other parts need to be built with the old shuffle vector, cast to a
8504 // v4i32 and extract_vector_elts
8505 if (!Parts[0] || !Parts[1] || !Parts[2] || !Parts[3]) {
8506 SmallVector<int, 16> NewShuffleMask;
8507 for (int Part = 0; Part < 4; ++Part)
8508 for (int i = 0; i < QuarterSize; i++)
8509 NewShuffleMask.push_back(
8510 Parts[Part] ? -1 : ShuffleMask[Part * QuarterSize + i]);
8511 SDValue NewShuffle = DAG.getVectorShuffle(
8512 VT, dl, Op->getOperand(0), Op->getOperand(1), NewShuffleMask);
8513 SDValue BitCast = DAG.getBitcast(MVT::v4f32, NewShuffle);
8514
8515 for (int Part = 0; Part < 4; ++Part)
8516 if (!Parts[Part])
8517 Parts[Part] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32,
8518 BitCast, DAG.getConstant(Part, dl, MVT::i32));
8519 }
8520 // Build a vector out of the various parts and bitcast it back to the original
8521 // type.
8522 SDValue NewVec = DAG.getNode(ARMISD::BUILD_VECTOR, dl, MVT::v4f32, Parts);
8523 return DAG.getBitcast(VT, NewVec);
8524}
8525
8527 ArrayRef<int> ShuffleMask,
8528 SelectionDAG &DAG) {
8529 SDValue V1 = Op.getOperand(0);
8530 SDValue V2 = Op.getOperand(1);
8531 EVT VT = Op.getValueType();
8532 unsigned NumElts = VT.getVectorNumElements();
8533
8534 // An One-Off Identity mask is one that is mostly an identity mask from as
8535 // single source but contains a single element out-of-place, either from a
8536 // different vector or from another position in the same vector. As opposed to
8537 // lowering this via a ARMISD::BUILD_VECTOR we can generate an extract/insert
8538 // pair directly.
8539 auto isOneOffIdentityMask = [](ArrayRef<int> Mask, EVT VT, int BaseOffset,
8540 int &OffElement) {
8541 OffElement = -1;
8542 int NonUndef = 0;
8543 for (int i = 0, NumMaskElts = Mask.size(); i < NumMaskElts; ++i) {
8544 if (Mask[i] == -1)
8545 continue;
8546 NonUndef++;
8547 if (Mask[i] != i + BaseOffset) {
8548 if (OffElement == -1)
8549 OffElement = i;
8550 else
8551 return false;
8552 }
8553 }
8554 return NonUndef > 2 && OffElement != -1;
8555 };
8556 int OffElement;
8557 SDValue VInput;
8558 if (isOneOffIdentityMask(ShuffleMask, VT, 0, OffElement))
8559 VInput = V1;
8560 else if (isOneOffIdentityMask(ShuffleMask, VT, NumElts, OffElement))
8561 VInput = V2;
8562 else
8563 return SDValue();
8564
8565 SDLoc dl(Op);
8566 EVT SVT = VT.getScalarType() == MVT::i8 || VT.getScalarType() == MVT::i16
8567 ? MVT::i32
8568 : VT.getScalarType();
8569 SDValue Elt = DAG.getNode(
8570 ISD::EXTRACT_VECTOR_ELT, dl, SVT,
8571 ShuffleMask[OffElement] < (int)NumElts ? V1 : V2,
8572 DAG.getVectorIdxConstant(ShuffleMask[OffElement] % NumElts, dl));
8573 return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, VInput, Elt,
8574 DAG.getVectorIdxConstant(OffElement % NumElts, dl));
8575}
8576
8578 const ARMSubtarget *ST) {
8579 SDValue V1 = Op.getOperand(0);
8580 SDValue V2 = Op.getOperand(1);
8581 SDLoc dl(Op);
8582 EVT VT = Op.getValueType();
8584 unsigned EltSize = VT.getScalarSizeInBits();
8585
8586 if (ST->hasMVEIntegerOps() && EltSize == 1)
8587 return LowerVECTOR_SHUFFLE_i1(Op, DAG, ST);
8588
8589 // Convert shuffles that are directly supported on NEON to target-specific
8590 // DAG nodes, instead of keeping them as shuffles and matching them again
8591 // during code selection. This is more efficient and avoids the possibility
8592 // of inconsistencies between legalization and selection.
8593 // FIXME: floating-point vectors should be canonicalized to integer vectors
8594 // of the same time so that they get CSEd properly.
8595 ArrayRef<int> ShuffleMask = SVN->getMask();
8596
8597 if (EltSize <= 32) {
8598 if (SVN->isSplat()) {
8599 int Lane = SVN->getSplatIndex();
8600 // If this is undef splat, generate it via "just" vdup, if possible.
8601 if (Lane == -1) Lane = 0;
8602
8603 // Test if V1 is a SCALAR_TO_VECTOR.
8604 if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) {
8605 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8606 }
8607 // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR
8608 // (and probably will turn into a SCALAR_TO_VECTOR once legalization
8609 // reaches it).
8610 if (Lane == 0 && V1.getOpcode() == ISD::BUILD_VECTOR &&
8611 !isa<ConstantSDNode>(V1.getOperand(0))) {
8612 bool IsScalarToVector = true;
8613 for (unsigned i = 1, e = V1.getNumOperands(); i != e; ++i)
8614 if (!V1.getOperand(i).isUndef()) {
8615 IsScalarToVector = false;
8616 break;
8617 }
8618 if (IsScalarToVector)
8619 return DAG.getNode(ARMISD::VDUP, dl, VT, V1.getOperand(0));
8620 }
8621 return DAG.getNode(ARMISD::VDUPLANE, dl, VT, V1,
8622 DAG.getConstant(Lane, dl, MVT::i32));
8623 }
8624
8625 bool ReverseVEXT = false;
8626 unsigned Imm = 0;
8627 if (ST->hasNEON() && isVEXTMask(ShuffleMask, VT, ReverseVEXT, Imm)) {
8628 if (ReverseVEXT)
8629 std::swap(V1, V2);
8630 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V2,
8631 DAG.getConstant(Imm, dl, MVT::i32));
8632 }
8633
8634 if (isVREVMask(ShuffleMask, VT, 64))
8635 return DAG.getNode(ARMISD::VREV64, dl, VT, V1);
8636 if (isVREVMask(ShuffleMask, VT, 32))
8637 return DAG.getNode(ARMISD::VREV32, dl, VT, V1);
8638 if (isVREVMask(ShuffleMask, VT, 16))
8639 return DAG.getNode(ARMISD::VREV16, dl, VT, V1);
8640
8641 if (ST->hasNEON() && V2->isUndef() && isSingletonVEXTMask(ShuffleMask, VT, Imm)) {
8642 return DAG.getNode(ARMISD::VEXT, dl, VT, V1, V1,
8643 DAG.getConstant(Imm, dl, MVT::i32));
8644 }
8645
8646 // Check for Neon shuffles that modify both input vectors in place.
8647 // If both results are used, i.e., if there are two shuffles with the same
8648 // source operands and with masks corresponding to both results of one of
8649 // these operations, DAG memoization will ensure that a single node is
8650 // used for both shuffles.
8651 unsigned WhichResult = 0;
8652 bool isV_UNDEF = false;
8653 if (ST->hasNEON()) {
8654 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8655 ShuffleMask, VT, WhichResult, isV_UNDEF)) {
8656 if (isV_UNDEF)
8657 V2 = V1;
8658 return DAG.getNode(ShuffleOpc, dl, DAG.getVTList(VT, VT), V1, V2)
8659 .getValue(WhichResult);
8660 }
8661 }
8662 if (ST->hasMVEIntegerOps()) {
8663 if (isVMOVNMask(ShuffleMask, VT, false, false))
8664 return DAG.getNode(ARMISD::VMOVN, dl, VT, V2, V1,
8665 DAG.getConstant(0, dl, MVT::i32));
8666 if (isVMOVNMask(ShuffleMask, VT, true, false))
8667 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V2,
8668 DAG.getConstant(1, dl, MVT::i32));
8669 if (isVMOVNMask(ShuffleMask, VT, true, true))
8670 return DAG.getNode(ARMISD::VMOVN, dl, VT, V1, V1,
8671 DAG.getConstant(1, dl, MVT::i32));
8672 }
8673
8674 // Also check for these shuffles through CONCAT_VECTORS: we canonicalize
8675 // shuffles that produce a result larger than their operands with:
8676 // shuffle(concat(v1, undef), concat(v2, undef))
8677 // ->
8678 // shuffle(concat(v1, v2), undef)
8679 // because we can access quad vectors (see PerformVECTOR_SHUFFLECombine).
8680 //
8681 // This is useful in the general case, but there are special cases where
8682 // native shuffles produce larger results: the two-result ops.
8683 //
8684 // Look through the concat when lowering them:
8685 // shuffle(concat(v1, v2), undef)
8686 // ->
8687 // concat(VZIP(v1, v2):0, :1)
8688 //
8689 if (ST->hasNEON() && V1->getOpcode() == ISD::CONCAT_VECTORS && V2->isUndef()) {
8690 SDValue SubV1 = V1->getOperand(0);
8691 SDValue SubV2 = V1->getOperand(1);
8692 EVT SubVT = SubV1.getValueType();
8693
8694 // We expect these to have been canonicalized to -1.
8695 assert(llvm::all_of(ShuffleMask, [&](int i) {
8696 return i < (int)VT.getVectorNumElements();
8697 }) && "Unexpected shuffle index into UNDEF operand!");
8698
8699 if (unsigned ShuffleOpc = isNEONTwoResultShuffleMask(
8700 ShuffleMask, SubVT, WhichResult, isV_UNDEF)) {
8701 if (isV_UNDEF)
8702 SubV2 = SubV1;
8703 assert((WhichResult == 0) &&
8704 "In-place shuffle of concat can only have one result!");
8705 SDValue Res = DAG.getNode(ShuffleOpc, dl, DAG.getVTList(SubVT, SubVT),
8706 SubV1, SubV2);
8707 return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Res.getValue(0),
8708 Res.getValue(1));
8709 }
8710 }
8711 }
8712
8713 if (ST->hasMVEIntegerOps() && EltSize <= 32 &&
8714 (ST->hasFullFP16() || VT != MVT::v8f16)) {
8715 if (SDValue V = LowerVECTOR_SHUFFLEUsingOneOff(Op, ShuffleMask, DAG))
8716 return V;
8717
8718 for (bool Top : {false, true}) {
8719 for (bool SingleSource : {false, true}) {
8720 if (isTruncMask(ShuffleMask, VT, Top, SingleSource)) {
8721 MVT FromSVT = MVT::getIntegerVT(EltSize * 2);
8722 MVT FromVT = MVT::getVectorVT(FromSVT, ShuffleMask.size() / 2);
8723 SDValue Lo = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT, V1);
8724 SDValue Hi = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, FromVT,
8725 SingleSource ? V1 : V2);
8726 if (Top) {
8727 SDValue Amt = DAG.getConstant(EltSize, dl, FromVT);
8728 Lo = DAG.getNode(ISD::SRL, dl, FromVT, Lo, Amt);
8729 Hi = DAG.getNode(ISD::SRL, dl, FromVT, Hi, Amt);
8730 }
8731 return DAG.getNode(ARMISD::MVETRUNC, dl, VT, Lo, Hi);
8732 }
8733 }
8734 }
8735 }
8736
8737 // If the shuffle is not directly supported and it has 4 elements, use
8738 // the PerfectShuffle-generated table to synthesize it from other shuffles.
8739 unsigned NumElts = VT.getVectorNumElements();
8740 if (NumElts == 4) {
8741 unsigned PFIndexes[4];
8742 for (unsigned i = 0; i != 4; ++i) {
8743 if (ShuffleMask[i] < 0)
8744 PFIndexes[i] = 8;
8745 else
8746 PFIndexes[i] = ShuffleMask[i];
8747 }
8748
8749 // Compute the index in the perfect shuffle table.
8750 unsigned PFTableIndex =
8751 PFIndexes[0]*9*9*9+PFIndexes[1]*9*9+PFIndexes[2]*9+PFIndexes[3];
8752 unsigned PFEntry = PerfectShuffleTable[PFTableIndex];
8753 unsigned Cost = (PFEntry >> 30);
8754
8755 if (Cost <= 4) {
8756 if (ST->hasNEON())
8757 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8758 else if (isLegalMVEShuffleOp(PFEntry)) {
8759 unsigned LHSID = (PFEntry >> 13) & ((1 << 13)-1);
8760 unsigned RHSID = (PFEntry >> 0) & ((1 << 13)-1);
8761 unsigned PFEntryLHS = PerfectShuffleTable[LHSID];
8762 unsigned PFEntryRHS = PerfectShuffleTable[RHSID];
8763 if (isLegalMVEShuffleOp(PFEntryLHS) && isLegalMVEShuffleOp(PFEntryRHS))
8764 return GeneratePerfectShuffle(PFEntry, V1, V2, DAG, dl);
8765 }
8766 }
8767 }
8768
8769 // Implement shuffles with 32- or 64-bit elements as ARMISD::BUILD_VECTORs.
8770 if (EltSize >= 32) {
8771 // Do the expansion with floating-point types, since that is what the VFP
8772 // registers are defined to use, and since i64 is not legal.
8773 EVT EltVT = EVT::getFloatingPointVT(EltSize);
8774 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts);
8775 V1 = DAG.getNode(ISD::BITCAST, dl, VecVT, V1);
8776 V2 = DAG.getNode(ISD::BITCAST, dl, VecVT, V2);
8778 for (unsigned i = 0; i < NumElts; ++i) {
8779 if (ShuffleMask[i] < 0)
8780 Ops.push_back(DAG.getUNDEF(EltVT));
8781 else
8782 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
8783 ShuffleMask[i] < (int)NumElts ? V1 : V2,
8784 DAG.getConstant(ShuffleMask[i] & (NumElts-1),
8785 dl, MVT::i32)));
8786 }
8787 SDValue Val = DAG.getNode(ARMISD::BUILD_VECTOR, dl, VecVT, Ops);
8788 return DAG.getNode(ISD::BITCAST, dl, VT, Val);
8789 }
8790
8791 if ((VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v16i8) &&
8792 isReverseMask(ShuffleMask, VT))
8793 return LowerReverse_VECTOR_SHUFFLE(Op, DAG);
8794
8795 if (ST->hasNEON() && VT == MVT::v8i8)
8796 if (SDValue NewOp = LowerVECTOR_SHUFFLEv8i8(Op, ShuffleMask, DAG))
8797 return NewOp;
8798
8799 if (ST->hasMVEIntegerOps())
8800 if (SDValue NewOp = LowerVECTOR_SHUFFLEUsingMovs(Op, ShuffleMask, DAG))
8801 return NewOp;
8802
8803 // Lower v8f16 via v8i16 to avoid invalid f16 nodes.
8804 if (VT == MVT::v8f16 && !ST->hasFullFP16()) {
8805 SDValue BC0 =
8806 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v8i16, Op.getOperand(0));
8807 SDValue BC1 =
8808 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v8i16, Op.getOperand(1));
8809 SDValue Shuf = DAG.getVectorShuffle(MVT::v8i16, dl, BC0, BC1, ShuffleMask);
8810 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuf);
8811 }
8812
8813 return SDValue();
8814}
8815
8817 const ARMSubtarget *ST) {
8818 EVT VecVT = Op.getOperand(0).getValueType();
8819 SDLoc dl(Op);
8820
8821 assert(ST->hasMVEIntegerOps() &&
8822 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8823
8824 SDValue Conv =
8825 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8826 unsigned Lane = Op.getConstantOperandVal(2);
8827 unsigned LaneWidth =
8829 unsigned Mask = ((1 << LaneWidth) - 1) << Lane * LaneWidth;
8830 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i32,
8831 Op.getOperand(1), DAG.getValueType(MVT::i1));
8832 SDValue BFI = DAG.getNode(ARMISD::BFI, dl, MVT::i32, Conv, Ext,
8833 DAG.getConstant(~Mask, dl, MVT::i32));
8834 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, Op.getValueType(), BFI);
8835}
8836
8837SDValue ARMTargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
8838 SelectionDAG &DAG) const {
8839 // INSERT_VECTOR_ELT is legal only for immediate indexes.
8840 SDValue Lane = Op.getOperand(2);
8841 if (!isa<ConstantSDNode>(Lane))
8842 return SDValue();
8843
8844 SDValue Elt = Op.getOperand(1);
8845 EVT EltVT = Elt.getValueType();
8846
8847 if (Subtarget->hasMVEIntegerOps() &&
8848 Op.getValueType().getScalarSizeInBits() == 1)
8849 return LowerINSERT_VECTOR_ELT_i1(Op, DAG, Subtarget);
8850
8851 if (getTypeAction(*DAG.getContext(), EltVT) ==
8853 // INSERT_VECTOR_ELT doesn't want f16 operands promoting to f32,
8854 // but the type system will try to do that if we don't intervene.
8855 // Reinterpret any such vector-element insertion as one with the
8856 // corresponding integer types.
8857
8858 SDLoc dl(Op);
8859
8860 EVT IEltVT = MVT::getIntegerVT(EltVT.getScalarSizeInBits());
8861 assert(getTypeAction(*DAG.getContext(), IEltVT) !=
8863
8864 SDValue VecIn = Op.getOperand(0);
8865 EVT VecVT = VecIn.getValueType();
8866 EVT IVecVT = EVT::getVectorVT(*DAG.getContext(), IEltVT,
8867 VecVT.getVectorNumElements());
8868
8869 SDValue IElt = DAG.getNode(ISD::BITCAST, dl, IEltVT, Elt);
8870 SDValue IVecIn = DAG.getNode(ISD::BITCAST, dl, IVecVT, VecIn);
8871 SDValue IVecOut = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, IVecVT,
8872 IVecIn, IElt, Lane);
8873 return DAG.getNode(ISD::BITCAST, dl, VecVT, IVecOut);
8874 }
8875
8876 return Op;
8877}
8878
8880 const ARMSubtarget *ST) {
8881 EVT VecVT = Op.getOperand(0).getValueType();
8882 SDLoc dl(Op);
8883
8884 assert(ST->hasMVEIntegerOps() &&
8885 "LowerINSERT_VECTOR_ELT_i1 called without MVE!");
8886
8887 SDValue Conv =
8888 DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Op->getOperand(0));
8889 unsigned Lane = Op.getConstantOperandVal(1);
8890 unsigned LaneWidth =
8892 SDValue Shift = DAG.getNode(ISD::SRL, dl, MVT::i32, Conv,
8893 DAG.getConstant(Lane * LaneWidth, dl, MVT::i32));
8894 return Shift;
8895}
8896
8898 const ARMSubtarget *ST) {
8899 // EXTRACT_VECTOR_ELT is legal only for immediate indexes.
8900 SDValue Lane = Op.getOperand(1);
8901 if (!isa<ConstantSDNode>(Lane))
8902 return SDValue();
8903
8904 SDValue Vec = Op.getOperand(0);
8905 EVT VT = Vec.getValueType();
8906
8907 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
8908 return LowerEXTRACT_VECTOR_ELT_i1(Op, DAG, ST);
8909
8910 if (Op.getValueType() == MVT::i32 && Vec.getScalarValueSizeInBits() < 32) {
8911 SDLoc dl(Op);
8912 return DAG.getNode(ARMISD::VGETLANEu, dl, MVT::i32, Vec, Lane);
8913 }
8914
8915 return Op;
8916}
8917
8919 const ARMSubtarget *ST) {
8920 SDLoc dl(Op);
8921 assert(Op.getValueType().getScalarSizeInBits() == 1 &&
8922 "Unexpected custom CONCAT_VECTORS lowering");
8923 assert(isPowerOf2_32(Op.getNumOperands()) &&
8924 "Unexpected custom CONCAT_VECTORS lowering");
8925 assert(ST->hasMVEIntegerOps() &&
8926 "CONCAT_VECTORS lowering only supported for MVE");
8927
8928 auto ConcatPair = [&](SDValue V1, SDValue V2) {
8929 EVT Op1VT = V1.getValueType();
8930 EVT Op2VT = V2.getValueType();
8931 assert(Op1VT == Op2VT && "Operand types don't match!");
8932 assert((Op1VT == MVT::v2i1 || Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) &&
8933 "Unexpected i1 concat operations!");
8934 EVT VT = Op1VT.getDoubleNumVectorElementsVT(*DAG.getContext());
8935
8936 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
8937 SDValue NewV2 = PromoteMVEPredVector(dl, V2, Op2VT, DAG);
8938
8939 // We now have Op1 + Op2 promoted to vectors of integers, where v8i1 gets
8940 // promoted to v8i16, etc.
8941 MVT ElType =
8943 unsigned NumElts = 2 * Op1VT.getVectorNumElements();
8944
8945 EVT ConcatVT = MVT::getVectorVT(ElType, NumElts);
8946 if (Op1VT == MVT::v4i1 || Op1VT == MVT::v8i1) {
8947 // Use MVETRUNC to truncate the combined NewV1::NewV2 into the smaller
8948 // ConcatVT.
8949 SDValue ConVec =
8950 DAG.getNode(ARMISD::MVETRUNC, dl, ConcatVT, NewV1, NewV2);
8951 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8952 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8953 }
8954
8955 // Extract the vector elements from Op1 and Op2 one by one and truncate them
8956 // to be the right size for the destination. For example, if Op1 is v4i1
8957 // then the promoted vector is v4i32. The result of concatenation gives a
8958 // v8i1, which when promoted is v8i16. That means each i32 element from Op1
8959 // needs truncating to i16 and inserting in the result.
8960 auto ExtractInto = [&DAG, &dl](SDValue NewV, SDValue ConVec, unsigned &j) {
8961 EVT NewVT = NewV.getValueType();
8962 EVT ConcatVT = ConVec.getValueType();
8963 unsigned ExtScale = 1;
8964 if (NewVT == MVT::v2f64) {
8965 NewV = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, NewV);
8966 ExtScale = 2;
8967 }
8968 for (unsigned i = 0, e = NewVT.getVectorNumElements(); i < e; i++, j++) {
8969 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV,
8970 DAG.getIntPtrConstant(i * ExtScale, dl));
8971 ConVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ConcatVT, ConVec, Elt,
8972 DAG.getConstant(j, dl, MVT::i32));
8973 }
8974 return ConVec;
8975 };
8976 unsigned j = 0;
8977 SDValue ConVec = DAG.getNode(ISD::UNDEF, dl, ConcatVT);
8978 ConVec = ExtractInto(NewV1, ConVec, j);
8979 ConVec = ExtractInto(NewV2, ConVec, j);
8980
8981 // Now return the result of comparing the subvector with zero, which will
8982 // generate a real predicate, i.e. v4i1, v8i1 or v16i1.
8983 return DAG.getNode(ARMISD::VCMPZ, dl, VT, ConVec,
8984 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
8985 };
8986
8987 // Concat each pair of subvectors and pack into the lower half of the array.
8988 SmallVector<SDValue> ConcatOps(Op->ops());
8989 while (ConcatOps.size() > 1) {
8990 for (unsigned I = 0, E = ConcatOps.size(); I != E; I += 2) {
8991 SDValue V1 = ConcatOps[I];
8992 SDValue V2 = ConcatOps[I + 1];
8993 ConcatOps[I / 2] = ConcatPair(V1, V2);
8994 }
8995 ConcatOps.resize(ConcatOps.size() / 2);
8996 }
8997 return ConcatOps[0];
8998}
8999
9001 const ARMSubtarget *ST) {
9002 EVT VT = Op->getValueType(0);
9003 if (ST->hasMVEIntegerOps() && VT.getScalarSizeInBits() == 1)
9004 return LowerCONCAT_VECTORS_i1(Op, DAG, ST);
9005
9006 // The only time a CONCAT_VECTORS operation can have legal types is when
9007 // two 64-bit vectors are concatenated to a 128-bit vector.
9008 assert(Op.getValueType().is128BitVector() && Op.getNumOperands() == 2 &&
9009 "unexpected CONCAT_VECTORS");
9010 SDLoc dl(Op);
9011 SDValue Val = DAG.getUNDEF(MVT::v2f64);
9012 SDValue Op0 = Op.getOperand(0);
9013 SDValue Op1 = Op.getOperand(1);
9014 if (!Op0.isUndef())
9015 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9016 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op0),
9017 DAG.getIntPtrConstant(0, dl));
9018 if (!Op1.isUndef())
9019 Val = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, Val,
9020 DAG.getNode(ISD::BITCAST, dl, MVT::f64, Op1),
9021 DAG.getIntPtrConstant(1, dl));
9022 return DAG.getNode(ISD::BITCAST, dl, Op.getValueType(), Val);
9023}
9024
9026 const ARMSubtarget *ST) {
9027 SDValue V1 = Op.getOperand(0);
9028 SDValue V2 = Op.getOperand(1);
9029 SDLoc dl(Op);
9030 EVT VT = Op.getValueType();
9031 EVT Op1VT = V1.getValueType();
9032 unsigned NumElts = VT.getVectorNumElements();
9033 unsigned Index = V2->getAsZExtVal();
9034
9035 assert(VT.getScalarSizeInBits() == 1 &&
9036 "Unexpected custom EXTRACT_SUBVECTOR lowering");
9037 assert(ST->hasMVEIntegerOps() &&
9038 "EXTRACT_SUBVECTOR lowering only supported for MVE");
9039
9040 SDValue NewV1 = PromoteMVEPredVector(dl, V1, Op1VT, DAG);
9041
9042 // We now have Op1 promoted to a vector of integers, where v8i1 gets
9043 // promoted to v8i16, etc.
9044
9046
9047 if (NumElts == 2) {
9048 EVT SubVT = MVT::v4i32;
9049 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9050 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j += 2) {
9051 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9052 DAG.getIntPtrConstant(i, dl));
9053 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9054 DAG.getConstant(j, dl, MVT::i32));
9055 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9056 DAG.getConstant(j + 1, dl, MVT::i32));
9057 }
9058 SDValue Cmp = DAG.getNode(ARMISD::VCMPZ, dl, MVT::v4i1, SubVec,
9059 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9060 return DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v2i1, Cmp);
9061 }
9062
9063 EVT SubVT = MVT::getVectorVT(ElType, NumElts);
9064 SDValue SubVec = DAG.getNode(ISD::UNDEF, dl, SubVT);
9065 for (unsigned i = Index, j = 0; i < (Index + NumElts); i++, j++) {
9066 SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, NewV1,
9067 DAG.getIntPtrConstant(i, dl));
9068 SubVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, SubVT, SubVec, Elt,
9069 DAG.getConstant(j, dl, MVT::i32));
9070 }
9071
9072 // Now return the result of comparing the subvector with zero,
9073 // which will generate a real predicate, i.e. v4i1, v8i1 or v16i1.
9074 return DAG.getNode(ARMISD::VCMPZ, dl, VT, SubVec,
9075 DAG.getConstant(ARMCC::NE, dl, MVT::i32));
9076}
9077
9078// Turn a truncate into a predicate (an i1 vector) into icmp(and(x, 1), 0).
9080 const ARMSubtarget *ST) {
9081 assert(ST->hasMVEIntegerOps() && "Expected MVE!");
9082 EVT VT = N->getValueType(0);
9083 assert((VT == MVT::v16i1 || VT == MVT::v8i1 || VT == MVT::v4i1) &&
9084 "Expected a vector i1 type!");
9085 SDValue Op = N->getOperand(0);
9086 EVT FromVT = Op.getValueType();
9087 SDLoc DL(N);
9088
9089 SDValue And =
9090 DAG.getNode(ISD::AND, DL, FromVT, Op, DAG.getConstant(1, DL, FromVT));
9091 return DAG.getNode(ISD::SETCC, DL, VT, And, DAG.getConstant(0, DL, FromVT),
9092 DAG.getCondCode(ISD::SETNE));
9093}
9094
9096 const ARMSubtarget *Subtarget) {
9097 if (!Subtarget->hasMVEIntegerOps())
9098 return SDValue();
9099
9100 EVT ToVT = N->getValueType(0);
9101 if (ToVT.getScalarType() == MVT::i1)
9102 return LowerTruncatei1(N, DAG, Subtarget);
9103
9104 // MVE does not have a single instruction to perform the truncation of a v4i32
9105 // into the lower half of a v8i16, in the same way that a NEON vmovn would.
9106 // Most of the instructions in MVE follow the 'Beats' system, where moving
9107 // values from different lanes is usually something that the instructions
9108 // avoid.
9109 //
9110 // Instead it has top/bottom instructions such as VMOVLT/B and VMOVNT/B,
9111 // which take a the top/bottom half of a larger lane and extend it (or do the
9112 // opposite, truncating into the top/bottom lane from a larger lane). Note
9113 // that because of the way we widen lanes, a v4i16 is really a v4i32 using the
9114 // bottom 16bits from each vector lane. This works really well with T/B
9115 // instructions, but that doesn't extend to v8i32->v8i16 where the lanes need
9116 // to move order.
9117 //
9118 // But truncates and sext/zext are always going to be fairly common from llvm.
9119 // We have several options for how to deal with them:
9120 // - Wherever possible combine them into an instruction that makes them
9121 // "free". This includes loads/stores, which can perform the trunc as part
9122 // of the memory operation. Or certain shuffles that can be turned into
9123 // VMOVN/VMOVL.
9124 // - Lane Interleaving to transform blocks surrounded by ext/trunc. So
9125 // trunc(mul(sext(a), sext(b))) may become
9126 // VMOVNT(VMUL(VMOVLB(a), VMOVLB(b)), VMUL(VMOVLT(a), VMOVLT(b))). (Which in
9127 // this case can use VMULL). This is performed in the
9128 // MVELaneInterleavingPass.
9129 // - Otherwise we have an option. By default we would expand the
9130 // zext/sext/trunc into a series of lane extract/inserts going via GPR
9131 // registers. One for each vector lane in the vector. This can obviously be
9132 // very expensive.
9133 // - The other option is to use the fact that loads/store can extend/truncate
9134 // to turn a trunc into two truncating stack stores and a stack reload. This
9135 // becomes 3 back-to-back memory operations, but at least that is less than
9136 // all the insert/extracts.
9137 //
9138 // In order to do the last, we convert certain trunc's into MVETRUNC, which
9139 // are either optimized where they can be, or eventually lowered into stack
9140 // stores/loads. This prevents us from splitting a v8i16 trunc into two stores
9141 // two early, where other instructions would be better, and stops us from
9142 // having to reconstruct multiple buildvector shuffles into loads/stores.
9143 if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8)
9144 return SDValue();
9145 EVT FromVT = N->getOperand(0).getValueType();
9146 if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16)
9147 return SDValue();
9148
9149 SDValue Lo, Hi;
9150 std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
9151 SDLoc DL(N);
9152 return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi);
9153}
9154
9156 const ARMSubtarget *Subtarget) {
9157 if (!Subtarget->hasMVEIntegerOps())
9158 return SDValue();
9159
9160 // See LowerTruncate above for an explanation of MVEEXT/MVETRUNC.
9161
9162 EVT ToVT = N->getValueType(0);
9163 if (ToVT != MVT::v16i32 && ToVT != MVT::v8i32 && ToVT != MVT::v16i16)
9164 return SDValue();
9165 SDValue Op = N->getOperand(0);
9166 EVT FromVT = Op.getValueType();
9167 if (FromVT != MVT::v8i16 && FromVT != MVT::v16i8)
9168 return SDValue();
9169
9170 SDLoc DL(N);
9171 EVT ExtVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
9172 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8)
9173 ExtVT = MVT::v8i16;
9174
9175 unsigned Opcode =
9177 SDValue Ext = DAG.getNode(Opcode, DL, DAG.getVTList(ExtVT, ExtVT), Op);
9178 SDValue Ext1 = Ext.getValue(1);
9179
9180 if (ToVT.getScalarType() == MVT::i32 && FromVT.getScalarType() == MVT::i8) {
9181 Ext = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext);
9182 Ext1 = DAG.getNode(N->getOpcode(), DL, MVT::v8i32, Ext1);
9183 }
9184
9185 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Ext, Ext1);
9186}
9187
9188/// isExtendedBUILD_VECTOR - Check if N is a constant BUILD_VECTOR where each
9189/// element has been zero/sign-extended, depending on the isSigned parameter,
9190/// from an integer type half its size.
9192 bool isSigned) {
9193 // A v2i64 BUILD_VECTOR will have been legalized to a BITCAST from v4i32.
9194 EVT VT = N->getValueType(0);
9195 if (VT == MVT::v2i64 && N->getOpcode() == ISD::BITCAST) {
9196 SDNode *BVN = N->getOperand(0).getNode();
9197 if (BVN->getValueType(0) != MVT::v4i32 ||
9198 BVN->getOpcode() != ISD::BUILD_VECTOR)
9199 return false;
9200 unsigned LoElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9201 unsigned HiElt = 1 - LoElt;
9206 if (!Lo0 || !Hi0 || !Lo1 || !Hi1)
9207 return false;
9208 if (isSigned) {
9209 if (Hi0->getSExtValue() == Lo0->getSExtValue() >> 32 &&
9210 Hi1->getSExtValue() == Lo1->getSExtValue() >> 32)
9211 return true;
9212 } else {
9213 if (Hi0->isZero() && Hi1->isZero())
9214 return true;
9215 }
9216 return false;
9217 }
9218
9219 if (N->getOpcode() != ISD::BUILD_VECTOR)
9220 return false;
9221
9222 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
9223 SDNode *Elt = N->getOperand(i).getNode();
9225 unsigned EltSize = VT.getScalarSizeInBits();
9226 unsigned HalfSize = EltSize / 2;
9227 if (isSigned) {
9228 if (!isIntN(HalfSize, C->getSExtValue()))
9229 return false;
9230 } else {
9231 if (!isUIntN(HalfSize, C->getZExtValue()))
9232 return false;
9233 }
9234 continue;
9235 }
9236 return false;
9237 }
9238
9239 return true;
9240}
9241
9242/// isSignExtended - Check if a node is a vector value that is sign-extended
9243/// or a constant BUILD_VECTOR with sign-extended elements.
9245 if (N->getOpcode() == ISD::SIGN_EXTEND || ISD::isSEXTLoad(N))
9246 return true;
9247 if (isExtendedBUILD_VECTOR(N, DAG, true))
9248 return true;
9249 return false;
9250}
9251
9252/// isZeroExtended - Check if a node is a vector value that is zero-extended (or
9253/// any-extended) or a constant BUILD_VECTOR with zero-extended elements.
9255 if (N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND ||
9257 return true;
9258 if (isExtendedBUILD_VECTOR(N, DAG, false))
9259 return true;
9260 return false;
9261}
9262
9263static EVT getExtensionTo64Bits(const EVT &OrigVT) {
9264 if (OrigVT.getSizeInBits() >= 64)
9265 return OrigVT;
9266
9267 assert(OrigVT.isSimple() && "Expecting a simple value type");
9268
9269 MVT::SimpleValueType OrigSimpleTy = OrigVT.getSimpleVT().SimpleTy;
9270 switch (OrigSimpleTy) {
9271 default: llvm_unreachable("Unexpected Vector Type");
9272 case MVT::v2i8:
9273 case MVT::v2i16:
9274 return MVT::v2i32;
9275 case MVT::v4i8:
9276 return MVT::v4i16;
9277 }
9278}
9279
9280/// AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total
9281/// value size to 64 bits. We need a 64-bit D register as an operand to VMULL.
9282/// We insert the required extension here to get the vector to fill a D register.
9284 const EVT &OrigTy,
9285 const EVT &ExtTy,
9286 unsigned ExtOpcode) {
9287 // The vector originally had a size of OrigTy. It was then extended to ExtTy.
9288 // We expect the ExtTy to be 128-bits total. If the OrigTy is less than
9289 // 64-bits we need to insert a new extension so that it will be 64-bits.
9290 assert(ExtTy.is128BitVector() && "Unexpected extension size");
9291 if (OrigTy.getSizeInBits() >= 64)
9292 return N;
9293
9294 // Must extend size to at least 64 bits to be used as an operand for VMULL.
9295 EVT NewVT = getExtensionTo64Bits(OrigTy);
9296
9297 return DAG.getNode(ExtOpcode, SDLoc(N), NewVT, N);
9298}
9299
9300/// SkipLoadExtensionForVMULL - return a load of the original vector size that
9301/// does not do any sign/zero extension. If the original vector is less
9302/// than 64 bits, an appropriate extension will be added after the load to
9303/// reach a total size of 64 bits. We have to add the extension separately
9304/// because ARM does not have a sign/zero extending load for vectors.
9306 EVT ExtendedTy = getExtensionTo64Bits(LD->getMemoryVT());
9307
9308 // The load already has the right type.
9309 if (ExtendedTy == LD->getMemoryVT())
9310 return DAG.getLoad(LD->getMemoryVT(), SDLoc(LD), LD->getChain(),
9311 LD->getBasePtr(), LD->getPointerInfo(), LD->getAlign(),
9312 LD->getMemOperand()->getFlags());
9313
9314 // We need to create a zextload/sextload. We cannot just create a load
9315 // followed by a zext/zext node because LowerMUL is also run during normal
9316 // operation legalization where we can't create illegal types.
9317 return DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD), ExtendedTy,
9318 LD->getChain(), LD->getBasePtr(), LD->getPointerInfo(),
9319 LD->getMemoryVT(), LD->getAlign(),
9320 LD->getMemOperand()->getFlags());
9321}
9322
9323/// SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND,
9324/// ANY_EXTEND, extending load, or BUILD_VECTOR with extended elements, return
9325/// the unextended value. The unextended vector should be 64 bits so that it can
9326/// be used as an operand to a VMULL instruction. If the original vector size
9327/// before extension is less than 64 bits we add a an extension to resize
9328/// the vector to 64 bits.
9330 if (N->getOpcode() == ISD::SIGN_EXTEND ||
9331 N->getOpcode() == ISD::ZERO_EXTEND || N->getOpcode() == ISD::ANY_EXTEND)
9332 return AddRequiredExtensionForVMULL(N->getOperand(0), DAG,
9333 N->getOperand(0)->getValueType(0),
9334 N->getValueType(0),
9335 N->getOpcode());
9336
9337 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
9338 assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
9339 "Expected extending load");
9340
9341 SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
9342 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
9343 unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
9344 SDValue extLoad =
9345 DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
9346 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
9347
9348 return newLoad;
9349 }
9350
9351 // Otherwise, the value must be a BUILD_VECTOR. For v2i64, it will
9352 // have been legalized as a BITCAST from v4i32.
9353 if (N->getOpcode() == ISD::BITCAST) {
9354 SDNode *BVN = N->getOperand(0).getNode();
9356 BVN->getValueType(0) == MVT::v4i32 && "expected v4i32 BUILD_VECTOR");
9357 unsigned LowElt = DAG.getDataLayout().isBigEndian() ? 1 : 0;
9358 return DAG.getBuildVector(
9359 MVT::v2i32, SDLoc(N),
9360 {BVN->getOperand(LowElt), BVN->getOperand(LowElt + 2)});
9361 }
9362 // Construct a new BUILD_VECTOR with elements truncated to half the size.
9363 assert(N->getOpcode() == ISD::BUILD_VECTOR && "expected BUILD_VECTOR");
9364 EVT VT = N->getValueType(0);
9365 unsigned EltSize = VT.getScalarSizeInBits() / 2;
9366 unsigned NumElts = VT.getVectorNumElements();
9367 MVT TruncVT = MVT::getIntegerVT(EltSize);
9369 SDLoc dl(N);
9370 for (unsigned i = 0; i != NumElts; ++i) {
9371 const APInt &CInt = N->getConstantOperandAPInt(i);
9372 // Element types smaller than 32 bits are not legal, so use i32 elements.
9373 // The values are implicitly truncated so sext vs. zext doesn't matter.
9374 Ops.push_back(DAG.getConstant(CInt.zextOrTrunc(32), dl, MVT::i32));
9375 }
9376 return DAG.getBuildVector(MVT::getVectorVT(TruncVT, NumElts), dl, Ops);
9377}
9378
9379static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
9380 unsigned Opcode = N->getOpcode();
9381 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9382 SDNode *N0 = N->getOperand(0).getNode();
9383 SDNode *N1 = N->getOperand(1).getNode();
9384 return N0->hasOneUse() && N1->hasOneUse() &&
9385 isSignExtended(N0, DAG) && isSignExtended(N1, DAG);
9386 }
9387 return false;
9388}
9389
9390static bool isAddSubZExt(SDNode *N, SelectionDAG &DAG) {
9391 unsigned Opcode = N->getOpcode();
9392 if (Opcode == ISD::ADD || Opcode == ISD::SUB) {
9393 SDNode *N0 = N->getOperand(0).getNode();
9394 SDNode *N1 = N->getOperand(1).getNode();
9395 return N0->hasOneUse() && N1->hasOneUse() &&
9396 isZeroExtended(N0, DAG) && isZeroExtended(N1, DAG);
9397 }
9398 return false;
9399}
9400
9402 // Multiplications are only custom-lowered for 128-bit vectors so that
9403 // VMULL can be detected. Otherwise v2i64 multiplications are not legal.
9404 EVT VT = Op.getValueType();
9405 assert(VT.is128BitVector() && VT.isInteger() &&
9406 "unexpected type for custom-lowering ISD::MUL");
9407 SDNode *N0 = Op.getOperand(0).getNode();
9408 SDNode *N1 = Op.getOperand(1).getNode();
9409 unsigned NewOpc = 0;
9410 bool isMLA = false;
9411 bool isN0SExt = isSignExtended(N0, DAG);
9412 bool isN1SExt = isSignExtended(N1, DAG);
9413 if (isN0SExt && isN1SExt)
9414 NewOpc = ARMISD::VMULLs;
9415 else {
9416 bool isN0ZExt = isZeroExtended(N0, DAG);
9417 bool isN1ZExt = isZeroExtended(N1, DAG);
9418 if (isN0ZExt && isN1ZExt)
9419 NewOpc = ARMISD::VMULLu;
9420 else if (isN1SExt || isN1ZExt) {
9421 // Look for (s/zext A + s/zext B) * (s/zext C). We want to turn these
9422 // into (s/zext A * s/zext C) + (s/zext B * s/zext C)
9423 if (isN1SExt && isAddSubSExt(N0, DAG)) {
9424 NewOpc = ARMISD::VMULLs;
9425 isMLA = true;
9426 } else if (isN1ZExt && isAddSubZExt(N0, DAG)) {
9427 NewOpc = ARMISD::VMULLu;
9428 isMLA = true;
9429 } else if (isN0ZExt && isAddSubZExt(N1, DAG)) {
9430 std::swap(N0, N1);
9431 NewOpc = ARMISD::VMULLu;
9432 isMLA = true;
9433 }
9434 }
9435
9436 if (!NewOpc) {
9437 if (VT == MVT::v2i64)
9438 // Fall through to expand this. It is not legal.
9439 return SDValue();
9440 else
9441 // Other vector multiplications are legal.
9442 return Op;
9443 }
9444 }
9445
9446 // Legalize to a VMULL instruction.
9447 SDLoc DL(Op);
9448 SDValue Op0;
9449 SDValue Op1 = SkipExtensionForVMULL(N1, DAG);
9450 if (!isMLA) {
9451 Op0 = SkipExtensionForVMULL(N0, DAG);
9453 Op1.getValueType().is64BitVector() &&
9454 "unexpected types for extended operands to VMULL");
9455 return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
9456 }
9457
9458 // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
9459 // isel lowering to take advantage of no-stall back to back vmul + vmla.
9460 // vmull q0, d4, d6
9461 // vmlal q0, d5, d6
9462 // is faster than
9463 // vaddl q0, d4, d5
9464 // vmovl q1, d6
9465 // vmul q0, q0, q1
9466 SDValue N00 = SkipExtensionForVMULL(N0->getOperand(0).getNode(), DAG);
9467 SDValue N01 = SkipExtensionForVMULL(N0->getOperand(1).getNode(), DAG);
9468 EVT Op1VT = Op1.getValueType();
9469 return DAG.getNode(N0->getOpcode(), DL, VT,
9470 DAG.getNode(NewOpc, DL, VT,
9471 DAG.getNode(ISD::BITCAST, DL, Op1VT, N00), Op1),
9472 DAG.getNode(NewOpc, DL, VT,
9473 DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
9474}
9475
9477 SelectionDAG &DAG) {
9478 // TODO: Should this propagate fast-math-flags?
9479
9480 // Convert to float
9481 // float4 xf = vcvt_f32_s32(vmovl_s16(a.lo));
9482 // float4 yf = vcvt_f32_s32(vmovl_s16(b.lo));
9483 X = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, X);
9484 Y = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Y);
9485 X = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, X);
9486 Y = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, Y);
9487 // Get reciprocal estimate.
9488 // float4 recip = vrecpeq_f32(yf);
9489 Y = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9490 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9491 Y);
9492 // Because char has a smaller range than uchar, we can actually get away
9493 // without any newton steps. This requires that we use a weird bias
9494 // of 0xb000, however (again, this has been exhaustively tested).
9495 // float4 result = as_float4(as_int4(xf*recip) + 0xb000);
9496 X = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, X, Y);
9497 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, X);
9498 Y = DAG.getConstant(0xb000, dl, MVT::v4i32);
9499 X = DAG.getNode(ISD::ADD, dl, MVT::v4i32, X, Y);
9500 X = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, X);
9501 // Convert back to short.
9502 X = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, X);
9503 X = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, X);
9504 return X;
9505}
9506
9508 SelectionDAG &DAG) {
9509 // TODO: Should this propagate fast-math-flags?
9510
9511 SDValue N2;
9512 // Convert to float.
9513 // float4 yf = vcvt_f32_s32(vmovl_s16(y));
9514 // float4 xf = vcvt_f32_s32(vmovl_s16(x));
9515 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N0);
9516 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, N1);
9517 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9518 N1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9519
9520 // Use reciprocal estimate and one refinement step.
9521 // float4 recip = vrecpeq_f32(yf);
9522 // recip *= vrecpsq_f32(yf, recip);
9523 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9524 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9525 N1);
9526 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9527 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9528 N1, N2);
9529 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9530 // Because short has a smaller range than ushort, we can actually get away
9531 // with only a single newton step. This requires that we use a weird bias
9532 // of 89, however (again, this has been exhaustively tested).
9533 // float4 result = as_float4(as_int4(xf*recip) + 0x89);
9534 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9535 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9536 N1 = DAG.getConstant(0x89, dl, MVT::v4i32);
9537 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9538 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9539 // Convert back to integer and return.
9540 // return vmovn_s32(vcvt_s32_f32(result));
9541 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9542 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9543 return N0;
9544}
9545
9547 const ARMSubtarget *ST) {
9548 EVT VT = Op.getValueType();
9549 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9550 "unexpected type for custom-lowering ISD::SDIV");
9551
9552 SDLoc dl(Op);
9553 SDValue N0 = Op.getOperand(0);
9554 SDValue N1 = Op.getOperand(1);
9555 SDValue N2, N3;
9556
9557 if (VT == MVT::v8i8) {
9558 N0 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N0);
9559 N1 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i16, N1);
9560
9561 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9562 DAG.getIntPtrConstant(4, dl));
9563 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9564 DAG.getIntPtrConstant(4, dl));
9565 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9566 DAG.getIntPtrConstant(0, dl));
9567 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9568 DAG.getIntPtrConstant(0, dl));
9569
9570 N0 = LowerSDIV_v4i8(N0, N1, dl, DAG); // v4i16
9571 N2 = LowerSDIV_v4i8(N2, N3, dl, DAG); // v4i16
9572
9573 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9574 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9575
9576 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v8i8, N0);
9577 return N0;
9578 }
9579 return LowerSDIV_v4i16(N0, N1, dl, DAG);
9580}
9581
9583 const ARMSubtarget *ST) {
9584 // TODO: Should this propagate fast-math-flags?
9585 EVT VT = Op.getValueType();
9586 assert((VT == MVT::v4i16 || VT == MVT::v8i8) &&
9587 "unexpected type for custom-lowering ISD::UDIV");
9588
9589 SDLoc dl(Op);
9590 SDValue N0 = Op.getOperand(0);
9591 SDValue N1 = Op.getOperand(1);
9592 SDValue N2, N3;
9593
9594 if (VT == MVT::v8i8) {
9595 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N0);
9596 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v8i16, N1);
9597
9598 N2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9599 DAG.getIntPtrConstant(4, dl));
9600 N3 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9601 DAG.getIntPtrConstant(4, dl));
9602 N0 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N0,
9603 DAG.getIntPtrConstant(0, dl));
9604 N1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v4i16, N1,
9605 DAG.getIntPtrConstant(0, dl));
9606
9607 N0 = LowerSDIV_v4i16(N0, N1, dl, DAG); // v4i16
9608 N2 = LowerSDIV_v4i16(N2, N3, dl, DAG); // v4i16
9609
9610 N0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, N0, N2);
9611 N0 = LowerCONCAT_VECTORS(N0, DAG, ST);
9612
9613 N0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v8i8,
9614 DAG.getConstant(Intrinsic::arm_neon_vqmovnsu, dl,
9615 MVT::i32),
9616 N0);
9617 return N0;
9618 }
9619
9620 // v4i16 sdiv ... Convert to float.
9621 // float4 yf = vcvt_f32_s32(vmovl_u16(y));
9622 // float4 xf = vcvt_f32_s32(vmovl_u16(x));
9623 N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N0);
9624 N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v4i32, N1);
9625 N0 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N0);
9626 SDValue BN1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::v4f32, N1);
9627
9628 // Use reciprocal estimate and two refinement steps.
9629 // float4 recip = vrecpeq_f32(yf);
9630 // recip *= vrecpsq_f32(yf, recip);
9631 // recip *= vrecpsq_f32(yf, recip);
9632 N2 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9633 DAG.getConstant(Intrinsic::arm_neon_vrecpe, dl, MVT::i32),
9634 BN1);
9635 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9636 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9637 BN1, N2);
9638 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9639 N1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::v4f32,
9640 DAG.getConstant(Intrinsic::arm_neon_vrecps, dl, MVT::i32),
9641 BN1, N2);
9642 N2 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N1, N2);
9643 // Simply multiplying by the reciprocal estimate can leave us a few ulps
9644 // too low, so we add 2 ulps (exhaustive testing shows that this is enough,
9645 // and that it will never cause us to return an answer too large).
9646 // float4 result = as_float4(as_int4(xf*recip) + 2);
9647 N0 = DAG.getNode(ISD::FMUL, dl, MVT::v4f32, N0, N2);
9648 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4i32, N0);
9649 N1 = DAG.getConstant(2, dl, MVT::v4i32);
9650 N0 = DAG.getNode(ISD::ADD, dl, MVT::v4i32, N0, N1);
9651 N0 = DAG.getNode(ISD::BITCAST, dl, MVT::v4f32, N0);
9652 // Convert back to integer and return.
9653 // return vmovn_u32(vcvt_s32_f32(result));
9654 N0 = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::v4i32, N0);
9655 N0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::v4i16, N0);
9656 return N0;
9657}
9658
9660 unsigned Opcode, bool IsSigned) {
9661 EVT VT0 = Op.getValue(0).getValueType();
9662 EVT VT1 = Op.getValue(1).getValueType();
9663
9664 bool InvertCarry = Opcode == ARMISD::SUBE;
9665 SDValue OpLHS = Op.getOperand(0);
9666 SDValue OpRHS = Op.getOperand(1);
9667 SDValue OpCarryIn = valueToCarryFlag(Op.getOperand(2), DAG, InvertCarry);
9668
9669 SDLoc DL(Op);
9670
9671 SDValue Result = DAG.getNode(Opcode, DL, DAG.getVTList(VT0, MVT::i32), OpLHS,
9672 OpRHS, OpCarryIn);
9673
9674 SDValue OutFlag =
9675 IsSigned ? overflowFlagToValue(Result.getValue(1), VT1, DAG)
9676 : carryFlagToValue(Result.getValue(1), VT1, DAG, InvertCarry);
9677
9678 return DAG.getMergeValues({Result, OutFlag}, DL);
9679}
9680
9681SDValue ARMTargetLowering::LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG,
9682 bool Signed,
9683 SDValue &Chain) const {
9684 EVT VT = Op.getValueType();
9685 assert((VT == MVT::i32 || VT == MVT::i64) &&
9686 "unexpected type for custom lowering DIV");
9687 SDLoc dl(Op);
9688
9689 const auto &DL = DAG.getDataLayout();
9690 RTLIB::Libcall LC;
9691 if (Signed)
9692 LC = VT == MVT::i32 ? RTLIB::SDIVREM_I32 : RTLIB::SDIVREM_I64;
9693 else
9694 LC = VT == MVT::i32 ? RTLIB::UDIVREM_I32 : RTLIB::UDIVREM_I64;
9695
9696 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
9697 SDValue ES = DAG.getExternalSymbol(LCImpl, getPointerTy(DL));
9698
9700
9701 for (auto AI : {1, 0}) {
9702 SDValue Operand = Op.getOperand(AI);
9703 Args.emplace_back(Operand,
9704 Operand.getValueType().getTypeForEVT(*DAG.getContext()));
9705 }
9706
9707 CallLoweringInfo CLI(DAG);
9708 CLI.setDebugLoc(dl).setChain(Chain).setCallee(
9710 VT.getTypeForEVT(*DAG.getContext()), ES, std::move(Args));
9711
9712 return LowerCallTo(CLI).first;
9713}
9714
9715// This is a code size optimisation: return the original SDIV node to
9716// DAGCombiner when we don't want to expand SDIV into a sequence of
9717// instructions, and an empty node otherwise which will cause the
9718// SDIV to be expanded in DAGCombine.
9719SDValue
9720ARMTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
9721 SelectionDAG &DAG,
9722 SmallVectorImpl<SDNode *> &Created) const {
9723 // TODO: Support SREM
9724 if (N->getOpcode() != ISD::SDIV)
9725 return SDValue();
9726
9727 const auto &ST = DAG.getSubtarget<ARMSubtarget>();
9728 const bool MinSize = ST.hasMinSize();
9729 const bool HasDivide = ST.isThumb() ? ST.hasDivideInThumbMode()
9730 : ST.hasDivideInARMMode();
9731
9732 // Don't touch vector types; rewriting this may lead to scalarizing
9733 // the int divs.
9734 if (N->getOperand(0).getValueType().isVector())
9735 return SDValue();
9736
9737 // Bail if MinSize is not set, and also for both ARM and Thumb mode we need
9738 // hwdiv support for this to be really profitable.
9739 if (!(MinSize && HasDivide))
9740 return SDValue();
9741
9742 // ARM mode is a bit simpler than Thumb: we can handle large power
9743 // of 2 immediates with 1 mov instruction; no further checks required,
9744 // just return the sdiv node.
9745 if (!ST.isThumb())
9746 return SDValue(N, 0);
9747
9748 // In Thumb mode, immediates larger than 128 need a wide 4-byte MOV,
9749 // and thus lose the code size benefits of a MOVS that requires only 2.
9750 // TargetTransformInfo and 'getIntImmCodeSizeCost' could be helpful here,
9751 // but as it's doing exactly this, it's not worth the trouble to get TTI.
9752 if (Divisor.sgt(128))
9753 return SDValue();
9754
9755 return SDValue(N, 0);
9756}
9757
9758SDValue ARMTargetLowering::LowerDIV_Windows(SDValue Op, SelectionDAG &DAG,
9759 bool Signed) const {
9760 assert(Op.getValueType() == MVT::i32 &&
9761 "unexpected type for custom lowering DIV");
9762 SDLoc dl(Op);
9763
9764 SDValue DBZCHK = DAG.getNode(ARMISD::WIN__DBZCHK, dl, MVT::Other,
9765 DAG.getEntryNode(), Op.getOperand(1));
9766
9767 return LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9768}
9769
9771 SDLoc DL(N);
9772 SDValue Op = N->getOperand(1);
9773 if (N->getValueType(0) == MVT::i32)
9774 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain, Op);
9775 SDValue Lo, Hi;
9776 std::tie(Lo, Hi) = DAG.SplitScalar(Op, DL, MVT::i32, MVT::i32);
9777 return DAG.getNode(ARMISD::WIN__DBZCHK, DL, MVT::Other, InChain,
9778 DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi));
9779}
9780
9781void ARMTargetLowering::ExpandDIV_Windows(
9782 SDValue Op, SelectionDAG &DAG, bool Signed,
9784 const auto &DL = DAG.getDataLayout();
9785
9786 assert(Op.getValueType() == MVT::i64 &&
9787 "unexpected type for custom lowering DIV");
9788 SDLoc dl(Op);
9789
9790 SDValue DBZCHK = WinDBZCheckDenominator(DAG, Op.getNode(), DAG.getEntryNode());
9791
9792 SDValue Result = LowerWindowsDIVLibCall(Op, DAG, Signed, DBZCHK);
9793
9794 SDValue Lower = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Result);
9795 SDValue Upper = DAG.getNode(ISD::SRL, dl, MVT::i64, Result,
9796 DAG.getConstant(32, dl, getPointerTy(DL)));
9797 Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
9798
9799 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
9800}
9801
9802std::pair<SDValue, SDValue>
9803ARMTargetLowering::LowerAEABIUnalignedLoad(SDValue Op,
9804 SelectionDAG &DAG) const {
9805 // If we have an unaligned load from a i32 or i64 that would normally be
9806 // split into separate ldrb's, we can use the __aeabi_uread4/__aeabi_uread8
9807 // functions instead.
9808 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9809 EVT MemVT = LD->getMemoryVT();
9810 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9811 return std::make_pair(SDValue(), SDValue());
9812
9813 const auto &MF = DAG.getMachineFunction();
9814 unsigned AS = LD->getAddressSpace();
9815 Align Alignment = LD->getAlign();
9816 const DataLayout &DL = DAG.getDataLayout();
9817 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9818
9819 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9820 Alignment <= llvm::Align(2)) {
9821
9822 RTLIB::Libcall LC =
9823 (MemVT == MVT::i32) ? RTLIB::AEABI_UREAD4 : RTLIB::AEABI_UREAD8;
9824
9825 MakeLibCallOptions Opts;
9826 SDLoc dl(Op);
9827
9828 auto Pair = makeLibCall(DAG, LC, MemVT.getSimpleVT(), LD->getBasePtr(),
9829 Opts, dl, LD->getChain());
9830
9831 // If necessary, extend the node to 64bit
9832 if (LD->getExtensionType() != ISD::NON_EXTLOAD) {
9833 unsigned ExtType = LD->getExtensionType() == ISD::SEXTLOAD
9836 SDValue EN = DAG.getNode(ExtType, dl, LD->getValueType(0), Pair.first);
9837 Pair.first = EN;
9838 }
9839 return Pair;
9840 }
9841
9842 // Default expand to individual loads
9843 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9844 return expandUnalignedLoad(LD, DAG);
9845 return std::make_pair(SDValue(), SDValue());
9846}
9847
9848SDValue ARMTargetLowering::LowerAEABIUnalignedStore(SDValue Op,
9849 SelectionDAG &DAG) const {
9850 // If we have an unaligned store to a i32 or i64 that would normally be
9851 // split into separate ldrb's, we can use the __aeabi_uwrite4/__aeabi_uwrite8
9852 // functions instead.
9853 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9854 EVT MemVT = ST->getMemoryVT();
9855 if (MemVT != MVT::i32 && MemVT != MVT::i64)
9856 return SDValue();
9857
9858 const auto &MF = DAG.getMachineFunction();
9859 unsigned AS = ST->getAddressSpace();
9860 Align Alignment = ST->getAlign();
9861 const DataLayout &DL = DAG.getDataLayout();
9862 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
9863
9864 if (MF.getFunction().hasMinSize() && !AllowsUnaligned &&
9865 Alignment <= llvm::Align(2)) {
9866
9867 SDLoc dl(Op);
9868
9869 // If necessary, trunc the value to 32bit
9870 SDValue StoreVal = ST->getOperand(1);
9871 if (ST->isTruncatingStore())
9872 StoreVal = DAG.getNode(ISD::TRUNCATE, dl, MemVT, ST->getOperand(1));
9873
9874 RTLIB::Libcall LC =
9875 (MemVT == MVT::i32) ? RTLIB::AEABI_UWRITE4 : RTLIB::AEABI_UWRITE8;
9876
9877 MakeLibCallOptions Opts;
9878 auto CallResult =
9879 makeLibCall(DAG, LC, MVT::isVoid, {StoreVal, ST->getBasePtr()}, Opts,
9880 dl, ST->getChain());
9881
9882 return CallResult.second;
9883 }
9884
9885 // Default expand to individual stores
9886 if (!allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Alignment))
9887 return expandUnalignedStore(ST, DAG);
9888 return SDValue();
9889}
9890
9892 LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
9893 EVT MemVT = LD->getMemoryVT();
9894 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9895 MemVT == MVT::v16i1) &&
9896 "Expected a predicate type!");
9897 assert(MemVT == Op.getValueType());
9898 assert(LD->getExtensionType() == ISD::NON_EXTLOAD &&
9899 "Expected a non-extending load");
9900 assert(LD->isUnindexed() && "Expected a unindexed load");
9901
9902 // The basic MVE VLDR on a v2i1/v4i1/v8i1 actually loads the entire 16bit
9903 // predicate, with the "v4i1" bits spread out over the 16 bits loaded. We
9904 // need to make sure that 8/4/2 bits are actually loaded into the correct
9905 // place, which means loading the value and then shuffling the values into
9906 // the bottom bits of the predicate.
9907 // Equally, VLDR for an v16i1 will actually load 32bits (so will be incorrect
9908 // for BE).
9909 // Speaking of BE, apparently the rest of llvm will assume a reverse order to
9910 // a natural VMSR(load), so needs to be reversed.
9911
9912 SDLoc dl(Op);
9913 SDValue Load = DAG.getExtLoad(
9914 ISD::EXTLOAD, dl, MVT::i32, LD->getChain(), LD->getBasePtr(),
9916 LD->getMemOperand());
9917 SDValue Val = Load;
9918 if (DAG.getDataLayout().isBigEndian())
9919 Val = DAG.getNode(ISD::SRL, dl, MVT::i32,
9920 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, Load),
9921 DAG.getConstant(32 - MemVT.getSizeInBits(), dl, MVT::i32));
9922 SDValue Pred = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::v16i1, Val);
9923 if (MemVT != MVT::v16i1)
9924 Pred = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MemVT, Pred,
9925 DAG.getConstant(0, dl, MVT::i32));
9926 return DAG.getMergeValues({Pred, Load.getValue(1)}, dl);
9927}
9928
9929void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
9930 SelectionDAG &DAG) const {
9931 LoadSDNode *LD = cast<LoadSDNode>(N);
9932 EVT MemVT = LD->getMemoryVT();
9933
9934 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9935 !Subtarget->isThumb1Only() && LD->isVolatile() &&
9936 LD->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
9937 assert(LD->isUnindexed() && "Loads should be unindexed at this point.");
9938 SDLoc dl(N);
9940 ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
9941 {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
9942 SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
9943 SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
9944 SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
9945 Results.append({Pair, Result.getValue(2)});
9946 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
9947 auto Pair = LowerAEABIUnalignedLoad(SDValue(N, 0), DAG);
9948 if (Pair.first) {
9949 Results.push_back(Pair.first);
9950 Results.push_back(Pair.second);
9951 }
9952 }
9953}
9954
9956 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9957 EVT MemVT = ST->getMemoryVT();
9958 assert((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
9959 MemVT == MVT::v16i1) &&
9960 "Expected a predicate type!");
9961 assert(MemVT == ST->getValue().getValueType());
9962 assert(!ST->isTruncatingStore() && "Expected a non-extending store");
9963 assert(ST->isUnindexed() && "Expected a unindexed store");
9964
9965 // Only store the v2i1 or v4i1 or v8i1 worth of bits, via a buildvector with
9966 // top bits unset and a scalar store.
9967 SDLoc dl(Op);
9968 SDValue Build = ST->getValue();
9969 if (MemVT != MVT::v16i1) {
9971 for (unsigned I = 0; I < MemVT.getVectorNumElements(); I++) {
9972 unsigned Elt = DAG.getDataLayout().isBigEndian()
9973 ? MemVT.getVectorNumElements() - I - 1
9974 : I;
9975 Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Build,
9976 DAG.getConstant(Elt, dl, MVT::i32)));
9977 }
9978 for (unsigned I = MemVT.getVectorNumElements(); I < 16; I++)
9979 Ops.push_back(DAG.getUNDEF(MVT::i32));
9980 Build = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v16i1, Ops);
9981 }
9982 SDValue GRP = DAG.getNode(ARMISD::PREDICATE_CAST, dl, MVT::i32, Build);
9983 if (MemVT == MVT::v16i1 && DAG.getDataLayout().isBigEndian())
9984 GRP = DAG.getNode(ISD::SRL, dl, MVT::i32,
9985 DAG.getNode(ISD::BITREVERSE, dl, MVT::i32, GRP),
9986 DAG.getConstant(16, dl, MVT::i32));
9987 return DAG.getTruncStore(
9988 ST->getChain(), dl, GRP, ST->getBasePtr(),
9990 ST->getMemOperand());
9991}
9992
9993SDValue ARMTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG,
9994 const ARMSubtarget *Subtarget) const {
9995 StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
9996 EVT MemVT = ST->getMemoryVT();
9997
9998 if (MemVT == MVT::i64 && Subtarget->hasV5TEOps() &&
9999 !Subtarget->isThumb1Only() && ST->isVolatile() &&
10000 ST->getAlign() >= Subtarget->getDualLoadStoreAlignment()) {
10001 assert(ST->isUnindexed() && "Stores should be unindexed at this point.");
10002 SDNode *N = Op.getNode();
10003 SDLoc dl(N);
10004
10005 SDValue Lo = DAG.getNode(
10006 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10007 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
10008 MVT::i32));
10009 SDValue Hi = DAG.getNode(
10010 ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
10011 DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
10012 MVT::i32));
10013
10014 return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
10015 {ST->getChain(), Lo, Hi, ST->getBasePtr()},
10016 MemVT, ST->getMemOperand());
10017 } else if (Subtarget->hasMVEIntegerOps() &&
10018 ((MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10019 MemVT == MVT::v16i1))) {
10020 return LowerPredicateStore(Op, DAG);
10021 } else if (MemVT == MVT::i32 || MemVT == MVT::i64) {
10022 return LowerAEABIUnalignedStore(Op, DAG);
10023 }
10024 return SDValue();
10025}
10026
10027static bool isZeroVector(SDValue N) {
10028 return (ISD::isBuildVectorAllZeros(N.getNode()) ||
10029 (N->getOpcode() == ARMISD::VMOVIMM &&
10030 isNullConstant(N->getOperand(0))));
10031}
10032
10035 MVT VT = Op.getSimpleValueType();
10036 SDValue Mask = N->getMask();
10037 SDValue PassThru = N->getPassThru();
10038 SDLoc dl(Op);
10039
10040 if (isZeroVector(PassThru))
10041 return Op;
10042
10043 // MVE Masked loads use zero as the passthru value. Here we convert undef to
10044 // zero too, and other values are lowered to a select.
10045 SDValue ZeroVec = DAG.getNode(ARMISD::VMOVIMM, dl, VT,
10046 DAG.getTargetConstant(0, dl, MVT::i32));
10047 SDValue NewLoad = DAG.getMaskedLoad(
10048 VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask, ZeroVec,
10049 N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
10050 N->getExtensionType(), N->isExpandingLoad());
10051 SDValue Combo = NewLoad;
10052 bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
10053 PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
10054 isZeroVector(PassThru->getOperand(0));
10055 if (!PassThru.isUndef() && !PassThruIsCastZero)
10056 Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
10057 return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
10058}
10059
10061 const ARMSubtarget *ST) {
10062 if (!ST->hasMVEIntegerOps())
10063 return SDValue();
10064
10065 SDLoc dl(Op);
10066 unsigned BaseOpcode = 0;
10067 switch (Op->getOpcode()) {
10068 default: llvm_unreachable("Expected VECREDUCE opcode");
10069 case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
10070 case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
10071 case ISD::VECREDUCE_MUL: BaseOpcode = ISD::MUL; break;
10072 case ISD::VECREDUCE_AND: BaseOpcode = ISD::AND; break;
10073 case ISD::VECREDUCE_OR: BaseOpcode = ISD::OR; break;
10074 case ISD::VECREDUCE_XOR: BaseOpcode = ISD::XOR; break;
10075 case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
10076 case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
10077 }
10078
10079 SDValue Op0 = Op->getOperand(0);
10080 EVT VT = Op0.getValueType();
10081 EVT EltVT = VT.getVectorElementType();
10082 unsigned NumElts = VT.getVectorNumElements();
10083 unsigned NumActiveLanes = NumElts;
10084
10085 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10086 NumActiveLanes == 2) &&
10087 "Only expected a power 2 vector size");
10088
10089 // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
10090 // allows us to easily extract vector elements from the lanes.
10091 while (NumActiveLanes > 4) {
10092 unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
10093 SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
10094 Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
10095 NumActiveLanes /= 2;
10096 }
10097
10098 SDValue Res;
10099 if (NumActiveLanes == 4) {
10100 // The remaining 4 elements are summed sequentially
10101 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10102 DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
10103 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10104 DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
10105 SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10106 DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
10107 SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10108 DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
10109 SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10110 SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
10111 Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
10112 } else {
10113 SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10114 DAG.getConstant(0, dl, MVT::i32));
10115 SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10116 DAG.getConstant(1, dl, MVT::i32));
10117 Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
10118 }
10119
10120 // Result type may be wider than element type.
10121 if (EltVT != Op->getValueType(0))
10122 Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
10123 return Res;
10124}
10125
10127 const ARMSubtarget *ST) {
10128 if (!ST->hasMVEFloatOps())
10129 return SDValue();
10130 return LowerVecReduce(Op, DAG, ST);
10131}
10132
10134 const ARMSubtarget *ST) {
10135 if (!ST->hasNEON())
10136 return SDValue();
10137
10138 SDLoc dl(Op);
10139 SDValue Op0 = Op->getOperand(0);
10140 EVT VT = Op0.getValueType();
10141 EVT EltVT = VT.getVectorElementType();
10142
10143 unsigned PairwiseIntrinsic = 0;
10144 switch (Op->getOpcode()) {
10145 default:
10146 llvm_unreachable("Expected VECREDUCE opcode");
10148 PairwiseIntrinsic = Intrinsic::arm_neon_vpminu;
10149 break;
10151 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxu;
10152 break;
10154 PairwiseIntrinsic = Intrinsic::arm_neon_vpmins;
10155 break;
10157 PairwiseIntrinsic = Intrinsic::arm_neon_vpmaxs;
10158 break;
10159 }
10160 SDValue PairwiseOp = DAG.getConstant(PairwiseIntrinsic, dl, MVT::i32);
10161
10162 unsigned NumElts = VT.getVectorNumElements();
10163 unsigned NumActiveLanes = NumElts;
10164
10165 assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
10166 NumActiveLanes == 2) &&
10167 "Only expected a power 2 vector size");
10168
10169 // Split 128-bit vectors, since vpmin/max takes 2 64-bit vectors.
10170 if (VT.is128BitVector()) {
10171 SDValue Lo, Hi;
10172 std::tie(Lo, Hi) = DAG.SplitVector(Op0, dl);
10173 VT = Lo.getValueType();
10174 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Lo, Hi});
10175 NumActiveLanes /= 2;
10176 }
10177
10178 // Use pairwise reductions until one lane remains
10179 while (NumActiveLanes > 1) {
10180 Op0 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, {PairwiseOp, Op0, Op0});
10181 NumActiveLanes /= 2;
10182 }
10183
10184 SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
10185 DAG.getConstant(0, dl, MVT::i32));
10186
10187 // Result type may be wider than element type.
10188 if (EltVT != Op.getValueType()) {
10189 unsigned Extend = 0;
10190 switch (Op->getOpcode()) {
10191 default:
10192 llvm_unreachable("Expected VECREDUCE opcode");
10195 Extend = ISD::ZERO_EXTEND;
10196 break;
10199 Extend = ISD::SIGN_EXTEND;
10200 break;
10201 }
10202 Res = DAG.getNode(Extend, dl, Op.getValueType(), Res);
10203 }
10204 return Res;
10205}
10206
10208 if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getSuccessOrdering()))
10209 // Acquire/Release load/store is not legal for targets without a dmb or
10210 // equivalent available.
10211 return SDValue();
10212
10213 // Monotonic load/store is legal for all targets.
10214 return Op;
10215}
10216
10219 SelectionDAG &DAG,
10220 const ARMSubtarget *Subtarget) {
10221 SDLoc DL(N);
10222 // Under Power Management extensions, the cycle-count is:
10223 // mrc p15, #0, <Rt>, c9, c13, #0
10224 SDValue Ops[] = { N->getOperand(0), // Chain
10225 DAG.getTargetConstant(Intrinsic::arm_mrc, DL, MVT::i32),
10226 DAG.getTargetConstant(15, DL, MVT::i32),
10227 DAG.getTargetConstant(0, DL, MVT::i32),
10228 DAG.getTargetConstant(9, DL, MVT::i32),
10229 DAG.getTargetConstant(13, DL, MVT::i32),
10230 DAG.getTargetConstant(0, DL, MVT::i32)
10231 };
10232
10233 SDValue Cycles32 = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL,
10234 DAG.getVTList(MVT::i32, MVT::Other), Ops);
10235 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, Cycles32,
10236 DAG.getConstant(0, DL, MVT::i32)));
10237 Results.push_back(Cycles32.getValue(1));
10238}
10239
10241 SDValue V1) {
10242 SDLoc dl(V0.getNode());
10243 SDValue RegClass =
10244 DAG.getTargetConstant(ARM::GPRPairRegClassID, dl, MVT::i32);
10245 SDValue SubReg0 = DAG.getTargetConstant(ARM::gsub_0, dl, MVT::i32);
10246 SDValue SubReg1 = DAG.getTargetConstant(ARM::gsub_1, dl, MVT::i32);
10247 const SDValue Ops[] = {RegClass, V0, SubReg0, V1, SubReg1};
10248 return SDValue(
10249 DAG.getMachineNode(TargetOpcode::REG_SEQUENCE, dl, MVT::Untyped, Ops), 0);
10250}
10251
10253 SDLoc dl(V.getNode());
10254 auto [VLo, VHi] = DAG.SplitScalar(V, dl, MVT::i32, MVT::i32);
10255 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10256 if (isBigEndian)
10257 std::swap(VLo, VHi);
10258 return createGPRPairNode2xi32(DAG, VLo, VHi);
10259}
10260
10263 SelectionDAG &DAG) {
10264 assert(N->getValueType(0) == MVT::i64 &&
10265 "AtomicCmpSwap on types less than 64 should be legal");
10266 SDValue Ops[] = {
10267 createGPRPairNode2xi32(DAG, N->getOperand(1),
10268 DAG.getUNDEF(MVT::i32)), // pointer, temp
10269 createGPRPairNodei64(DAG, N->getOperand(2)), // expected
10270 createGPRPairNodei64(DAG, N->getOperand(3)), // new
10271 N->getOperand(0), // chain in
10272 };
10273 SDNode *CmpSwap = DAG.getMachineNode(
10274 ARM::CMP_SWAP_64, SDLoc(N),
10275 DAG.getVTList(MVT::Untyped, MVT::Untyped, MVT::Other), Ops);
10276
10277 MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
10278 DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
10279
10280 bool isBigEndian = DAG.getDataLayout().isBigEndian();
10281
10282 SDValue Lo =
10283 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
10284 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10285 SDValue Hi =
10286 DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
10287 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
10288 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
10289 Results.push_back(SDValue(CmpSwap, 2));
10290}
10291
10292SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
10293 SDLoc dl(Op);
10294 EVT VT = Op.getValueType();
10295 SDValue Chain = Op.getOperand(0);
10296 SDValue LHS = Op.getOperand(1);
10297 SDValue RHS = Op.getOperand(2);
10298 ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
10299 bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
10300
10301 // If we don't have instructions of this float type then soften to a libcall
10302 // and use SETCC instead.
10303 if (isUnsupportedFloatingType(LHS.getValueType())) {
10304 softenSetCCOperands(DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS,
10305 Chain, IsSignaling);
10306 if (!RHS.getNode()) {
10307 RHS = DAG.getConstant(0, dl, LHS.getValueType());
10308 CC = ISD::SETNE;
10309 }
10310 SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
10311 DAG.getCondCode(CC));
10312 return DAG.getMergeValues({Result, Chain}, dl);
10313 }
10314
10315 ARMCC::CondCodes CondCode, CondCode2;
10316 FPCCToARMCC(CC, CondCode, CondCode2);
10317
10318 SDValue True = DAG.getConstant(1, dl, VT);
10319 SDValue False = DAG.getConstant(0, dl, VT);
10320 SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
10321 SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
10322 SDValue Result = getCMOV(dl, VT, False, True, ARMcc, Cmp, DAG);
10323 if (CondCode2 != ARMCC::AL) {
10324 ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
10325 Result = getCMOV(dl, VT, Result, True, ARMcc, Cmp, DAG);
10326 }
10327 return DAG.getMergeValues({Result, Chain}, dl);
10328}
10329
10330SDValue ARMTargetLowering::LowerSPONENTRY(SDValue Op, SelectionDAG &DAG) const {
10331 MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
10332
10333 EVT VT = getPointerTy(DAG.getDataLayout());
10334 int FI = MFI.CreateFixedObject(4, 0, false);
10335 return DAG.getFrameIndex(FI, VT);
10336}
10337
10338SDValue ARMTargetLowering::LowerFP_TO_BF16(SDValue Op,
10339 SelectionDAG &DAG) const {
10340 SDLoc DL(Op);
10341 MakeLibCallOptions CallOptions;
10342 MVT SVT = Op.getOperand(0).getSimpleValueType();
10343 RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, MVT::bf16);
10344 SDValue Res =
10345 makeLibCall(DAG, LC, MVT::f32, Op.getOperand(0), CallOptions, DL).first;
10346 return DAG.getBitcast(MVT::i32, Res);
10347}
10348
10349SDValue ARMTargetLowering::LowerCMP(SDValue Op, SelectionDAG &DAG) const {
10350 SDLoc dl(Op);
10351 SDValue LHS = Op.getOperand(0);
10352 SDValue RHS = Op.getOperand(1);
10353
10354 // Determine if this is signed or unsigned comparison
10355 bool IsSigned = (Op.getOpcode() == ISD::SCMP);
10356
10357 // Special case for Thumb1 UCMP only
10358 if (!IsSigned && Subtarget->isThumb1Only()) {
10359 // For Thumb unsigned comparison, use this sequence:
10360 // subs r2, r0, r1 ; r2 = LHS - RHS, sets flags
10361 // sbc r2, r2 ; r2 = r2 - r2 - !carry
10362 // cmp r1, r0 ; compare RHS with LHS
10363 // sbc r1, r1 ; r1 = r1 - r1 - !carry
10364 // subs r0, r2, r1 ; r0 = r2 - r1 (final result)
10365
10366 // First subtraction: LHS - RHS
10367 SDValue Sub1WithFlags = DAG.getNode(
10368 ARMISD::SUBC, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10369 SDValue Sub1Result = Sub1WithFlags.getValue(0);
10370 SDValue Flags1 = Sub1WithFlags.getValue(1);
10371
10372 // SUBE: Sub1Result - Sub1Result - !carry
10373 // This gives 0 if LHS >= RHS (unsigned), -1 if LHS < RHS (unsigned)
10374 SDValue Sbc1 =
10375 DAG.getNode(ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT),
10376 Sub1Result, Sub1Result, Flags1);
10377 SDValue Sbc1Result = Sbc1.getValue(0);
10378
10379 // Second comparison: RHS vs LHS (reverse comparison)
10380 SDValue CmpFlags = DAG.getNode(ARMISD::CMP, dl, FlagsVT, RHS, LHS);
10381
10382 // SUBE: RHS - RHS - !carry
10383 // This gives 0 if RHS <= LHS (unsigned), -1 if RHS > LHS (unsigned)
10384 SDValue Sbc2 = DAG.getNode(
10385 ARMISD::SUBE, dl, DAG.getVTList(MVT::i32, FlagsVT), RHS, RHS, CmpFlags);
10386 SDValue Sbc2Result = Sbc2.getValue(0);
10387
10388 // Final subtraction: Sbc1Result - Sbc2Result (no flags needed)
10389 SDValue Result =
10390 DAG.getNode(ISD::SUB, dl, MVT::i32, Sbc1Result, Sbc2Result);
10391 if (Op.getValueType() != MVT::i32)
10392 Result = DAG.getSExtOrTrunc(Result, dl, Op.getValueType());
10393
10394 return Result;
10395 }
10396
10397 // For the ARM assembly pattern:
10398 // subs r0, r0, r1 ; subtract RHS from LHS and set flags
10399 // movgt r0, #1 ; if LHS > RHS, set result to 1 (GT for signed, HI for
10400 // unsigned) mvnlt r0, #0 ; if LHS < RHS, set result to -1 (LT for
10401 // signed, LO for unsigned)
10402 // ; if LHS == RHS, result remains 0 from the subs
10403
10404 // Optimization: if RHS is a subtraction against 0, use ADDC instead of SUBC
10405 unsigned Opcode = ARMISD::SUBC;
10406
10407 // Check if RHS is a subtraction against 0: (0 - X)
10408 if (RHS.getOpcode() == ISD::SUB) {
10409 SDValue SubLHS = RHS.getOperand(0);
10410 SDValue SubRHS = RHS.getOperand(1);
10411
10412 // Check if it's 0 - X
10413 if (isNullConstant(SubLHS)) {
10414 bool CanUseAdd = false;
10415 if (IsSigned) {
10416 // For SCMP: only if X is known to never be INT_MIN (to avoid overflow)
10417 if (RHS->getFlags().hasNoSignedWrap() || !DAG.computeKnownBits(SubRHS)
10419 .isMinSignedValue()) {
10420 CanUseAdd = true;
10421 }
10422 } else {
10423 // For UCMP: only if X is known to never be zero
10424 if (DAG.isKnownNeverZero(SubRHS)) {
10425 CanUseAdd = true;
10426 }
10427 }
10428
10429 if (CanUseAdd) {
10430 Opcode = ARMISD::ADDC;
10431 RHS = SubRHS; // Replace RHS with X, so we do LHS + X instead of
10432 // LHS - (0 - X)
10433 }
10434 }
10435 }
10436
10437 // Generate the operation with flags
10438 SDValue OpWithFlags =
10439 DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, FlagsVT), LHS, RHS);
10440
10441 SDValue OpResult = OpWithFlags.getValue(0);
10442 SDValue Flags = OpWithFlags.getValue(1);
10443
10444 // Constants for conditional moves
10445 SDValue One = DAG.getConstant(1, dl, MVT::i32);
10446 SDValue MinusOne = DAG.getAllOnesConstant(dl, MVT::i32);
10447
10448 // Select condition codes based on signed vs unsigned
10449 ARMCC::CondCodes GTCond = IsSigned ? ARMCC::GT : ARMCC::HI;
10450 ARMCC::CondCodes LTCond = IsSigned ? ARMCC::LT : ARMCC::LO;
10451
10452 // First conditional move: if greater than, set to 1
10453 SDValue GTCondValue = DAG.getConstant(GTCond, dl, MVT::i32);
10454 SDValue Result1 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, OpResult, One,
10455 GTCondValue, Flags);
10456
10457 // Second conditional move: if less than, set to -1
10458 SDValue LTCondValue = DAG.getConstant(LTCond, dl, MVT::i32);
10459 SDValue Result2 = DAG.getNode(ARMISD::CMOV, dl, MVT::i32, Result1, MinusOne,
10460 LTCondValue, Flags);
10461
10462 if (Op.getValueType() != MVT::i32)
10463 Result2 = DAG.getSExtOrTrunc(Result2, dl, Op.getValueType());
10464
10465 return Result2;
10466}
10467
10469 LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
10470 switch (Op.getOpcode()) {
10471 default: llvm_unreachable("Don't know how to custom lower this!");
10472 case ISD::WRITE_REGISTER: return LowerWRITE_REGISTER(Op, DAG);
10473 case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
10474 case ISD::BlockAddress: return LowerBlockAddress(Op, DAG);
10475 case ISD::GlobalAddress: return LowerGlobalAddress(Op, DAG);
10476 case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
10477 case ISD::SELECT: return LowerSELECT(Op, DAG);
10478 case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
10479 case ISD::BRCOND: return LowerBRCOND(Op, DAG);
10480 case ISD::BR_CC: return LowerBR_CC(Op, DAG);
10481 case ISD::BR_JT: return LowerBR_JT(Op, DAG);
10482 case ISD::VASTART: return LowerVASTART(Op, DAG);
10483 case ISD::ATOMIC_FENCE: return LowerATOMIC_FENCE(Op, DAG, Subtarget);
10484 case ISD::PREFETCH: return LowerPREFETCH(Op, DAG, Subtarget);
10485 case ISD::SINT_TO_FP:
10486 case ISD::UINT_TO_FP: return LowerINT_TO_FP(Op, DAG);
10489 case ISD::FP_TO_SINT:
10490 case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
10492 case ISD::FP_TO_UINT_SAT: return LowerFP_TO_INT_SAT(Op, DAG, Subtarget);
10493 case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
10494 case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
10495 case ISD::FRAMEADDR: return LowerFRAMEADDR(Op, DAG);
10496 case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
10497 case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
10498 case ISD::EH_SJLJ_SETUP_DISPATCH: return LowerEH_SJLJ_SETUP_DISPATCH(Op, DAG);
10499 case ISD::INTRINSIC_VOID: return LowerINTRINSIC_VOID(Op, DAG, Subtarget);
10500 case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
10501 Subtarget);
10502 case ISD::BITCAST: return ExpandBITCAST(Op.getNode(), DAG, Subtarget);
10503 case ISD::SHL:
10504 case ISD::SRL:
10505 case ISD::SRA: return LowerShift(Op.getNode(), DAG, Subtarget);
10506 case ISD::SREM: return LowerREM(Op.getNode(), DAG);
10507 case ISD::UREM: return LowerREM(Op.getNode(), DAG);
10508 case ISD::SHL_PARTS: return LowerShiftLeftParts(Op, DAG);
10509 case ISD::SRL_PARTS:
10510 case ISD::SRA_PARTS: return LowerShiftRightParts(Op, DAG);
10511 case ISD::CTTZ:
10512 case ISD::CTTZ_ZERO_POISON: return LowerCTTZ(Op.getNode(), DAG, Subtarget);
10513 case ISD::CTPOP: return LowerCTPOP(Op.getNode(), DAG, Subtarget);
10514 case ISD::SETCC: return LowerVSETCC(Op, DAG, Subtarget);
10515 case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
10516 case ISD::ConstantFP: return LowerConstantFP(Op, DAG, Subtarget);
10517 case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG, Subtarget);
10518 case ISD::VECTOR_SHUFFLE: return LowerVECTOR_SHUFFLE(Op, DAG, Subtarget);
10519 case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG, Subtarget);
10520 case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
10521 case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG, Subtarget);
10522 case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG, Subtarget);
10523 case ISD::TRUNCATE: return LowerTruncate(Op.getNode(), DAG, Subtarget);
10524 case ISD::SIGN_EXTEND:
10525 case ISD::ZERO_EXTEND: return LowerVectorExtend(Op.getNode(), DAG, Subtarget);
10526 case ISD::GET_ROUNDING: return LowerGET_ROUNDING(Op, DAG);
10527 case ISD::SET_ROUNDING: return LowerSET_ROUNDING(Op, DAG);
10528 case ISD::SET_FPMODE:
10529 return LowerSET_FPMODE(Op, DAG);
10530 case ISD::RESET_FPMODE:
10531 return LowerRESET_FPMODE(Op, DAG);
10532 case ISD::MUL: return LowerMUL(Op, DAG);
10533 case ISD::SDIV:
10534 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10535 !Op.getValueType().isVector())
10536 return LowerDIV_Windows(Op, DAG, /* Signed */ true);
10537 return LowerSDIV(Op, DAG, Subtarget);
10538 case ISD::UDIV:
10539 if (getTargetMachine().getTargetTriple().isOSWindows() &&
10540 !Op.getValueType().isVector())
10541 return LowerDIV_Windows(Op, DAG, /* Signed */ false);
10542 return LowerUDIV(Op, DAG, Subtarget);
10543 case ISD::UADDO_CARRY:
10544 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::ADDE, false /*unsigned*/);
10545 case ISD::USUBO_CARRY:
10546 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::SUBE, false /*unsigned*/);
10547 case ISD::SADDO_CARRY:
10548 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::ADDE, true /*signed*/);
10549 case ISD::SSUBO_CARRY:
10550 return LowerADDSUBO_CARRY(Op, DAG, ARMISD::SUBE, true /*signed*/);
10551 case ISD::UADDO:
10552 case ISD::USUBO:
10553 case ISD::UMULO:
10554 case ISD::SADDO:
10555 case ISD::SSUBO:
10556 case ISD::SMULO:
10557 return LowerALUO(Op, DAG);
10558 case ISD::SADDSAT:
10559 case ISD::SSUBSAT:
10560 case ISD::UADDSAT:
10561 case ISD::USUBSAT:
10562 return LowerADDSUBSAT(Op, DAG, Subtarget);
10563 case ISD::LOAD: {
10564 auto *LD = cast<LoadSDNode>(Op);
10565 EVT MemVT = LD->getMemoryVT();
10566 if (Subtarget->hasMVEIntegerOps() &&
10567 (MemVT == MVT::v2i1 || MemVT == MVT::v4i1 || MemVT == MVT::v8i1 ||
10568 MemVT == MVT::v16i1))
10569 return LowerPredicateLoad(Op, DAG);
10570
10571 auto Pair = LowerAEABIUnalignedLoad(Op, DAG);
10572 if (Pair.first)
10573 return DAG.getMergeValues({Pair.first, Pair.second}, SDLoc(Pair.first));
10574 return SDValue();
10575 }
10576 case ISD::STORE:
10577 return LowerSTORE(Op, DAG, Subtarget);
10578 case ISD::MLOAD:
10579 return LowerMLOAD(Op, DAG);
10580 case ISD::VECREDUCE_MUL:
10581 case ISD::VECREDUCE_AND:
10582 case ISD::VECREDUCE_OR:
10583 case ISD::VECREDUCE_XOR:
10584 return LowerVecReduce(Op, DAG, Subtarget);
10589 return LowerVecReduceF(Op, DAG, Subtarget);
10594 return LowerVecReduceMinMax(Op, DAG, Subtarget);
10595 case ISD::ATOMIC_LOAD:
10596 case ISD::ATOMIC_STORE:
10597 return LowerAtomicLoadStore(Op, DAG);
10598 case ISD::SDIVREM:
10599 case ISD::UDIVREM: return LowerDivRem(Op, DAG);
10601 if (getTargetMachine().getTargetTriple().isOSWindows())
10602 return LowerDYNAMIC_STACKALLOC(Op, DAG);
10603 llvm_unreachable("Don't know how to custom lower this!");
10605 case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
10607 case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
10608 case ISD::STRICT_FSETCC:
10609 case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
10610 case ISD::SPONENTRY:
10611 return LowerSPONENTRY(Op, DAG);
10612 case ISD::FP_TO_BF16:
10613 return LowerFP_TO_BF16(Op, DAG);
10614 case ARMISD::WIN__DBZCHK: return SDValue();
10615 case ISD::UCMP:
10616 case ISD::SCMP:
10617 return LowerCMP(Op, DAG);
10618 case ISD::ABS:
10619 return LowerABS(Op, DAG);
10620 case ISD::STRICT_LROUND:
10622 case ISD::STRICT_LRINT:
10623 case ISD::STRICT_LLRINT: {
10624 assert((Op.getOperand(1).getValueType() == MVT::f16 ||
10625 Op.getOperand(1).getValueType() == MVT::bf16) &&
10626 "Expected custom lowering of rounding operations only for f16");
10627 SDLoc DL(Op);
10628 SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, DL, {MVT::f32, MVT::Other},
10629 {Op.getOperand(0), Op.getOperand(1)});
10630 return DAG.getNode(Op.getOpcode(), DL, {Op.getValueType(), MVT::Other},
10631 {Ext.getValue(1), Ext.getValue(0)});
10632 }
10633 }
10634}
10635
10637 SelectionDAG &DAG) {
10638 unsigned IntNo = N->getConstantOperandVal(0);
10639 unsigned Opc = 0;
10640 if (IntNo == Intrinsic::arm_smlald)
10641 Opc = ARMISD::SMLALD;
10642 else if (IntNo == Intrinsic::arm_smlaldx)
10643 Opc = ARMISD::SMLALDX;
10644 else if (IntNo == Intrinsic::arm_smlsld)
10645 Opc = ARMISD::SMLSLD;
10646 else if (IntNo == Intrinsic::arm_smlsldx)
10647 Opc = ARMISD::SMLSLDX;
10648 else
10649 return;
10650
10651 SDLoc dl(N);
10652 SDValue Lo, Hi;
10653 std::tie(Lo, Hi) = DAG.SplitScalar(N->getOperand(3), dl, MVT::i32, MVT::i32);
10654
10655 SDValue LongMul = DAG.getNode(Opc, dl,
10656 DAG.getVTList(MVT::i32, MVT::i32),
10657 N->getOperand(1), N->getOperand(2),
10658 Lo, Hi);
10659 Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
10660 LongMul.getValue(0), LongMul.getValue(1)));
10661}
10662
10663/// ReplaceNodeResults - Replace the results of node with an illegal result
10664/// type with new values built out of custom code.
10667 SelectionDAG &DAG) const {
10668 SDValue Res;
10669 switch (N->getOpcode()) {
10670 default:
10671 llvm_unreachable("Don't know how to custom expand this!");
10672 case ISD::READ_REGISTER:
10674 break;
10675 case ISD::BITCAST:
10676 Res = ExpandBITCAST(N, DAG, Subtarget);
10677 break;
10678 case ISD::SRL:
10679 case ISD::SRA:
10680 case ISD::SHL:
10681 Res = Expand64BitShift(N, DAG, Subtarget);
10682 break;
10683 case ISD::SREM:
10684 case ISD::UREM:
10685 Res = LowerREM(N, DAG);
10686 break;
10687 case ISD::SDIVREM:
10688 case ISD::UDIVREM:
10689 Res = LowerDivRem(SDValue(N, 0), DAG);
10690 assert(Res.getNumOperands() == 2 && "DivRem needs two values");
10691 Results.push_back(Res.getValue(0));
10692 Results.push_back(Res.getValue(1));
10693 return;
10694 case ISD::SADDSAT:
10695 case ISD::SSUBSAT:
10696 case ISD::UADDSAT:
10697 case ISD::USUBSAT:
10698 Res = LowerADDSUBSAT(SDValue(N, 0), DAG, Subtarget);
10699 break;
10701 ReplaceREADCYCLECOUNTER(N, Results, DAG, Subtarget);
10702 return;
10703 case ISD::UDIV:
10704 case ISD::SDIV:
10705 assert(getTargetMachine().getTargetTriple().isOSWindows() &&
10706 "can only expand DIV on Windows");
10707 return ExpandDIV_Windows(SDValue(N, 0), DAG, N->getOpcode() == ISD::SDIV,
10708 Results);
10711 return;
10713 return ReplaceLongIntrinsic(N, Results, DAG);
10714 case ISD::LOAD:
10715 LowerLOAD(N, Results, DAG);
10716 break;
10717 case ISD::STORE:
10718 Res = LowerAEABIUnalignedStore(SDValue(N, 0), DAG);
10719 break;
10720 case ISD::TRUNCATE:
10721 Res = LowerTruncate(N, DAG, Subtarget);
10722 break;
10723 case ISD::SIGN_EXTEND:
10724 case ISD::ZERO_EXTEND:
10725 Res = LowerVectorExtend(N, DAG, Subtarget);
10726 break;
10729 Res = LowerFP_TO_INT_SAT(SDValue(N, 0), DAG, Subtarget);
10730 break;
10731 }
10732 if (Res.getNode())
10733 Results.push_back(Res);
10734}
10735
10736//===----------------------------------------------------------------------===//
10737// ARM Scheduler Hooks
10738//===----------------------------------------------------------------------===//
10739
10740/// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
10741/// registers the function context.
10742void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
10744 MachineBasicBlock *DispatchBB,
10745 int FI) const {
10746 assert(!Subtarget->isROPI() && !Subtarget->isRWPI() &&
10747 "ROPI/RWPI not currently supported with SjLj");
10748 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10749 DebugLoc dl = MI.getDebugLoc();
10750 MachineFunction *MF = MBB->getParent();
10751 MachineRegisterInfo *MRI = &MF->getRegInfo();
10754 const Function &F = MF->getFunction();
10755
10756 bool isThumb = Subtarget->isThumb();
10757 bool isThumb2 = Subtarget->isThumb2();
10758
10759 unsigned PCLabelId = AFI->createPICLabelUId();
10760 unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
10762 ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
10763 unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
10764
10765 const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
10766 : &ARM::GPRRegClass;
10767
10768 // Grab constant pool and fixed stack memory operands.
10769 MachineMemOperand *CPMMO =
10772
10773 MachineMemOperand *FIMMOSt =
10776
10777 // Load the address of the dispatch MBB into the jump buffer.
10778 if (isThumb2) {
10779 // Incoming value: jbuf
10780 // ldr.n r5, LCPI1_1
10781 // orr r5, r5, #1
10782 // add r5, pc
10783 // str r5, [$jbuf, #+4] ; &jbuf[1]
10784 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10785 BuildMI(*MBB, MI, dl, TII->get(ARM::t2LDRpci), NewVReg1)
10787 .addMemOperand(CPMMO)
10789 // Set the low bit because of thumb mode.
10790 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10791 BuildMI(*MBB, MI, dl, TII->get(ARM::t2ORRri), NewVReg2)
10792 .addReg(NewVReg1, RegState::Kill)
10793 .addImm(0x01)
10795 .add(condCodeOp());
10796 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10797 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg3)
10798 .addReg(NewVReg2, RegState::Kill)
10799 .addImm(PCLabelId);
10800 BuildMI(*MBB, MI, dl, TII->get(ARM::t2STRi12))
10801 .addReg(NewVReg3, RegState::Kill)
10802 .addFrameIndex(FI)
10803 .addImm(36) // &jbuf[1] :: pc
10804 .addMemOperand(FIMMOSt)
10806 } else if (isThumb) {
10807 // Incoming value: jbuf
10808 // ldr.n r1, LCPI1_4
10809 // add r1, pc
10810 // mov r2, #1
10811 // orrs r1, r2
10812 // add r2, $jbuf, #+4 ; &jbuf[1]
10813 // str r1, [r2]
10814 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10815 BuildMI(*MBB, MI, dl, TII->get(ARM::tLDRpci), NewVReg1)
10817 .addMemOperand(CPMMO)
10819 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10820 BuildMI(*MBB, MI, dl, TII->get(ARM::tPICADD), NewVReg2)
10821 .addReg(NewVReg1, RegState::Kill)
10822 .addImm(PCLabelId);
10823 // Set the low bit because of thumb mode.
10824 Register NewVReg3 = MRI->createVirtualRegister(TRC);
10825 BuildMI(*MBB, MI, dl, TII->get(ARM::tMOVi8), NewVReg3)
10826 .addReg(ARM::CPSR, RegState::Define)
10827 .addImm(1)
10829 Register NewVReg4 = MRI->createVirtualRegister(TRC);
10830 BuildMI(*MBB, MI, dl, TII->get(ARM::tORR), NewVReg4)
10831 .addReg(ARM::CPSR, RegState::Define)
10832 .addReg(NewVReg2, RegState::Kill)
10833 .addReg(NewVReg3, RegState::Kill)
10835 Register NewVReg5 = MRI->createVirtualRegister(TRC);
10836 BuildMI(*MBB, MI, dl, TII->get(ARM::tADDframe), NewVReg5)
10837 .addFrameIndex(FI)
10838 .addImm(36); // &jbuf[1] :: pc
10839 BuildMI(*MBB, MI, dl, TII->get(ARM::tSTRi))
10840 .addReg(NewVReg4, RegState::Kill)
10841 .addReg(NewVReg5, RegState::Kill)
10842 .addImm(0)
10843 .addMemOperand(FIMMOSt)
10845 } else {
10846 // Incoming value: jbuf
10847 // ldr r1, LCPI1_1
10848 // add r1, pc, r1
10849 // str r1, [$jbuf, #+4] ; &jbuf[1]
10850 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10851 BuildMI(*MBB, MI, dl, TII->get(ARM::LDRi12), NewVReg1)
10853 .addImm(0)
10854 .addMemOperand(CPMMO)
10856 Register NewVReg2 = MRI->createVirtualRegister(TRC);
10857 BuildMI(*MBB, MI, dl, TII->get(ARM::PICADD), NewVReg2)
10858 .addReg(NewVReg1, RegState::Kill)
10859 .addImm(PCLabelId)
10861 BuildMI(*MBB, MI, dl, TII->get(ARM::STRi12))
10862 .addReg(NewVReg2, RegState::Kill)
10863 .addFrameIndex(FI)
10864 .addImm(36) // &jbuf[1] :: pc
10865 .addMemOperand(FIMMOSt)
10867 }
10868}
10869
10870void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
10871 MachineBasicBlock *MBB) const {
10872 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
10873 DebugLoc dl = MI.getDebugLoc();
10874 MachineFunction *MF = MBB->getParent();
10875 MachineRegisterInfo *MRI = &MF->getRegInfo();
10876 MachineFrameInfo &MFI = MF->getFrameInfo();
10877 int FI = MFI.getFunctionContextIndex();
10878
10879 const TargetRegisterClass *TRC = Subtarget->isThumb() ? &ARM::tGPRRegClass
10880 : &ARM::GPRnopcRegClass;
10881
10882 // Get a mapping of the call site numbers to all of the landing pads they're
10883 // associated with.
10884 DenseMap<unsigned, SmallVector<MachineBasicBlock*, 2>> CallSiteNumToLPad;
10885 unsigned MaxCSNum = 0;
10886 for (MachineBasicBlock &BB : *MF) {
10887 if (!BB.isEHPad())
10888 continue;
10889
10890 // FIXME: We should assert that the EH_LABEL is the first MI in the landing
10891 // pad.
10892 for (MachineInstr &II : BB) {
10893 if (!II.isEHLabel())
10894 continue;
10895
10896 MCSymbol *Sym = II.getOperand(0).getMCSymbol();
10897 if (!MF->hasCallSiteLandingPad(Sym)) continue;
10898
10899 SmallVectorImpl<unsigned> &CallSiteIdxs = MF->getCallSiteLandingPad(Sym);
10900 for (unsigned Idx : CallSiteIdxs) {
10901 CallSiteNumToLPad[Idx].push_back(&BB);
10902 MaxCSNum = std::max(MaxCSNum, Idx);
10903 }
10904 break;
10905 }
10906 }
10907
10908 // Get an ordered list of the machine basic blocks for the jump table.
10909 std::vector<MachineBasicBlock*> LPadList;
10910 SmallPtrSet<MachineBasicBlock*, 32> InvokeBBs;
10911 LPadList.reserve(CallSiteNumToLPad.size());
10912 for (unsigned I = 1; I <= MaxCSNum; ++I) {
10913 SmallVectorImpl<MachineBasicBlock*> &MBBList = CallSiteNumToLPad[I];
10914 for (MachineBasicBlock *MBB : MBBList) {
10915 LPadList.push_back(MBB);
10916 InvokeBBs.insert_range(MBB->predecessors());
10917 }
10918 }
10919
10920 assert(!LPadList.empty() &&
10921 "No landing pad destinations for the dispatch jump table!");
10922
10923 // Create the jump table and associated information.
10924 MachineJumpTableInfo *JTI =
10925 MF->getOrCreateJumpTableInfo(MachineJumpTableInfo::EK_Inline);
10926 unsigned MJTI = JTI->createJumpTableIndex(LPadList);
10927
10928 // Create the MBBs for the dispatch code.
10929
10930 // Shove the dispatch's address into the return slot in the function context.
10931 MachineBasicBlock *DispatchBB = MF->CreateMachineBasicBlock();
10932 DispatchBB->setIsEHPad();
10933
10934 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
10935
10936 BuildMI(TrapBB, dl, TII->get(Subtarget->isThumb() ? ARM::tTRAP : ARM::TRAP));
10937 DispatchBB->addSuccessor(TrapBB);
10938
10939 MachineBasicBlock *DispContBB = MF->CreateMachineBasicBlock();
10940 DispatchBB->addSuccessor(DispContBB);
10941
10942 // Insert and MBBs.
10943 MF->insert(MF->end(), DispatchBB);
10944 MF->insert(MF->end(), DispContBB);
10945 MF->insert(MF->end(), TrapBB);
10946
10947 // Insert code into the entry block that creates and registers the function
10948 // context.
10949 SetupEntryBlockForSjLj(MI, MBB, DispatchBB, FI);
10950
10951 MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
10954
10955 MachineInstrBuilder MIB;
10956 MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
10957
10958 const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
10959 const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
10960
10961 // Add a register mask with no preserved registers. This results in all
10962 // registers being marked as clobbered. This can't work if the dispatch block
10963 // is in a Thumb1 function and is linked with ARM code which uses the FP
10964 // registers, as there is no way to preserve the FP registers in Thumb1 mode.
10966
10967 bool IsPositionIndependent = isPositionIndependent();
10968 unsigned NumLPads = LPadList.size();
10969 if (Subtarget->isThumb2()) {
10970 Register NewVReg1 = MRI->createVirtualRegister(TRC);
10971 BuildMI(DispatchBB, dl, TII->get(ARM::t2LDRi12), NewVReg1)
10972 .addFrameIndex(FI)
10973 .addImm(4)
10974 .addMemOperand(FIMMOLd)
10976
10977 if (NumLPads < 256) {
10978 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPri))
10979 .addReg(NewVReg1)
10980 .addImm(LPadList.size())
10982 } else {
10983 Register VReg1 = MRI->createVirtualRegister(TRC);
10984 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVi16), VReg1)
10985 .addImm(NumLPads & 0xFFFF)
10987
10988 unsigned VReg2 = VReg1;
10989 if ((NumLPads & 0xFFFF0000) != 0) {
10990 VReg2 = MRI->createVirtualRegister(TRC);
10991 BuildMI(DispatchBB, dl, TII->get(ARM::t2MOVTi16), VReg2)
10992 .addReg(VReg1)
10993 .addImm(NumLPads >> 16)
10995 }
10996
10997 BuildMI(DispatchBB, dl, TII->get(ARM::t2CMPrr))
10998 .addReg(NewVReg1)
10999 .addReg(VReg2)
11001 }
11002
11003 BuildMI(DispatchBB, dl, TII->get(ARM::t2Bcc))
11004 .addMBB(TrapBB)
11006 .addReg(ARM::CPSR);
11007
11008 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11009 BuildMI(DispContBB, dl, TII->get(ARM::t2LEApcrelJT), NewVReg3)
11010 .addJumpTableIndex(MJTI)
11012
11013 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11014 BuildMI(DispContBB, dl, TII->get(ARM::t2ADDrs), NewVReg4)
11015 .addReg(NewVReg3, RegState::Kill)
11016 .addReg(NewVReg1)
11019 .add(condCodeOp());
11020
11021 BuildMI(DispContBB, dl, TII->get(ARM::t2BR_JT))
11022 .addReg(NewVReg4, RegState::Kill)
11023 .addReg(NewVReg1)
11024 .addJumpTableIndex(MJTI);
11025 } else if (Subtarget->isThumb()) {
11026 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11027 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRspi), NewVReg1)
11028 .addFrameIndex(FI)
11029 .addImm(1)
11030 .addMemOperand(FIMMOLd)
11032
11033 if (NumLPads < 256) {
11034 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPi8))
11035 .addReg(NewVReg1)
11036 .addImm(NumLPads)
11038 } else {
11039 MachineConstantPool *ConstantPool = MF->getConstantPool();
11040 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11041 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11042
11043 // MachineConstantPool wants an explicit alignment.
11044 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11045 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11046
11047 Register VReg1 = MRI->createVirtualRegister(TRC);
11048 BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
11049 .addReg(VReg1, RegState::Define)
11052 BuildMI(DispatchBB, dl, TII->get(ARM::tCMPr))
11053 .addReg(NewVReg1)
11054 .addReg(VReg1)
11056 }
11057
11058 BuildMI(DispatchBB, dl, TII->get(ARM::tBcc))
11059 .addMBB(TrapBB)
11061 .addReg(ARM::CPSR);
11062
11063 Register NewVReg2 = MRI->createVirtualRegister(TRC);
11064 BuildMI(DispContBB, dl, TII->get(ARM::tLSLri), NewVReg2)
11065 .addReg(ARM::CPSR, RegState::Define)
11066 .addReg(NewVReg1)
11067 .addImm(2)
11069
11070 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11071 BuildMI(DispContBB, dl, TII->get(ARM::tLEApcrelJT), NewVReg3)
11072 .addJumpTableIndex(MJTI)
11074
11075 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11076 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg4)
11077 .addReg(ARM::CPSR, RegState::Define)
11078 .addReg(NewVReg2, RegState::Kill)
11079 .addReg(NewVReg3)
11081
11082 MachineMemOperand *JTMMOLd =
11083 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11085
11086 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11087 BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
11088 .addReg(NewVReg4, RegState::Kill)
11089 .addImm(0)
11090 .addMemOperand(JTMMOLd)
11092
11093 unsigned NewVReg6 = NewVReg5;
11094 if (IsPositionIndependent) {
11095 NewVReg6 = MRI->createVirtualRegister(TRC);
11096 BuildMI(DispContBB, dl, TII->get(ARM::tADDrr), NewVReg6)
11097 .addReg(ARM::CPSR, RegState::Define)
11098 .addReg(NewVReg5, RegState::Kill)
11099 .addReg(NewVReg3)
11101 }
11102
11103 BuildMI(DispContBB, dl, TII->get(ARM::tBR_JTr))
11104 .addReg(NewVReg6, RegState::Kill)
11105 .addJumpTableIndex(MJTI);
11106 } else {
11107 Register NewVReg1 = MRI->createVirtualRegister(TRC);
11108 BuildMI(DispatchBB, dl, TII->get(ARM::LDRi12), NewVReg1)
11109 .addFrameIndex(FI)
11110 .addImm(4)
11111 .addMemOperand(FIMMOLd)
11113
11114 if (NumLPads < 256) {
11115 BuildMI(DispatchBB, dl, TII->get(ARM::CMPri))
11116 .addReg(NewVReg1)
11117 .addImm(NumLPads)
11119 } else if (Subtarget->hasV6T2Ops() && isUInt<16>(NumLPads)) {
11120 Register VReg1 = MRI->createVirtualRegister(TRC);
11121 BuildMI(DispatchBB, dl, TII->get(ARM::MOVi16), VReg1)
11122 .addImm(NumLPads & 0xFFFF)
11124
11125 unsigned VReg2 = VReg1;
11126 if ((NumLPads & 0xFFFF0000) != 0) {
11127 VReg2 = MRI->createVirtualRegister(TRC);
11128 BuildMI(DispatchBB, dl, TII->get(ARM::MOVTi16), VReg2)
11129 .addReg(VReg1)
11130 .addImm(NumLPads >> 16)
11132 }
11133
11134 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11135 .addReg(NewVReg1)
11136 .addReg(VReg2)
11138 } else {
11139 MachineConstantPool *ConstantPool = MF->getConstantPool();
11140 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11141 const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
11142
11143 // MachineConstantPool wants an explicit alignment.
11144 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11145 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11146
11147 Register VReg1 = MRI->createVirtualRegister(TRC);
11148 BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
11149 .addReg(VReg1, RegState::Define)
11151 .addImm(0)
11153 BuildMI(DispatchBB, dl, TII->get(ARM::CMPrr))
11154 .addReg(NewVReg1)
11155 .addReg(VReg1, RegState::Kill)
11157 }
11158
11159 BuildMI(DispatchBB, dl, TII->get(ARM::Bcc))
11160 .addMBB(TrapBB)
11162 .addReg(ARM::CPSR);
11163
11164 Register NewVReg3 = MRI->createVirtualRegister(TRC);
11165 BuildMI(DispContBB, dl, TII->get(ARM::MOVsi), NewVReg3)
11166 .addReg(NewVReg1)
11169 .add(condCodeOp());
11170 Register NewVReg4 = MRI->createVirtualRegister(TRC);
11171 BuildMI(DispContBB, dl, TII->get(ARM::LEApcrelJT), NewVReg4)
11172 .addJumpTableIndex(MJTI)
11174
11175 MachineMemOperand *JTMMOLd =
11176 MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
11178 Register NewVReg5 = MRI->createVirtualRegister(TRC);
11179 BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
11180 .addReg(NewVReg3, RegState::Kill)
11181 .addReg(NewVReg4)
11182 .addImm(0)
11183 .addMemOperand(JTMMOLd)
11185
11186 if (IsPositionIndependent) {
11187 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTadd))
11188 .addReg(NewVReg5, RegState::Kill)
11189 .addReg(NewVReg4)
11190 .addJumpTableIndex(MJTI);
11191 } else {
11192 BuildMI(DispContBB, dl, TII->get(ARM::BR_JTr))
11193 .addReg(NewVReg5, RegState::Kill)
11194 .addJumpTableIndex(MJTI);
11195 }
11196 }
11197
11198 // Add the jump table entries as successors to the MBB.
11199 SmallPtrSet<MachineBasicBlock*, 8> SeenMBBs;
11200 for (MachineBasicBlock *CurMBB : LPadList) {
11201 if (SeenMBBs.insert(CurMBB).second)
11202 DispContBB->addSuccessor(CurMBB);
11203 }
11204
11205 // N.B. the order the invoke BBs are processed in doesn't matter here.
11206 const MCPhysReg *SavedRegs = RI.getCalleeSavedRegs(MF);
11208 for (MachineBasicBlock *BB : InvokeBBs) {
11209
11210 // Remove the landing pad successor from the invoke block and replace it
11211 // with the new dispatch block.
11212 SmallVector<MachineBasicBlock*, 4> Successors(BB->successors());
11213 while (!Successors.empty()) {
11214 MachineBasicBlock *SMBB = Successors.pop_back_val();
11215 if (SMBB->isEHPad()) {
11216 BB->removeSuccessor(SMBB);
11217 MBBLPads.push_back(SMBB);
11218 }
11219 }
11220
11221 BB->addSuccessor(DispatchBB, BranchProbability::getZero());
11222 BB->normalizeSuccProbs();
11223
11224 // Find the invoke call and mark all of the callee-saved registers as
11225 // 'implicit defined' so that they're spilled. This prevents code from
11226 // moving instructions to before the EH block, where they will never be
11227 // executed.
11229 II = BB->rbegin(), IE = BB->rend(); II != IE; ++II) {
11230 if (!II->isCall()) continue;
11231
11232 DenseSet<unsigned> DefRegs;
11234 OI = II->operands_begin(), OE = II->operands_end();
11235 OI != OE; ++OI) {
11236 if (!OI->isReg()) continue;
11237 DefRegs.insert(OI->getReg());
11238 }
11239
11240 MachineInstrBuilder MIB(*MF, &*II);
11241
11242 for (unsigned i = 0; SavedRegs[i] != 0; ++i) {
11243 unsigned Reg = SavedRegs[i];
11244 if (Subtarget->isThumb2() &&
11245 !ARM::tGPRRegClass.contains(Reg) &&
11246 !ARM::hGPRRegClass.contains(Reg))
11247 continue;
11248 if (Subtarget->isThumb1Only() && !ARM::tGPRRegClass.contains(Reg))
11249 continue;
11250 if (!Subtarget->isThumb() && !ARM::GPRRegClass.contains(Reg))
11251 continue;
11252 if (!DefRegs.contains(Reg))
11254 }
11255
11256 break;
11257 }
11258 }
11259
11260 // Mark all former landing pads as non-landing pads. The dispatch is the only
11261 // landing pad now.
11262 for (MachineBasicBlock *MBBLPad : MBBLPads)
11263 MBBLPad->setIsEHPad(false);
11264
11265 // The instruction is gone now.
11266 MI.eraseFromParent();
11267}
11268
11269static
11271 for (MachineBasicBlock *S : MBB->successors())
11272 if (S != Succ)
11273 return S;
11274 llvm_unreachable("Expecting a BB with two successors!");
11275}
11276
11277/// Return the load opcode for a given load size. If load size >= 8,
11278/// neon opcode will be returned.
11279static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2) {
11280 if (LdSize >= 8)
11281 return LdSize == 16 ? ARM::VLD1q32wb_fixed
11282 : LdSize == 8 ? ARM::VLD1d32wb_fixed : 0;
11283 if (IsThumb1)
11284 return LdSize == 4 ? ARM::tLDRi
11285 : LdSize == 2 ? ARM::tLDRHi
11286 : LdSize == 1 ? ARM::tLDRBi : 0;
11287 if (IsThumb2)
11288 return LdSize == 4 ? ARM::t2LDR_POST
11289 : LdSize == 2 ? ARM::t2LDRH_POST
11290 : LdSize == 1 ? ARM::t2LDRB_POST : 0;
11291 return LdSize == 4 ? ARM::LDR_POST_IMM
11292 : LdSize == 2 ? ARM::LDRH_POST
11293 : LdSize == 1 ? ARM::LDRB_POST_IMM : 0;
11294}
11295
11296/// Return the store opcode for a given store size. If store size >= 8,
11297/// neon opcode will be returned.
11298static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2) {
11299 if (StSize >= 8)
11300 return StSize == 16 ? ARM::VST1q32wb_fixed
11301 : StSize == 8 ? ARM::VST1d32wb_fixed : 0;
11302 if (IsThumb1)
11303 return StSize == 4 ? ARM::tSTRi
11304 : StSize == 2 ? ARM::tSTRHi
11305 : StSize == 1 ? ARM::tSTRBi : 0;
11306 if (IsThumb2)
11307 return StSize == 4 ? ARM::t2STR_POST
11308 : StSize == 2 ? ARM::t2STRH_POST
11309 : StSize == 1 ? ARM::t2STRB_POST : 0;
11310 return StSize == 4 ? ARM::STR_POST_IMM
11311 : StSize == 2 ? ARM::STRH_POST
11312 : StSize == 1 ? ARM::STRB_POST_IMM : 0;
11313}
11314
11315/// Emit a post-increment load operation with given size. The instructions
11316/// will be added to BB at Pos.
11318 const TargetInstrInfo *TII, const DebugLoc &dl,
11319 unsigned LdSize, unsigned Data, unsigned AddrIn,
11320 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11321 unsigned LdOpc = getLdOpcode(LdSize, IsThumb1, IsThumb2);
11322 assert(LdOpc != 0 && "Should have a load opcode");
11323 if (LdSize >= 8) {
11324 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11325 .addReg(AddrOut, RegState::Define)
11326 .addReg(AddrIn)
11327 .addImm(0)
11329 } else if (IsThumb1) {
11330 // load + update AddrIn
11331 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11332 .addReg(AddrIn)
11333 .addImm(0)
11335 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11336 .add(t1CondCodeOp())
11337 .addReg(AddrIn)
11338 .addImm(LdSize)
11340 } else if (IsThumb2) {
11341 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11342 .addReg(AddrOut, RegState::Define)
11343 .addReg(AddrIn)
11344 .addImm(LdSize)
11346 } else { // arm
11347 BuildMI(*BB, Pos, dl, TII->get(LdOpc), Data)
11348 .addReg(AddrOut, RegState::Define)
11349 .addReg(AddrIn)
11350 .addReg(0)
11351 .addImm(LdSize)
11353 }
11354}
11355
11356/// Emit a post-increment store operation with given size. The instructions
11357/// will be added to BB at Pos.
11359 const TargetInstrInfo *TII, const DebugLoc &dl,
11360 unsigned StSize, unsigned Data, unsigned AddrIn,
11361 unsigned AddrOut, bool IsThumb1, bool IsThumb2) {
11362 unsigned StOpc = getStOpcode(StSize, IsThumb1, IsThumb2);
11363 assert(StOpc != 0 && "Should have a store opcode");
11364 if (StSize >= 8) {
11365 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11366 .addReg(AddrIn)
11367 .addImm(0)
11368 .addReg(Data)
11370 } else if (IsThumb1) {
11371 // store + update AddrIn
11372 BuildMI(*BB, Pos, dl, TII->get(StOpc))
11373 .addReg(Data)
11374 .addReg(AddrIn)
11375 .addImm(0)
11377 BuildMI(*BB, Pos, dl, TII->get(ARM::tADDi8), AddrOut)
11378 .add(t1CondCodeOp())
11379 .addReg(AddrIn)
11380 .addImm(StSize)
11382 } else if (IsThumb2) {
11383 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11384 .addReg(Data)
11385 .addReg(AddrIn)
11386 .addImm(StSize)
11388 } else { // arm
11389 BuildMI(*BB, Pos, dl, TII->get(StOpc), AddrOut)
11390 .addReg(Data)
11391 .addReg(AddrIn)
11392 .addReg(0)
11393 .addImm(StSize)
11395 }
11396}
11397
11399ARMTargetLowering::EmitStructByval(MachineInstr &MI,
11400 MachineBasicBlock *BB) const {
11401 // This pseudo instruction has 3 operands: dst, src, size
11402 // We expand it to a loop if size > Subtarget->getMaxInlineSizeThreshold().
11403 // Otherwise, we will generate unrolled scalar copies.
11404 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11405 const BasicBlock *LLVM_BB = BB->getBasicBlock();
11407
11408 Register dest = MI.getOperand(0).getReg();
11409 Register src = MI.getOperand(1).getReg();
11410 unsigned SizeVal = MI.getOperand(2).getImm();
11411 unsigned Alignment = MI.getOperand(3).getImm();
11412 DebugLoc dl = MI.getDebugLoc();
11413
11414 MachineFunction *MF = BB->getParent();
11415 MachineRegisterInfo &MRI = MF->getRegInfo();
11416 unsigned UnitSize = 0;
11417 const TargetRegisterClass *TRC = nullptr;
11418 const TargetRegisterClass *VecTRC = nullptr;
11419
11420 bool IsThumb1 = Subtarget->isThumb1Only();
11421 bool IsThumb2 = Subtarget->isThumb2();
11422 bool IsThumb = Subtarget->isThumb();
11423
11424 if (Alignment & 1) {
11425 UnitSize = 1;
11426 } else if (Alignment & 2) {
11427 UnitSize = 2;
11428 } else {
11429 // Check whether we can use NEON instructions.
11430 if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
11431 Subtarget->hasNEON()) {
11432 if ((Alignment % 16 == 0) && SizeVal >= 16)
11433 UnitSize = 16;
11434 else if ((Alignment % 8 == 0) && SizeVal >= 8)
11435 UnitSize = 8;
11436 }
11437 // Can't use NEON instructions.
11438 if (UnitSize == 0)
11439 UnitSize = 4;
11440 }
11441
11442 // Select the correct opcode and register class for unit size load/store
11443 bool IsNeon = UnitSize >= 8;
11444 TRC = IsThumb ? &ARM::tGPRRegClass : &ARM::GPRRegClass;
11445 if (IsNeon)
11446 VecTRC = UnitSize == 16 ? &ARM::DPairRegClass
11447 : UnitSize == 8 ? &ARM::DPRRegClass
11448 : nullptr;
11449
11450 unsigned BytesLeft = SizeVal % UnitSize;
11451 unsigned LoopSize = SizeVal - BytesLeft;
11452
11453 if (SizeVal <= Subtarget->getMaxInlineSizeThreshold()) {
11454 // Use LDR and STR to copy.
11455 // [scratch, srcOut] = LDR_POST(srcIn, UnitSize)
11456 // [destOut] = STR_POST(scratch, destIn, UnitSize)
11457 unsigned srcIn = src;
11458 unsigned destIn = dest;
11459 for (unsigned i = 0; i < LoopSize; i+=UnitSize) {
11460 Register srcOut = MRI.createVirtualRegister(TRC);
11461 Register destOut = MRI.createVirtualRegister(TRC);
11462 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11463 emitPostLd(BB, MI, TII, dl, UnitSize, scratch, srcIn, srcOut,
11464 IsThumb1, IsThumb2);
11465 emitPostSt(BB, MI, TII, dl, UnitSize, scratch, destIn, destOut,
11466 IsThumb1, IsThumb2);
11467 srcIn = srcOut;
11468 destIn = destOut;
11469 }
11470
11471 // Handle the leftover bytes with LDRB and STRB.
11472 // [scratch, srcOut] = LDRB_POST(srcIn, 1)
11473 // [destOut] = STRB_POST(scratch, destIn, 1)
11474 for (unsigned i = 0; i < BytesLeft; i++) {
11475 Register srcOut = MRI.createVirtualRegister(TRC);
11476 Register destOut = MRI.createVirtualRegister(TRC);
11477 Register scratch = MRI.createVirtualRegister(TRC);
11478 emitPostLd(BB, MI, TII, dl, 1, scratch, srcIn, srcOut,
11479 IsThumb1, IsThumb2);
11480 emitPostSt(BB, MI, TII, dl, 1, scratch, destIn, destOut,
11481 IsThumb1, IsThumb2);
11482 srcIn = srcOut;
11483 destIn = destOut;
11484 }
11485 MI.eraseFromParent(); // The instruction is gone now.
11486 return BB;
11487 }
11488
11489 // Expand the pseudo op to a loop.
11490 // thisMBB:
11491 // ...
11492 // movw varEnd, # --> with thumb2
11493 // movt varEnd, #
11494 // ldrcp varEnd, idx --> without thumb2
11495 // fallthrough --> loopMBB
11496 // loopMBB:
11497 // PHI varPhi, varEnd, varLoop
11498 // PHI srcPhi, src, srcLoop
11499 // PHI destPhi, dst, destLoop
11500 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11501 // [destLoop] = STR_POST(scratch, destPhi, UnitSize)
11502 // subs varLoop, varPhi, #UnitSize
11503 // bne loopMBB
11504 // fallthrough --> exitMBB
11505 // exitMBB:
11506 // epilogue to handle left-over bytes
11507 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11508 // [destOut] = STRB_POST(scratch, destLoop, 1)
11509 MachineBasicBlock *loopMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11510 MachineBasicBlock *exitMBB = MF->CreateMachineBasicBlock(LLVM_BB);
11511 MF->insert(It, loopMBB);
11512 MF->insert(It, exitMBB);
11513
11514 // Set the call frame size on entry to the new basic blocks.
11515 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
11516 loopMBB->setCallFrameSize(CallFrameSize);
11517 exitMBB->setCallFrameSize(CallFrameSize);
11518
11519 // Transfer the remainder of BB and its successor edges to exitMBB.
11520 exitMBB->splice(exitMBB->begin(), BB,
11521 std::next(MachineBasicBlock::iterator(MI)), BB->end());
11523
11524 // Load an immediate to varEnd.
11525 Register varEnd = MRI.createVirtualRegister(TRC);
11526 if (Subtarget->useMovt()) {
11527 BuildMI(BB, dl, TII->get(IsThumb ? ARM::t2MOVi32imm : ARM::MOVi32imm),
11528 varEnd)
11529 .addImm(LoopSize);
11530 } else if (Subtarget->genExecuteOnly()) {
11531 assert(IsThumb && "Non-thumb expected to have used movt");
11532 BuildMI(BB, dl, TII->get(ARM::tMOVi32imm), varEnd).addImm(LoopSize);
11533 } else {
11534 MachineConstantPool *ConstantPool = MF->getConstantPool();
11535 Type *Int32Ty = Type::getInt32Ty(MF->getFunction().getContext());
11536 const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
11537
11538 // MachineConstantPool wants an explicit alignment.
11539 Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
11540 unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
11541 MachineMemOperand *CPMMO =
11544
11545 if (IsThumb)
11546 BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
11547 .addReg(varEnd, RegState::Define)
11550 .addMemOperand(CPMMO);
11551 else
11552 BuildMI(*BB, MI, dl, TII->get(ARM::LDRcp))
11553 .addReg(varEnd, RegState::Define)
11555 .addImm(0)
11557 .addMemOperand(CPMMO);
11558 }
11559 BB->addSuccessor(loopMBB);
11560
11561 // Generate the loop body:
11562 // varPhi = PHI(varLoop, varEnd)
11563 // srcPhi = PHI(srcLoop, src)
11564 // destPhi = PHI(destLoop, dst)
11565 MachineBasicBlock *entryBB = BB;
11566 BB = loopMBB;
11567 Register varLoop = MRI.createVirtualRegister(TRC);
11568 Register varPhi = MRI.createVirtualRegister(TRC);
11569 Register srcLoop = MRI.createVirtualRegister(TRC);
11570 Register srcPhi = MRI.createVirtualRegister(TRC);
11571 Register destLoop = MRI.createVirtualRegister(TRC);
11572 Register destPhi = MRI.createVirtualRegister(TRC);
11573
11574 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), varPhi)
11575 .addReg(varLoop).addMBB(loopMBB)
11576 .addReg(varEnd).addMBB(entryBB);
11577 BuildMI(BB, dl, TII->get(ARM::PHI), srcPhi)
11578 .addReg(srcLoop).addMBB(loopMBB)
11579 .addReg(src).addMBB(entryBB);
11580 BuildMI(BB, dl, TII->get(ARM::PHI), destPhi)
11581 .addReg(destLoop).addMBB(loopMBB)
11582 .addReg(dest).addMBB(entryBB);
11583
11584 // [scratch, srcLoop] = LDR_POST(srcPhi, UnitSize)
11585 // [destLoop] = STR_POST(scratch, destPhi, UnitSiz)
11586 Register scratch = MRI.createVirtualRegister(IsNeon ? VecTRC : TRC);
11587 emitPostLd(BB, BB->end(), TII, dl, UnitSize, scratch, srcPhi, srcLoop,
11588 IsThumb1, IsThumb2);
11589 emitPostSt(BB, BB->end(), TII, dl, UnitSize, scratch, destPhi, destLoop,
11590 IsThumb1, IsThumb2);
11591
11592 // Decrement loop variable by UnitSize.
11593 if (IsThumb1) {
11594 BuildMI(*BB, BB->end(), dl, TII->get(ARM::tSUBi8), varLoop)
11595 .add(t1CondCodeOp())
11596 .addReg(varPhi)
11597 .addImm(UnitSize)
11599 } else {
11600 MachineInstrBuilder MIB =
11601 BuildMI(*BB, BB->end(), dl,
11602 TII->get(IsThumb2 ? ARM::t2SUBri : ARM::SUBri), varLoop);
11603 MIB.addReg(varPhi)
11604 .addImm(UnitSize)
11606 .add(condCodeOp());
11607 MIB->getOperand(5).setReg(ARM::CPSR);
11608 MIB->getOperand(5).setIsDef(true);
11609 }
11610 BuildMI(*BB, BB->end(), dl,
11611 TII->get(IsThumb1 ? ARM::tBcc : IsThumb2 ? ARM::t2Bcc : ARM::Bcc))
11612 .addMBB(loopMBB).addImm(ARMCC::NE).addReg(ARM::CPSR);
11613
11614 // loopMBB can loop back to loopMBB or fall through to exitMBB.
11615 BB->addSuccessor(loopMBB);
11616 BB->addSuccessor(exitMBB);
11617
11618 // Add epilogue to handle BytesLeft.
11619 BB = exitMBB;
11620 auto StartOfExit = exitMBB->begin();
11621
11622 // [scratch, srcOut] = LDRB_POST(srcLoop, 1)
11623 // [destOut] = STRB_POST(scratch, destLoop, 1)
11624 unsigned srcIn = srcLoop;
11625 unsigned destIn = destLoop;
11626 for (unsigned i = 0; i < BytesLeft; i++) {
11627 Register srcOut = MRI.createVirtualRegister(TRC);
11628 Register destOut = MRI.createVirtualRegister(TRC);
11629 Register scratch = MRI.createVirtualRegister(TRC);
11630 emitPostLd(BB, StartOfExit, TII, dl, 1, scratch, srcIn, srcOut,
11631 IsThumb1, IsThumb2);
11632 emitPostSt(BB, StartOfExit, TII, dl, 1, scratch, destIn, destOut,
11633 IsThumb1, IsThumb2);
11634 srcIn = srcOut;
11635 destIn = destOut;
11636 }
11637
11638 MI.eraseFromParent(); // The instruction is gone now.
11639 return BB;
11640}
11641
11643ARMTargetLowering::EmitLowered__chkstk(MachineInstr &MI,
11644 MachineBasicBlock *MBB) const {
11645 const TargetMachine &TM = getTargetMachine();
11646 const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
11647 DebugLoc DL = MI.getDebugLoc();
11648
11649 assert(TM.getTargetTriple().isOSWindows() &&
11650 "__chkstk is only supported on Windows");
11651 assert(Subtarget->isThumb2() && "Windows on ARM requires Thumb-2 mode");
11652
11653 // __chkstk takes the number of words to allocate on the stack in R4, and
11654 // returns the stack adjustment in number of bytes in R4. This will not
11655 // clober any other registers (other than the obvious lr).
11656 //
11657 // Although, technically, IP should be considered a register which may be
11658 // clobbered, the call itself will not touch it. Windows on ARM is a pure
11659 // thumb-2 environment, so there is no interworking required. As a result, we
11660 // do not expect a veneer to be emitted by the linker, clobbering IP.
11661 //
11662 // Each module receives its own copy of __chkstk, so no import thunk is
11663 // required, again, ensuring that IP is not clobbered.
11664 //
11665 // Finally, although some linkers may theoretically provide a trampoline for
11666 // out of range calls (which is quite common due to a 32M range limitation of
11667 // branches for Thumb), we can generate the long-call version via
11668 // -mcmodel=large, alleviating the need for the trampoline which may clobber
11669 // IP.
11670
11671 RTLIB::LibcallImpl ChkStkLibcall = getLibcallImpl(RTLIB::STACK_PROBE);
11672 if (ChkStkLibcall == RTLIB::Unsupported)
11673 reportFatalUsageError("no available implementation of __chkstk");
11674
11675 const char *ChkStk = getLibcallImplName(ChkStkLibcall).data();
11676 switch (TM.getCodeModel()) {
11677 case CodeModel::Tiny:
11678 llvm_unreachable("Tiny code model not available on ARM.");
11679 case CodeModel::Small:
11680 case CodeModel::Medium:
11681 case CodeModel::Kernel:
11682 BuildMI(*MBB, MI, DL, TII.get(ARM::tBL))
11684 .addExternalSymbol(ChkStk)
11687 .addReg(ARM::R12,
11689 .addReg(ARM::CPSR,
11691 break;
11692 case CodeModel::Large: {
11693 MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
11694 Register Reg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11695
11696 BuildMI(*MBB, MI, DL, TII.get(ARM::t2MOVi32imm), Reg)
11697 .addExternalSymbol(ChkStk);
11703 .addReg(ARM::R12,
11705 .addReg(ARM::CPSR,
11707 break;
11708 }
11709 }
11710
11711 BuildMI(*MBB, MI, DL, TII.get(ARM::t2SUBrr), ARM::SP)
11712 .addReg(ARM::SP, RegState::Kill)
11713 .addReg(ARM::R4, RegState::Kill)
11716 .add(condCodeOp());
11717
11718 MI.eraseFromParent();
11719 return MBB;
11720}
11721
11723ARMTargetLowering::EmitLowered__dbzchk(MachineInstr &MI,
11724 MachineBasicBlock *MBB) const {
11725 DebugLoc DL = MI.getDebugLoc();
11726 MachineFunction *MF = MBB->getParent();
11727 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11728
11729 MachineBasicBlock *ContBB = MF->CreateMachineBasicBlock();
11730 MF->insert(++MBB->getIterator(), ContBB);
11731 ContBB->splice(ContBB->begin(), MBB,
11732 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
11734 MBB->addSuccessor(ContBB);
11735
11736 MachineBasicBlock *TrapBB = MF->CreateMachineBasicBlock();
11737 BuildMI(TrapBB, DL, TII->get(ARM::t__brkdiv0));
11738 MF->push_back(TrapBB);
11739 MBB->addSuccessor(TrapBB);
11740
11741 BuildMI(*MBB, MI, DL, TII->get(ARM::tCMPi8))
11742 .addReg(MI.getOperand(0).getReg())
11743 .addImm(0)
11745 BuildMI(*MBB, MI, DL, TII->get(ARM::t2Bcc))
11746 .addMBB(TrapBB)
11748 .addReg(ARM::CPSR);
11749
11750 MI.eraseFromParent();
11751 return ContBB;
11752}
11753
11754// The CPSR operand of SelectItr might be missing a kill marker
11755// because there were multiple uses of CPSR, and ISel didn't know
11756// which to mark. Figure out whether SelectItr should have had a
11757// kill marker, and set it if it should. Returns the correct kill
11758// marker value.
11761 const TargetRegisterInfo* TRI) {
11762 // Scan forward through BB for a use/def of CPSR.
11763 MachineBasicBlock::iterator miI(std::next(SelectItr));
11764 for (MachineBasicBlock::iterator miE = BB->end(); miI != miE; ++miI) {
11765 const MachineInstr& mi = *miI;
11766 if (mi.readsRegister(ARM::CPSR, /*TRI=*/nullptr))
11767 return false;
11768 if (mi.definesRegister(ARM::CPSR, /*TRI=*/nullptr))
11769 break; // Should have kill-flag - update below.
11770 }
11771
11772 // If we hit the end of the block, check whether CPSR is live into a
11773 // successor.
11774 if (miI == BB->end()) {
11775 for (MachineBasicBlock *Succ : BB->successors())
11776 if (Succ->isLiveIn(ARM::CPSR))
11777 return false;
11778 }
11779
11780 // We found a def, or hit the end of the basic block and CPSR wasn't live
11781 // out. SelectMI should have a kill flag on CPSR.
11782 SelectItr->addRegisterKilled(ARM::CPSR, TRI);
11783 return true;
11784}
11785
11786/// Adds logic in loop entry MBB to calculate loop iteration count and adds
11787/// t2WhileLoopSetup and t2WhileLoopStart to generate WLS loop
11789 MachineBasicBlock *TpLoopBody,
11790 MachineBasicBlock *TpExit, Register OpSizeReg,
11791 const TargetInstrInfo *TII, DebugLoc Dl,
11792 MachineRegisterInfo &MRI) {
11793 // Calculates loop iteration count = ceil(n/16) = (n + 15) >> 4.
11794 Register AddDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11795 BuildMI(TpEntry, Dl, TII->get(ARM::t2ADDri), AddDestReg)
11796 .addUse(OpSizeReg)
11797 .addImm(15)
11799 .addReg(0);
11800
11801 Register LsrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11802 BuildMI(TpEntry, Dl, TII->get(ARM::t2LSRri), LsrDestReg)
11803 .addUse(AddDestReg, RegState::Kill)
11804 .addImm(4)
11806 .addReg(0);
11807
11808 Register TotalIterationsReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11809 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopSetup), TotalIterationsReg)
11810 .addUse(LsrDestReg, RegState::Kill);
11811
11812 BuildMI(TpEntry, Dl, TII->get(ARM::t2WhileLoopStart))
11813 .addUse(TotalIterationsReg)
11814 .addMBB(TpExit);
11815
11816 BuildMI(TpEntry, Dl, TII->get(ARM::t2B))
11817 .addMBB(TpLoopBody)
11819
11820 return TotalIterationsReg;
11821}
11822
11823/// Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and
11824/// t2DoLoopEnd. These are used by later passes to generate tail predicated
11825/// loops.
11826static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
11827 MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit,
11828 const TargetInstrInfo *TII, DebugLoc Dl,
11829 MachineRegisterInfo &MRI, Register OpSrcReg,
11830 Register OpDestReg, Register ElementCountReg,
11831 Register TotalIterationsReg, bool IsMemcpy) {
11832 // First insert 4 PHI nodes for: Current pointer to Src (if memcpy), Dest
11833 // array, loop iteration counter, predication counter.
11834
11835 Register SrcPhiReg, CurrSrcReg;
11836 if (IsMemcpy) {
11837 // Current position in the src array
11838 SrcPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11839 CurrSrcReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11840 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), SrcPhiReg)
11841 .addUse(OpSrcReg)
11842 .addMBB(TpEntry)
11843 .addUse(CurrSrcReg)
11844 .addMBB(TpLoopBody);
11845 }
11846
11847 // Current position in the dest array
11848 Register DestPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11849 Register CurrDestReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11850 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), DestPhiReg)
11851 .addUse(OpDestReg)
11852 .addMBB(TpEntry)
11853 .addUse(CurrDestReg)
11854 .addMBB(TpLoopBody);
11855
11856 // Current loop counter
11857 Register LoopCounterPhiReg = MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11858 Register RemainingLoopIterationsReg =
11859 MRI.createVirtualRegister(&ARM::GPRlrRegClass);
11860 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), LoopCounterPhiReg)
11861 .addUse(TotalIterationsReg)
11862 .addMBB(TpEntry)
11863 .addUse(RemainingLoopIterationsReg)
11864 .addMBB(TpLoopBody);
11865
11866 // Predication counter
11867 Register PredCounterPhiReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11868 Register RemainingElementsReg = MRI.createVirtualRegister(&ARM::rGPRRegClass);
11869 BuildMI(TpLoopBody, Dl, TII->get(ARM::PHI), PredCounterPhiReg)
11870 .addUse(ElementCountReg)
11871 .addMBB(TpEntry)
11872 .addUse(RemainingElementsReg)
11873 .addMBB(TpLoopBody);
11874
11875 // Pass predication counter to VCTP
11876 Register VccrReg = MRI.createVirtualRegister(&ARM::VCCRRegClass);
11877 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VCTP8), VccrReg)
11878 .addUse(PredCounterPhiReg)
11880 .addReg(0)
11881 .addReg(0);
11882
11883 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2SUBri), RemainingElementsReg)
11884 .addUse(PredCounterPhiReg)
11885 .addImm(16)
11887 .addReg(0);
11888
11889 // VLDRB (only if memcpy) and VSTRB instructions, predicated using VPR
11890 Register SrcValueReg;
11891 if (IsMemcpy) {
11892 SrcValueReg = MRI.createVirtualRegister(&ARM::MQPRRegClass);
11893 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VLDRBU8_post))
11894 .addDef(CurrSrcReg)
11895 .addDef(SrcValueReg)
11896 .addReg(SrcPhiReg)
11897 .addImm(16)
11899 .addUse(VccrReg)
11900 .addReg(0);
11901 } else
11902 SrcValueReg = OpSrcReg;
11903
11904 BuildMI(TpLoopBody, Dl, TII->get(ARM::MVE_VSTRBU8_post))
11905 .addDef(CurrDestReg)
11906 .addUse(SrcValueReg)
11907 .addReg(DestPhiReg)
11908 .addImm(16)
11910 .addUse(VccrReg)
11911 .addReg(0);
11912
11913 // Add the pseudoInstrs for decrementing the loop counter and marking the
11914 // end:t2DoLoopDec and t2DoLoopEnd
11915 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopDec), RemainingLoopIterationsReg)
11916 .addUse(LoopCounterPhiReg)
11917 .addImm(1);
11918
11919 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2LoopEnd))
11920 .addUse(RemainingLoopIterationsReg)
11921 .addMBB(TpLoopBody);
11922
11923 BuildMI(TpLoopBody, Dl, TII->get(ARM::t2B))
11924 .addMBB(TpExit)
11926}
11927
11929 // KCFI is supported in all ARM/Thumb modes
11930 return true;
11931}
11932
11936 const TargetInstrInfo *TII) const {
11937 assert(MBBI->isCall() && MBBI->getCFIType() &&
11938 "Invalid call instruction for a KCFI check");
11939
11940 MachineOperand *TargetOp = nullptr;
11941 switch (MBBI->getOpcode()) {
11942 // ARM mode opcodes
11943 case ARM::BLX:
11944 case ARM::BLX_pred:
11945 case ARM::BLX_noip:
11946 case ARM::BLX_pred_noip:
11947 case ARM::BX_CALL:
11948 TargetOp = &MBBI->getOperand(0);
11949 break;
11950 case ARM::TCRETURNri:
11951 case ARM::TCRETURNrinotr12:
11952 case ARM::TAILJMPr:
11953 case ARM::TAILJMPr4:
11954 TargetOp = &MBBI->getOperand(0);
11955 break;
11956 // Thumb mode opcodes (Thumb1 and Thumb2)
11957 // Note: Most Thumb call instructions have predicate operands before the
11958 // target register Format: tBLXr pred, predreg, target_register, ...
11959 case ARM::tBLXr: // Thumb1/Thumb2: BLX register (requires V5T)
11960 case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
11961 case ARM::tBX_CALL: // Thumb1 only: BX call (push LR, BX)
11962 TargetOp = &MBBI->getOperand(2);
11963 break;
11964 // Tail call instructions don't have predicates, target is operand 0
11965 case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
11966 TargetOp = &MBBI->getOperand(0);
11967 break;
11968 default:
11969 llvm_unreachable("Unexpected CFI call opcode");
11970 }
11971
11972 assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
11973 TargetOp->setIsRenamable(false);
11974
11975 // Select the appropriate KCFI_CHECK variant based on the instruction set
11976 unsigned KCFICheckOpcode;
11977 if (Subtarget->isThumb()) {
11978 if (Subtarget->isThumb2()) {
11979 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
11980 } else {
11981 KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
11982 }
11983 } else {
11984 KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
11985 }
11986
11987 return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
11988 .addReg(TargetOp->getReg())
11989 .addImm(MBBI->getCFIType())
11990 .getInstr();
11991}
11992
11995 MachineBasicBlock *BB) const {
11996 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
11997 DebugLoc dl = MI.getDebugLoc();
11998 bool isThumb2 = Subtarget->isThumb2();
11999 switch (MI.getOpcode()) {
12000 default: {
12001 MI.print(errs());
12002 llvm_unreachable("Unexpected instr type to insert");
12003 }
12004
12005 // Thumb1 post-indexed loads are really just single-register LDMs.
12006 case ARM::tLDR_postidx: {
12007 MachineOperand Def(MI.getOperand(1));
12008 BuildMI(*BB, MI, dl, TII->get(ARM::tLDMIA_UPD))
12009 .add(Def) // Rn_wb
12010 .add(MI.getOperand(2)) // Rn
12011 .add(MI.getOperand(3)) // PredImm
12012 .add(MI.getOperand(4)) // PredReg
12013 .add(MI.getOperand(0)) // Rt
12014 .cloneMemRefs(MI);
12015 MI.eraseFromParent();
12016 return BB;
12017 }
12018
12019 case ARM::MVE_MEMCPYLOOPINST:
12020 case ARM::MVE_MEMSETLOOPINST: {
12021
12022 // Transformation below expands MVE_MEMCPYLOOPINST/MVE_MEMSETLOOPINST Pseudo
12023 // into a Tail Predicated (TP) Loop. It adds the instructions to calculate
12024 // the iteration count =ceil(size_in_bytes/16)) in the TP entry block and
12025 // adds the relevant instructions in the TP loop Body for generation of a
12026 // WLSTP loop.
12027
12028 // Below is relevant portion of the CFG after the transformation.
12029 // The Machine Basic Blocks are shown along with branch conditions (in
12030 // brackets). Note that TP entry/exit MBBs depict the entry/exit of this
12031 // portion of the CFG and may not necessarily be the entry/exit of the
12032 // function.
12033
12034 // (Relevant) CFG after transformation:
12035 // TP entry MBB
12036 // |
12037 // |-----------------|
12038 // (n <= 0) (n > 0)
12039 // | |
12040 // | TP loop Body MBB<--|
12041 // | | |
12042 // \ |___________|
12043 // \ /
12044 // TP exit MBB
12045
12046 MachineFunction *MF = BB->getParent();
12047 MachineFunctionProperties &Properties = MF->getProperties();
12048 MachineRegisterInfo &MRI = MF->getRegInfo();
12049
12050 Register OpDestReg = MI.getOperand(0).getReg();
12051 Register OpSrcReg = MI.getOperand(1).getReg();
12052 Register OpSizeReg = MI.getOperand(2).getReg();
12053
12054 // Allocate the required MBBs and add to parent function.
12055 MachineBasicBlock *TpEntry = BB;
12056 MachineBasicBlock *TpLoopBody = MF->CreateMachineBasicBlock();
12057 MachineBasicBlock *TpExit;
12058
12059 MF->push_back(TpLoopBody);
12060
12061 // If any instructions are present in the current block after
12062 // MVE_MEMCPYLOOPINST or MVE_MEMSETLOOPINST, split the current block and
12063 // move the instructions into the newly created exit block. If there are no
12064 // instructions add an explicit branch to the FallThrough block and then
12065 // split.
12066 //
12067 // The split is required for two reasons:
12068 // 1) A terminator(t2WhileLoopStart) will be placed at that site.
12069 // 2) Since a TPLoopBody will be added later, any phis in successive blocks
12070 // need to be updated. splitAt() already handles this.
12071 TpExit = BB->splitAt(MI, false);
12072 if (TpExit == BB) {
12073 assert(BB->canFallThrough() && "Exit Block must be Fallthrough of the "
12074 "block containing memcpy/memset Pseudo");
12075 TpExit = BB->getFallThrough();
12076 BuildMI(BB, dl, TII->get(ARM::t2B))
12077 .addMBB(TpExit)
12079 TpExit = BB->splitAt(MI, false);
12080 }
12081
12082 // Add logic for iteration count
12083 Register TotalIterationsReg =
12084 genTPEntry(TpEntry, TpLoopBody, TpExit, OpSizeReg, TII, dl, MRI);
12085
12086 // Add the vectorized (and predicated) loads/store instructions
12087 bool IsMemcpy = MI.getOpcode() == ARM::MVE_MEMCPYLOOPINST;
12088 genTPLoopBody(TpLoopBody, TpEntry, TpExit, TII, dl, MRI, OpSrcReg,
12089 OpDestReg, OpSizeReg, TotalIterationsReg, IsMemcpy);
12090
12091 // Required to avoid conflict with the MachineVerifier during testing.
12092 Properties.resetNoPHIs();
12093
12094 // Connect the blocks
12095 TpEntry->addSuccessor(TpLoopBody);
12096 TpLoopBody->addSuccessor(TpLoopBody);
12097 TpLoopBody->addSuccessor(TpExit);
12098
12099 // Reorder for a more natural layout
12100 TpLoopBody->moveAfter(TpEntry);
12101 TpExit->moveAfter(TpLoopBody);
12102
12103 // Finally, remove the memcpy Pseudo Instruction
12104 MI.eraseFromParent();
12105
12106 // Return the exit block as it may contain other instructions requiring a
12107 // custom inserter
12108 return TpExit;
12109 }
12110
12111 // The Thumb2 pre-indexed stores have the same MI operands, they just
12112 // define them differently in the .td files from the isel patterns, so
12113 // they need pseudos.
12114 case ARM::t2STR_preidx:
12115 MI.setDesc(TII->get(ARM::t2STR_PRE));
12116 return BB;
12117 case ARM::t2STRB_preidx:
12118 MI.setDesc(TII->get(ARM::t2STRB_PRE));
12119 return BB;
12120 case ARM::t2STRH_preidx:
12121 MI.setDesc(TII->get(ARM::t2STRH_PRE));
12122 return BB;
12123
12124 case ARM::STRi_preidx:
12125 case ARM::STRBi_preidx: {
12126 unsigned NewOpc = MI.getOpcode() == ARM::STRi_preidx ? ARM::STR_PRE_IMM
12127 : ARM::STRB_PRE_IMM;
12128 // Decode the offset.
12129 unsigned Offset = MI.getOperand(4).getImm();
12130 bool isSub = ARM_AM::getAM2Op(Offset) == ARM_AM::sub;
12132 if (isSub)
12133 Offset = -Offset;
12134
12135 MachineMemOperand *MMO = *MI.memoperands_begin();
12136 BuildMI(*BB, MI, dl, TII->get(NewOpc))
12137 .add(MI.getOperand(0)) // Rn_wb
12138 .add(MI.getOperand(1)) // Rt
12139 .add(MI.getOperand(2)) // Rn
12140 .addImm(Offset) // offset (skip GPR==zero_reg)
12141 .add(MI.getOperand(5)) // pred
12142 .add(MI.getOperand(6))
12143 .addMemOperand(MMO);
12144 MI.eraseFromParent();
12145 return BB;
12146 }
12147 case ARM::STRr_preidx:
12148 case ARM::STRBr_preidx:
12149 case ARM::STRH_preidx: {
12150 unsigned NewOpc;
12151 switch (MI.getOpcode()) {
12152 default: llvm_unreachable("unexpected opcode!");
12153 case ARM::STRr_preidx: NewOpc = ARM::STR_PRE_REG; break;
12154 case ARM::STRBr_preidx: NewOpc = ARM::STRB_PRE_REG; break;
12155 case ARM::STRH_preidx: NewOpc = ARM::STRH_PRE; break;
12156 }
12157 MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(NewOpc));
12158 for (const MachineOperand &MO : MI.operands())
12159 MIB.add(MO);
12160 MI.eraseFromParent();
12161 return BB;
12162 }
12163
12164 case ARM::tMOVCCr_pseudo: {
12165 // To "insert" a SELECT_CC instruction, we actually have to insert the
12166 // diamond control-flow pattern. The incoming instruction knows the
12167 // destination vreg to set, the condition code register to branch on, the
12168 // true/false values to select between, and a branch opcode to use.
12169 const BasicBlock *LLVM_BB = BB->getBasicBlock();
12171
12172 // thisMBB:
12173 // ...
12174 // TrueVal = ...
12175 // cmpTY ccX, r1, r2
12176 // bCC copy1MBB
12177 // fallthrough --> copy0MBB
12178 MachineBasicBlock *thisMBB = BB;
12179 MachineFunction *F = BB->getParent();
12180 MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
12181 MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
12182 F->insert(It, copy0MBB);
12183 F->insert(It, sinkMBB);
12184
12185 // Set the call frame size on entry to the new basic blocks.
12186 unsigned CallFrameSize = TII->getCallFrameSizeAt(MI);
12187 copy0MBB->setCallFrameSize(CallFrameSize);
12188 sinkMBB->setCallFrameSize(CallFrameSize);
12189
12190 // Check whether CPSR is live past the tMOVCCr_pseudo.
12191 const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
12192 if (!MI.killsRegister(ARM::CPSR, /*TRI=*/nullptr) &&
12193 !checkAndUpdateCPSRKill(MI, thisMBB, TRI)) {
12194 copy0MBB->addLiveIn(ARM::CPSR);
12195 sinkMBB->addLiveIn(ARM::CPSR);
12196 }
12197
12198 // Transfer the remainder of BB and its successor edges to sinkMBB.
12199 sinkMBB->splice(sinkMBB->begin(), BB,
12200 std::next(MachineBasicBlock::iterator(MI)), BB->end());
12202
12203 BB->addSuccessor(copy0MBB);
12204 BB->addSuccessor(sinkMBB);
12205
12206 BuildMI(BB, dl, TII->get(ARM::tBcc))
12207 .addMBB(sinkMBB)
12208 .addImm(MI.getOperand(3).getImm())
12209 .addReg(MI.getOperand(4).getReg());
12210
12211 // copy0MBB:
12212 // %FalseValue = ...
12213 // # fallthrough to sinkMBB
12214 BB = copy0MBB;
12215
12216 // Update machine-CFG edges
12217 BB->addSuccessor(sinkMBB);
12218
12219 // sinkMBB:
12220 // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
12221 // ...
12222 BB = sinkMBB;
12223 BuildMI(*BB, BB->begin(), dl, TII->get(ARM::PHI), MI.getOperand(0).getReg())
12224 .addReg(MI.getOperand(1).getReg())
12225 .addMBB(copy0MBB)
12226 .addReg(MI.getOperand(2).getReg())
12227 .addMBB(thisMBB);
12228
12229 MI.eraseFromParent(); // The pseudo instruction is gone now.
12230 return BB;
12231 }
12232
12233 case ARM::BCCi64:
12234 case ARM::BCCZi64: {
12235 // If there is an unconditional branch to the other successor, remove it.
12236 BB->erase(std::next(MachineBasicBlock::iterator(MI)), BB->end());
12237
12238 // Compare both parts that make up the double comparison separately for
12239 // equality.
12240 bool RHSisZero = MI.getOpcode() == ARM::BCCZi64;
12241
12242 Register LHS1 = MI.getOperand(1).getReg();
12243 Register LHS2 = MI.getOperand(2).getReg();
12244 if (RHSisZero) {
12245 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12246 .addReg(LHS1)
12247 .addImm(0)
12249 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPri : ARM::CMPri))
12250 .addReg(LHS2).addImm(0)
12251 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12252 } else {
12253 Register RHS1 = MI.getOperand(3).getReg();
12254 Register RHS2 = MI.getOperand(4).getReg();
12255 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12256 .addReg(LHS1)
12257 .addReg(RHS1)
12259 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2CMPrr : ARM::CMPrr))
12260 .addReg(LHS2).addReg(RHS2)
12261 .addImm(ARMCC::EQ).addReg(ARM::CPSR);
12262 }
12263
12264 MachineBasicBlock *destMBB = MI.getOperand(RHSisZero ? 3 : 5).getMBB();
12265 MachineBasicBlock *exitMBB = OtherSucc(BB, destMBB);
12266 if (MI.getOperand(0).getImm() == ARMCC::NE)
12267 std::swap(destMBB, exitMBB);
12268
12269 BuildMI(BB, dl, TII->get(isThumb2 ? ARM::t2Bcc : ARM::Bcc))
12270 .addMBB(destMBB).addImm(ARMCC::EQ).addReg(ARM::CPSR);
12271 if (isThumb2)
12272 BuildMI(BB, dl, TII->get(ARM::t2B))
12273 .addMBB(exitMBB)
12275 else
12276 BuildMI(BB, dl, TII->get(ARM::B)) .addMBB(exitMBB);
12277
12278 MI.eraseFromParent(); // The pseudo instruction is gone now.
12279 return BB;
12280 }
12281
12282 case ARM::Int_eh_sjlj_setjmp:
12283 case ARM::Int_eh_sjlj_setjmp_nofp:
12284 case ARM::tInt_eh_sjlj_setjmp:
12285 case ARM::t2Int_eh_sjlj_setjmp:
12286 case ARM::t2Int_eh_sjlj_setjmp_nofp:
12287 return BB;
12288
12289 case ARM::Int_eh_sjlj_setup_dispatch:
12290 EmitSjLjDispatchBlock(MI, BB);
12291 return BB;
12292 case ARM::COPY_STRUCT_BYVAL_I32:
12293 ++NumLoopByVals;
12294 return EmitStructByval(MI, BB);
12295 case ARM::WIN__CHKSTK:
12296 return EmitLowered__chkstk(MI, BB);
12297 case ARM::WIN__DBZCHK:
12298 return EmitLowered__dbzchk(MI, BB);
12299 }
12300}
12301
12302/// Attaches vregs to MEMCPY that it will use as scratch registers
12303/// when it is expanded into LDM/STM. This is done as a post-isel lowering
12304/// instead of as a custom inserter because we need the use list from the SDNode.
12305static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget,
12306 MachineInstr &MI, const SDNode *Node) {
12307 bool isThumb1 = Subtarget->isThumb1Only();
12308
12309 MachineFunction *MF = MI.getParent()->getParent();
12310 MachineRegisterInfo &MRI = MF->getRegInfo();
12311 MachineInstrBuilder MIB(*MF, MI);
12312
12313 // If the new dst/src is unused mark it as dead.
12314 if (!Node->hasAnyUseOfValue(0)) {
12315 MI.getOperand(0).setIsDead(true);
12316 }
12317 if (!Node->hasAnyUseOfValue(1)) {
12318 MI.getOperand(1).setIsDead(true);
12319 }
12320
12321 // The MEMCPY both defines and kills the scratch registers.
12322 for (unsigned I = 0; I != MI.getOperand(4).getImm(); ++I) {
12323 Register TmpReg = MRI.createVirtualRegister(isThumb1 ? &ARM::tGPRRegClass
12324 : &ARM::GPRRegClass);
12326 }
12327}
12328
12330 SDNode *Node) const {
12331 if (MI.getOpcode() == ARM::MEMCPY) {
12332 attachMEMCPYScratchRegs(Subtarget, MI, Node);
12333 return;
12334 }
12335
12336 const MCInstrDesc *MCID = &MI.getDesc();
12337 // Adjust potentially 's' setting instructions after isel, i.e. ADC, SBC, RSB,
12338 // RSC. Coming out of isel, they have an implicit CPSR def, but the optional
12339 // operand is still set to noreg. If needed, set the optional operand's
12340 // register to CPSR, and remove the redundant implicit def.
12341 //
12342 // e.g. ADCS (..., implicit-def CPSR) -> ADC (... opt:def CPSR).
12343
12344 // Rename pseudo opcodes.
12345 unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
12346 unsigned ccOutIdx;
12347 if (NewOpc) {
12348 const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
12349 MCID = &TII->get(NewOpc);
12350
12351 assert(MCID->getNumOperands() ==
12352 MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
12353 && "converted opcode should be the same except for cc_out"
12354 " (and, on Thumb1, pred)");
12355
12356 MI.setDesc(*MCID);
12357
12358 // Add the optional cc_out operand
12359 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
12360
12361 // On Thumb1, move all input operands to the end, then add the predicate
12362 if (Subtarget->isThumb1Only()) {
12363 for (unsigned c = MCID->getNumOperands() - 4; c--;) {
12364 MI.addOperand(MI.getOperand(1));
12365 MI.removeOperand(1);
12366 }
12367
12368 // Restore the ties
12369 for (unsigned i = MI.getNumOperands(); i--;) {
12370 const MachineOperand& op = MI.getOperand(i);
12371 if (op.isReg() && op.isUse()) {
12372 int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
12373 if (DefIdx != -1)
12374 MI.tieOperands(DefIdx, i);
12375 }
12376 }
12377
12379 MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
12380 ccOutIdx = 1;
12381 } else
12382 ccOutIdx = MCID->getNumOperands() - 1;
12383 } else
12384 ccOutIdx = MCID->getNumOperands() - 1;
12385
12386 // Any ARM instruction that sets the 's' bit should specify an optional
12387 // "cc_out" operand in the last operand position.
12388 if (!MI.hasOptionalDef() || !MCID->operands()[ccOutIdx].isOptionalDef()) {
12389 assert(!NewOpc && "Optional cc_out operand required");
12390 return;
12391 }
12392 // Look for an implicit def of CPSR added by MachineInstr ctor. Remove it
12393 // since we already have an optional CPSR def.
12394 bool definesCPSR = false;
12395 bool deadCPSR = false;
12396 for (unsigned i = MCID->getNumOperands(), e = MI.getNumOperands(); i != e;
12397 ++i) {
12398 const MachineOperand &MO = MI.getOperand(i);
12399 if (MO.isReg() && MO.isDef() && MO.getReg() == ARM::CPSR) {
12400 definesCPSR = true;
12401 if (MO.isDead())
12402 deadCPSR = true;
12403 MI.removeOperand(i);
12404 break;
12405 }
12406 }
12407 if (!definesCPSR) {
12408 assert(!NewOpc && "Optional cc_out operand required");
12409 return;
12410 }
12411 assert(deadCPSR == !Node->hasAnyUseOfValue(1) && "inconsistent dead flag");
12412 if (deadCPSR) {
12413 assert(!MI.getOperand(ccOutIdx).getReg() &&
12414 "expect uninitialized optional cc_out operand");
12415 // Thumb1 instructions must have the S bit even if the CPSR is dead.
12416 if (!Subtarget->isThumb1Only())
12417 return;
12418 }
12419
12420 // If this instruction was defined with an optional CPSR def and its dag node
12421 // had a live implicit CPSR def, then activate the optional CPSR def.
12422 MachineOperand &MO = MI.getOperand(ccOutIdx);
12423 MO.setReg(ARM::CPSR);
12424 MO.setIsDef(true);
12425}
12426
12427//===----------------------------------------------------------------------===//
12428// ARM Optimization Hooks
12429//===----------------------------------------------------------------------===//
12430
12431// Helper function that checks if N is a null or all ones constant.
12432static inline bool isZeroOrAllOnes(SDValue N, bool AllOnes) {
12434}
12435
12436// Return true if N is conditionally 0 or all ones.
12437// Detects these expressions where cc is an i1 value:
12438//
12439// (select cc 0, y) [AllOnes=0]
12440// (select cc y, 0) [AllOnes=0]
12441// (zext cc) [AllOnes=0]
12442// (sext cc) [AllOnes=0/1]
12443// (select cc -1, y) [AllOnes=1]
12444// (select cc y, -1) [AllOnes=1]
12445//
12446// Invert is set when N is the null/all ones constant when CC is false.
12447// OtherOp is set to the alternative value of N.
12449 SDValue &CC, bool &Invert,
12450 SDValue &OtherOp,
12451 SelectionDAG &DAG) {
12452 switch (N->getOpcode()) {
12453 default: return false;
12454 case ISD::SELECT: {
12455 CC = N->getOperand(0);
12456 SDValue N1 = N->getOperand(1);
12457 SDValue N2 = N->getOperand(2);
12458 if (isZeroOrAllOnes(N1, AllOnes)) {
12459 Invert = false;
12460 OtherOp = N2;
12461 return true;
12462 }
12463 if (isZeroOrAllOnes(N2, AllOnes)) {
12464 Invert = true;
12465 OtherOp = N1;
12466 return true;
12467 }
12468 return false;
12469 }
12470 case ISD::ZERO_EXTEND:
12471 // (zext cc) can never be the all ones value.
12472 if (AllOnes)
12473 return false;
12474 [[fallthrough]];
12475 case ISD::SIGN_EXTEND: {
12476 SDLoc dl(N);
12477 EVT VT = N->getValueType(0);
12478 CC = N->getOperand(0);
12479 if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
12480 return false;
12481 Invert = !AllOnes;
12482 if (AllOnes)
12483 // When looking for an AllOnes constant, N is an sext, and the 'other'
12484 // value is 0.
12485 OtherOp = DAG.getConstant(0, dl, VT);
12486 else if (N->getOpcode() == ISD::ZERO_EXTEND)
12487 // When looking for a 0 constant, N can be zext or sext.
12488 OtherOp = DAG.getConstant(1, dl, VT);
12489 else
12490 OtherOp = DAG.getAllOnesConstant(dl, VT);
12491 return true;
12492 }
12493 }
12494}
12495
12496// Combine a constant select operand into its use:
12497//
12498// (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
12499// (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
12500// (and (select cc, -1, c), x) -> (select cc, x, (and, x, c)) [AllOnes=1]
12501// (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
12502// (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
12503//
12504// The transform is rejected if the select doesn't have a constant operand that
12505// is null, or all ones when AllOnes is set.
12506//
12507// Also recognize sext/zext from i1:
12508//
12509// (add (zext cc), x) -> (select cc (add x, 1), x)
12510// (add (sext cc), x) -> (select cc (add x, -1), x)
12511//
12512// These transformations eventually create predicated instructions.
12513//
12514// @param N The node to transform.
12515// @param Slct The N operand that is a select.
12516// @param OtherOp The other N operand (x above).
12517// @param DCI Context.
12518// @param AllOnes Require the select constant to be all ones instead of null.
12519// @returns The new node, or SDValue() on failure.
12520static
12523 bool AllOnes = false) {
12524 SelectionDAG &DAG = DCI.DAG;
12525 EVT VT = N->getValueType(0);
12526 SDValue NonConstantVal;
12527 SDValue CCOp;
12528 bool SwapSelectOps;
12529 if (!isConditionalZeroOrAllOnes(Slct.getNode(), AllOnes, CCOp, SwapSelectOps,
12530 NonConstantVal, DAG))
12531 return SDValue();
12532
12533 // Slct is now know to be the desired identity constant when CC is true.
12534 SDValue TrueVal = OtherOp;
12535 SDValue FalseVal = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
12536 OtherOp, NonConstantVal);
12537 // Unless SwapSelectOps says CC should be false.
12538 if (SwapSelectOps)
12539 std::swap(TrueVal, FalseVal);
12540
12541 return DAG.getNode(ISD::SELECT, SDLoc(N), VT,
12542 CCOp, TrueVal, FalseVal);
12543}
12544
12545// Attempt combineSelectAndUse on each operand of a commutative operator N.
12546static
12549 SDValue N0 = N->getOperand(0);
12550 SDValue N1 = N->getOperand(1);
12551 if (N0.getNode()->hasOneUse())
12552 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI, AllOnes))
12553 return Result;
12554 if (N1.getNode()->hasOneUse())
12555 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI, AllOnes))
12556 return Result;
12557 return SDValue();
12558}
12559
12561 // VUZP shuffle node.
12562 if (N->getOpcode() == ARMISD::VUZP)
12563 return true;
12564
12565 // "VUZP" on i32 is an alias for VTRN.
12566 if (N->getOpcode() == ARMISD::VTRN && N->getValueType(0) == MVT::v2i32)
12567 return true;
12568
12569 return false;
12570}
12571
12574 const ARMSubtarget *Subtarget) {
12575 // Look for ADD(VUZP.0, VUZP.1).
12576 if (!IsVUZPShuffleNode(N0.getNode()) || N0.getNode() != N1.getNode() ||
12577 N0 == N1)
12578 return SDValue();
12579
12580 // Make sure the ADD is a 64-bit add; there is no 128-bit VPADD.
12581 if (!N->getValueType(0).is64BitVector())
12582 return SDValue();
12583
12584 // Generate vpadd.
12585 SelectionDAG &DAG = DCI.DAG;
12586 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12587 SDLoc dl(N);
12588 SDNode *Unzip = N0.getNode();
12589 EVT VT = N->getValueType(0);
12590
12592 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpadd, dl,
12593 TLI.getPointerTy(DAG.getDataLayout())));
12594 Ops.push_back(Unzip->getOperand(0));
12595 Ops.push_back(Unzip->getOperand(1));
12596
12597 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12598}
12599
12602 const ARMSubtarget *Subtarget) {
12603 // Check for two extended operands.
12604 if (!(N0.getOpcode() == ISD::SIGN_EXTEND &&
12605 N1.getOpcode() == ISD::SIGN_EXTEND) &&
12606 !(N0.getOpcode() == ISD::ZERO_EXTEND &&
12607 N1.getOpcode() == ISD::ZERO_EXTEND))
12608 return SDValue();
12609
12610 SDValue N00 = N0.getOperand(0);
12611 SDValue N10 = N1.getOperand(0);
12612
12613 // Look for ADD(SEXT(VUZP.0), SEXT(VUZP.1))
12614 if (!IsVUZPShuffleNode(N00.getNode()) || N00.getNode() != N10.getNode() ||
12615 N00 == N10)
12616 return SDValue();
12617
12618 // We only recognize Q register paddl here; this can't be reached until
12619 // after type legalization.
12620 if (!N00.getValueType().is64BitVector() ||
12622 return SDValue();
12623
12624 // Generate vpaddl.
12625 SelectionDAG &DAG = DCI.DAG;
12626 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12627 SDLoc dl(N);
12628 EVT VT = N->getValueType(0);
12629
12631 // Form vpaddl.sN or vpaddl.uN depending on the kind of extension.
12632 unsigned Opcode;
12633 if (N0.getOpcode() == ISD::SIGN_EXTEND)
12634 Opcode = Intrinsic::arm_neon_vpaddls;
12635 else
12636 Opcode = Intrinsic::arm_neon_vpaddlu;
12637 Ops.push_back(DAG.getConstant(Opcode, dl,
12638 TLI.getPointerTy(DAG.getDataLayout())));
12639 EVT ElemTy = N00.getValueType().getVectorElementType();
12640 unsigned NumElts = VT.getVectorNumElements();
12641 EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(), ElemTy, NumElts * 2);
12642 SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), ConcatVT,
12643 N00.getOperand(0), N00.getOperand(1));
12644 Ops.push_back(Concat);
12645
12646 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, Ops);
12647}
12648
12649// FIXME: This function shouldn't be necessary; if we lower BUILD_VECTOR in
12650// an appropriate manner, we end up with ADD(VUZP(ZEXT(N))), which is
12651// much easier to match.
12652static SDValue
12655 const ARMSubtarget *Subtarget) {
12656 // Only perform optimization if after legalize, and if NEON is available. We
12657 // also expected both operands to be BUILD_VECTORs.
12658 if (DCI.isBeforeLegalize() || !Subtarget->hasNEON()
12659 || N0.getOpcode() != ISD::BUILD_VECTOR
12660 || N1.getOpcode() != ISD::BUILD_VECTOR)
12661 return SDValue();
12662
12663 // Check output type since VPADDL operand elements can only be 8, 16, or 32.
12664 EVT VT = N->getValueType(0);
12665 if (!VT.isInteger() || VT.getVectorElementType() == MVT::i64)
12666 return SDValue();
12667
12668 // Check that the vector operands are of the right form.
12669 // N0 and N1 are BUILD_VECTOR nodes with N number of EXTRACT_VECTOR
12670 // operands, where N is the size of the formed vector.
12671 // Each EXTRACT_VECTOR should have the same input vector and odd or even
12672 // index such that we have a pair wise add pattern.
12673
12674 // Grab the vector that all EXTRACT_VECTOR nodes should be referencing.
12676 return SDValue();
12677 SDValue Vec = N0->getOperand(0)->getOperand(0);
12678 SDNode *V = Vec.getNode();
12679 unsigned nextIndex = 0;
12680
12681 // For each operands to the ADD which are BUILD_VECTORs,
12682 // check to see if each of their operands are an EXTRACT_VECTOR with
12683 // the same vector and appropriate index.
12684 for (unsigned i = 0, e = N0->getNumOperands(); i != e; ++i) {
12687
12688 SDValue ExtVec0 = N0->getOperand(i);
12689 SDValue ExtVec1 = N1->getOperand(i);
12690
12691 // First operand is the vector, verify its the same.
12692 if (V != ExtVec0->getOperand(0).getNode() ||
12693 V != ExtVec1->getOperand(0).getNode())
12694 return SDValue();
12695
12696 // Second is the constant, verify its correct.
12699
12700 // For the constant, we want to see all the even or all the odd.
12701 if (!C0 || !C1 || C0->getZExtValue() != nextIndex
12702 || C1->getZExtValue() != nextIndex+1)
12703 return SDValue();
12704
12705 // Increment index.
12706 nextIndex+=2;
12707 } else
12708 return SDValue();
12709 }
12710
12711 // Don't generate vpaddl+vmovn; we'll match it to vpadd later. Also make sure
12712 // we're using the entire input vector, otherwise there's a size/legality
12713 // mismatch somewhere.
12714 if (nextIndex != Vec.getValueType().getVectorNumElements() ||
12716 return SDValue();
12717
12718 // Create VPADDL node.
12719 SelectionDAG &DAG = DCI.DAG;
12720 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
12721
12722 SDLoc dl(N);
12723
12724 // Build operand list.
12726 Ops.push_back(DAG.getConstant(Intrinsic::arm_neon_vpaddls, dl,
12727 TLI.getPointerTy(DAG.getDataLayout())));
12728
12729 // Input is the vector.
12730 Ops.push_back(Vec);
12731
12732 // Get widened type and narrowed type.
12733 MVT widenType;
12734 unsigned numElem = VT.getVectorNumElements();
12735
12736 EVT inputLaneType = Vec.getValueType().getVectorElementType();
12737 switch (inputLaneType.getSimpleVT().SimpleTy) {
12738 case MVT::i8: widenType = MVT::getVectorVT(MVT::i16, numElem); break;
12739 case MVT::i16: widenType = MVT::getVectorVT(MVT::i32, numElem); break;
12740 case MVT::i32: widenType = MVT::getVectorVT(MVT::i64, numElem); break;
12741 default:
12742 llvm_unreachable("Invalid vector element type for padd optimization.");
12743 }
12744
12745 SDValue tmp = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, widenType, Ops);
12746 unsigned ExtOp = VT.bitsGT(tmp.getValueType()) ? ISD::ANY_EXTEND : ISD::TRUNCATE;
12747 return DAG.getNode(ExtOp, dl, VT, tmp);
12748}
12749
12751 if (V->getOpcode() == ISD::UMUL_LOHI ||
12752 V->getOpcode() == ISD::SMUL_LOHI)
12753 return V;
12754 return SDValue();
12755}
12756
12757static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
12759 const ARMSubtarget *Subtarget) {
12760 if (!Subtarget->hasBaseDSP())
12761 return SDValue();
12762
12763 // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
12764 // accumulates the product into a 64-bit value. The 16-bit values will
12765 // be sign extended somehow or SRA'd into 32-bit values
12766 // (addc (adde (mul 16bit, 16bit), lo), hi)
12767 SDValue Mul = AddcNode->getOperand(0);
12768 SDValue Lo = AddcNode->getOperand(1);
12769 if (Mul.getOpcode() != ISD::MUL) {
12770 Lo = AddcNode->getOperand(0);
12771 Mul = AddcNode->getOperand(1);
12772 if (Mul.getOpcode() != ISD::MUL)
12773 return SDValue();
12774 }
12775
12776 SDValue SRA = AddeNode->getOperand(0);
12777 SDValue Hi = AddeNode->getOperand(1);
12778 if (SRA.getOpcode() != ISD::SRA) {
12779 SRA = AddeNode->getOperand(1);
12780 Hi = AddeNode->getOperand(0);
12781 if (SRA.getOpcode() != ISD::SRA)
12782 return SDValue();
12783 }
12784 if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
12785 if (Const->getZExtValue() != 31)
12786 return SDValue();
12787 } else
12788 return SDValue();
12789
12790 if (SRA.getOperand(0) != Mul)
12791 return SDValue();
12792
12793 SelectionDAG &DAG = DCI.DAG;
12794 SDLoc dl(AddcNode);
12795 unsigned Opcode = 0;
12796 SDValue Op0;
12797 SDValue Op1;
12798
12799 if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
12800 Opcode = ARMISD::SMLALBB;
12801 Op0 = Mul.getOperand(0);
12802 Op1 = Mul.getOperand(1);
12803 } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
12804 Opcode = ARMISD::SMLALBT;
12805 Op0 = Mul.getOperand(0);
12806 Op1 = Mul.getOperand(1).getOperand(0);
12807 } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
12808 Opcode = ARMISD::SMLALTB;
12809 Op0 = Mul.getOperand(0).getOperand(0);
12810 Op1 = Mul.getOperand(1);
12811 } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
12812 Opcode = ARMISD::SMLALTT;
12813 Op0 = Mul->getOperand(0).getOperand(0);
12814 Op1 = Mul->getOperand(1).getOperand(0);
12815 }
12816
12817 if (!Op0 || !Op1)
12818 return SDValue();
12819
12820 SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
12821 Op0, Op1, Lo, Hi);
12822 // Replace the ADDs' nodes uses by the MLA node's values.
12823 SDValue HiMLALResult(SMLAL.getNode(), 1);
12824 SDValue LoMLALResult(SMLAL.getNode(), 0);
12825
12826 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
12827 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
12828
12829 // Return original node to notify the driver to stop replacing.
12830 SDValue resNode(AddcNode, 0);
12831 return resNode;
12832}
12833
12836 const ARMSubtarget *Subtarget) {
12837 // Look for multiply add opportunities.
12838 // The pattern is a ISD::UMUL_LOHI followed by two add nodes, where
12839 // each add nodes consumes a value from ISD::UMUL_LOHI and there is
12840 // a glue link from the first add to the second add.
12841 // If we find this pattern, we can replace the U/SMUL_LOHI, ADDC, and ADDE by
12842 // a S/UMLAL instruction.
12843 // UMUL_LOHI
12844 // / :lo \ :hi
12845 // V \ [no multiline comment]
12846 // loAdd -> ADDC |
12847 // \ :carry /
12848 // V V
12849 // ADDE <- hiAdd
12850 //
12851 // In the special case where only the higher part of a signed result is used
12852 // and the add to the low part of the result of ISD::UMUL_LOHI adds or subtracts
12853 // a constant with the exact value of 0x80000000, we recognize we are dealing
12854 // with a "rounded multiply and add" (or subtract) and transform it into
12855 // either a ARMISD::SMMLAR or ARMISD::SMMLSR respectively.
12856
12857 assert((AddeSubeNode->getOpcode() == ARMISD::ADDE ||
12858 AddeSubeNode->getOpcode() == ARMISD::SUBE) &&
12859 "Expect an ADDE or SUBE");
12860
12861 assert(AddeSubeNode->getNumOperands() == 3 &&
12862 AddeSubeNode->getOperand(2).getValueType() == MVT::i32 &&
12863 "ADDE node has the wrong inputs");
12864
12865 // Check that we are chained to the right ADDC or SUBC node.
12866 SDNode *AddcSubcNode = AddeSubeNode->getOperand(2).getNode();
12867 if ((AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12868 AddcSubcNode->getOpcode() != ARMISD::ADDC) ||
12869 (AddeSubeNode->getOpcode() == ARMISD::SUBE &&
12870 AddcSubcNode->getOpcode() != ARMISD::SUBC))
12871 return SDValue();
12872
12873 SDValue AddcSubcOp0 = AddcSubcNode->getOperand(0);
12874 SDValue AddcSubcOp1 = AddcSubcNode->getOperand(1);
12875
12876 // Check if the two operands are from the same mul_lohi node.
12877 if (AddcSubcOp0.getNode() == AddcSubcOp1.getNode())
12878 return SDValue();
12879
12880 assert(AddcSubcNode->getNumValues() == 2 &&
12881 AddcSubcNode->getValueType(0) == MVT::i32 &&
12882 "Expect ADDC with two result values. First: i32");
12883
12884 // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
12885 // maybe a SMLAL which multiplies two 16-bit values.
12886 if (AddeSubeNode->getOpcode() == ARMISD::ADDE &&
12887 AddcSubcOp0->getOpcode() != ISD::UMUL_LOHI &&
12888 AddcSubcOp0->getOpcode() != ISD::SMUL_LOHI &&
12889 AddcSubcOp1->getOpcode() != ISD::UMUL_LOHI &&
12890 AddcSubcOp1->getOpcode() != ISD::SMUL_LOHI)
12891 return AddCombineTo64BitSMLAL16(AddcSubcNode, AddeSubeNode, DCI, Subtarget);
12892
12893 // Check for the triangle shape.
12894 SDValue AddeSubeOp0 = AddeSubeNode->getOperand(0);
12895 SDValue AddeSubeOp1 = AddeSubeNode->getOperand(1);
12896
12897 // Make sure that the ADDE/SUBE operands are not coming from the same node.
12898 if (AddeSubeOp0.getNode() == AddeSubeOp1.getNode())
12899 return SDValue();
12900
12901 // Find the MUL_LOHI node walking up ADDE/SUBE's operands.
12902 bool IsLeftOperandMUL = false;
12903 SDValue MULOp = findMUL_LOHI(AddeSubeOp0);
12904 if (MULOp == SDValue())
12905 MULOp = findMUL_LOHI(AddeSubeOp1);
12906 else
12907 IsLeftOperandMUL = true;
12908 if (MULOp == SDValue())
12909 return SDValue();
12910
12911 // Figure out the right opcode.
12912 unsigned Opc = MULOp->getOpcode();
12913 unsigned FinalOpc = (Opc == ISD::SMUL_LOHI) ? ARMISD::SMLAL : ARMISD::UMLAL;
12914
12915 // Figure out the high and low input values to the MLAL node.
12916 SDValue *HiAddSub = nullptr;
12917 SDValue *LoMul = nullptr;
12918 SDValue *LowAddSub = nullptr;
12919
12920 // Ensure that ADDE/SUBE is from high result of ISD::xMUL_LOHI.
12921 if ((AddeSubeOp0 != MULOp.getValue(1)) && (AddeSubeOp1 != MULOp.getValue(1)))
12922 return SDValue();
12923
12924 if (IsLeftOperandMUL)
12925 HiAddSub = &AddeSubeOp1;
12926 else
12927 HiAddSub = &AddeSubeOp0;
12928
12929 // Ensure that LoMul and LowAddSub are taken from correct ISD::SMUL_LOHI node
12930 // whose low result is fed to the ADDC/SUBC we are checking.
12931
12932 if (AddcSubcOp0 == MULOp.getValue(0)) {
12933 LoMul = &AddcSubcOp0;
12934 LowAddSub = &AddcSubcOp1;
12935 }
12936 if (AddcSubcOp1 == MULOp.getValue(0)) {
12937 LoMul = &AddcSubcOp1;
12938 LowAddSub = &AddcSubcOp0;
12939 }
12940
12941 if (!LoMul)
12942 return SDValue();
12943
12944 // If HiAddSub is the same node as ADDC/SUBC or is a predecessor of ADDC/SUBC
12945 // the replacement below will create a cycle.
12946 if (AddcSubcNode == HiAddSub->getNode() ||
12947 AddcSubcNode->isPredecessorOf(HiAddSub->getNode()))
12948 return SDValue();
12949
12950 // Create the merged node.
12951 SelectionDAG &DAG = DCI.DAG;
12952
12953 // Start building operand list.
12955 Ops.push_back(LoMul->getOperand(0));
12956 Ops.push_back(LoMul->getOperand(1));
12957
12958 // Check whether we can use SMMLAR, SMMLSR or SMMULR instead. For this to be
12959 // the case, we must be doing signed multiplication and only use the higher
12960 // part of the result of the MLAL, furthermore the LowAddSub must be a constant
12961 // addition or subtraction with the value of 0x800000.
12962 if (Subtarget->hasV6Ops() && Subtarget->hasDSP() && Subtarget->useMulOps() &&
12963 FinalOpc == ARMISD::SMLAL && !AddeSubeNode->hasAnyUseOfValue(1) &&
12964 LowAddSub->getNode()->getOpcode() == ISD::Constant &&
12965 static_cast<ConstantSDNode *>(LowAddSub->getNode())->getZExtValue() ==
12966 0x80000000) {
12967 Ops.push_back(*HiAddSub);
12968 if (AddcSubcNode->getOpcode() == ARMISD::SUBC) {
12969 FinalOpc = ARMISD::SMMLSR;
12970 } else {
12971 FinalOpc = ARMISD::SMMLAR;
12972 }
12973 SDValue NewNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode), MVT::i32, Ops);
12974 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), NewNode);
12975
12976 return SDValue(AddeSubeNode, 0);
12977 } else if (AddcSubcNode->getOpcode() == ARMISD::SUBC)
12978 // SMMLS is generated during instruction selection and the rest of this
12979 // function can not handle the case where AddcSubcNode is a SUBC.
12980 return SDValue();
12981
12982 // Finish building the operand list for {U/S}MLAL
12983 Ops.push_back(*LowAddSub);
12984 Ops.push_back(*HiAddSub);
12985
12986 SDValue MLALNode = DAG.getNode(FinalOpc, SDLoc(AddcSubcNode),
12987 DAG.getVTList(MVT::i32, MVT::i32), Ops);
12988
12989 // Replace the ADDs' nodes uses by the MLA node's values.
12990 SDValue HiMLALResult(MLALNode.getNode(), 1);
12991 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeSubeNode, 0), HiMLALResult);
12992
12993 SDValue LoMLALResult(MLALNode.getNode(), 0);
12994 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcSubcNode, 0), LoMLALResult);
12995
12996 // Return original node to notify the driver to stop replacing.
12997 return SDValue(AddeSubeNode, 0);
12998}
12999
13002 const ARMSubtarget *Subtarget) {
13003 // UMAAL is similar to UMLAL except that it adds two unsigned values.
13004 // While trying to combine for the other MLAL nodes, first search for the
13005 // chance to use UMAAL. Check if Addc uses a node which has already
13006 // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
13007 // as the addend, and it's handled in PerformUMLALCombine.
13008
13009 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13010 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13011
13012 // Check that we have a glued ADDC node.
13013 SDNode* AddcNode = AddeNode->getOperand(2).getNode();
13014 if (AddcNode->getOpcode() != ARMISD::ADDC)
13015 return SDValue();
13016
13017 // Find the converted UMAAL or quit if it doesn't exist.
13018 SDNode *UmlalNode = nullptr;
13019 SDValue AddHi;
13020 if (AddcNode->getOperand(0).getOpcode() == ARMISD::UMLAL) {
13021 UmlalNode = AddcNode->getOperand(0).getNode();
13022 AddHi = AddcNode->getOperand(1);
13023 } else if (AddcNode->getOperand(1).getOpcode() == ARMISD::UMLAL) {
13024 UmlalNode = AddcNode->getOperand(1).getNode();
13025 AddHi = AddcNode->getOperand(0);
13026 } else {
13027 return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
13028 }
13029
13030 // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
13031 // the ADDC as well as Zero.
13032 if (!isNullConstant(UmlalNode->getOperand(3)))
13033 return SDValue();
13034
13035 if ((isNullConstant(AddeNode->getOperand(0)) &&
13036 AddeNode->getOperand(1).getNode() == UmlalNode) ||
13037 (AddeNode->getOperand(0).getNode() == UmlalNode &&
13038 isNullConstant(AddeNode->getOperand(1)))) {
13039 SelectionDAG &DAG = DCI.DAG;
13040 SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
13041 UmlalNode->getOperand(2), AddHi };
13042 SDValue UMAAL = DAG.getNode(ARMISD::UMAAL, SDLoc(AddcNode),
13043 DAG.getVTList(MVT::i32, MVT::i32), Ops);
13044
13045 // Replace the ADDs' nodes uses by the UMAAL node's values.
13046 DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), SDValue(UMAAL.getNode(), 1));
13047 DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
13048
13049 // Return original node to notify the driver to stop replacing.
13050 return SDValue(AddeNode, 0);
13051 }
13052 return SDValue();
13053}
13054
13056 const ARMSubtarget *Subtarget) {
13057 if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
13058 return SDValue();
13059
13060 // Check that we have a pair of ADDC and ADDE as operands.
13061 // Both addends of the ADDE must be zero.
13062 SDNode* AddcNode = N->getOperand(2).getNode();
13063 SDNode* AddeNode = N->getOperand(3).getNode();
13064 if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
13065 (AddeNode->getOpcode() == ARMISD::ADDE) &&
13066 isNullConstant(AddeNode->getOperand(0)) &&
13067 isNullConstant(AddeNode->getOperand(1)) &&
13068 (AddeNode->getOperand(2).getNode() == AddcNode))
13069 return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
13070 DAG.getVTList(MVT::i32, MVT::i32),
13071 {N->getOperand(0), N->getOperand(1),
13072 AddcNode->getOperand(0), AddcNode->getOperand(1)});
13073 else
13074 return SDValue();
13075}
13076
13079 const ARMSubtarget *Subtarget) {
13080 SelectionDAG &DAG(DCI.DAG);
13081
13082 if (N->getOpcode() == ARMISD::SUBC && N->hasAnyUseOfValue(1)) {
13083 // (SUBC (ADDE 0, 0, C), 1) -> C
13084 SDValue LHS = N->getOperand(0);
13085 SDValue RHS = N->getOperand(1);
13086 if (LHS->getOpcode() == ARMISD::ADDE &&
13087 isNullConstant(LHS->getOperand(0)) &&
13088 isNullConstant(LHS->getOperand(1)) && isOneConstant(RHS)) {
13089 return DCI.CombineTo(N, SDValue(N, 0), LHS->getOperand(2));
13090 }
13091 }
13092
13093 if (Subtarget->isThumb1Only()) {
13094 SDValue RHS = N->getOperand(1);
13096 int32_t imm = C->getSExtValue();
13097 if (imm < 0 && imm > std::numeric_limits<int>::min()) {
13098 SDLoc DL(N);
13099 RHS = DAG.getConstant(-imm, DL, MVT::i32);
13100 unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
13101 : ARMISD::ADDC;
13102 return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
13103 }
13104 }
13105 }
13106
13107 return SDValue();
13108}
13109
13112 const ARMSubtarget *Subtarget) {
13113 if (Subtarget->isThumb1Only()) {
13114 SelectionDAG &DAG = DCI.DAG;
13115 SDValue RHS = N->getOperand(1);
13117 int64_t imm = C->getSExtValue();
13118 if (imm < 0) {
13119 SDLoc DL(N);
13120
13121 // The with-carry-in form matches bitwise not instead of the negation.
13122 // Effectively, the inverse interpretation of the carry flag already
13123 // accounts for part of the negation.
13124 RHS = DAG.getConstant(~imm, DL, MVT::i32);
13125
13126 unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
13127 : ARMISD::ADDE;
13128 return DAG.getNode(Opcode, DL, N->getVTList(),
13129 N->getOperand(0), RHS, N->getOperand(2));
13130 }
13131 }
13132 } else if (N->getOperand(1)->getOpcode() == ISD::SMUL_LOHI) {
13133 return AddCombineTo64bitMLAL(N, DCI, Subtarget);
13134 }
13135 return SDValue();
13136}
13137
13140 const ARMSubtarget *Subtarget) {
13141 if (!Subtarget->hasMVEIntegerOps())
13142 return SDValue();
13143
13144 SDLoc dl(N);
13145 SDValue SetCC;
13146 SDValue LHS;
13147 SDValue RHS;
13148 ISD::CondCode CC;
13149 SDValue TrueVal;
13150 SDValue FalseVal;
13151
13152 if (N->getOpcode() == ISD::SELECT &&
13153 N->getOperand(0)->getOpcode() == ISD::SETCC) {
13154 SetCC = N->getOperand(0);
13155 LHS = SetCC->getOperand(0);
13156 RHS = SetCC->getOperand(1);
13157 CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
13158 TrueVal = N->getOperand(1);
13159 FalseVal = N->getOperand(2);
13160 } else if (N->getOpcode() == ISD::SELECT_CC) {
13161 LHS = N->getOperand(0);
13162 RHS = N->getOperand(1);
13163 CC = cast<CondCodeSDNode>(N->getOperand(4))->get();
13164 TrueVal = N->getOperand(2);
13165 FalseVal = N->getOperand(3);
13166 } else {
13167 return SDValue();
13168 }
13169
13170 unsigned int Opcode = 0;
13171 if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMIN ||
13172 FalseVal->getOpcode() == ISD::VECREDUCE_UMIN) &&
13173 (CC == ISD::SETULT || CC == ISD::SETUGT)) {
13174 Opcode = ARMISD::VMINVu;
13175 if (CC == ISD::SETUGT)
13176 std::swap(TrueVal, FalseVal);
13177 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMIN ||
13178 FalseVal->getOpcode() == ISD::VECREDUCE_SMIN) &&
13179 (CC == ISD::SETLT || CC == ISD::SETGT)) {
13180 Opcode = ARMISD::VMINVs;
13181 if (CC == ISD::SETGT)
13182 std::swap(TrueVal, FalseVal);
13183 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_UMAX ||
13184 FalseVal->getOpcode() == ISD::VECREDUCE_UMAX) &&
13185 (CC == ISD::SETUGT || CC == ISD::SETULT)) {
13186 Opcode = ARMISD::VMAXVu;
13187 if (CC == ISD::SETULT)
13188 std::swap(TrueVal, FalseVal);
13189 } else if ((TrueVal->getOpcode() == ISD::VECREDUCE_SMAX ||
13190 FalseVal->getOpcode() == ISD::VECREDUCE_SMAX) &&
13191 (CC == ISD::SETGT || CC == ISD::SETLT)) {
13192 Opcode = ARMISD::VMAXVs;
13193 if (CC == ISD::SETLT)
13194 std::swap(TrueVal, FalseVal);
13195 } else
13196 return SDValue();
13197
13198 // Normalise to the right hand side being the vector reduction
13199 switch (TrueVal->getOpcode()) {
13204 std::swap(LHS, RHS);
13205 std::swap(TrueVal, FalseVal);
13206 break;
13207 }
13208
13209 EVT VectorType = FalseVal->getOperand(0).getValueType();
13210
13211 if (VectorType != MVT::v16i8 && VectorType != MVT::v8i16 &&
13212 VectorType != MVT::v4i32)
13213 return SDValue();
13214
13215 EVT VectorScalarType = VectorType.getVectorElementType();
13216
13217 // The values being selected must also be the ones being compared
13218 if (TrueVal != LHS || FalseVal != RHS)
13219 return SDValue();
13220
13221 EVT LeftType = LHS->getValueType(0);
13222 EVT RightType = RHS->getValueType(0);
13223
13224 // The types must match the reduced type too
13225 if (LeftType != VectorScalarType || RightType != VectorScalarType)
13226 return SDValue();
13227
13228 // Legalise the scalar to an i32
13229 if (VectorScalarType != MVT::i32)
13230 LHS = DCI.DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, LHS);
13231
13232 // Generate the reduction as an i32 for legalisation purposes
13233 auto Reduction =
13234 DCI.DAG.getNode(Opcode, dl, MVT::i32, LHS, RHS->getOperand(0));
13235
13236 // The result isn't actually an i32 so truncate it back to its original type
13237 if (VectorScalarType != MVT::i32)
13238 Reduction = DCI.DAG.getNode(ISD::TRUNCATE, dl, VectorScalarType, Reduction);
13239
13240 return Reduction;
13241}
13242
13243// A special combine for the vqdmulh family of instructions. This is one of the
13244// potential set of patterns that could patch this instruction. The base pattern
13245// you would expect to be min(max(ashr(mul(mul(sext(x), 2), sext(y)), 16))).
13246// This matches the different min(max(ashr(mul(mul(sext(x), sext(y)), 2), 16))),
13247// which llvm will have optimized to min(ashr(mul(sext(x), sext(y)), 15))) as
13248// the max is unnecessary.
13250 EVT VT = N->getValueType(0);
13251 SDValue Shft;
13252 ConstantSDNode *Clamp;
13253
13254 if (!VT.isVector() || VT.getScalarSizeInBits() > 64)
13255 return SDValue();
13256
13257 if (N->getOpcode() == ISD::SMIN) {
13258 Shft = N->getOperand(0);
13259 Clamp = isConstOrConstSplat(N->getOperand(1));
13260 } else if (N->getOpcode() == ISD::VSELECT) {
13261 // Detect a SMIN, which for an i64 node will be a vselect/setcc, not a smin.
13262 SDValue Cmp = N->getOperand(0);
13263 if (Cmp.getOpcode() != ISD::SETCC ||
13264 cast<CondCodeSDNode>(Cmp.getOperand(2))->get() != ISD::SETLT ||
13265 Cmp.getOperand(0) != N->getOperand(1) ||
13266 Cmp.getOperand(1) != N->getOperand(2))
13267 return SDValue();
13268 Shft = N->getOperand(1);
13269 Clamp = isConstOrConstSplat(N->getOperand(2));
13270 } else
13271 return SDValue();
13272
13273 if (!Clamp)
13274 return SDValue();
13275
13276 MVT ScalarType;
13277 int ShftAmt = 0;
13278 switch (Clamp->getSExtValue()) {
13279 case (1 << 7) - 1:
13280 ScalarType = MVT::i8;
13281 ShftAmt = 7;
13282 break;
13283 case (1 << 15) - 1:
13284 ScalarType = MVT::i16;
13285 ShftAmt = 15;
13286 break;
13287 case (1ULL << 31) - 1:
13288 ScalarType = MVT::i32;
13289 ShftAmt = 31;
13290 break;
13291 default:
13292 return SDValue();
13293 }
13294
13295 if (Shft.getOpcode() != ISD::SRA)
13296 return SDValue();
13298 if (!N1 || N1->getSExtValue() != ShftAmt)
13299 return SDValue();
13300
13301 SDValue Mul = Shft.getOperand(0);
13302 if (Mul.getOpcode() != ISD::MUL)
13303 return SDValue();
13304
13305 SDValue Ext0 = Mul.getOperand(0);
13306 SDValue Ext1 = Mul.getOperand(1);
13307 if (Ext0.getOpcode() != ISD::SIGN_EXTEND ||
13308 Ext1.getOpcode() != ISD::SIGN_EXTEND)
13309 return SDValue();
13310 EVT VecVT = Ext0.getOperand(0).getValueType();
13311 if (!VecVT.isPow2VectorType() || VecVT.getVectorNumElements() == 1)
13312 return SDValue();
13313 if (Ext1.getOperand(0).getValueType() != VecVT ||
13314 VecVT.getScalarType() != ScalarType ||
13315 VT.getScalarSizeInBits() < ScalarType.getScalarSizeInBits() * 2)
13316 return SDValue();
13317
13318 SDLoc DL(Mul);
13319 unsigned LegalLanes = 128 / (ShftAmt + 1);
13320 EVT LegalVecVT = MVT::getVectorVT(ScalarType, LegalLanes);
13321 // For types smaller than legal vectors extend to be legal and only use needed
13322 // lanes.
13323 if (VecVT.getSizeInBits() < 128) {
13324 EVT ExtVecVT =
13326 VecVT.getVectorNumElements());
13327 SDValue Inp0 =
13328 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext0.getOperand(0));
13329 SDValue Inp1 =
13330 DAG.getNode(ISD::ANY_EXTEND, DL, ExtVecVT, Ext1.getOperand(0));
13331 Inp0 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp0);
13332 Inp1 = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, LegalVecVT, Inp1);
13333 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13334 SDValue Trunc = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, ExtVecVT, VQDMULH);
13335 Trunc = DAG.getNode(ISD::TRUNCATE, DL, VecVT, Trunc);
13336 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Trunc);
13337 }
13338
13339 // For larger types, split into legal sized chunks.
13340 assert(VecVT.getSizeInBits() % 128 == 0 && "Expected a power2 type");
13341 unsigned NumParts = VecVT.getSizeInBits() / 128;
13343 for (unsigned I = 0; I < NumParts; ++I) {
13344 SDValue Inp0 =
13345 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext0.getOperand(0),
13346 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13347 SDValue Inp1 =
13348 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LegalVecVT, Ext1.getOperand(0),
13349 DAG.getVectorIdxConstant(I * LegalLanes, DL));
13350 SDValue VQDMULH = DAG.getNode(ARMISD::VQDMULH, DL, LegalVecVT, Inp0, Inp1);
13351 Parts.push_back(VQDMULH);
13352 }
13353 return DAG.getNode(ISD::SIGN_EXTEND, DL, VT,
13354 DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Parts));
13355}
13356
13359 const ARMSubtarget *Subtarget) {
13360 if (!Subtarget->hasMVEIntegerOps())
13361 return SDValue();
13362
13363 // Constant fold vselect 0, A, B -> B
13364 // and vselect 0xffff, A, B -> A
13365 if (N->getOperand(0).getOpcode() == ARMISD::PREDICATE_CAST &&
13366 isa<ConstantSDNode>(N->getOperand(0).getOperand(0))) {
13367 unsigned C = N->getOperand(0).getConstantOperandVal(0);
13368 if (C == 0)
13369 return N->getOperand(2);
13370 if (C == 0xffff)
13371 return N->getOperand(1);
13372 }
13373
13374 if (SDValue V = PerformVQDMULHCombine(N, DCI.DAG))
13375 return V;
13376
13377 // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
13378 //
13379 // We need to re-implement this optimization here as the implementation in the
13380 // Target-Independent DAGCombiner does not handle the kind of constant we make
13381 // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
13382 // good reason, allowing truncation there would break other targets).
13383 //
13384 // Currently, this is only done for MVE, as it's the only target that benefits
13385 // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
13386 if (N->getOperand(0).getOpcode() != ISD::XOR)
13387 return SDValue();
13388 SDValue XOR = N->getOperand(0);
13389
13390 // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
13391 // It is important to check with truncation allowed as the BUILD_VECTORs we
13392 // generate in those situations will truncate their operands.
13393 ConstantSDNode *Const =
13394 isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
13395 /*AllowTruncation*/ true);
13396 if (!Const || !Const->isOne())
13397 return SDValue();
13398
13399 // Rewrite into vselect(cond, rhs, lhs).
13400 SDValue Cond = XOR->getOperand(0);
13401 SDValue LHS = N->getOperand(1);
13402 SDValue RHS = N->getOperand(2);
13403 EVT Type = N->getValueType(0);
13404 return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
13405}
13406
13407// Convert vsetcc([0,1,2,..], splat(n), ult) -> vctp n
13410 const ARMSubtarget *Subtarget) {
13411 SDValue Op0 = N->getOperand(0);
13412 SDValue Op1 = N->getOperand(1);
13413 ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
13414 EVT VT = N->getValueType(0);
13415
13416 if (!Subtarget->hasMVEIntegerOps() ||
13418 return SDValue();
13419
13420 if (CC == ISD::SETUGE) {
13421 std::swap(Op0, Op1);
13422 CC = ISD::SETULT;
13423 }
13424
13425 if (CC != ISD::SETULT || VT.getScalarSizeInBits() != 1 ||
13427 return SDValue();
13428
13429 // Check first operand is BuildVector of 0,1,2,...
13430 for (unsigned I = 0; I < VT.getVectorNumElements(); I++) {
13431 if (!Op0.getOperand(I).isUndef() &&
13433 Op0.getConstantOperandVal(I) == I))
13434 return SDValue();
13435 }
13436
13437 // The second is a Splat of Op1S
13438 SDValue Op1S = DCI.DAG.getSplatValue(Op1);
13439 if (!Op1S)
13440 return SDValue();
13441
13442 unsigned Opc;
13443 switch (VT.getVectorNumElements()) {
13444 case 2:
13445 Opc = Intrinsic::arm_mve_vctp64;
13446 break;
13447 case 4:
13448 Opc = Intrinsic::arm_mve_vctp32;
13449 break;
13450 case 8:
13451 Opc = Intrinsic::arm_mve_vctp16;
13452 break;
13453 case 16:
13454 Opc = Intrinsic::arm_mve_vctp8;
13455 break;
13456 default:
13457 return SDValue();
13458 }
13459
13460 SDLoc DL(N);
13461 return DCI.DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
13462 DCI.DAG.getConstant(Opc, DL, MVT::i32),
13463 DCI.DAG.getZExtOrTrunc(Op1S, DL, MVT::i32));
13464}
13465
13466/// PerformADDECombine - Target-specific dag combine transform from
13467/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
13468/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
13471 const ARMSubtarget *Subtarget) {
13472 // Only ARM and Thumb2 support UMLAL/SMLAL.
13473 if (Subtarget->isThumb1Only())
13474 return PerformAddeSubeCombine(N, DCI, Subtarget);
13475
13476 // Only perform the checks after legalize when the pattern is available.
13477 if (DCI.isBeforeLegalize()) return SDValue();
13478
13479 return AddCombineTo64bitUMAAL(N, DCI, Subtarget);
13480}
13481
13482/// PerformADDCombineWithOperands - Try DAG combinations for an ADD with
13483/// operands N0 and N1. This is a helper for PerformADDCombine that is
13484/// called with the default operands, and if that fails, with commuted
13485/// operands.
13488 const ARMSubtarget *Subtarget){
13489 // Attempt to create vpadd for this add.
13490 if (SDValue Result = AddCombineToVPADD(N, N0, N1, DCI, Subtarget))
13491 return Result;
13492
13493 // Attempt to create vpaddl for this add.
13494 if (SDValue Result = AddCombineVUZPToVPADDL(N, N0, N1, DCI, Subtarget))
13495 return Result;
13496 if (SDValue Result = AddCombineBUILD_VECTORToVPADDL(N, N0, N1, DCI,
13497 Subtarget))
13498 return Result;
13499
13500 // fold (add (select cc, 0, c), x) -> (select cc, x, (add, x, c))
13501 if (N0.getNode()->hasOneUse())
13502 if (SDValue Result = combineSelectAndUse(N, N0, N1, DCI))
13503 return Result;
13504 return SDValue();
13505}
13506
13508 EVT VT = N->getValueType(0);
13509 SDValue N0 = N->getOperand(0);
13510 SDValue N1 = N->getOperand(1);
13511 SDLoc dl(N);
13512
13513 auto IsVecReduce = [](SDValue Op) {
13514 switch (Op.getOpcode()) {
13515 case ISD::VECREDUCE_ADD:
13516 case ARMISD::VADDVs:
13517 case ARMISD::VADDVu:
13518 case ARMISD::VMLAVs:
13519 case ARMISD::VMLAVu:
13520 return true;
13521 }
13522 return false;
13523 };
13524
13525 auto DistrubuteAddAddVecReduce = [&](SDValue N0, SDValue N1) {
13526 // Distribute add(X, add(vecreduce(Y), vecreduce(Z))) ->
13527 // add(add(X, vecreduce(Y)), vecreduce(Z))
13528 // to make better use of vaddva style instructions.
13529 if (VT == MVT::i32 && N1.getOpcode() == ISD::ADD && !IsVecReduce(N0) &&
13530 IsVecReduce(N1.getOperand(0)) && IsVecReduce(N1.getOperand(1)) &&
13531 !isa<ConstantSDNode>(N0) && N1->hasOneUse()) {
13532 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0, N1.getOperand(0));
13533 return DAG.getNode(ISD::ADD, dl, VT, Add0, N1.getOperand(1));
13534 }
13535 // And turn add(add(A, reduce(B)), add(C, reduce(D))) ->
13536 // add(add(add(A, C), reduce(B)), reduce(D))
13537 if (VT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
13538 N1.getOpcode() == ISD::ADD && N0->hasOneUse() && N1->hasOneUse()) {
13539 unsigned N0RedOp = 0;
13540 if (!IsVecReduce(N0.getOperand(N0RedOp))) {
13541 N0RedOp = 1;
13542 if (!IsVecReduce(N0.getOperand(N0RedOp)))
13543 return SDValue();
13544 }
13545
13546 unsigned N1RedOp = 0;
13547 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13548 N1RedOp = 1;
13549 if (!IsVecReduce(N1.getOperand(N1RedOp)))
13550 return SDValue();
13551
13552 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, N0.getOperand(1 - N0RedOp),
13553 N1.getOperand(1 - N1RedOp));
13554 SDValue Add1 =
13555 DAG.getNode(ISD::ADD, dl, VT, Add0, N0.getOperand(N0RedOp));
13556 return DAG.getNode(ISD::ADD, dl, VT, Add1, N1.getOperand(N1RedOp));
13557 }
13558 return SDValue();
13559 };
13560 if (SDValue R = DistrubuteAddAddVecReduce(N0, N1))
13561 return R;
13562 if (SDValue R = DistrubuteAddAddVecReduce(N1, N0))
13563 return R;
13564
13565 // Distribute add(vecreduce(load(Y)), vecreduce(load(Z)))
13566 // Or add(add(X, vecreduce(load(Y))), vecreduce(load(Z)))
13567 // by ascending load offsets. This can help cores prefetch if the order of
13568 // loads is more predictable.
13569 auto DistrubuteVecReduceLoad = [&](SDValue N0, SDValue N1, bool IsForward) {
13570 // Check if two reductions are known to load data where one is before/after
13571 // another. Return negative if N0 loads data before N1, positive if N1 is
13572 // before N0 and 0 otherwise if nothing is known.
13573 auto IsKnownOrderedLoad = [&](SDValue N0, SDValue N1) {
13574 // Look through to the first operand of a MUL, for the VMLA case.
13575 // Currently only looks at the first operand, in the hope they are equal.
13576 if (N0.getOpcode() == ISD::MUL)
13577 N0 = N0.getOperand(0);
13578 if (N1.getOpcode() == ISD::MUL)
13579 N1 = N1.getOperand(0);
13580
13581 // Return true if the two operands are loads to the same object and the
13582 // offset of the first is known to be less than the offset of the second.
13583 LoadSDNode *Load0 = dyn_cast<LoadSDNode>(N0);
13584 LoadSDNode *Load1 = dyn_cast<LoadSDNode>(N1);
13585 if (!Load0 || !Load1 || Load0->getChain() != Load1->getChain() ||
13586 !Load0->isSimple() || !Load1->isSimple() || Load0->isIndexed() ||
13587 Load1->isIndexed())
13588 return 0;
13589
13590 auto BaseLocDecomp0 = BaseIndexOffset::match(Load0, DAG);
13591 auto BaseLocDecomp1 = BaseIndexOffset::match(Load1, DAG);
13592
13593 if (!BaseLocDecomp0.getBase() ||
13594 BaseLocDecomp0.getBase() != BaseLocDecomp1.getBase() ||
13595 !BaseLocDecomp0.hasValidOffset() || !BaseLocDecomp1.hasValidOffset())
13596 return 0;
13597 if (BaseLocDecomp0.getOffset() < BaseLocDecomp1.getOffset())
13598 return -1;
13599 if (BaseLocDecomp0.getOffset() > BaseLocDecomp1.getOffset())
13600 return 1;
13601 return 0;
13602 };
13603
13604 SDValue X;
13605 if (N0.getOpcode() == ISD::ADD && N0->hasOneUse()) {
13606 if (IsVecReduce(N0.getOperand(0)) && IsVecReduce(N0.getOperand(1))) {
13607 int IsBefore = IsKnownOrderedLoad(N0.getOperand(0).getOperand(0),
13608 N0.getOperand(1).getOperand(0));
13609 if (IsBefore < 0) {
13610 X = N0.getOperand(0);
13611 N0 = N0.getOperand(1);
13612 } else if (IsBefore > 0) {
13613 X = N0.getOperand(1);
13614 N0 = N0.getOperand(0);
13615 } else
13616 return SDValue();
13617 } else if (IsVecReduce(N0.getOperand(0))) {
13618 X = N0.getOperand(1);
13619 N0 = N0.getOperand(0);
13620 } else if (IsVecReduce(N0.getOperand(1))) {
13621 X = N0.getOperand(0);
13622 N0 = N0.getOperand(1);
13623 } else
13624 return SDValue();
13625 } else if (IsForward && IsVecReduce(N0) && IsVecReduce(N1) &&
13626 IsKnownOrderedLoad(N0.getOperand(0), N1.getOperand(0)) < 0) {
13627 // Note this is backward to how you would expect. We create
13628 // add(reduce(load + 16), reduce(load + 0)) so that the
13629 // add(reduce(load+16), X) is combined into VADDVA(X, load+16)), leaving
13630 // the X as VADDV(load + 0)
13631 return DAG.getNode(ISD::ADD, dl, VT, N1, N0);
13632 } else
13633 return SDValue();
13634
13635 if (!IsVecReduce(N0) || !IsVecReduce(N1))
13636 return SDValue();
13637
13638 if (IsKnownOrderedLoad(N1.getOperand(0), N0.getOperand(0)) >= 0)
13639 return SDValue();
13640
13641 // Switch from add(add(X, N0), N1) to add(add(X, N1), N0)
13642 SDValue Add0 = DAG.getNode(ISD::ADD, dl, VT, X, N1);
13643 return DAG.getNode(ISD::ADD, dl, VT, Add0, N0);
13644 };
13645 if (SDValue R = DistrubuteVecReduceLoad(N0, N1, true))
13646 return R;
13647 if (SDValue R = DistrubuteVecReduceLoad(N1, N0, false))
13648 return R;
13649 return SDValue();
13650}
13651
13653 const ARMSubtarget *Subtarget) {
13654 if (!Subtarget->hasMVEIntegerOps())
13655 return SDValue();
13656
13658 return R;
13659
13660 EVT VT = N->getValueType(0);
13661 SDValue N0 = N->getOperand(0);
13662 SDValue N1 = N->getOperand(1);
13663 SDLoc dl(N);
13664
13665 if (VT != MVT::i64)
13666 return SDValue();
13667
13668 // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
13669 // will look like:
13670 // t1: i32,i32 = ARMISD::VADDLVs x
13671 // t2: i64 = build_pair t1, t1:1
13672 // t3: i64 = add t2, y
13673 // Otherwise we try to push the add up above VADDLVAx, to potentially allow
13674 // the add to be simplified separately.
13675 // We also need to check for sext / zext and commutitive adds.
13676 auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
13677 SDValue NB) {
13678 if (NB->getOpcode() != ISD::BUILD_PAIR)
13679 return SDValue();
13680 SDValue VecRed = NB->getOperand(0);
13681 if ((VecRed->getOpcode() != Opcode && VecRed->getOpcode() != OpcodeA) ||
13682 VecRed.getResNo() != 0 ||
13683 NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
13684 return SDValue();
13685
13686 if (VecRed->getOpcode() == OpcodeA) {
13687 // add(NA, VADDLVA(Inp), Y) -> VADDLVA(add(NA, Inp), Y)
13688 SDValue Inp = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
13689 VecRed.getOperand(0), VecRed.getOperand(1));
13690 NA = DAG.getNode(ISD::ADD, dl, MVT::i64, Inp, NA);
13691 }
13692
13694 std::tie(Ops[0], Ops[1]) = DAG.SplitScalar(NA, dl, MVT::i32, MVT::i32);
13695
13696 unsigned S = VecRed->getOpcode() == OpcodeA ? 2 : 0;
13697 for (unsigned I = S, E = VecRed.getNumOperands(); I < E; I++)
13698 Ops.push_back(VecRed->getOperand(I));
13699 SDValue Red =
13700 DAG.getNode(OpcodeA, dl, DAG.getVTList({MVT::i32, MVT::i32}), Ops);
13701 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
13702 SDValue(Red.getNode(), 1));
13703 };
13704
13705 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
13706 return M;
13707 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
13708 return M;
13709 if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
13710 return M;
13711 if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
13712 return M;
13713 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
13714 return M;
13715 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
13716 return M;
13717 if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
13718 return M;
13719 if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
13720 return M;
13721 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
13722 return M;
13723 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
13724 return M;
13725 if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
13726 return M;
13727 if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
13728 return M;
13729 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N0, N1))
13730 return M;
13731 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N0, N1))
13732 return M;
13733 if (SDValue M = MakeVecReduce(ARMISD::VMLALVps, ARMISD::VMLALVAps, N1, N0))
13734 return M;
13735 if (SDValue M = MakeVecReduce(ARMISD::VMLALVpu, ARMISD::VMLALVApu, N1, N0))
13736 return M;
13737 return SDValue();
13738}
13739
13740bool
13742 CombineLevel Level) const {
13743 assert((N->getOpcode() == ISD::SHL || N->getOpcode() == ISD::SRA ||
13744 N->getOpcode() == ISD::SRL) &&
13745 "Expected shift op");
13746
13747 SDValue ShiftLHS = N->getOperand(0);
13748 if (!ShiftLHS->hasOneUse())
13749 return false;
13750
13751 if (ShiftLHS.getOpcode() == ISD::SIGN_EXTEND &&
13752 !ShiftLHS.getOperand(0)->hasOneUse())
13753 return false;
13754
13755 if (Level == BeforeLegalizeTypes)
13756 return true;
13757
13758 if (N->getOpcode() != ISD::SHL)
13759 return true;
13760
13761 if (Subtarget->isThumb1Only()) {
13762 // Avoid making expensive immediates by commuting shifts. (This logic
13763 // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted
13764 // for free.)
13765 if (N->getOpcode() != ISD::SHL)
13766 return true;
13767 SDValue N1 = N->getOperand(0);
13768 if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND &&
13769 N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR)
13770 return true;
13771 if (auto *Const = dyn_cast<ConstantSDNode>(N1->getOperand(1))) {
13772 if (Const->getAPIntValue().ult(256))
13773 return false;
13774 if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) &&
13775 Const->getAPIntValue().sgt(-256))
13776 return false;
13777 }
13778 return true;
13779 }
13780
13781 // Turn off commute-with-shift transform after legalization, so it doesn't
13782 // conflict with PerformSHLSimplify. (We could try to detect when
13783 // PerformSHLSimplify would trigger more precisely, but it isn't
13784 // really necessary.)
13785 return false;
13786}
13787
13789 const SDNode *N) const {
13790 assert(N->getOpcode() == ISD::XOR &&
13791 (N->getOperand(0).getOpcode() == ISD::SHL ||
13792 N->getOperand(0).getOpcode() == ISD::SRL) &&
13793 "Expected XOR(SHIFT) pattern");
13794
13795 // Only commute if the entire NOT mask is a hidden shifted mask.
13796 auto *XorC = dyn_cast<ConstantSDNode>(N->getOperand(1));
13797 auto *ShiftC = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
13798 if (XorC && ShiftC) {
13799 unsigned MaskIdx, MaskLen;
13800 if (XorC->getAPIntValue().isShiftedMask(MaskIdx, MaskLen)) {
13801 unsigned ShiftAmt = ShiftC->getZExtValue();
13802 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
13803 if (N->getOperand(0).getOpcode() == ISD::SHL)
13804 return MaskIdx == ShiftAmt && MaskLen == (BitWidth - ShiftAmt);
13805 return MaskIdx == 0 && MaskLen == (BitWidth - ShiftAmt);
13806 }
13807 }
13808
13809 return false;
13810}
13811
13813 const SDNode *N) const {
13814 assert(((N->getOpcode() == ISD::SHL &&
13815 N->getOperand(0).getOpcode() == ISD::SRL) ||
13816 (N->getOpcode() == ISD::SRL &&
13817 N->getOperand(0).getOpcode() == ISD::SHL)) &&
13818 "Expected shift-shift mask");
13819
13820 if (!Subtarget->isThumb1Only())
13821 return true;
13822
13823 EVT VT = N->getValueType(0);
13824 if (VT.getScalarSizeInBits() > 32)
13825 return true;
13826
13827 return false;
13828}
13829
13831 unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X,
13832 SDValue Y) const {
13833 return Subtarget->hasMVEIntegerOps() && isTypeLegal(VT) &&
13834 SelectOpcode == ISD::VSELECT;
13835}
13836
13838 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps()) {
13839 if (Subtarget->isThumb1Only())
13840 return VT.getScalarSizeInBits() <= 32;
13841 return true;
13842 }
13843 return VT.isScalarInteger();
13844}
13845
13847 EVT VT) const {
13848 if (!isOperationLegalOrCustom(Op, VT) || !FPVT.isSimple())
13849 return false;
13850
13851 switch (FPVT.getSimpleVT().SimpleTy) {
13852 case MVT::f16:
13853 return Subtarget->hasVFP2Base();
13854 case MVT::f32:
13855 return Subtarget->hasVFP2Base();
13856 case MVT::f64:
13857 return Subtarget->hasFP64();
13858 case MVT::v4f32:
13859 case MVT::v8f16:
13860 return Subtarget->hasMVEFloatOps();
13861 default:
13862 return false;
13863 }
13864}
13865
13868 const ARMSubtarget *ST) {
13869 // Allow the generic combiner to identify potential bswaps.
13870 if (DCI.isBeforeLegalize())
13871 return SDValue();
13872
13873 // DAG combiner will fold:
13874 // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
13875 // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2
13876 // Other code patterns that can be also be modified have the following form:
13877 // b + ((a << 1) | 510)
13878 // b + ((a << 1) & 510)
13879 // b + ((a << 1) ^ 510)
13880 // b + ((a << 1) + 510)
13881
13882 // Many instructions can perform the shift for free, but it requires both
13883 // the operands to be registers. If c1 << c2 is too large, a mov immediate
13884 // instruction will needed. So, unfold back to the original pattern if:
13885 // - if c1 and c2 are small enough that they don't require mov imms.
13886 // - the user(s) of the node can perform an shl
13887
13888 // No shifted operands for 16-bit instructions.
13889 if (ST->isThumb() && ST->isThumb1Only())
13890 return SDValue();
13891
13892 // Check that all the users could perform the shl themselves.
13893 for (auto *U : N->users()) {
13894 switch(U->getOpcode()) {
13895 default:
13896 return SDValue();
13897 case ISD::SUB:
13898 case ISD::ADD:
13899 case ISD::AND:
13900 case ISD::OR:
13901 case ISD::XOR:
13902 case ISD::SETCC:
13903 case ARMISD::CMP:
13904 // Check that the user isn't already using a constant because there
13905 // aren't any instructions that support an immediate operand and a
13906 // shifted operand.
13907 if (isa<ConstantSDNode>(U->getOperand(0)) ||
13908 isa<ConstantSDNode>(U->getOperand(1)))
13909 return SDValue();
13910
13911 // Check that it's not already using a shift.
13912 if (U->getOperand(0).getOpcode() == ISD::SHL ||
13913 U->getOperand(1).getOpcode() == ISD::SHL)
13914 return SDValue();
13915 break;
13916 }
13917 }
13918
13919 if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR &&
13920 N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND)
13921 return SDValue();
13922
13923 if (N->getOperand(0).getOpcode() != ISD::SHL)
13924 return SDValue();
13925
13926 SDValue SHL = N->getOperand(0);
13927
13928 auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
13929 auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
13930 if (!C1ShlC2 || !C2)
13931 return SDValue();
13932
13933 APInt C2Int = C2->getAPIntValue();
13934 APInt C1Int = C1ShlC2->getAPIntValue();
13935 unsigned C2Width = C2Int.getBitWidth();
13936 if (C2Int.uge(C2Width))
13937 return SDValue();
13938 uint64_t C2Value = C2Int.getZExtValue();
13939
13940 // Check that performing a lshr will not lose any information.
13941 APInt Mask = APInt::getHighBitsSet(C2Width, C2Width - C2Value);
13942 if ((C1Int & Mask) != C1Int)
13943 return SDValue();
13944
13945 // Shift the first constant.
13946 C1Int.lshrInPlace(C2Int);
13947
13948 // The immediates are encoded as an 8-bit value that can be rotated.
13949 auto LargeImm = [](const APInt &Imm) {
13950 unsigned Zeros = Imm.countl_zero() + Imm.countr_zero();
13951 return Imm.getBitWidth() - Zeros > 8;
13952 };
13953
13954 if (LargeImm(C1Int) || LargeImm(C2Int))
13955 return SDValue();
13956
13957 SelectionDAG &DAG = DCI.DAG;
13958 SDLoc dl(N);
13959 SDValue X = SHL.getOperand(0);
13960 SDValue BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
13961 DAG.getConstant(C1Int, dl, MVT::i32));
13962 // Shift left to compensate for the lshr of C1Int.
13963 SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
13964
13965 LLVM_DEBUG(dbgs() << "Simplify shl use:\n"; SHL.getOperand(0).dump();
13966 SHL.dump(); N->dump());
13967 LLVM_DEBUG(dbgs() << "Into:\n"; X.dump(); BinOp.dump(); Res.dump());
13968 return Res;
13969}
13970
13971
13972/// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
13973///
13976 const ARMSubtarget *Subtarget) {
13977 SDValue N0 = N->getOperand(0);
13978 SDValue N1 = N->getOperand(1);
13979
13980 // Only works one way, because it needs an immediate operand.
13981 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
13982 return Result;
13983
13984 if (SDValue Result = PerformADDVecReduce(N, DCI.DAG, Subtarget))
13985 return Result;
13986
13987 // First try with the default operand order.
13988 if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
13989 return Result;
13990
13991 // If that didn't work, try again with the operands commuted.
13992 return PerformADDCombineWithOperands(N, N1, N0, DCI, Subtarget);
13993}
13994
13995// Combine (sub 0, (csinc X, Y, CC)) -> (csinv -X, Y, CC)
13996// providing -X is as cheap as X (currently, just a constant).
13998 if (N->getValueType(0) != MVT::i32 || !isNullConstant(N->getOperand(0)))
13999 return SDValue();
14000 SDValue CSINC = N->getOperand(1);
14001 if (CSINC.getOpcode() != ARMISD::CSINC || !CSINC.hasOneUse())
14002 return SDValue();
14003
14005 if (!X)
14006 return SDValue();
14007
14008 return DAG.getNode(ARMISD::CSINV, SDLoc(N), MVT::i32,
14009 DAG.getNode(ISD::SUB, SDLoc(N), MVT::i32, N->getOperand(0),
14010 CSINC.getOperand(0)),
14011 CSINC.getOperand(1), CSINC.getOperand(2),
14012 CSINC.getOperand(3));
14013}
14014
14016 // Free to negate.
14018 return 0;
14019
14020 // Will save one instruction.
14021 if (Op.getOpcode() == ISD::SUB && isNullConstant(Op.getOperand(0)))
14022 return -1;
14023
14024 // Can freely negate by converting sra <-> srl.
14025 if (Op.getOpcode() == ISD::SRA || Op.getOpcode() == ISD::SRL) {
14026 ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(Op.getOperand(1));
14027 if (Op.hasOneUse() && ShiftAmt &&
14028 ShiftAmt->getZExtValue() == Op.getValueType().getScalarSizeInBits() - 1)
14029 return 0;
14030 }
14031
14032 // Will have to create sub.
14033 return 1;
14034}
14035
14036// Try to fold
14037//
14038// (neg (cmov X, Y)) -> (cmov (neg X), (neg Y))
14039//
14040// The folding helps cmov to be matched with csneg without generating
14041// redundant neg instruction.
14043 assert(N->getOpcode() == ISD::SUB);
14044 if (!isNullConstant(N->getOperand(0)))
14045 return SDValue();
14046
14047 SDValue CMov = N->getOperand(1);
14048 if (CMov.getOpcode() != ARMISD::CMOV || !CMov->hasOneUse())
14049 return SDValue();
14050
14051 SDValue N0 = CMov.getOperand(0);
14052 SDValue N1 = CMov.getOperand(1);
14053
14054 // Only perform the fold if we actually save something.
14055 if (getNegationCost(N0) + getNegationCost(N1) > 0)
14056 return SDValue();
14057
14058 SDLoc DL(N);
14059 EVT VT = CMov.getValueType();
14060
14061 SDValue N0N = DAG.getNegative(N0, DL, VT);
14062 SDValue N1N = DAG.getNegative(N1, DL, VT);
14063 return DAG.getNode(ARMISD::CMOV, DL, VT, N0N, N1N, CMov.getOperand(2),
14064 CMov.getOperand(3));
14065}
14066
14067/// PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
14068///
14071 const ARMSubtarget *Subtarget) {
14072 SDValue N0 = N->getOperand(0);
14073 SDValue N1 = N->getOperand(1);
14074
14075 // fold (sub x, (select cc, 0, c)) -> (select cc, x, (sub, x, c))
14076 if (N1.getNode()->hasOneUse())
14077 if (SDValue Result = combineSelectAndUse(N, N1, N0, DCI))
14078 return Result;
14079
14080 if (SDValue R = PerformSubCSINCCombine(N, DCI.DAG))
14081 return R;
14082
14083 if (SDValue Val = performNegCMovCombine(N, DCI.DAG))
14084 return Val;
14085
14086 if (!Subtarget->hasMVEIntegerOps() || !N->getValueType(0).isVector())
14087 return SDValue();
14088
14089 // Fold (sub (ARMvmovImm 0), (ARMvdup x)) -> (ARMvdup (sub 0, x))
14090 // so that we can readily pattern match more mve instructions which can use
14091 // a scalar operand.
14092 SDValue VDup = N->getOperand(1);
14093 if (VDup->getOpcode() != ARMISD::VDUP)
14094 return SDValue();
14095
14096 SDValue VMov = N->getOperand(0);
14097 if (VMov->getOpcode() == ISD::BITCAST)
14098 VMov = VMov->getOperand(0);
14099
14100 if (VMov->getOpcode() != ARMISD::VMOVIMM || !isZeroVector(VMov))
14101 return SDValue();
14102
14103 SDLoc dl(N);
14104 SDValue Negate = DCI.DAG.getNode(ISD::SUB, dl, MVT::i32,
14105 DCI.DAG.getConstant(0, dl, MVT::i32),
14106 VDup->getOperand(0));
14107 return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0), Negate);
14108}
14109
14110/// PerformVMULCombine
14111/// Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the
14112/// special multiplier accumulator forwarding.
14113/// vmul d3, d0, d2
14114/// vmla d3, d1, d2
14115/// is faster than
14116/// vadd d3, d0, d1
14117/// vmul d3, d3, d2
14118// However, for (A + B) * (A + B),
14119// vadd d2, d0, d1
14120// vmul d3, d0, d2
14121// vmla d3, d1, d2
14122// is slower than
14123// vadd d2, d0, d1
14124// vmul d3, d2, d2
14127 const ARMSubtarget *Subtarget) {
14128 if (!Subtarget->hasVMLxForwarding())
14129 return SDValue();
14130
14131 SelectionDAG &DAG = DCI.DAG;
14132 SDValue N0 = N->getOperand(0);
14133 SDValue N1 = N->getOperand(1);
14134 unsigned Opcode = N0.getOpcode();
14135 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14136 Opcode != ISD::FADD && Opcode != ISD::FSUB) {
14137 Opcode = N1.getOpcode();
14138 if (Opcode != ISD::ADD && Opcode != ISD::SUB &&
14139 Opcode != ISD::FADD && Opcode != ISD::FSUB)
14140 return SDValue();
14141 std::swap(N0, N1);
14142 }
14143
14144 if (N0 == N1)
14145 return SDValue();
14146
14147 EVT VT = N->getValueType(0);
14148 SDLoc DL(N);
14149 SDValue N00 = N0->getOperand(0);
14150 SDValue N01 = N0->getOperand(1);
14151 return DAG.getNode(Opcode, DL, VT,
14152 DAG.getNode(ISD::MUL, DL, VT, N00, N1),
14153 DAG.getNode(ISD::MUL, DL, VT, N01, N1));
14154}
14155
14157 const ARMSubtarget *Subtarget) {
14158 EVT VT = N->getValueType(0);
14159 if (VT != MVT::v2i64)
14160 return SDValue();
14161
14162 SDValue N0 = N->getOperand(0);
14163 SDValue N1 = N->getOperand(1);
14164
14165 auto IsSignExt = [&](SDValue Op) {
14166 if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
14167 return SDValue();
14168 EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
14169 if (VT.getScalarSizeInBits() == 32)
14170 return Op->getOperand(0);
14171 return SDValue();
14172 };
14173 auto IsZeroExt = [&](SDValue Op) {
14174 // Zero extends are a little more awkward. At the point we are matching
14175 // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
14176 // That might be before of after a bitcast depending on how the and is
14177 // placed. Because this has to look through bitcasts, it is currently only
14178 // supported on LE.
14179 if (!Subtarget->isLittle())
14180 return SDValue();
14181
14182 SDValue And = Op;
14183 if (And->getOpcode() == ISD::BITCAST)
14184 And = And->getOperand(0);
14185 if (And->getOpcode() != ISD::AND)
14186 return SDValue();
14187 SDValue Mask = And->getOperand(1);
14188 if (Mask->getOpcode() == ISD::BITCAST)
14189 Mask = Mask->getOperand(0);
14190
14191 if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
14192 Mask.getValueType() != MVT::v4i32)
14193 return SDValue();
14194 if (isAllOnesConstant(Mask->getOperand(0)) &&
14195 isNullConstant(Mask->getOperand(1)) &&
14196 isAllOnesConstant(Mask->getOperand(2)) &&
14197 isNullConstant(Mask->getOperand(3)))
14198 return And->getOperand(0);
14199 return SDValue();
14200 };
14201
14202 SDLoc dl(N);
14203 if (SDValue Op0 = IsSignExt(N0)) {
14204 if (SDValue Op1 = IsSignExt(N1)) {
14205 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14206 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14207 return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
14208 }
14209 }
14210 if (SDValue Op0 = IsZeroExt(N0)) {
14211 if (SDValue Op1 = IsZeroExt(N1)) {
14212 SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
14213 SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
14214 return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
14215 }
14216 }
14217
14218 return SDValue();
14219}
14220
14223 const ARMSubtarget *Subtarget) {
14224 SelectionDAG &DAG = DCI.DAG;
14225
14226 EVT VT = N->getValueType(0);
14227 if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
14228 return PerformMVEVMULLCombine(N, DAG, Subtarget);
14229
14230 if (Subtarget->isThumb1Only())
14231 return SDValue();
14232
14233 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14234 return SDValue();
14235
14236 if (VT.is64BitVector() || VT.is128BitVector())
14237 return PerformVMULCombine(N, DCI, Subtarget);
14238 if (VT != MVT::i32)
14239 return SDValue();
14240
14241 ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14242 if (!C)
14243 return SDValue();
14244
14245 int64_t MulAmt = C->getSExtValue();
14246 unsigned ShiftAmt = llvm::countr_zero<uint64_t>(MulAmt);
14247
14248 ShiftAmt = ShiftAmt & (32 - 1);
14249 SDValue V = N->getOperand(0);
14250 SDLoc DL(N);
14251
14252 SDValue Res;
14253 MulAmt >>= ShiftAmt;
14254
14255 if (MulAmt >= 0) {
14256 if (llvm::has_single_bit<uint32_t>(MulAmt - 1)) {
14257 // (mul x, 2^N + 1) => (add (shl x, N), x)
14258 Res = DAG.getNode(ISD::ADD, DL, VT,
14259 V,
14260 DAG.getNode(ISD::SHL, DL, VT,
14261 V,
14262 DAG.getConstant(Log2_32(MulAmt - 1), DL,
14263 MVT::i32)));
14264 } else if (llvm::has_single_bit<uint32_t>(MulAmt + 1)) {
14265 // (mul x, 2^N - 1) => (sub (shl x, N), x)
14266 Res = DAG.getNode(ISD::SUB, DL, VT,
14267 DAG.getNode(ISD::SHL, DL, VT,
14268 V,
14269 DAG.getConstant(Log2_32(MulAmt + 1), DL,
14270 MVT::i32)),
14271 V);
14272 } else
14273 return SDValue();
14274 } else {
14275 uint64_t MulAmtAbs = -MulAmt;
14276 if (llvm::has_single_bit<uint32_t>(MulAmtAbs + 1)) {
14277 // (mul x, -(2^N - 1)) => (sub x, (shl x, N))
14278 Res = DAG.getNode(ISD::SUB, DL, VT,
14279 V,
14280 DAG.getNode(ISD::SHL, DL, VT,
14281 V,
14282 DAG.getConstant(Log2_32(MulAmtAbs + 1), DL,
14283 MVT::i32)));
14284 } else if (llvm::has_single_bit<uint32_t>(MulAmtAbs - 1)) {
14285 // (mul x, -(2^N + 1)) => - (add (shl x, N), x)
14286 Res = DAG.getNode(ISD::ADD, DL, VT,
14287 V,
14288 DAG.getNode(ISD::SHL, DL, VT,
14289 V,
14290 DAG.getConstant(Log2_32(MulAmtAbs - 1), DL,
14291 MVT::i32)));
14292 Res = DAG.getNode(ISD::SUB, DL, VT,
14293 DAG.getConstant(0, DL, MVT::i32), Res);
14294 } else
14295 return SDValue();
14296 }
14297
14298 if (ShiftAmt != 0)
14299 Res = DAG.getNode(ISD::SHL, DL, VT,
14300 Res, DAG.getConstant(ShiftAmt, DL, MVT::i32));
14301
14302 // Do not add new nodes to DAG combiner worklist.
14303 DCI.CombineTo(N, Res, false);
14304 return SDValue();
14305}
14306
14309 const ARMSubtarget *Subtarget) {
14310 // Allow DAGCombine to pattern-match before we touch the canonical form.
14311 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
14312 return SDValue();
14313
14314 if (N->getValueType(0) != MVT::i32)
14315 return SDValue();
14316
14317 ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N->getOperand(1));
14318 if (!N1C)
14319 return SDValue();
14320
14321 uint32_t C1 = (uint32_t)N1C->getZExtValue();
14322 // Don't transform uxtb/uxth.
14323 if (C1 == 255 || C1 == 65535)
14324 return SDValue();
14325
14326 SDNode *N0 = N->getOperand(0).getNode();
14327 if (!N0->hasOneUse())
14328 return SDValue();
14329
14330 if (N0->getOpcode() != ISD::SHL && N0->getOpcode() != ISD::SRL)
14331 return SDValue();
14332
14333 bool LeftShift = N0->getOpcode() == ISD::SHL;
14334
14336 if (!N01C)
14337 return SDValue();
14338
14339 uint32_t C2 = (uint32_t)N01C->getZExtValue();
14340 if (!C2 || C2 >= 32)
14341 return SDValue();
14342
14343 // Clear irrelevant bits in the mask.
14344 if (LeftShift)
14345 C1 &= (-1U << C2);
14346 else
14347 C1 &= (-1U >> C2);
14348
14349 SelectionDAG &DAG = DCI.DAG;
14350 SDLoc DL(N);
14351
14352 // We have a pattern of the form "(and (shl x, c2) c1)" or
14353 // "(and (srl x, c2) c1)", where c1 is a shifted mask. Try to
14354 // transform to a pair of shifts, to save materializing c1.
14355
14356 // First pattern: right shift, then mask off leading bits.
14357 // FIXME: Use demanded bits?
14358 if (!LeftShift && isMask_32(C1)) {
14359 uint32_t C3 = llvm::countl_zero(C1);
14360 if (C2 < C3) {
14361 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14362 DAG.getConstant(C3 - C2, DL, MVT::i32));
14363 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14364 DAG.getConstant(C3, DL, MVT::i32));
14365 }
14366 }
14367
14368 // First pattern, reversed: left shift, then mask off trailing bits.
14369 if (LeftShift && isMask_32(~C1)) {
14370 uint32_t C3 = llvm::countr_zero(C1);
14371 if (C2 < C3) {
14372 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14373 DAG.getConstant(C3 - C2, DL, MVT::i32));
14374 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14375 DAG.getConstant(C3, DL, MVT::i32));
14376 }
14377 }
14378
14379 // Second pattern: left shift, then mask off leading bits.
14380 // FIXME: Use demanded bits?
14381 if (LeftShift && isShiftedMask_32(C1)) {
14382 uint32_t Trailing = llvm::countr_zero(C1);
14383 uint32_t C3 = llvm::countl_zero(C1);
14384 if (Trailing == C2 && C2 + C3 < 32) {
14385 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
14386 DAG.getConstant(C2 + C3, DL, MVT::i32));
14387 return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL,
14388 DAG.getConstant(C3, DL, MVT::i32));
14389 }
14390 }
14391
14392 // Second pattern, reversed: right shift, then mask off trailing bits.
14393 // FIXME: Handle other patterns of known/demanded bits.
14394 if (!LeftShift && isShiftedMask_32(C1)) {
14395 uint32_t Leading = llvm::countl_zero(C1);
14396 uint32_t C3 = llvm::countr_zero(C1);
14397 if (Leading == C2 && C2 + C3 < 32) {
14398 SDValue SHL = DAG.getNode(ISD::SRL, DL, MVT::i32, N0->getOperand(0),
14399 DAG.getConstant(C2 + C3, DL, MVT::i32));
14400 return DAG.getNode(ISD::SHL, DL, MVT::i32, SHL,
14401 DAG.getConstant(C3, DL, MVT::i32));
14402 }
14403 }
14404
14405 // Transform "(and (shl x, c2) c1)" into "(shl (and x, c1>>c2), c2)"
14406 // if "c1 >> c2" is a cheaper immediate than "c1"
14407 if (LeftShift &&
14408 HasLowerConstantMaterializationCost(C1 >> C2, C1, Subtarget)) {
14409
14410 SDValue And = DAG.getNode(ISD::AND, DL, MVT::i32, N0->getOperand(0),
14411 DAG.getConstant(C1 >> C2, DL, MVT::i32));
14412 return DAG.getNode(ISD::SHL, DL, MVT::i32, And,
14413 DAG.getConstant(C2, DL, MVT::i32));
14414 }
14415
14416 return SDValue();
14417}
14418
14421 const ARMSubtarget *Subtarget) {
14422 // Attempt to use immediate-form VBIC
14423 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14424 SDLoc dl(N);
14425 EVT VT = N->getValueType(0);
14426 SelectionDAG &DAG = DCI.DAG;
14427
14428 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v2i1 ||
14429 VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1)
14430 return SDValue();
14431
14432 APInt SplatBits, SplatUndef;
14433 unsigned SplatBitSize;
14434 bool HasAnyUndefs;
14435 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14436 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14437 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14438 SplatBitSize == 64) {
14439 EVT VbicVT;
14440 SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
14441 SplatUndef.getZExtValue(), SplatBitSize,
14442 DAG, dl, VbicVT, VT, OtherModImm);
14443 if (Val.getNode()) {
14444 SDValue Input =
14445 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VbicVT, N->getOperand(0));
14446 SDValue Vbic = DAG.getNode(ARMISD::VBICIMM, dl, VbicVT, Input, Val);
14447 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vbic);
14448 }
14449 }
14450 }
14451
14452 if (!Subtarget->isThumb1Only()) {
14453 // fold (and (select cc, -1, c), x) -> (select cc, x, (and, x, c))
14454 if (SDValue Result = combineSelectAndUseCommutative(N, true, DCI))
14455 return Result;
14456
14457 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14458 return Result;
14459 }
14460
14461 if (Subtarget->isThumb1Only())
14462 if (SDValue Result = CombineANDShift(N, DCI, Subtarget))
14463 return Result;
14464
14465 return SDValue();
14466}
14467
14468// Try combining OR nodes to SMULWB, SMULWT.
14471 const ARMSubtarget *Subtarget) {
14472 if (!Subtarget->hasV6Ops() ||
14473 (Subtarget->isThumb() &&
14474 (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
14475 return SDValue();
14476
14477 SDValue SRL = OR->getOperand(0);
14478 SDValue SHL = OR->getOperand(1);
14479
14480 if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
14481 SRL = OR->getOperand(1);
14482 SHL = OR->getOperand(0);
14483 }
14484 if (!isSRL16(SRL) || !isSHL16(SHL))
14485 return SDValue();
14486
14487 // The first operands to the shifts need to be the two results from the
14488 // same smul_lohi node.
14489 if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
14490 SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
14491 return SDValue();
14492
14493 SDNode *SMULLOHI = SRL.getOperand(0).getNode();
14494 if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
14495 SHL.getOperand(0) != SDValue(SMULLOHI, 1))
14496 return SDValue();
14497
14498 // Now we have:
14499 // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
14500 // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
14501 // For SMUWB the 16-bit value will signed extended somehow.
14502 // For SMULWT only the SRA is required.
14503 // Check both sides of SMUL_LOHI
14504 SDValue OpS16 = SMULLOHI->getOperand(0);
14505 SDValue OpS32 = SMULLOHI->getOperand(1);
14506
14507 SelectionDAG &DAG = DCI.DAG;
14508 if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
14509 OpS16 = OpS32;
14510 OpS32 = SMULLOHI->getOperand(0);
14511 }
14512
14513 SDLoc dl(OR);
14514 unsigned Opcode = 0;
14515 if (isS16(OpS16, DAG))
14516 Opcode = ARMISD::SMULWB;
14517 else if (isSRA16(OpS16)) {
14518 Opcode = ARMISD::SMULWT;
14519 OpS16 = OpS16->getOperand(0);
14520 }
14521 else
14522 return SDValue();
14523
14524 SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
14525 DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
14526 return SDValue(OR, 0);
14527}
14528
14531 const ARMSubtarget *Subtarget) {
14532 // BFI is only available on V6T2+
14533 if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
14534 return SDValue();
14535
14536 EVT VT = N->getValueType(0);
14537 SDValue N0 = N->getOperand(0);
14538 SDValue N1 = N->getOperand(1);
14539 SelectionDAG &DAG = DCI.DAG;
14540 SDLoc DL(N);
14541 // 1) or (and A, mask), val => ARMbfi A, val, mask
14542 // iff (val & mask) == val
14543 //
14544 // 2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14545 // 2a) iff isBitFieldInvertedMask(mask) && isBitFieldInvertedMask(~mask2)
14546 // && mask == ~mask2
14547 // 2b) iff isBitFieldInvertedMask(~mask) && isBitFieldInvertedMask(mask2)
14548 // && ~mask == mask2
14549 // (i.e., copy a bitfield value into another bitfield of the same width)
14550
14551 if (VT != MVT::i32)
14552 return SDValue();
14553
14554 SDValue N00 = N0.getOperand(0);
14555
14556 // The value and the mask need to be constants so we can verify this is
14557 // actually a bitfield set. If the mask is 0xffff, we can do better
14558 // via a movt instruction, so don't use BFI in that case.
14559 SDValue MaskOp = N0.getOperand(1);
14561 if (!MaskC)
14562 return SDValue();
14563 unsigned Mask = MaskC->getZExtValue();
14564 if (Mask == 0xffff)
14565 return SDValue();
14566 SDValue Res;
14567 // Case (1): or (and A, mask), val => ARMbfi A, val, mask
14569 if (N1C) {
14570 unsigned Val = N1C->getZExtValue();
14571 if ((Val & ~Mask) != Val)
14572 return SDValue();
14573
14574 if (ARM::isBitFieldInvertedMask(Mask)) {
14575 Val >>= llvm::countr_zero(~Mask);
14576
14577 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00,
14578 DAG.getConstant(Val, DL, MVT::i32),
14579 DAG.getConstant(Mask, DL, MVT::i32));
14580
14581 DCI.CombineTo(N, Res, false);
14582 // Return value from the original node to inform the combiner than N is
14583 // now dead.
14584 return SDValue(N, 0);
14585 }
14586 } else if (N1.getOpcode() == ISD::AND) {
14587 // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
14589 if (!N11C)
14590 return SDValue();
14591 unsigned Mask2 = N11C->getZExtValue();
14592
14593 // Mask and ~Mask2 (or reverse) must be equivalent for the BFI pattern
14594 // as is to match.
14595 if (ARM::isBitFieldInvertedMask(Mask) &&
14596 (Mask == ~Mask2)) {
14597 // The pack halfword instruction works better for masks that fit it,
14598 // so use that when it's available.
14599 if (Subtarget->hasDSP() &&
14600 (Mask == 0xffff || Mask == 0xffff0000))
14601 return SDValue();
14602 // 2a
14603 unsigned amt = llvm::countr_zero(Mask2);
14604 Res = DAG.getNode(ISD::SRL, DL, VT, N1.getOperand(0),
14605 DAG.getConstant(amt, DL, MVT::i32));
14606 Res = DAG.getNode(ARMISD::BFI, DL, VT, N00, Res,
14607 DAG.getConstant(Mask, DL, MVT::i32));
14608 DCI.CombineTo(N, Res, false);
14609 // Return value from the original node to inform the combiner than N is
14610 // now dead.
14611 return SDValue(N, 0);
14612 } else if (ARM::isBitFieldInvertedMask(~Mask) &&
14613 (~Mask == Mask2)) {
14614 // The pack halfword instruction works better for masks that fit it,
14615 // so use that when it's available.
14616 if (Subtarget->hasDSP() &&
14617 (Mask2 == 0xffff || Mask2 == 0xffff0000))
14618 return SDValue();
14619 // 2b
14620 unsigned lsb = llvm::countr_zero(Mask);
14621 Res = DAG.getNode(ISD::SRL, DL, VT, N00,
14622 DAG.getConstant(lsb, DL, MVT::i32));
14623 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1.getOperand(0), Res,
14624 DAG.getConstant(Mask2, DL, MVT::i32));
14625 DCI.CombineTo(N, Res, false);
14626 // Return value from the original node to inform the combiner than N is
14627 // now dead.
14628 return SDValue(N, 0);
14629 }
14630 }
14631
14632 if (DAG.MaskedValueIsZero(N1, MaskC->getAPIntValue()) &&
14633 N00.getOpcode() == ISD::SHL && isa<ConstantSDNode>(N00.getOperand(1)) &&
14635 // Case (3): or (and (shl A, #shamt), mask), B => ARMbfi B, A, ~mask
14636 // where lsb(mask) == #shamt and masked bits of B are known zero.
14637 SDValue ShAmt = N00.getOperand(1);
14638 unsigned ShAmtC = ShAmt->getAsZExtVal();
14639 unsigned LSB = llvm::countr_zero(Mask);
14640 if (ShAmtC != LSB)
14641 return SDValue();
14642
14643 Res = DAG.getNode(ARMISD::BFI, DL, VT, N1, N00.getOperand(0),
14644 DAG.getConstant(~Mask, DL, MVT::i32));
14645
14646 DCI.CombineTo(N, Res, false);
14647 // Return value from the original node to inform the combiner than N is
14648 // now dead.
14649 return SDValue(N, 0);
14650 }
14651
14652 return SDValue();
14653}
14654
14655static bool isValidMVECond(unsigned CC, bool IsFloat) {
14656 switch (CC) {
14657 case ARMCC::EQ:
14658 case ARMCC::NE:
14659 case ARMCC::LE:
14660 case ARMCC::GT:
14661 case ARMCC::GE:
14662 case ARMCC::LT:
14663 return true;
14664 case ARMCC::HS:
14665 case ARMCC::HI:
14666 return !IsFloat;
14667 default:
14668 return false;
14669 };
14670}
14671
14673 if (N->getOpcode() == ARMISD::VCMP)
14674 return (ARMCC::CondCodes)N->getConstantOperandVal(2);
14675 else if (N->getOpcode() == ARMISD::VCMPZ)
14676 return (ARMCC::CondCodes)N->getConstantOperandVal(1);
14677 else
14678 llvm_unreachable("Not a VCMP/VCMPZ!");
14679}
14680
14683 return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
14684}
14685
14687 const ARMSubtarget *Subtarget) {
14688 // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
14689 // together with predicates
14690 EVT VT = N->getValueType(0);
14691 SDLoc DL(N);
14692 SDValue N0 = N->getOperand(0);
14693 SDValue N1 = N->getOperand(1);
14694
14695 auto IsFreelyInvertable = [&](SDValue V) {
14696 if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
14697 return CanInvertMVEVCMP(V);
14698 return false;
14699 };
14700
14701 // At least one operand must be freely invertable.
14702 if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
14703 return SDValue();
14704
14705 SDValue NewN0 = DAG.getLogicalNOT(DL, N0, VT);
14706 SDValue NewN1 = DAG.getLogicalNOT(DL, N1, VT);
14707 SDValue And = DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
14708 return DAG.getLogicalNOT(DL, And, VT);
14709}
14710
14711// Try to form a NEON shift-{right, left}-and-insert (VSRI/VSLI) from:
14712// (or (and X, splat (i32 C1)), (srl Y, splat (i32 C2))) -> VSRI X, Y, #C2
14713// (or (and X, splat (i32 C1)), (shl Y, splat (i32 C2))) -> VSLI X, Y, #C2
14714// where C1 is a mask that preserves the bits not written by the shift/insert,
14715// i.e. `C1 == (1 << C2) - 1`.
14717 SDValue ShiftOp, EVT VT,
14718 SDLoc dl) {
14719 // Match (and X, Mask)
14720 if (AndOp.getOpcode() != ISD::AND)
14721 return SDValue();
14722
14723 SDValue X = AndOp.getOperand(0);
14724 SDValue Mask = AndOp.getOperand(1);
14725
14726 ConstantSDNode *MaskC = isConstOrConstSplat(Mask, false, true);
14727 if (!MaskC)
14728 return SDValue();
14729 APInt MaskBits =
14730 MaskC->getAPIntValue().trunc(Mask.getScalarValueSizeInBits());
14731
14732 // Match shift (srl/shl Y, CntVec)
14733 int64_t Cnt = 0;
14734 bool IsShiftRight = false;
14735 SDValue Y;
14736
14737 if (ShiftOp.getOpcode() == ARMISD::VSHRuIMM) {
14738 IsShiftRight = true;
14739 Y = ShiftOp.getOperand(0);
14740 Cnt = ShiftOp.getConstantOperandVal(1);
14741 } else if (ShiftOp.getOpcode() == ARMISD::VSHLIMM) {
14742 Y = ShiftOp.getOperand(0);
14743 Cnt = ShiftOp.getConstantOperandVal(1);
14744 } else {
14745 return SDValue();
14746 }
14747
14748 unsigned ElemBits = VT.getScalarSizeInBits();
14749 APInt RequiredMask = IsShiftRight
14750 ? APInt::getHighBitsSet(ElemBits, (unsigned)Cnt)
14751 : APInt::getLowBitsSet(ElemBits, (unsigned)Cnt);
14752 if (MaskBits != RequiredMask)
14753 return SDValue();
14754
14755 unsigned Opc = IsShiftRight ? ARMISD::VSRIIMM : ARMISD::VSLIIMM;
14756 return DAG.getNode(Opc, dl, VT, X, Y, DAG.getConstant(Cnt, dl, MVT::i32));
14757}
14758
14759/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
14761 const ARMSubtarget *Subtarget) {
14762 // Attempt to use immediate-form VORR
14763 BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
14764 SDLoc dl(N);
14765 EVT VT = N->getValueType(0);
14766 SelectionDAG &DAG = DCI.DAG;
14767
14768 if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14769 return SDValue();
14770
14771 if (Subtarget->hasMVEIntegerOps() && (VT == MVT::v2i1 || VT == MVT::v4i1 ||
14772 VT == MVT::v8i1 || VT == MVT::v16i1))
14773 return PerformORCombine_i1(N, DAG, Subtarget);
14774
14775 APInt SplatBits, SplatUndef;
14776 unsigned SplatBitSize;
14777 bool HasAnyUndefs;
14778 if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
14779 BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
14780 if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
14781 SplatBitSize == 64) {
14782 EVT VorrVT;
14783 SDValue Val =
14784 isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
14785 SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
14786 if (Val.getNode()) {
14787 SDValue Input =
14788 DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VorrVT, N->getOperand(0));
14789 SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
14790 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Vorr);
14791 }
14792 }
14793 }
14794
14795 if (!Subtarget->isThumb1Only()) {
14796 // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
14797 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14798 return Result;
14799 if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
14800 return Result;
14801 }
14802
14803 SDValue N0 = N->getOperand(0);
14804 SDValue N1 = N->getOperand(1);
14805
14806 // (or (and X, C1), (srl Y, C2)) -> VSRI X, Y, #C2
14807 // (or (and X, C1), (shl Y, C2)) -> VSLI X, Y, #C2
14808 if (VT.isVector() &&
14809 ((Subtarget->hasNEON() && DAG.getTargetLoweringInfo().isTypeLegal(VT)) ||
14810 (Subtarget->hasMVEIntegerOps() &&
14811 (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32)))) {
14812 if (SDValue ShiftInsert =
14813 PerformORCombineToShiftInsert(DAG, N0, N1, VT, dl))
14814 return ShiftInsert;
14815
14816 if (SDValue ShiftInsert =
14817 PerformORCombineToShiftInsert(DAG, N1, N0, VT, dl))
14818 return ShiftInsert;
14819 }
14820
14821 // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
14822 if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
14824
14825 // The code below optimizes (or (and X, Y), Z).
14826 // The AND operand needs to have a single user to make these optimizations
14827 // profitable.
14828 if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
14829 return SDValue();
14830
14831 APInt SplatUndef;
14832 unsigned SplatBitSize;
14833 bool HasAnyUndefs;
14834
14835 APInt SplatBits0, SplatBits1;
14838 // Ensure that the second operand of both ands are constants
14839 if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
14840 HasAnyUndefs) && !HasAnyUndefs) {
14841 if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
14842 HasAnyUndefs) && !HasAnyUndefs) {
14843 // Ensure that the bit width of the constants are the same and that
14844 // the splat arguments are logical inverses as per the pattern we
14845 // are trying to simplify.
14846 if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
14847 SplatBits0 == ~SplatBits1) {
14848 // Canonicalize the vector type to make instruction selection
14849 // simpler.
14850 EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
14851 SDValue Result = DAG.getNode(ARMISD::VBSP, dl, CanonicalVT,
14852 N0->getOperand(1),
14853 N0->getOperand(0),
14854 N1->getOperand(0));
14855 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Result);
14856 }
14857 }
14858 }
14859 }
14860
14861 // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
14862 // reasonable.
14863 if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
14864 if (SDValue Res = PerformORCombineToBFI(N, DCI, Subtarget))
14865 return Res;
14866 }
14867
14868 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14869 return Result;
14870
14871 // (or x, (csinc 0, 0, cc)) -> (csinc x, 0, cc)
14872 // providing that the x is 0 or 1.
14873 SDValue CSINC = N1;
14874 SDValue Other = N0;
14875 if (CSINC.getOpcode() != ARMISD::CSINC)
14876 std::swap(CSINC, Other);
14877 if (CSINC.getOpcode() == ARMISD::CSINC &&
14878 isNullConstant(CSINC.getOperand(0)) &&
14879 isNullConstant(CSINC.getOperand(1)) &&
14881 return DAG.getNode(ARMISD::CSINC, dl, VT, Other, CSINC.getOperand(1),
14882 CSINC.getOperand(2), CSINC.getOperand(3));
14883
14884 return SDValue();
14885}
14886
14889 const ARMSubtarget *Subtarget) {
14890 EVT VT = N->getValueType(0);
14891 SelectionDAG &DAG = DCI.DAG;
14892
14893 if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
14894 return SDValue();
14895
14896 if (!Subtarget->isThumb1Only()) {
14897 // fold (xor (select cc, 0, c), x) -> (select cc, x, (xor, x, c))
14898 if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
14899 return Result;
14900
14901 if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
14902 return Result;
14903 }
14904
14905 if (Subtarget->hasMVEIntegerOps()) {
14906 // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
14907 SDValue N0 = N->getOperand(0);
14908 SDValue N1 = N->getOperand(1);
14909 const TargetLowering *TLI = Subtarget->getTargetLowering();
14910 if (TLI->isConstTrueVal(N1) &&
14911 (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
14912 if (CanInvertMVEVCMP(N0)) {
14913 SDLoc DL(N0);
14915
14917 Ops.push_back(N0->getOperand(0));
14918 if (N0->getOpcode() == ARMISD::VCMP)
14919 Ops.push_back(N0->getOperand(1));
14920 Ops.push_back(DAG.getConstant(CC, DL, MVT::i32));
14921 return DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
14922 }
14923 }
14924 }
14925
14926 return SDValue();
14927}
14928
14929// ParseBFI - given a BFI instruction in N, extract the "from" value (Rn) and return it,
14930// and fill in FromMask and ToMask with (consecutive) bits in "from" to be extracted and
14931// their position in "to" (Rd).
14932static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask) {
14933 assert(N->getOpcode() == ARMISD::BFI);
14934
14935 SDValue From = N->getOperand(1);
14936 ToMask = ~N->getConstantOperandAPInt(2);
14937 FromMask = APInt::getLowBitsSet(ToMask.getBitWidth(), ToMask.popcount());
14938
14939 // If the Base came from a SHR #C, we can deduce that it is really testing bit
14940 // #C in the base of the SHR.
14941 if (From->getOpcode() == ISD::SRL &&
14942 isa<ConstantSDNode>(From->getOperand(1))) {
14943 APInt Shift = From->getConstantOperandAPInt(1);
14944 assert(Shift.getLimitedValue() < 32 && "Shift too large!");
14945 FromMask <<= Shift.getLimitedValue(31);
14946 From = From->getOperand(0);
14947 }
14948
14949 return From;
14950}
14951
14952// If A and B contain one contiguous set of bits, does A | B == A . B?
14953//
14954// Neither A nor B must be zero.
14955static bool BitsProperlyConcatenate(const APInt &A, const APInt &B) {
14956 unsigned LastActiveBitInA = A.countr_zero();
14957 unsigned FirstActiveBitInB = B.getBitWidth() - B.countl_zero() - 1;
14958 return LastActiveBitInA - 1 == FirstActiveBitInB;
14959}
14960
14962 // We have a BFI in N. Find a BFI it can combine with, if one exists.
14963 APInt ToMask, FromMask;
14964 SDValue From = ParseBFI(N, ToMask, FromMask);
14965 SDValue To = N->getOperand(0);
14966
14967 SDValue V = To;
14968 if (V.getOpcode() != ARMISD::BFI)
14969 return SDValue();
14970
14971 APInt NewToMask, NewFromMask;
14972 SDValue NewFrom = ParseBFI(V.getNode(), NewToMask, NewFromMask);
14973 if (NewFrom != From)
14974 return SDValue();
14975
14976 // Do the written bits conflict with any we've seen so far?
14977 if ((NewToMask & ToMask).getBoolValue())
14978 // Conflicting bits.
14979 return SDValue();
14980
14981 // Are the new bits contiguous when combined with the old bits?
14982 if (BitsProperlyConcatenate(ToMask, NewToMask) &&
14983 BitsProperlyConcatenate(FromMask, NewFromMask))
14984 return V;
14985 if (BitsProperlyConcatenate(NewToMask, ToMask) &&
14986 BitsProperlyConcatenate(NewFromMask, FromMask))
14987 return V;
14988
14989 return SDValue();
14990}
14991
14993 SDValue N0 = N->getOperand(0);
14994 SDValue N1 = N->getOperand(1);
14995
14996 if (N1.getOpcode() == ISD::AND) {
14997 // (bfi A, (and B, Mask1), Mask2) -> (bfi A, B, Mask2) iff
14998 // the bits being cleared by the AND are not demanded by the BFI.
15000 if (!N11C)
15001 return SDValue();
15002 unsigned InvMask = N->getConstantOperandVal(2);
15003 unsigned LSB = llvm::countr_zero(~InvMask);
15004 unsigned Width = llvm::bit_width<unsigned>(~InvMask) - LSB;
15005 assert(Width <
15006 static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
15007 "undefined behavior");
15008 unsigned Mask = (1u << Width) - 1;
15009 unsigned Mask2 = N11C->getZExtValue();
15010 if ((Mask & (~Mask2)) == 0)
15011 return DAG.getNode(ARMISD::BFI, SDLoc(N), N->getValueType(0),
15012 N->getOperand(0), N1.getOperand(0), N->getOperand(2));
15013 return SDValue();
15014 }
15015
15016 // Look for another BFI to combine with.
15017 if (SDValue CombineBFI = FindBFIToCombineWith(N)) {
15018 // We've found a BFI.
15019 APInt ToMask1, FromMask1;
15020 SDValue From1 = ParseBFI(N, ToMask1, FromMask1);
15021
15022 APInt ToMask2, FromMask2;
15023 SDValue From2 = ParseBFI(CombineBFI.getNode(), ToMask2, FromMask2);
15024 assert(From1 == From2);
15025 (void)From2;
15026
15027 // Create a new BFI, combining the two together.
15028 APInt NewFromMask = FromMask1 | FromMask2;
15029 APInt NewToMask = ToMask1 | ToMask2;
15030
15031 EVT VT = N->getValueType(0);
15032 SDLoc dl(N);
15033
15034 if (NewFromMask[0] == 0)
15035 From1 = DAG.getNode(ISD::SRL, dl, VT, From1,
15036 DAG.getConstant(NewFromMask.countr_zero(), dl, VT));
15037 return DAG.getNode(ARMISD::BFI, dl, VT, CombineBFI.getOperand(0), From1,
15038 DAG.getConstant(~NewToMask, dl, VT));
15039 }
15040
15041 // Reassociate BFI(BFI (A, B, M1), C, M2) to BFI(BFI (A, C, M2), B, M1) so
15042 // that lower bit insertions are performed first, providing that M1 and M2
15043 // do no overlap. This can allow multiple BFI instructions to be combined
15044 // together by the other folds above.
15045 if (N->getOperand(0).getOpcode() == ARMISD::BFI) {
15046 APInt ToMask1 = ~N->getConstantOperandAPInt(2);
15047 APInt ToMask2 = ~N0.getConstantOperandAPInt(2);
15048
15049 if (!N0.hasOneUse() || (ToMask1 & ToMask2) != 0 ||
15050 ToMask1.countl_zero() < ToMask2.countl_zero())
15051 return SDValue();
15052
15053 EVT VT = N->getValueType(0);
15054 SDLoc dl(N);
15055 SDValue BFI1 = DAG.getNode(ARMISD::BFI, dl, VT, N0.getOperand(0),
15056 N->getOperand(1), N->getOperand(2));
15057 return DAG.getNode(ARMISD::BFI, dl, VT, BFI1, N0.getOperand(1),
15058 N0.getOperand(2));
15059 }
15060
15061 return SDValue();
15062}
15063
15064// Check that N is CMPZ(CSINC(0, 0, CC, X)),
15065// or CMPZ(CMOV(1, 0, CC, X))
15066// return X if valid.
15068 if (Cmp->getOpcode() != ARMISD::CMPZ || !isNullConstant(Cmp->getOperand(1)))
15069 return SDValue();
15070 SDValue CSInc = Cmp->getOperand(0);
15071
15072 // Ignore any `And 1` nodes that may not yet have been removed. We are
15073 // looking for a value that produces 1/0, so these have no effect on the
15074 // code.
15075 while (CSInc.getOpcode() == ISD::AND &&
15076 isa<ConstantSDNode>(CSInc.getOperand(1)) &&
15077 CSInc.getConstantOperandVal(1) == 1 && CSInc->hasOneUse())
15078 CSInc = CSInc.getOperand(0);
15079
15080 if (CSInc.getOpcode() == ARMISD::CSINC &&
15081 isNullConstant(CSInc.getOperand(0)) &&
15082 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15084 return CSInc.getOperand(3);
15085 }
15086 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(0)) &&
15087 isNullConstant(CSInc.getOperand(1)) && CSInc->hasOneUse()) {
15089 return CSInc.getOperand(3);
15090 }
15091 if (CSInc.getOpcode() == ARMISD::CMOV && isOneConstant(CSInc.getOperand(1)) &&
15092 isNullConstant(CSInc.getOperand(0)) && CSInc->hasOneUse()) {
15095 return CSInc.getOperand(3);
15096 }
15097 return SDValue();
15098}
15099
15101 // Given CMPZ(CSINC(C, 0, 0, EQ), 0), we can just use C directly. As in
15102 // t92: flags = ARMISD::CMPZ t74, 0
15103 // t93: i32 = ARMISD::CSINC 0, 0, 1, t92
15104 // t96: flags = ARMISD::CMPZ t93, 0
15105 // t114: i32 = ARMISD::CSINV 0, 0, 0, t96
15107 if (SDValue C = IsCMPZCSINC(N, Cond))
15108 if (Cond == ARMCC::EQ)
15109 return C;
15110 return SDValue();
15111}
15112
15114 // Fold away an unnecessary CMPZ/CSINC
15115 // CSXYZ A, B, C1 (CMPZ (CSINC 0, 0, C2, D), 0) ->
15116 // if C1==EQ -> CSXYZ A, B, C2, D
15117 // if C1==NE -> CSXYZ A, B, NOT(C2), D
15119 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
15120 if (N->getConstantOperandVal(2) == ARMCC::EQ)
15121 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15122 N->getOperand(1),
15123 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
15124 if (N->getConstantOperandVal(2) == ARMCC::NE)
15125 return DAG.getNode(
15126 N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
15127 N->getOperand(1),
15129 }
15130 return SDValue();
15131}
15132
15133/// PerformVMOVRRDCombine - Target-specific dag combine xforms for
15134/// ARMISD::VMOVRRD.
15137 const ARMSubtarget *Subtarget) {
15138 // vmovrrd(vmovdrr x, y) -> x,y
15139 SDValue InDouble = N->getOperand(0);
15140 if (InDouble.getOpcode() == ARMISD::VMOVDRR && Subtarget->hasFP64())
15141 return DCI.CombineTo(N, InDouble.getOperand(0), InDouble.getOperand(1));
15142
15143 // vmovrrd(load f64) -> (load i32), (load i32)
15144 SDNode *InNode = InDouble.getNode();
15145 if (ISD::isNormalLoad(InNode) && InNode->hasOneUse() &&
15146 InNode->getValueType(0) == MVT::f64 &&
15147 InNode->getOperand(1).getOpcode() == ISD::FrameIndex &&
15148 !cast<LoadSDNode>(InNode)->isVolatile()) {
15149 // TODO: Should this be done for non-FrameIndex operands?
15150 LoadSDNode *LD = cast<LoadSDNode>(InNode);
15151
15152 SelectionDAG &DAG = DCI.DAG;
15153 SDLoc DL(LD);
15154 SDValue BasePtr = LD->getBasePtr();
15155 SDValue NewLD1 =
15156 DAG.getLoad(MVT::i32, DL, LD->getChain(), BasePtr, LD->getPointerInfo(),
15157 LD->getAlign(), LD->getMemOperand()->getFlags());
15158
15159 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
15160 DAG.getConstant(4, DL, MVT::i32));
15161
15162 SDValue NewLD2 = DAG.getLoad(MVT::i32, DL, LD->getChain(), OffsetPtr,
15163 LD->getPointerInfo().getWithOffset(4),
15164 commonAlignment(LD->getAlign(), 4),
15165 LD->getMemOperand()->getFlags());
15166
15167 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewLD2.getValue(1));
15168 if (DCI.DAG.getDataLayout().isBigEndian())
15169 std::swap (NewLD1, NewLD2);
15170 SDValue Result = DCI.CombineTo(N, NewLD1, NewLD2);
15171 return Result;
15172 }
15173
15174 // VMOVRRD(extract(..(build_vector(a, b, c, d)))) -> a,b or c,d
15175 // VMOVRRD(extract(insert_vector(insert_vector(.., a, l1), b, l2))) -> a,b
15176 if (InDouble.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15177 isa<ConstantSDNode>(InDouble.getOperand(1))) {
15178 SDValue BV = InDouble.getOperand(0);
15179 // Look up through any nop bitcasts and vector_reg_casts. bitcasts may
15180 // change lane order under big endian.
15181 bool BVSwap = BV.getOpcode() == ISD::BITCAST;
15182 while (
15183 (BV.getOpcode() == ISD::BITCAST ||
15184 BV.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
15185 (BV.getValueType() == MVT::v2f64 || BV.getValueType() == MVT::v2i64)) {
15186 BVSwap = BV.getOpcode() == ISD::BITCAST;
15187 BV = BV.getOperand(0);
15188 }
15189 if (BV.getValueType() != MVT::v4i32)
15190 return SDValue();
15191
15192 // Handle buildvectors, pulling out the correct lane depending on
15193 // endianness.
15194 unsigned Offset = InDouble.getConstantOperandVal(1) == 1 ? 2 : 0;
15195 if (BV.getOpcode() == ISD::BUILD_VECTOR) {
15196 SDValue Op0 = BV.getOperand(Offset);
15197 SDValue Op1 = BV.getOperand(Offset + 1);
15198 if (!Subtarget->isLittle() && BVSwap)
15199 std::swap(Op0, Op1);
15200
15201 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15202 }
15203
15204 // A chain of insert_vectors, grabbing the correct value of the chain of
15205 // inserts.
15206 SDValue Op0, Op1;
15207 while (BV.getOpcode() == ISD::INSERT_VECTOR_ELT) {
15208 if (isa<ConstantSDNode>(BV.getOperand(2))) {
15209 if (BV.getConstantOperandVal(2) == Offset && !Op0)
15210 Op0 = BV.getOperand(1);
15211 if (BV.getConstantOperandVal(2) == Offset + 1 && !Op1)
15212 Op1 = BV.getOperand(1);
15213 }
15214 BV = BV.getOperand(0);
15215 }
15216 if (!Subtarget->isLittle() && BVSwap)
15217 std::swap(Op0, Op1);
15218 if (Op0 && Op1)
15219 return DCI.DAG.getMergeValues({Op0, Op1}, SDLoc(N));
15220 }
15221
15222 return SDValue();
15223}
15224
15225/// PerformVMOVDRRCombine - Target-specific dag combine xforms for
15226/// ARMISD::VMOVDRR. This is also used for BUILD_VECTORs with 2 operands.
15228 // N=vmovrrd(X); vmovdrr(N:0, N:1) -> bit_convert(X)
15229 SDValue Op0 = N->getOperand(0);
15230 SDValue Op1 = N->getOperand(1);
15231 if (Op0.getOpcode() == ISD::BITCAST)
15232 Op0 = Op0.getOperand(0);
15233 if (Op1.getOpcode() == ISD::BITCAST)
15234 Op1 = Op1.getOperand(0);
15235 if (Op0.getOpcode() == ARMISD::VMOVRRD &&
15236 Op0.getNode() == Op1.getNode() &&
15237 Op0.getResNo() == 0 && Op1.getResNo() == 1)
15238 return DAG.getNode(ISD::BITCAST, SDLoc(N),
15239 N->getValueType(0), Op0.getOperand(0));
15240 return SDValue();
15241}
15242
15245 SDValue Op0 = N->getOperand(0);
15246
15247 // VMOVhr (VMOVrh (X)) -> X
15248 if (Op0->getOpcode() == ARMISD::VMOVrh)
15249 return Op0->getOperand(0);
15250
15251 // FullFP16: half values are passed in S-registers, and we don't
15252 // need any of the bitcast and moves:
15253 //
15254 // t2: f32,ch1,gl1? = CopyFromReg ch, Register:f32 %0, gl?
15255 // t5: i32 = bitcast t2
15256 // t18: f16 = ARMISD::VMOVhr t5
15257 // =>
15258 // tN: f16,ch2,gl2? = CopyFromReg ch, Register::f32 %0, gl?
15259 if (Op0->getOpcode() == ISD::BITCAST) {
15260 SDValue Copy = Op0->getOperand(0);
15261 if (Copy.getValueType() == MVT::f32 &&
15262 Copy->getOpcode() == ISD::CopyFromReg) {
15263 bool HasGlue = Copy->getNumOperands() == 3;
15264 SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1),
15265 HasGlue ? Copy->getOperand(2) : SDValue()};
15266 EVT OutTys[] = {N->getValueType(0), MVT::Other, MVT::Glue};
15267 SDValue NewCopy =
15269 DCI.DAG.getVTList(ArrayRef(OutTys, HasGlue ? 3 : 2)),
15270 ArrayRef(Ops, HasGlue ? 3 : 2));
15271
15272 // Update Users, Chains, and Potential Glue.
15273 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), NewCopy.getValue(0));
15274 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(1), NewCopy.getValue(1));
15275 if (HasGlue)
15276 DCI.DAG.ReplaceAllUsesOfValueWith(Copy.getValue(2),
15277 NewCopy.getValue(2));
15278
15279 return NewCopy;
15280 }
15281 }
15282
15283 // fold (VMOVhr (load x)) -> (load (f16*)x)
15284 if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
15285 if (LN0->hasOneUse() && LN0->isUnindexed() &&
15286 LN0->getMemoryVT() == MVT::i16) {
15287 SDValue Load =
15288 DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
15289 LN0->getBasePtr(), LN0->getMemOperand());
15290 DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15291 DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
15292 return Load;
15293 }
15294 }
15295
15296 // Only the bottom 16 bits of the source register are used.
15297 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15298 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15299 if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
15300 return SDValue(N, 0);
15301
15302 return SDValue();
15303}
15304
15306 SDValue N0 = N->getOperand(0);
15307 EVT VT = N->getValueType(0);
15308
15309 // fold (VMOVrh (fpconst x)) -> const x
15311 APFloat V = C->getValueAPF();
15312 return DAG.getConstant(V.bitcastToAPInt().getZExtValue(), SDLoc(N), VT);
15313 }
15314
15315 // fold (VMOVrh (load x)) -> (zextload (i16*)x)
15316 if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
15317 LoadSDNode *LN0 = cast<LoadSDNode>(N0);
15318
15319 SDValue Load =
15320 DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
15321 LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
15322 DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
15323 DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
15324 return Load;
15325 }
15326
15327 // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
15328 if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15330 return DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
15331 N0->getOperand(1));
15332
15333 return SDValue();
15334}
15335
15336/// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
15337/// are normal, non-volatile loads. If so, it is profitable to bitcast an
15338/// i64 vector to have f64 elements, since the value can then be loaded
15339/// directly into a VFP register.
15341 unsigned NumElts = N->getValueType(0).getVectorNumElements();
15342 for (unsigned i = 0; i < NumElts; ++i) {
15343 SDNode *Elt = N->getOperand(i).getNode();
15344 if (ISD::isNormalLoad(Elt) && !cast<LoadSDNode>(Elt)->isVolatile())
15345 return true;
15346 }
15347 return false;
15348}
15349
15350/// PerformBUILD_VECTORCombine - Target-specific dag combine xforms for
15351/// ISD::BUILD_VECTOR.
15354 const ARMSubtarget *Subtarget) {
15355 // build_vector(N=ARMISD::VMOVRRD(X), N:1) -> bit_convert(X):
15356 // VMOVRRD is introduced when legalizing i64 types. It forces the i64 value
15357 // into a pair of GPRs, which is fine when the value is used as a scalar,
15358 // but if the i64 value is converted to a vector, we need to undo the VMOVRRD.
15359 SelectionDAG &DAG = DCI.DAG;
15360 if (N->getNumOperands() == 2)
15361 if (SDValue RV = PerformVMOVDRRCombine(N, DAG))
15362 return RV;
15363
15364 // Load i64 elements as f64 values so that type legalization does not split
15365 // them up into i32 values.
15366 EVT VT = N->getValueType(0);
15367 if (VT.getVectorElementType() != MVT::i64 || !hasNormalLoadOperand(N))
15368 return SDValue();
15369 SDLoc dl(N);
15371 unsigned NumElts = VT.getVectorNumElements();
15372 for (unsigned i = 0; i < NumElts; ++i) {
15373 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(i));
15374 Ops.push_back(V);
15375 // Make the DAGCombiner fold the bitcast.
15376 DCI.AddToWorklist(V.getNode());
15377 }
15378 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64, NumElts);
15379 SDValue BV = DAG.getBuildVector(FloatVT, dl, Ops);
15380 return DAG.getNode(ISD::BITCAST, dl, VT, BV);
15381}
15382
15383/// Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
15384static SDValue
15386 // ARMISD::BUILD_VECTOR is introduced when legalizing ISD::BUILD_VECTOR.
15387 // At that time, we may have inserted bitcasts from integer to float.
15388 // If these bitcasts have survived DAGCombine, change the lowering of this
15389 // BUILD_VECTOR in something more vector friendly, i.e., that does not
15390 // force to use floating point types.
15391
15392 // Make sure we can change the type of the vector.
15393 // This is possible iff:
15394 // 1. The vector is only used in a bitcast to a integer type. I.e.,
15395 // 1.1. Vector is used only once.
15396 // 1.2. Use is a bit convert to an integer type.
15397 // 2. The size of its operands are 32-bits (64-bits are not legal).
15398 EVT VT = N->getValueType(0);
15399 EVT EltVT = VT.getVectorElementType();
15400
15401 // Check 1.1. and 2.
15402 if (EltVT.getSizeInBits() != 32 || !N->hasOneUse())
15403 return SDValue();
15404
15405 // By construction, the input type must be float.
15406 assert(EltVT == MVT::f32 && "Unexpected type!");
15407
15408 // Check 1.2.
15409 SDNode *Use = *N->user_begin();
15410 if (Use->getOpcode() != ISD::BITCAST ||
15411 Use->getValueType(0).isFloatingPoint())
15412 return SDValue();
15413
15414 // Check profitability.
15415 // Model is, if more than half of the relevant operands are bitcast from
15416 // i32, turn the build_vector into a sequence of insert_vector_elt.
15417 // Relevant operands are everything that is not statically
15418 // (i.e., at compile time) bitcasted.
15419 unsigned NumOfBitCastedElts = 0;
15420 unsigned NumElts = VT.getVectorNumElements();
15421 unsigned NumOfRelevantElts = NumElts;
15422 for (unsigned Idx = 0; Idx < NumElts; ++Idx) {
15423 SDValue Elt = N->getOperand(Idx);
15424 if (Elt->getOpcode() == ISD::BITCAST) {
15425 // Assume only bit cast to i32 will go away.
15426 if (Elt->getOperand(0).getValueType() == MVT::i32)
15427 ++NumOfBitCastedElts;
15428 } else if (Elt.isUndef() || isa<ConstantSDNode>(Elt))
15429 // Constants are statically casted, thus do not count them as
15430 // relevant operands.
15431 --NumOfRelevantElts;
15432 }
15433
15434 // Check if more than half of the elements require a non-free bitcast.
15435 if (NumOfBitCastedElts <= NumOfRelevantElts / 2)
15436 return SDValue();
15437
15438 SelectionDAG &DAG = DCI.DAG;
15439 // Create the new vector type.
15440 EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
15441 // Check if the type is legal.
15442 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15443 if (!TLI.isTypeLegal(VecVT))
15444 return SDValue();
15445
15446 // Combine:
15447 // ARMISD::BUILD_VECTOR E1, E2, ..., EN.
15448 // => BITCAST INSERT_VECTOR_ELT
15449 // (INSERT_VECTOR_ELT (...), (BITCAST EN-1), N-1),
15450 // (BITCAST EN), N.
15451 SDValue Vec = DAG.getUNDEF(VecVT);
15452 SDLoc dl(N);
15453 for (unsigned Idx = 0 ; Idx < NumElts; ++Idx) {
15454 SDValue V = N->getOperand(Idx);
15455 if (V.isUndef())
15456 continue;
15457 if (V.getOpcode() == ISD::BITCAST &&
15458 V->getOperand(0).getValueType() == MVT::i32)
15459 // Fold obvious case.
15460 V = V.getOperand(0);
15461 else {
15462 V = DAG.getNode(ISD::BITCAST, SDLoc(V), MVT::i32, V);
15463 // Make the DAGCombiner fold the bitcasts.
15464 DCI.AddToWorklist(V.getNode());
15465 }
15466 SDValue LaneIdx = DAG.getConstant(Idx, dl, MVT::i32);
15467 Vec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecVT, Vec, V, LaneIdx);
15468 }
15469 Vec = DAG.getNode(ISD::BITCAST, dl, VT, Vec);
15470 // Make the DAGCombiner fold the bitcasts.
15471 DCI.AddToWorklist(Vec.getNode());
15472 return Vec;
15473}
15474
15475static SDValue
15477 EVT VT = N->getValueType(0);
15478 SDValue Op = N->getOperand(0);
15479 SDLoc dl(N);
15480
15481 // PREDICATE_CAST(PREDICATE_CAST(x)) == PREDICATE_CAST(x)
15482 if (Op->getOpcode() == ARMISD::PREDICATE_CAST) {
15483 // If the valuetypes are the same, we can remove the cast entirely.
15484 if (Op->getOperand(0).getValueType() == VT)
15485 return Op->getOperand(0);
15486 return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15487 }
15488
15489 // Turn pred_cast(xor x, -1) into xor(pred_cast x, -1), in order to produce
15490 // more VPNOT which might get folded as else predicates.
15491 if (Op.getValueType() == MVT::i32 && isBitwiseNot(Op)) {
15492 SDValue X =
15493 DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
15494 SDValue C = DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT,
15495 DCI.DAG.getConstant(65535, dl, MVT::i32));
15496 return DCI.DAG.getNode(ISD::XOR, dl, VT, X, C);
15497 }
15498
15499 // Only the bottom 16 bits of the source register are used.
15500 if (Op.getValueType() == MVT::i32) {
15501 APInt DemandedMask = APInt::getLowBitsSet(32, 16);
15502 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
15503 if (TLI.SimplifyDemandedBits(Op, DemandedMask, DCI))
15504 return SDValue(N, 0);
15505 }
15506 return SDValue();
15507}
15508
15510 const ARMSubtarget *ST) {
15511 EVT VT = N->getValueType(0);
15512 SDValue Op = N->getOperand(0);
15513 SDLoc dl(N);
15514
15515 // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
15516 if (ST->isLittle())
15517 return DAG.getNode(ISD::BITCAST, dl, VT, Op);
15518
15519 // VT VECTOR_REG_CAST (VT Op) -> Op
15520 if (Op.getValueType() == VT)
15521 return Op;
15522 // VECTOR_REG_CAST undef -> undef
15523 if (Op.isUndef())
15524 return DAG.getUNDEF(VT);
15525
15526 // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
15527 if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
15528 // If the valuetypes are the same, we can remove the cast entirely.
15529 if (Op->getOperand(0).getValueType() == VT)
15530 return Op->getOperand(0);
15531 return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
15532 }
15533
15534 return SDValue();
15535}
15536
15538 const ARMSubtarget *Subtarget) {
15539 if (!Subtarget->hasMVEIntegerOps())
15540 return SDValue();
15541
15542 EVT VT = N->getValueType(0);
15543 SDValue Op0 = N->getOperand(0);
15544 SDValue Op1 = N->getOperand(1);
15545 ARMCC::CondCodes Cond = (ARMCC::CondCodes)N->getConstantOperandVal(2);
15546 SDLoc dl(N);
15547
15548 // vcmp X, 0, cc -> vcmpz X, cc
15549 if (isZeroVector(Op1))
15550 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op0, N->getOperand(2));
15551
15552 unsigned SwappedCond = getSwappedCondition(Cond);
15553 if (isValidMVECond(SwappedCond, VT.isFloatingPoint())) {
15554 // vcmp 0, X, cc -> vcmpz X, reversed(cc)
15555 if (isZeroVector(Op0))
15556 return DAG.getNode(ARMISD::VCMPZ, dl, VT, Op1,
15557 DAG.getConstant(SwappedCond, dl, MVT::i32));
15558 // vcmp vdup(Y), X, cc -> vcmp X, vdup(Y), reversed(cc)
15559 if (Op0->getOpcode() == ARMISD::VDUP && Op1->getOpcode() != ARMISD::VDUP)
15560 return DAG.getNode(ARMISD::VCMP, dl, VT, Op1, Op0,
15561 DAG.getConstant(SwappedCond, dl, MVT::i32));
15562 }
15563
15564 return SDValue();
15565}
15566
15567/// PerformInsertEltCombine - Target-specific dag combine xforms for
15568/// ISD::INSERT_VECTOR_ELT.
15571 // Bitcast an i64 load inserted into a vector to f64.
15572 // Otherwise, the i64 value will be legalized to a pair of i32 values.
15573 EVT VT = N->getValueType(0);
15574 SDNode *Elt = N->getOperand(1).getNode();
15575 if (VT.getVectorElementType() != MVT::i64 ||
15576 !ISD::isNormalLoad(Elt) || cast<LoadSDNode>(Elt)->isVolatile())
15577 return SDValue();
15578
15579 SelectionDAG &DAG = DCI.DAG;
15580 SDLoc dl(N);
15581 EVT FloatVT = EVT::getVectorVT(*DAG.getContext(), MVT::f64,
15583 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, N->getOperand(0));
15584 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::f64, N->getOperand(1));
15585 // Make the DAGCombiner fold the bitcasts.
15586 DCI.AddToWorklist(Vec.getNode());
15587 DCI.AddToWorklist(V.getNode());
15588 SDValue InsElt = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, FloatVT,
15589 Vec, V, N->getOperand(2));
15590 return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
15591}
15592
15593// Convert a pair of extracts from the same base vector to a VMOVRRD. Either
15594// directly or bitcast to an integer if the original is a float vector.
15595// extract(x, n); extract(x, n+1) -> VMOVRRD(extract v2f64 x, n/2)
15596// bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD(extract x, n/2)
15597static SDValue
15599 EVT VT = N->getValueType(0);
15600 SDLoc dl(N);
15601
15602 if (!DCI.isAfterLegalizeDAG() || VT != MVT::i32 ||
15603 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(MVT::f64))
15604 return SDValue();
15605
15606 SDValue Ext = SDValue(N, 0);
15607 if (Ext.getOpcode() == ISD::BITCAST &&
15608 Ext.getOperand(0).getValueType() == MVT::f32)
15609 Ext = Ext.getOperand(0);
15610 if (Ext.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
15612 Ext.getConstantOperandVal(1) % 2 != 0)
15613 return SDValue();
15614 if (Ext->hasOneUse() && (Ext->user_begin()->getOpcode() == ISD::SINT_TO_FP ||
15615 Ext->user_begin()->getOpcode() == ISD::UINT_TO_FP))
15616 return SDValue();
15617
15618 SDValue Op0 = Ext.getOperand(0);
15619 EVT VecVT = Op0.getValueType();
15620 unsigned ResNo = Op0.getResNo();
15621 unsigned Lane = Ext.getConstantOperandVal(1);
15622 if (VecVT.getVectorNumElements() != 4)
15623 return SDValue();
15624
15625 // Find another extract, of Lane + 1
15626 auto OtherIt = find_if(Op0->users(), [&](SDNode *V) {
15627 return V->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15628 isa<ConstantSDNode>(V->getOperand(1)) &&
15629 V->getConstantOperandVal(1) == Lane + 1 &&
15630 V->getOperand(0).getResNo() == ResNo;
15631 });
15632 if (OtherIt == Op0->users().end())
15633 return SDValue();
15634
15635 // For float extracts, we need to be converting to a i32 for both vector
15636 // lanes.
15637 SDValue OtherExt(*OtherIt, 0);
15638 if (OtherExt.getValueType() != MVT::i32) {
15639 if (!OtherExt->hasOneUse() ||
15640 OtherExt->user_begin()->getOpcode() != ISD::BITCAST ||
15641 OtherExt->user_begin()->getValueType(0) != MVT::i32)
15642 return SDValue();
15643 OtherExt = SDValue(*OtherExt->user_begin(), 0);
15644 }
15645
15646 // Convert the type to a f64 and extract with a VMOVRRD.
15647 SDValue F64 = DCI.DAG.getNode(
15648 ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
15649 DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v2f64, Op0),
15650 DCI.DAG.getConstant(Ext.getConstantOperandVal(1) / 2, dl, MVT::i32));
15651 SDValue VMOVRRD =
15652 DCI.DAG.getNode(ARMISD::VMOVRRD, dl, {MVT::i32, MVT::i32}, F64);
15653
15654 DCI.CombineTo(OtherExt.getNode(), SDValue(VMOVRRD.getNode(), 1));
15655 return VMOVRRD;
15656}
15657
15660 const ARMSubtarget *ST) {
15661 SDValue Op0 = N->getOperand(0);
15662 EVT VT = N->getValueType(0);
15663 SDLoc dl(N);
15664
15665 // extract (vdup x) -> x
15666 if (Op0->getOpcode() == ARMISD::VDUP) {
15667 SDValue X = Op0->getOperand(0);
15668 if (VT == MVT::f16 && X.getValueType() == MVT::i32)
15669 return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
15670 if (VT == MVT::i32 && X.getValueType() == MVT::f16)
15671 return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
15672 if (VT == MVT::f32 && X.getValueType() == MVT::i32)
15673 return DCI.DAG.getNode(ISD::BITCAST, dl, VT, X);
15674
15675 while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
15676 X = X->getOperand(0);
15677 if (X.getValueType() == VT)
15678 return X;
15679 }
15680
15681 // extract ARM_BUILD_VECTOR -> x
15682 if (Op0->getOpcode() == ARMISD::BUILD_VECTOR &&
15683 isa<ConstantSDNode>(N->getOperand(1)) &&
15684 N->getConstantOperandVal(1) < Op0.getNumOperands()) {
15685 return Op0.getOperand(N->getConstantOperandVal(1));
15686 }
15687
15688 // extract(bitcast(BUILD_VECTOR(VMOVDRR(a, b), ..))) -> a or b
15689 if (Op0.getValueType() == MVT::v4i32 &&
15690 isa<ConstantSDNode>(N->getOperand(1)) &&
15691 Op0.getOpcode() == ISD::BITCAST &&
15693 Op0.getOperand(0).getValueType() == MVT::v2f64) {
15694 SDValue BV = Op0.getOperand(0);
15695 unsigned Offset = N->getConstantOperandVal(1);
15696 SDValue MOV = BV.getOperand(Offset < 2 ? 0 : 1);
15697 if (MOV.getOpcode() == ARMISD::VMOVDRR)
15698 return MOV.getOperand(ST->isLittle() ? Offset % 2 : 1 - Offset % 2);
15699 }
15700
15701 // extract x, n; extract x, n+1 -> VMOVRRD x
15702 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
15703 return R;
15704
15705 // extract (MVETrunc(x)) -> extract x
15706 if (Op0->getOpcode() == ARMISD::MVETRUNC) {
15707 unsigned Idx = N->getConstantOperandVal(1);
15708 unsigned Vec =
15710 unsigned SubIdx =
15712 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec),
15713 DCI.DAG.getConstant(SubIdx, dl, MVT::i32));
15714 }
15715
15716 // extract(bitcast(BUILD_VECTOR(extract(bitcast(a)), ..))) -> extract(a)
15717 if (ST->isLittle() && Op0.getOpcode() == ISD::BITCAST &&
15719 isa<ConstantSDNode>(N->getOperand(1)) &&
15722 unsigned Lane = N->getConstantOperandVal(1);
15723 EVT ExtVT = Op0.getValueType();
15724 EVT BVVT = Op0.getOperand(0).getValueType();
15725 unsigned BVLane =
15726 (Lane * BVVT.getVectorNumElements()) / ExtVT.getVectorNumElements();
15727 assert(BVLane < Op0.getOperand(0).getNumOperands());
15728 SDValue Ext = Op0.getOperand(0).getOperand(BVLane);
15729 if (Ext.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
15730 Ext.getOperand(0).getOpcode() == ISD::BITCAST &&
15732 Ext.getOperand(0).getOperand(0).getValueType() == ExtVT) {
15733 unsigned InnerLane = Ext.getConstantOperandVal(1);
15734 unsigned BVSubLane = Lane - (BVLane * ExtVT.getVectorNumElements()) /
15735 BVVT.getVectorNumElements();
15736 unsigned FinalLane = (InnerLane * ExtVT.getVectorNumElements()) /
15737 BVVT.getVectorNumElements() +
15738 BVSubLane;
15739 return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT,
15740 Ext.getOperand(0).getOperand(0),
15741 DCI.DAG.getConstant(FinalLane, dl, MVT::i32));
15742 }
15743 }
15744
15745 return SDValue();
15746}
15747
15749 SDValue Op = N->getOperand(0);
15750 EVT VT = N->getValueType(0);
15751
15752 // sext_inreg(VGETLANEu) -> VGETLANEs
15753 if (Op.getOpcode() == ARMISD::VGETLANEu &&
15754 cast<VTSDNode>(N->getOperand(1))->getVT() ==
15755 Op.getOperand(0).getValueType().getScalarType())
15756 return DAG.getNode(ARMISD::VGETLANEs, SDLoc(N), VT, Op.getOperand(0),
15757 Op.getOperand(1));
15758
15759 return SDValue();
15760}
15761
15762static SDValue
15764 SDValue Vec = N->getOperand(0);
15765 SDValue SubVec = N->getOperand(1);
15766 uint64_t IdxVal = N->getConstantOperandVal(2);
15767 EVT VecVT = Vec.getValueType();
15768 EVT SubVT = SubVec.getValueType();
15769
15770 // Only do this for legal fixed vector types.
15771 if (!VecVT.isFixedLengthVector() ||
15772 !DCI.DAG.getTargetLoweringInfo().isTypeLegal(VecVT) ||
15774 return SDValue();
15775
15776 // Ignore widening patterns.
15777 if (IdxVal == 0 && Vec.isUndef())
15778 return SDValue();
15779
15780 // Subvector must be half the width and an "aligned" insertion.
15781 unsigned NumSubElts = SubVT.getVectorNumElements();
15782 if ((SubVT.getSizeInBits() * 2) != VecVT.getSizeInBits() ||
15783 (IdxVal != 0 && IdxVal != NumSubElts))
15784 return SDValue();
15785
15786 // Fold insert_subvector -> concat_vectors
15787 // insert_subvector(Vec,Sub,lo) -> concat_vectors(Sub,extract(Vec,hi))
15788 // insert_subvector(Vec,Sub,hi) -> concat_vectors(extract(Vec,lo),Sub)
15789 SDLoc DL(N);
15790 SDValue Lo, Hi;
15791 if (IdxVal == 0) {
15792 Lo = SubVec;
15793 Hi = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15794 DCI.DAG.getVectorIdxConstant(NumSubElts, DL));
15795 } else {
15796 Lo = DCI.DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, Vec,
15797 DCI.DAG.getVectorIdxConstant(0, DL));
15798 Hi = SubVec;
15799 }
15800 return DCI.DAG.getNode(ISD::CONCAT_VECTORS, DL, VecVT, Lo, Hi);
15801}
15802
15803// shuffle(MVETrunc(x, y)) -> VMOVN(x, y)
15805 SelectionDAG &DAG) {
15806 SDValue Trunc = N->getOperand(0);
15807 EVT VT = Trunc.getValueType();
15808 if (Trunc.getOpcode() != ARMISD::MVETRUNC || !N->getOperand(1).isUndef())
15809 return SDValue();
15810
15811 SDLoc DL(Trunc);
15812 if (isVMOVNTruncMask(N->getMask(), VT, false))
15813 return DAG.getNode(
15814 ARMISD::VMOVN, DL, VT,
15815 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15816 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15817 DAG.getConstant(1, DL, MVT::i32));
15818 else if (isVMOVNTruncMask(N->getMask(), VT, true))
15819 return DAG.getNode(
15820 ARMISD::VMOVN, DL, VT,
15821 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(1)),
15822 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, Trunc.getOperand(0)),
15823 DAG.getConstant(1, DL, MVT::i32));
15824 return SDValue();
15825}
15826
15827/// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
15828/// ISD::VECTOR_SHUFFLE.
15831 return R;
15832
15833 // The LLVM shufflevector instruction does not require the shuffle mask
15834 // length to match the operand vector length, but ISD::VECTOR_SHUFFLE does
15835 // have that requirement. When translating to ISD::VECTOR_SHUFFLE, if the
15836 // operands do not match the mask length, they are extended by concatenating
15837 // them with undef vectors. That is probably the right thing for other
15838 // targets, but for NEON it is better to concatenate two double-register
15839 // size vector operands into a single quad-register size vector. Do that
15840 // transformation here:
15841 // shuffle(concat(v1, undef), concat(v2, undef)) ->
15842 // shuffle(concat(v1, v2), undef)
15843 SDValue Op0 = N->getOperand(0);
15844 SDValue Op1 = N->getOperand(1);
15845 if (Op0.getOpcode() != ISD::CONCAT_VECTORS ||
15846 Op1.getOpcode() != ISD::CONCAT_VECTORS ||
15847 Op0.getNumOperands() != 2 ||
15848 Op1.getNumOperands() != 2)
15849 return SDValue();
15850 SDValue Concat0Op1 = Op0.getOperand(1);
15851 SDValue Concat1Op1 = Op1.getOperand(1);
15852 if (!Concat0Op1.isUndef() || !Concat1Op1.isUndef())
15853 return SDValue();
15854 // Skip the transformation if any of the types are illegal.
15855 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
15856 EVT VT = N->getValueType(0);
15857 if (!TLI.isTypeLegal(VT) ||
15858 !TLI.isTypeLegal(Concat0Op1.getValueType()) ||
15859 !TLI.isTypeLegal(Concat1Op1.getValueType()))
15860 return SDValue();
15861
15862 SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT,
15863 Op0.getOperand(0), Op1.getOperand(0));
15864 // Translate the shuffle mask.
15865 SmallVector<int, 16> NewMask;
15866 unsigned NumElts = VT.getVectorNumElements();
15867 unsigned HalfElts = NumElts/2;
15869 for (unsigned n = 0; n < NumElts; ++n) {
15870 int MaskElt = SVN->getMaskElt(n);
15871 int NewElt = -1;
15872 if (MaskElt < (int)HalfElts)
15873 NewElt = MaskElt;
15874 else if (MaskElt >= (int)NumElts && MaskElt < (int)(NumElts + HalfElts))
15875 NewElt = HalfElts + MaskElt - NumElts;
15876 NewMask.push_back(NewElt);
15877 }
15878 return DAG.getVectorShuffle(VT, SDLoc(N), NewConcat,
15879 DAG.getUNDEF(VT), NewMask);
15880}
15881
15882/// Load/store instruction that can be merged with a base address
15883/// update
15888 unsigned AddrOpIdx;
15889};
15890
15892 /// Instruction that updates a pointer
15894 /// Pointer increment operand
15896 /// Pointer increment value if it is a constant, or 0 otherwise
15897 unsigned ConstInc;
15898};
15899
15901 // Check that the add is independent of the load/store.
15902 // Otherwise, folding it would create a cycle. Search through Addr
15903 // as well, since the User may not be a direct user of Addr and
15904 // only share a base pointer.
15907 Worklist.push_back(N);
15908 Worklist.push_back(User);
15909 const unsigned MaxSteps = 1024;
15910 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
15911 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
15912 return false;
15913 return true;
15914}
15915
15917 struct BaseUpdateUser &User,
15918 bool SimpleConstIncOnly,
15920 SelectionDAG &DAG = DCI.DAG;
15921 SDNode *N = Target.N;
15922 MemSDNode *MemN = cast<MemSDNode>(N);
15923 SDLoc dl(N);
15924
15925 // Find the new opcode for the updating load/store.
15926 bool isLoadOp = true;
15927 bool isLaneOp = false;
15928 // Workaround for vst1x and vld1x intrinsics which do not have alignment
15929 // as an operand.
15930 bool hasAlignment = true;
15931 unsigned NewOpc = 0;
15932 unsigned NumVecs = 0;
15933 if (Target.isIntrinsic) {
15934 unsigned IntNo = N->getConstantOperandVal(1);
15935 switch (IntNo) {
15936 default:
15937 llvm_unreachable("unexpected intrinsic for Neon base update");
15938 case Intrinsic::arm_neon_vld1:
15939 NewOpc = ARMISD::VLD1_UPD;
15940 NumVecs = 1;
15941 break;
15942 case Intrinsic::arm_neon_vld2:
15943 NewOpc = ARMISD::VLD2_UPD;
15944 NumVecs = 2;
15945 break;
15946 case Intrinsic::arm_neon_vld3:
15947 NewOpc = ARMISD::VLD3_UPD;
15948 NumVecs = 3;
15949 break;
15950 case Intrinsic::arm_neon_vld4:
15951 NewOpc = ARMISD::VLD4_UPD;
15952 NumVecs = 4;
15953 break;
15954 case Intrinsic::arm_neon_vld1x2:
15955 NewOpc = ARMISD::VLD1x2_UPD;
15956 NumVecs = 2;
15957 hasAlignment = false;
15958 break;
15959 case Intrinsic::arm_neon_vld1x3:
15960 NewOpc = ARMISD::VLD1x3_UPD;
15961 NumVecs = 3;
15962 hasAlignment = false;
15963 break;
15964 case Intrinsic::arm_neon_vld1x4:
15965 NewOpc = ARMISD::VLD1x4_UPD;
15966 NumVecs = 4;
15967 hasAlignment = false;
15968 break;
15969 case Intrinsic::arm_neon_vld2dup:
15970 NewOpc = ARMISD::VLD2DUP_UPD;
15971 NumVecs = 2;
15972 break;
15973 case Intrinsic::arm_neon_vld3dup:
15974 NewOpc = ARMISD::VLD3DUP_UPD;
15975 NumVecs = 3;
15976 break;
15977 case Intrinsic::arm_neon_vld4dup:
15978 NewOpc = ARMISD::VLD4DUP_UPD;
15979 NumVecs = 4;
15980 break;
15981 case Intrinsic::arm_neon_vld2lane:
15982 NewOpc = ARMISD::VLD2LN_UPD;
15983 NumVecs = 2;
15984 isLaneOp = true;
15985 break;
15986 case Intrinsic::arm_neon_vld3lane:
15987 NewOpc = ARMISD::VLD3LN_UPD;
15988 NumVecs = 3;
15989 isLaneOp = true;
15990 break;
15991 case Intrinsic::arm_neon_vld4lane:
15992 NewOpc = ARMISD::VLD4LN_UPD;
15993 NumVecs = 4;
15994 isLaneOp = true;
15995 break;
15996 case Intrinsic::arm_neon_vst1:
15997 NewOpc = ARMISD::VST1_UPD;
15998 NumVecs = 1;
15999 isLoadOp = false;
16000 break;
16001 case Intrinsic::arm_neon_vst2:
16002 NewOpc = ARMISD::VST2_UPD;
16003 NumVecs = 2;
16004 isLoadOp = false;
16005 break;
16006 case Intrinsic::arm_neon_vst3:
16007 NewOpc = ARMISD::VST3_UPD;
16008 NumVecs = 3;
16009 isLoadOp = false;
16010 break;
16011 case Intrinsic::arm_neon_vst4:
16012 NewOpc = ARMISD::VST4_UPD;
16013 NumVecs = 4;
16014 isLoadOp = false;
16015 break;
16016 case Intrinsic::arm_neon_vst2lane:
16017 NewOpc = ARMISD::VST2LN_UPD;
16018 NumVecs = 2;
16019 isLoadOp = false;
16020 isLaneOp = true;
16021 break;
16022 case Intrinsic::arm_neon_vst3lane:
16023 NewOpc = ARMISD::VST3LN_UPD;
16024 NumVecs = 3;
16025 isLoadOp = false;
16026 isLaneOp = true;
16027 break;
16028 case Intrinsic::arm_neon_vst4lane:
16029 NewOpc = ARMISD::VST4LN_UPD;
16030 NumVecs = 4;
16031 isLoadOp = false;
16032 isLaneOp = true;
16033 break;
16034 case Intrinsic::arm_neon_vst1x2:
16035 NewOpc = ARMISD::VST1x2_UPD;
16036 NumVecs = 2;
16037 isLoadOp = false;
16038 hasAlignment = false;
16039 break;
16040 case Intrinsic::arm_neon_vst1x3:
16041 NewOpc = ARMISD::VST1x3_UPD;
16042 NumVecs = 3;
16043 isLoadOp = false;
16044 hasAlignment = false;
16045 break;
16046 case Intrinsic::arm_neon_vst1x4:
16047 NewOpc = ARMISD::VST1x4_UPD;
16048 NumVecs = 4;
16049 isLoadOp = false;
16050 hasAlignment = false;
16051 break;
16052 }
16053 } else {
16054 isLaneOp = true;
16055 switch (N->getOpcode()) {
16056 default:
16057 llvm_unreachable("unexpected opcode for Neon base update");
16058 case ARMISD::VLD1DUP:
16059 NewOpc = ARMISD::VLD1DUP_UPD;
16060 NumVecs = 1;
16061 break;
16062 case ARMISD::VLD2DUP:
16063 NewOpc = ARMISD::VLD2DUP_UPD;
16064 NumVecs = 2;
16065 break;
16066 case ARMISD::VLD3DUP:
16067 NewOpc = ARMISD::VLD3DUP_UPD;
16068 NumVecs = 3;
16069 break;
16070 case ARMISD::VLD4DUP:
16071 NewOpc = ARMISD::VLD4DUP_UPD;
16072 NumVecs = 4;
16073 break;
16074 case ISD::LOAD:
16075 NewOpc = ARMISD::VLD1_UPD;
16076 NumVecs = 1;
16077 isLaneOp = false;
16078 break;
16079 case ISD::STORE:
16080 NewOpc = ARMISD::VST1_UPD;
16081 NumVecs = 1;
16082 isLaneOp = false;
16083 isLoadOp = false;
16084 break;
16085 }
16086 }
16087
16088 // Find the size of memory referenced by the load/store.
16089 EVT VecTy;
16090 if (isLoadOp) {
16091 VecTy = N->getValueType(0);
16092 } else if (Target.isIntrinsic) {
16093 VecTy = N->getOperand(Target.AddrOpIdx + 1).getValueType();
16094 } else {
16095 assert(Target.isStore &&
16096 "Node has to be a load, a store, or an intrinsic!");
16097 VecTy = N->getOperand(1).getValueType();
16098 }
16099
16100 bool isVLDDUPOp =
16101 NewOpc == ARMISD::VLD1DUP_UPD || NewOpc == ARMISD::VLD2DUP_UPD ||
16102 NewOpc == ARMISD::VLD3DUP_UPD || NewOpc == ARMISD::VLD4DUP_UPD;
16103
16104 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16105 if (isLaneOp || isVLDDUPOp)
16106 NumBytes /= VecTy.getVectorNumElements();
16107
16108 if (NumBytes >= 3 * 16 && User.ConstInc != NumBytes) {
16109 // VLD3/4 and VST3/4 for 128-bit vectors are implemented with two
16110 // separate instructions that make it harder to use a non-constant update.
16111 return false;
16112 }
16113
16114 if (SimpleConstIncOnly && User.ConstInc != NumBytes)
16115 return false;
16116
16117 if (!isValidBaseUpdate(N, User.N))
16118 return false;
16119
16120 // OK, we found an ADD we can fold into the base update.
16121 // Now, create a _UPD node, taking care of not breaking alignment.
16122
16123 EVT AlignedVecTy = VecTy;
16124 Align Alignment = MemN->getAlign();
16125
16126 // If this is a less-than-standard-aligned load/store, change the type to
16127 // match the standard alignment.
16128 // The alignment is overlooked when selecting _UPD variants; and it's
16129 // easier to introduce bitcasts here than fix that.
16130 // There are 3 ways to get to this base-update combine:
16131 // - intrinsics: they are assumed to be properly aligned (to the standard
16132 // alignment of the memory type), so we don't need to do anything.
16133 // - ARMISD::VLDx nodes: they are only generated from the aforementioned
16134 // intrinsics, so, likewise, there's nothing to do.
16135 // - generic load/store instructions: the alignment is specified as an
16136 // explicit operand, rather than implicitly as the standard alignment
16137 // of the memory type (like the intrinsics). We need to change the
16138 // memory type to match the explicit alignment. That way, we don't
16139 // generate non-standard-aligned ARMISD::VLDx nodes.
16140 if (isa<LSBaseSDNode>(N)) {
16141 if (Alignment.value() < VecTy.getScalarSizeInBits() / 8) {
16142 MVT EltTy = MVT::getIntegerVT(Alignment.value() * 8);
16143 assert(NumVecs == 1 && "Unexpected multi-element generic load/store.");
16144 assert(!isLaneOp && "Unexpected generic load/store lane.");
16145 unsigned NumElts = NumBytes / (EltTy.getSizeInBits() / 8);
16146 AlignedVecTy = MVT::getVectorVT(EltTy, NumElts);
16147 }
16148 // Don't set an explicit alignment on regular load/stores that we want
16149 // to transform to VLD/VST 1_UPD nodes.
16150 // This matches the behavior of regular load/stores, which only get an
16151 // explicit alignment if the MMO alignment is larger than the standard
16152 // alignment of the memory type.
16153 // Intrinsics, however, always get an explicit alignment, set to the
16154 // alignment of the MMO.
16155 Alignment = Align(1);
16156 }
16157
16158 // Create the new updating load/store node.
16159 // First, create an SDVTList for the new updating node's results.
16160 EVT Tys[6];
16161 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16162 unsigned n;
16163 for (n = 0; n < NumResultVecs; ++n)
16164 Tys[n] = AlignedVecTy;
16165 Tys[n++] = MVT::i32;
16166 Tys[n] = MVT::Other;
16167 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16168
16169 // Then, gather the new node's operands.
16171 Ops.push_back(N->getOperand(0)); // incoming chain
16172 Ops.push_back(N->getOperand(Target.AddrOpIdx));
16173 Ops.push_back(User.Inc);
16174
16175 if (StoreSDNode *StN = dyn_cast<StoreSDNode>(N)) {
16176 // Try to match the intrinsic's signature
16177 Ops.push_back(StN->getValue());
16178 } else {
16179 // Loads (and of course intrinsics) match the intrinsics' signature,
16180 // so just add all but the alignment operand.
16181 unsigned LastOperand =
16182 hasAlignment ? N->getNumOperands() - 1 : N->getNumOperands();
16183 for (unsigned i = Target.AddrOpIdx + 1; i < LastOperand; ++i)
16184 Ops.push_back(N->getOperand(i));
16185 }
16186
16187 // For all node types, the alignment operand is always the last one.
16188 Ops.push_back(DAG.getConstant(Alignment.value(), dl, MVT::i32));
16189
16190 // If this is a non-standard-aligned STORE, the penultimate operand is the
16191 // stored value. Bitcast it to the aligned type.
16192 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::STORE) {
16193 SDValue &StVal = Ops[Ops.size() - 2];
16194 StVal = DAG.getNode(ISD::BITCAST, dl, AlignedVecTy, StVal);
16195 }
16196
16197 EVT LoadVT = isLaneOp ? VecTy.getVectorElementType() : AlignedVecTy;
16198 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, LoadVT,
16199 MemN->getMemOperand());
16200
16201 // Update the uses.
16202 SmallVector<SDValue, 5> NewResults;
16203 for (unsigned i = 0; i < NumResultVecs; ++i)
16204 NewResults.push_back(SDValue(UpdN.getNode(), i));
16205
16206 // If this is an non-standard-aligned LOAD, the first result is the loaded
16207 // value. Bitcast it to the expected result type.
16208 if (AlignedVecTy != VecTy && N->getOpcode() == ISD::LOAD) {
16209 SDValue &LdVal = NewResults[0];
16210 LdVal = DAG.getNode(ISD::BITCAST, dl, VecTy, LdVal);
16211 }
16212
16213 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16214 DCI.CombineTo(N, NewResults);
16215 DCI.CombineTo(User.N, SDValue(UpdN.getNode(), NumResultVecs));
16216
16217 return true;
16218}
16219
16220// If (opcode ptr inc) is and ADD-like instruction, return the
16221// increment value. Otherwise return 0.
16222static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr,
16223 SDValue Inc, const SelectionDAG &DAG) {
16225 if (!CInc)
16226 return 0;
16227
16228 switch (Opcode) {
16229 case ARMISD::VLD1_UPD:
16230 case ISD::ADD:
16231 return CInc->getZExtValue();
16232 case ISD::OR: {
16233 if (DAG.haveNoCommonBitsSet(Ptr, Inc)) {
16234 // (OR ptr inc) is the same as (ADD ptr inc)
16235 return CInc->getZExtValue();
16236 }
16237 return 0;
16238 }
16239 default:
16240 return 0;
16241 }
16242}
16243
16245 switch (N->getOpcode()) {
16246 case ISD::ADD:
16247 case ISD::OR: {
16248 if (isa<ConstantSDNode>(N->getOperand(1))) {
16249 *Ptr = N->getOperand(0);
16250 *CInc = N->getOperand(1);
16251 return true;
16252 }
16253 return false;
16254 }
16255 case ARMISD::VLD1_UPD: {
16256 if (isa<ConstantSDNode>(N->getOperand(2))) {
16257 *Ptr = N->getOperand(1);
16258 *CInc = N->getOperand(2);
16259 return true;
16260 }
16261 return false;
16262 }
16263 default:
16264 return false;
16265 }
16266}
16267
16268/// CombineBaseUpdate - Target-specific DAG combine function for VLDDUP,
16269/// NEON load/store intrinsics, and generic vector load/stores, to merge
16270/// base address updates.
16271/// For generic load/stores, the memory type is assumed to be a vector.
16272/// The caller is assumed to have checked legality.
16275 const bool isIntrinsic = (N->getOpcode() == ISD::INTRINSIC_VOID ||
16276 N->getOpcode() == ISD::INTRINSIC_W_CHAIN);
16277 const bool isStore = N->getOpcode() == ISD::STORE;
16278 const unsigned AddrOpIdx = ((isIntrinsic || isStore) ? 2 : 1);
16279 BaseUpdateTarget Target = {N, isIntrinsic, isStore, AddrOpIdx};
16280
16281 // Limit the number of possible base-updates we look at to prevent degenerate
16282 // cases.
16283 unsigned MaxBaseUpdates = ArmMaxBaseUpdatesToCheck;
16284
16285 SDValue Addr = N->getOperand(AddrOpIdx);
16286
16288
16289 // Search for a use of the address operand that is an increment.
16290 for (SDUse &Use : Addr->uses()) {
16291 SDNode *User = Use.getUser();
16292 if (Use.getResNo() != Addr.getResNo() || User->getNumOperands() != 2)
16293 continue;
16294
16295 SDValue Inc = User->getOperand(Use.getOperandNo() == 1 ? 0 : 1);
16296 unsigned ConstInc =
16297 getPointerConstIncrement(User->getOpcode(), Addr, Inc, DCI.DAG);
16298
16299 if (ConstInc || User->getOpcode() == ISD::ADD) {
16300 BaseUpdates.push_back({User, Inc, ConstInc});
16301 if (BaseUpdates.size() >= MaxBaseUpdates)
16302 break;
16303 }
16304 }
16305
16306 // If the address is a constant pointer increment itself, find
16307 // another constant increment that has the same base operand
16308 SDValue Base;
16309 SDValue CInc;
16310 if (findPointerConstIncrement(Addr.getNode(), &Base, &CInc)) {
16311 unsigned Offset =
16312 getPointerConstIncrement(Addr->getOpcode(), Base, CInc, DCI.DAG);
16313 if (Offset) {
16314 for (SDUse &Use : Base->uses()) {
16315
16316 SDNode *User = Use.getUser();
16317 if (Use.getResNo() != Base.getResNo() || User == Addr.getNode() ||
16318 User->getNumOperands() != 2)
16319 continue;
16320
16321 SDValue UserInc = User->getOperand(Use.getOperandNo() == 0 ? 1 : 0);
16322 unsigned UserOffset =
16323 getPointerConstIncrement(User->getOpcode(), Base, UserInc, DCI.DAG);
16324
16325 if (!UserOffset || UserOffset <= Offset)
16326 continue;
16327
16328 unsigned NewConstInc = UserOffset - Offset;
16329 SDValue NewInc = DCI.DAG.getConstant(NewConstInc, SDLoc(N), MVT::i32);
16330 BaseUpdates.push_back({User, NewInc, NewConstInc});
16331 if (BaseUpdates.size() >= MaxBaseUpdates)
16332 break;
16333 }
16334 }
16335 }
16336
16337 // Try to fold the load/store with an update that matches memory
16338 // access size. This should work well for sequential loads.
16339 unsigned NumValidUpd = BaseUpdates.size();
16340 for (unsigned I = 0; I < NumValidUpd; I++) {
16341 BaseUpdateUser &User = BaseUpdates[I];
16342 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/true, DCI))
16343 return SDValue();
16344 }
16345
16346 // Try to fold with other users. Non-constant updates are considered
16347 // first, and constant updates are sorted to not break a sequence of
16348 // strided accesses (if there is any).
16349 llvm::stable_sort(BaseUpdates,
16350 [](const BaseUpdateUser &LHS, const BaseUpdateUser &RHS) {
16351 return LHS.ConstInc < RHS.ConstInc;
16352 });
16353 for (BaseUpdateUser &User : BaseUpdates) {
16354 if (TryCombineBaseUpdate(Target, User, /*SimpleConstIncOnly=*/false, DCI))
16355 return SDValue();
16356 }
16357 return SDValue();
16358}
16359
16362 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16363 return SDValue();
16364
16365 return CombineBaseUpdate(N, DCI);
16366}
16367
16370 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
16371 return SDValue();
16372
16373 SelectionDAG &DAG = DCI.DAG;
16374 SDValue Addr = N->getOperand(2);
16375 MemSDNode *MemN = cast<MemSDNode>(N);
16376 SDLoc dl(N);
16377
16378 // For the stores, where there are multiple intrinsics we only actually want
16379 // to post-inc the last of the them.
16380 unsigned IntNo = N->getConstantOperandVal(1);
16381 if (IntNo == Intrinsic::arm_mve_vst2q && N->getConstantOperandVal(5) != 1)
16382 return SDValue();
16383 if (IntNo == Intrinsic::arm_mve_vst4q && N->getConstantOperandVal(7) != 3)
16384 return SDValue();
16385
16386 // Search for a use of the address operand that is an increment.
16387 for (SDUse &Use : Addr->uses()) {
16388 SDNode *User = Use.getUser();
16389 if (User->getOpcode() != ISD::ADD || Use.getResNo() != Addr.getResNo())
16390 continue;
16391
16392 // Check that the add is independent of the load/store. Otherwise, folding
16393 // it would create a cycle. We can avoid searching through Addr as it's a
16394 // predecessor to both.
16397 Visited.insert(Addr.getNode());
16398 Worklist.push_back(N);
16399 Worklist.push_back(User);
16400 const unsigned MaxSteps = 1024;
16401 if (SDNode::hasPredecessorHelper(N, Visited, Worklist, MaxSteps) ||
16402 SDNode::hasPredecessorHelper(User, Visited, Worklist, MaxSteps))
16403 continue;
16404
16405 // Find the new opcode for the updating load/store.
16406 bool isLoadOp = true;
16407 unsigned NewOpc = 0;
16408 unsigned NumVecs = 0;
16409 switch (IntNo) {
16410 default:
16411 llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
16412 case Intrinsic::arm_mve_vld2q:
16413 NewOpc = ARMISD::VLD2_UPD;
16414 NumVecs = 2;
16415 break;
16416 case Intrinsic::arm_mve_vld4q:
16417 NewOpc = ARMISD::VLD4_UPD;
16418 NumVecs = 4;
16419 break;
16420 case Intrinsic::arm_mve_vst2q:
16421 NewOpc = ARMISD::VST2_UPD;
16422 NumVecs = 2;
16423 isLoadOp = false;
16424 break;
16425 case Intrinsic::arm_mve_vst4q:
16426 NewOpc = ARMISD::VST4_UPD;
16427 NumVecs = 4;
16428 isLoadOp = false;
16429 break;
16430 }
16431
16432 // Find the size of memory referenced by the load/store.
16433 EVT VecTy;
16434 if (isLoadOp) {
16435 VecTy = N->getValueType(0);
16436 } else {
16437 VecTy = N->getOperand(3).getValueType();
16438 }
16439
16440 unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
16441
16442 // If the increment is a constant, it must match the memory ref size.
16443 SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
16445 if (!CInc || CInc->getZExtValue() != NumBytes)
16446 continue;
16447
16448 // Create the new updating load/store node.
16449 // First, create an SDVTList for the new updating node's results.
16450 EVT Tys[6];
16451 unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
16452 unsigned n;
16453 for (n = 0; n < NumResultVecs; ++n)
16454 Tys[n] = VecTy;
16455 Tys[n++] = MVT::i32;
16456 Tys[n] = MVT::Other;
16457 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumResultVecs + 2));
16458
16459 // Then, gather the new node's operands.
16461 Ops.push_back(N->getOperand(0)); // incoming chain
16462 Ops.push_back(N->getOperand(2)); // ptr
16463 Ops.push_back(Inc);
16464
16465 for (unsigned i = 3; i < N->getNumOperands(); ++i)
16466 Ops.push_back(N->getOperand(i));
16467
16468 SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
16469 MemN->getMemOperand());
16470
16471 // Update the uses.
16472 SmallVector<SDValue, 5> NewResults;
16473 for (unsigned i = 0; i < NumResultVecs; ++i)
16474 NewResults.push_back(SDValue(UpdN.getNode(), i));
16475
16476 NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
16477 DCI.CombineTo(N, NewResults);
16478 DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
16479
16480 break;
16481 }
16482
16483 return SDValue();
16484}
16485
16486/// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
16487/// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
16488/// are also VDUPLANEs. If so, combine them to a vldN-dup operation and
16489/// return true.
16491 SelectionDAG &DAG = DCI.DAG;
16492 EVT VT = N->getValueType(0);
16493 // vldN-dup instructions only support 64-bit vectors for N > 1.
16494 if (!VT.is64BitVector())
16495 return false;
16496
16497 // Check if the VDUPLANE operand is a vldN-dup intrinsic.
16498 SDNode *VLD = N->getOperand(0).getNode();
16499 if (VLD->getOpcode() != ISD::INTRINSIC_W_CHAIN)
16500 return false;
16501 unsigned NumVecs = 0;
16502 unsigned NewOpc = 0;
16503 unsigned IntNo = VLD->getConstantOperandVal(1);
16504 if (IntNo == Intrinsic::arm_neon_vld2lane) {
16505 NumVecs = 2;
16506 NewOpc = ARMISD::VLD2DUP;
16507 } else if (IntNo == Intrinsic::arm_neon_vld3lane) {
16508 NumVecs = 3;
16509 NewOpc = ARMISD::VLD3DUP;
16510 } else if (IntNo == Intrinsic::arm_neon_vld4lane) {
16511 NumVecs = 4;
16512 NewOpc = ARMISD::VLD4DUP;
16513 } else {
16514 return false;
16515 }
16516
16517 // First check that all the vldN-lane uses are VDUPLANEs and that the lane
16518 // numbers match the load.
16519 unsigned VLDLaneNo = VLD->getConstantOperandVal(NumVecs + 3);
16520 for (SDUse &Use : VLD->uses()) {
16521 // Ignore uses of the chain result.
16522 if (Use.getResNo() == NumVecs)
16523 continue;
16524 SDNode *User = Use.getUser();
16525 if (User->getOpcode() != ARMISD::VDUPLANE ||
16526 VLDLaneNo != User->getConstantOperandVal(1))
16527 return false;
16528 }
16529
16530 // Create the vldN-dup node.
16531 EVT Tys[5];
16532 unsigned n;
16533 for (n = 0; n < NumVecs; ++n)
16534 Tys[n] = VT;
16535 Tys[n] = MVT::Other;
16536 SDVTList SDTys = DAG.getVTList(ArrayRef(Tys, NumVecs + 1));
16537 SDValue Ops[] = { VLD->getOperand(0), VLD->getOperand(2) };
16539 SDValue VLDDup = DAG.getMemIntrinsicNode(NewOpc, SDLoc(VLD), SDTys,
16540 Ops, VLDMemInt->getMemoryVT(),
16541 VLDMemInt->getMemOperand());
16542
16543 // Update the uses.
16544 for (SDUse &Use : VLD->uses()) {
16545 unsigned ResNo = Use.getResNo();
16546 // Ignore uses of the chain result.
16547 if (ResNo == NumVecs)
16548 continue;
16549 DCI.CombineTo(Use.getUser(), SDValue(VLDDup.getNode(), ResNo));
16550 }
16551
16552 // Now the vldN-lane intrinsic is dead except for its chain result.
16553 // Update uses of the chain.
16554 std::vector<SDValue> VLDDupResults;
16555 for (unsigned n = 0; n < NumVecs; ++n)
16556 VLDDupResults.push_back(SDValue(VLDDup.getNode(), n));
16557 VLDDupResults.push_back(SDValue(VLDDup.getNode(), NumVecs));
16558 DCI.CombineTo(VLD, VLDDupResults);
16559
16560 return true;
16561}
16562
16563/// PerformVDUPLANECombine - Target-specific dag combine xforms for
16564/// ARMISD::VDUPLANE.
16567 const ARMSubtarget *Subtarget) {
16568 SDValue Op = N->getOperand(0);
16569 EVT VT = N->getValueType(0);
16570
16571 // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
16572 if (Subtarget->hasMVEIntegerOps()) {
16573 EVT ExtractVT = VT.getVectorElementType();
16574 // We need to ensure we are creating a legal type.
16575 if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
16576 ExtractVT = MVT::i32;
16577 SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
16578 N->getOperand(0), N->getOperand(1));
16579 return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
16580 }
16581
16582 // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
16583 // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
16584 if (CombineVLDDUP(N, DCI))
16585 return SDValue(N, 0);
16586
16587 // If the source is already a VMOVIMM or VMVNIMM splat, the VDUPLANE is
16588 // redundant. Ignore bit_converts for now; element sizes are checked below.
16589 while (Op.getOpcode() == ISD::BITCAST)
16590 Op = Op.getOperand(0);
16591 if (Op.getOpcode() != ARMISD::VMOVIMM && Op.getOpcode() != ARMISD::VMVNIMM)
16592 return SDValue();
16593
16594 // Make sure the VMOV element size is not bigger than the VDUPLANE elements.
16595 unsigned EltSize = Op.getScalarValueSizeInBits();
16596 // The canonical VMOV for a zero vector uses a 32-bit element size.
16597 unsigned Imm = Op.getConstantOperandVal(0);
16598 unsigned EltBits;
16599 if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
16600 EltSize = 8;
16601 if (EltSize > VT.getScalarSizeInBits())
16602 return SDValue();
16603
16604 return DCI.DAG.getNode(ISD::BITCAST, SDLoc(N), VT, Op);
16605}
16606
16607/// PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
16609 const ARMSubtarget *Subtarget) {
16610 SDValue Op = N->getOperand(0);
16611 SDLoc dl(N);
16612
16613 if (Subtarget->hasMVEIntegerOps()) {
16614 // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
16615 // need to come from a GPR.
16616 if (Op.getValueType() == MVT::f32)
16617 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16618 DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
16619 else if (Op.getValueType() == MVT::f16)
16620 return DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
16621 DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
16622 }
16623
16624 if (!Subtarget->hasNEON())
16625 return SDValue();
16626
16627 // Match VDUP(LOAD) -> VLD1DUP.
16628 // We match this pattern here rather than waiting for isel because the
16629 // transform is only legal for unindexed loads.
16630 LoadSDNode *LD = dyn_cast<LoadSDNode>(Op.getNode());
16631 if (LD && Op.hasOneUse() && LD->isUnindexed() &&
16632 LD->getMemoryVT() == N->getValueType(0).getVectorElementType()) {
16633 SDValue Ops[] = {LD->getOperand(0), LD->getOperand(1),
16634 DAG.getConstant(LD->getAlign().value(), SDLoc(N), MVT::i32)};
16635 SDVTList SDTys = DAG.getVTList(N->getValueType(0), MVT::Other);
16636 SDValue VLDDup =
16638 LD->getMemoryVT(), LD->getMemOperand());
16639 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), VLDDup.getValue(1));
16640 return VLDDup;
16641 }
16642
16643 return SDValue();
16644}
16645
16648 const ARMSubtarget *Subtarget) {
16649 EVT VT = N->getValueType(0);
16650
16651 // If this is a legal vector load, try to combine it into a VLD1_UPD.
16652 if (Subtarget->hasNEON() && ISD::isNormalLoad(N) && VT.isVector() &&
16654 return CombineBaseUpdate(N, DCI);
16655
16656 return SDValue();
16657}
16658
16659// Optimize trunc store (of multiple scalars) to shuffle and store. First,
16660// pack all of the elements in one place. Next, store to memory in fewer
16661// chunks.
16663 SelectionDAG &DAG) {
16664 SDValue StVal = St->getValue();
16665 EVT VT = StVal.getValueType();
16666 if (!St->isTruncatingStore() || !VT.isVector())
16667 return SDValue();
16668 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
16669 EVT StVT = St->getMemoryVT();
16670 unsigned NumElems = VT.getVectorNumElements();
16671 assert(StVT != VT && "Cannot truncate to the same type");
16672 unsigned FromEltSz = VT.getScalarSizeInBits();
16673 unsigned ToEltSz = StVT.getScalarSizeInBits();
16674
16675 // From, To sizes and ElemCount must be pow of two
16676 if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz))
16677 return SDValue();
16678
16679 // We are going to use the original vector elt for storing.
16680 // Accumulated smaller vector elements must be a multiple of the store size.
16681 if (0 != (NumElems * FromEltSz) % ToEltSz)
16682 return SDValue();
16683
16684 unsigned SizeRatio = FromEltSz / ToEltSz;
16685 assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits());
16686
16687 // Create a type on which we perform the shuffle.
16688 EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(),
16689 NumElems * SizeRatio);
16690 assert(WideVecVT.getSizeInBits() == VT.getSizeInBits());
16691
16692 SDLoc DL(St);
16693 SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal);
16694 SmallVector<int, 8> ShuffleVec(NumElems * SizeRatio, -1);
16695 for (unsigned i = 0; i < NumElems; ++i)
16696 ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1
16697 : i * SizeRatio;
16698
16699 // Can't shuffle using an illegal type.
16700 if (!TLI.isTypeLegal(WideVecVT))
16701 return SDValue();
16702
16703 SDValue Shuff = DAG.getVectorShuffle(
16704 WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec);
16705 // At this point all of the data is stored at the bottom of the
16706 // register. We now need to save it to mem.
16707
16708 // Find the largest store unit
16709 MVT StoreType = MVT::i8;
16710 for (MVT Tp : MVT::integer_valuetypes()) {
16711 if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz)
16712 StoreType = Tp;
16713 }
16714 // Didn't find a legal store type.
16715 if (!TLI.isTypeLegal(StoreType))
16716 return SDValue();
16717
16718 // Bitcast the original vector into a vector of store-size units
16719 EVT StoreVecVT =
16720 EVT::getVectorVT(*DAG.getContext(), StoreType,
16721 VT.getSizeInBits() / EVT(StoreType).getSizeInBits());
16722 assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits());
16723 SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff);
16725 SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL,
16726 TLI.getPointerTy(DAG.getDataLayout()));
16727 SDValue BasePtr = St->getBasePtr();
16728
16729 // Perform one or more big stores into memory.
16730 unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits();
16731 for (unsigned I = 0; I < E; I++) {
16732 SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType,
16733 ShuffWide, DAG.getIntPtrConstant(I, DL));
16734 SDValue Ch =
16735 DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(),
16736 St->getAlign(), St->getMemOperand()->getFlags());
16737 BasePtr =
16738 DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment);
16739 Chains.push_back(Ch);
16740 }
16741 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
16742}
16743
16744// Try taking a single vector store from an fpround (which would otherwise turn
16745// into an expensive buildvector) and splitting it into a series of narrowing
16746// stores.
16748 SelectionDAG &DAG) {
16749 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16750 return SDValue();
16751 SDValue Trunc = St->getValue();
16752 if (Trunc->getOpcode() != ISD::FP_ROUND)
16753 return SDValue();
16754 EVT FromVT = Trunc->getOperand(0).getValueType();
16755 EVT ToVT = Trunc.getValueType();
16756 if (!ToVT.isVector())
16757 return SDValue();
16759 EVT ToEltVT = ToVT.getVectorElementType();
16760 EVT FromEltVT = FromVT.getVectorElementType();
16761
16762 if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16)
16763 return SDValue();
16764
16765 unsigned NumElements = 4;
16766 if (FromVT.getVectorNumElements() % NumElements != 0)
16767 return SDValue();
16768
16769 // Test if the Trunc will be convertible to a VMOVN with a shuffle, and if so
16770 // use the VMOVN over splitting the store. We are looking for patterns of:
16771 // !rev: 0 N 1 N+1 2 N+2 ...
16772 // rev: N 0 N+1 1 N+2 2 ...
16773 // The shuffle may either be a single source (in which case N = NumElts/2) or
16774 // two inputs extended with concat to the same size (in which case N =
16775 // NumElts).
16776 auto isVMOVNShuffle = [&](ShuffleVectorSDNode *SVN, bool Rev) {
16777 ArrayRef<int> M = SVN->getMask();
16778 unsigned NumElts = ToVT.getVectorNumElements();
16779 if (SVN->getOperand(1).isUndef())
16780 NumElts /= 2;
16781
16782 unsigned Off0 = Rev ? NumElts : 0;
16783 unsigned Off1 = Rev ? 0 : NumElts;
16784
16785 for (unsigned I = 0; I < NumElts; I += 2) {
16786 if (M[I] >= 0 && M[I] != (int)(Off0 + I / 2))
16787 return false;
16788 if (M[I + 1] >= 0 && M[I + 1] != (int)(Off1 + I / 2))
16789 return false;
16790 }
16791
16792 return true;
16793 };
16794
16795 if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc.getOperand(0)))
16796 if (isVMOVNShuffle(Shuffle, false) || isVMOVNShuffle(Shuffle, true))
16797 return SDValue();
16798
16799 LLVMContext &C = *DAG.getContext();
16800 SDLoc DL(St);
16801 // Details about the old store
16802 SDValue Ch = St->getChain();
16803 SDValue BasePtr = St->getBasePtr();
16804 Align Alignment = St->getBaseAlign();
16806 AAMDNodes AAInfo = St->getAAInfo();
16807
16808 // We split the store into slices of NumElements. fp16 trunc stores are vcvt
16809 // and then stored as truncating integer stores.
16810 EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
16811 EVT NewToVT = EVT::getVectorVT(
16812 C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
16813
16815 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
16816 unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8;
16817 SDValue NewPtr =
16818 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16819
16820 SDValue Extract =
16821 DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
16822 DAG.getConstant(i * NumElements, DL, MVT::i32));
16823
16824 SDValue FPTrunc =
16825 DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
16826 Extract, DAG.getConstant(0, DL, MVT::i32));
16827 Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
16828
16829 SDValue Store = DAG.getTruncStore(
16830 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16831 NewToVT, Alignment, MMOFlags, AAInfo);
16832 Stores.push_back(Store);
16833 }
16834 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16835}
16836
16837// Try taking a single vector store from an MVETRUNC (which would otherwise turn
16838// into an expensive buildvector) and splitting it into a series of narrowing
16839// stores.
16841 SelectionDAG &DAG) {
16842 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16843 return SDValue();
16844 SDValue Trunc = St->getValue();
16845 if (Trunc->getOpcode() != ARMISD::MVETRUNC)
16846 return SDValue();
16847 EVT FromVT = Trunc->getOperand(0).getValueType();
16848 EVT ToVT = Trunc.getValueType();
16849
16850 LLVMContext &C = *DAG.getContext();
16851 SDLoc DL(St);
16852 // Details about the old store
16853 SDValue Ch = St->getChain();
16854 SDValue BasePtr = St->getBasePtr();
16855 Align Alignment = St->getBaseAlign();
16857 AAMDNodes AAInfo = St->getAAInfo();
16858
16859 EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(),
16860 FromVT.getVectorNumElements());
16861
16863 for (unsigned i = 0; i < Trunc.getNumOperands(); i++) {
16864 unsigned NewOffset =
16865 i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8;
16866 SDValue NewPtr =
16867 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
16868
16869 SDValue Extract = Trunc.getOperand(i);
16870 SDValue Store = DAG.getTruncStore(
16871 Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
16872 NewToVT, Alignment, MMOFlags, AAInfo);
16873 Stores.push_back(Store);
16874 }
16875 return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
16876}
16877
16878// Given a floating point store from an extracted vector, with an integer
16879// VGETLANE that already exists, store the existing VGETLANEu directly. This can
16880// help reduce fp register pressure, doesn't require the fp extract and allows
16881// use of more integer post-inc stores not available with vstr.
16883 if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
16884 return SDValue();
16885 SDValue Extract = St->getValue();
16886 EVT VT = Extract.getValueType();
16887 // For now only uses f16. This may be useful for f32 too, but that will
16888 // be bitcast(extract), not the VGETLANEu we currently check here.
16889 if (VT != MVT::f16 || Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
16890 return SDValue();
16891
16892 SDNode *GetLane =
16893 DAG.getNodeIfExists(ARMISD::VGETLANEu, DAG.getVTList(MVT::i32),
16894 {Extract.getOperand(0), Extract.getOperand(1)});
16895 if (!GetLane)
16896 return SDValue();
16897
16898 LLVMContext &C = *DAG.getContext();
16899 SDLoc DL(St);
16900 // Create a new integer store to replace the existing floating point version.
16901 SDValue Ch = St->getChain();
16902 SDValue BasePtr = St->getBasePtr();
16903 Align Alignment = St->getBaseAlign();
16905 AAMDNodes AAInfo = St->getAAInfo();
16906 EVT NewToVT = EVT::getIntegerVT(C, VT.getSizeInBits());
16907 SDValue Store = DAG.getTruncStore(Ch, DL, SDValue(GetLane, 0), BasePtr,
16908 St->getPointerInfo(), NewToVT, Alignment,
16909 MMOFlags, AAInfo);
16910
16911 return Store;
16912}
16913
16914/// PerformSTORECombine - Target-specific dag combine xforms for
16915/// ISD::STORE.
16918 const ARMSubtarget *Subtarget) {
16920 if (St->isVolatile())
16921 return SDValue();
16922 SDValue StVal = St->getValue();
16923 EVT VT = StVal.getValueType();
16924
16925 if (Subtarget->hasNEON())
16926 if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG))
16927 return Store;
16928
16929 if (Subtarget->hasMVEFloatOps())
16930 if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG))
16931 return NewToken;
16932
16933 if (Subtarget->hasMVEIntegerOps()) {
16934 if (SDValue NewChain = PerformExtractFpToIntStores(St, DCI.DAG))
16935 return NewChain;
16936 if (SDValue NewToken =
16938 return NewToken;
16939 }
16940
16941 if (!ISD::isNormalStore(St))
16942 return SDValue();
16943
16944 // Split a store of a VMOVDRR into two integer stores to avoid mixing NEON and
16945 // ARM stores of arguments in the same cache line.
16946 if (StVal.getOpcode() == ARMISD::VMOVDRR && StVal->hasOneUse()) {
16947 SelectionDAG &DAG = DCI.DAG;
16948 bool isBigEndian = DAG.getDataLayout().isBigEndian();
16949 SDLoc DL(St);
16950 SDValue BasePtr = St->getBasePtr();
16951 SDValue NewST1 =
16952 DAG.getStore(St->getChain(), DL, StVal.getOperand(isBigEndian ? 1 : 0),
16953 BasePtr, St->getPointerInfo(), St->getBaseAlign(),
16954 St->getMemOperand()->getFlags());
16955
16956 SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i32, BasePtr,
16957 DAG.getConstant(4, DL, MVT::i32));
16958 return DAG.getStore(NewST1.getValue(0), DL,
16959 StVal.getOperand(isBigEndian ? 0 : 1), OffsetPtr,
16961 St->getBaseAlign(), St->getMemOperand()->getFlags());
16962 }
16963
16964 if (StVal.getValueType() == MVT::i64 &&
16966 // Bitcast an i64 store extracted from a vector to f64.
16967 // Otherwise, the i64 value will be legalized to a pair of i32 values.
16968 SelectionDAG &DAG = DCI.DAG;
16969 SDLoc dl(StVal);
16970 SDValue IntVec = StVal.getOperand(0);
16971 EVT FloatVT =
16972 EVT::getVectorVT(*DAG.getContext(), MVT::f64,
16974 SDValue Vec = DAG.getNode(ISD::BITCAST, dl, FloatVT, IntVec);
16975 SDValue ExtElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Vec,
16976 StVal.getOperand(1));
16977 dl = SDLoc(N);
16978 SDValue V = DAG.getNode(ISD::BITCAST, dl, MVT::i64, ExtElt);
16979 // Make the DAGCombiner fold the bitcasts.
16980 DCI.AddToWorklist(Vec.getNode());
16981 DCI.AddToWorklist(ExtElt.getNode());
16982 DCI.AddToWorklist(V.getNode());
16983 return DAG.getStore(St->getChain(), dl, V, St->getBasePtr(),
16984 St->getPointerInfo(), St->getAlign(),
16985 St->getMemOperand()->getFlags(), St->getAAInfo());
16986 }
16987
16988 // If this is a legal vector store, try to combine it into a VST1_UPD.
16989 if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() &&
16991 return CombineBaseUpdate(N, DCI);
16992
16993 return SDValue();
16994}
16995
16996/// PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD)
16997/// can replace combinations of VMUL and VCVT (floating-point to integer)
16998/// when the VMUL has a constant operand that is a power of 2.
16999///
17000/// Example (assume d17 = <float 8.000000e+00, float 8.000000e+00>):
17001/// vmul.f32 d16, d17, d16
17002/// vcvt.s32.f32 d16, d16
17003/// becomes:
17004/// vcvt.s32.f32 d16, d16, #3
17006 const ARMSubtarget *Subtarget) {
17007 if (!Subtarget->hasNEON())
17008 return SDValue();
17009
17010 SDValue Op = N->getOperand(0);
17011 if (!Op.getValueType().isVector() || !Op.getValueType().isSimple() ||
17012 Op.getOpcode() != ISD::FMUL)
17013 return SDValue();
17014
17015 SDValue ConstVec = Op->getOperand(1);
17016 if (!isa<BuildVectorSDNode>(ConstVec))
17017 return SDValue();
17018
17019 MVT FloatTy = Op.getSimpleValueType().getVectorElementType();
17020 uint32_t FloatBits = FloatTy.getSizeInBits();
17021 MVT IntTy = N->getSimpleValueType(0).getVectorElementType();
17022 uint32_t IntBits = IntTy.getSizeInBits();
17023 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17024 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17025 // These instructions only exist converting from f32 to i32. We can handle
17026 // smaller integers by generating an extra truncate, but larger ones would
17027 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17028 // these instructions only support v2i32/v4i32 types.
17029 return SDValue();
17030 }
17031
17032 BitVector UndefElements;
17034 int32_t C = BV->getConstantFPSplatPow2ToLog2Int(&UndefElements, 33);
17035 if (C == -1 || C == 0 || C > 32)
17036 return SDValue();
17037
17038 SDLoc dl(N);
17039 bool isSigned = N->getOpcode() == ISD::FP_TO_SINT;
17040 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfp2fxs :
17041 Intrinsic::arm_neon_vcvtfp2fxu;
17042 SDValue FixConv = DAG.getNode(
17043 ISD::INTRINSIC_WO_CHAIN, dl, NumLanes == 2 ? MVT::v2i32 : MVT::v4i32,
17044 DAG.getConstant(IntrinsicOpcode, dl, MVT::i32), Op->getOperand(0),
17045 DAG.getConstant(C, dl, MVT::i32));
17046
17047 if (IntBits < FloatBits)
17048 FixConv = DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), FixConv);
17049
17050 return FixConv;
17051}
17052
17054 const ARMSubtarget *Subtarget) {
17055 if (!Subtarget->hasMVEFloatOps())
17056 return SDValue();
17057
17058 // Turn (fadd x, (vselect c, y, -0.0)) into (vselect c, (fadd x, y), x)
17059 // The second form can be more easily turned into a predicated vadd, and
17060 // possibly combined into a fma to become a predicated vfma.
17061 SDValue Op0 = N->getOperand(0);
17062 SDValue Op1 = N->getOperand(1);
17063 EVT VT = N->getValueType(0);
17064 SDLoc DL(N);
17065
17066 // The identity element for a fadd is -0.0 or +0.0 when the nsz flag is set,
17067 // which these VMOV's represent.
17068 auto isIdentitySplat = [&](SDValue Op, bool NSZ) {
17069 if (Op.getOpcode() != ISD::BITCAST ||
17070 Op.getOperand(0).getOpcode() != ARMISD::VMOVIMM)
17071 return false;
17072 uint64_t ImmVal = Op.getOperand(0).getConstantOperandVal(0);
17073 if (VT == MVT::v4f32 && (ImmVal == 1664 || (ImmVal == 0 && NSZ)))
17074 return true;
17075 if (VT == MVT::v8f16 && (ImmVal == 2688 || (ImmVal == 0 && NSZ)))
17076 return true;
17077 return false;
17078 };
17079
17080 if (Op0.getOpcode() == ISD::VSELECT && Op1.getOpcode() != ISD::VSELECT)
17081 std::swap(Op0, Op1);
17082
17083 if (Op1.getOpcode() != ISD::VSELECT)
17084 return SDValue();
17085
17086 SDNodeFlags FaddFlags = N->getFlags();
17087 bool NSZ = FaddFlags.hasNoSignedZeros();
17088 if (!isIdentitySplat(Op1.getOperand(2), NSZ))
17089 return SDValue();
17090
17091 SDValue FAdd =
17092 DAG.getNode(ISD::FADD, DL, VT, Op0, Op1.getOperand(1), FaddFlags);
17093 return DAG.getNode(ISD::VSELECT, DL, VT, Op1.getOperand(0), FAdd, Op0, FaddFlags);
17094}
17095
17097 SDValue LHS = N->getOperand(0);
17098 SDValue RHS = N->getOperand(1);
17099 EVT VT = N->getValueType(0);
17100 SDLoc DL(N);
17101
17102 if (!N->getFlags().hasAllowReassociation())
17103 return SDValue();
17104
17105 // Combine fadd(a, vcmla(b, c, d)) -> vcmla(fadd(a, b), b, c)
17106 auto ReassocComplex = [&](SDValue A, SDValue B) {
17107 if (A.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
17108 return SDValue();
17109 unsigned Opc = A.getConstantOperandVal(0);
17110 if (Opc != Intrinsic::arm_mve_vcmlaq)
17111 return SDValue();
17112 SDValue VCMLA = DAG.getNode(
17113 ISD::INTRINSIC_WO_CHAIN, DL, VT, A.getOperand(0), A.getOperand(1),
17114 DAG.getNode(ISD::FADD, DL, VT, A.getOperand(2), B, N->getFlags()),
17115 A.getOperand(3), A.getOperand(4));
17116 VCMLA->setFlags(A->getFlags());
17117 return VCMLA;
17118 };
17119 if (SDValue R = ReassocComplex(LHS, RHS))
17120 return R;
17121 if (SDValue R = ReassocComplex(RHS, LHS))
17122 return R;
17123
17124 return SDValue();
17125}
17126
17128 const ARMSubtarget *Subtarget) {
17129 if (SDValue S = PerformFAddVSelectCombine(N, DAG, Subtarget))
17130 return S;
17131 if (SDValue S = PerformFADDVCMLACombine(N, DAG))
17132 return S;
17133 return SDValue();
17134}
17135
17136/// PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD)
17137/// can replace combinations of VCVT (integer to floating-point) and VMUL
17138/// when the VMUL has a constant operand that is a power of 2.
17139///
17140/// Example (assume d17 = <float 0.125, float 0.125>):
17141/// vcvt.f32.s32 d16, d16
17142/// vmul.f32 d16, d16, d17
17143/// becomes:
17144/// vcvt.f32.s32 d16, d16, #3
17146 const ARMSubtarget *Subtarget) {
17147 if (!Subtarget->hasNEON())
17148 return SDValue();
17149
17150 SDValue Op = N->getOperand(0);
17151 unsigned OpOpcode = Op.getNode()->getOpcode();
17152 if (!N->getValueType(0).isVector() || !N->getValueType(0).isSimple() ||
17153 (OpOpcode != ISD::SINT_TO_FP && OpOpcode != ISD::UINT_TO_FP))
17154 return SDValue();
17155
17156 SDValue ConstVec = N->getOperand(1);
17157 if (!isa<BuildVectorSDNode>(ConstVec))
17158 return SDValue();
17159
17160 MVT FloatTy = N->getSimpleValueType(0).getVectorElementType();
17161 uint32_t FloatBits = FloatTy.getSizeInBits();
17162 MVT IntTy = Op.getOperand(0).getSimpleValueType().getVectorElementType();
17163 uint32_t IntBits = IntTy.getSizeInBits();
17164 unsigned NumLanes = Op.getValueType().getVectorNumElements();
17165 if (FloatBits != 32 || IntBits > 32 || (NumLanes != 4 && NumLanes != 2)) {
17166 // These instructions only exist converting from i32 to f32. We can handle
17167 // smaller integers by generating an extra extend, but larger ones would
17168 // be lossy. We also can't handle anything other than 2 or 4 lanes, since
17169 // these instructions only support v2i32/v4i32 types.
17170 return SDValue();
17171 }
17172
17173 ConstantFPSDNode *CN = isConstOrConstSplatFP(ConstVec, true);
17174 APFloat Recip(0.0f);
17175 if (!CN || !CN->getValueAPF().getExactInverse(&Recip))
17176 return SDValue();
17177
17178 bool IsExact;
17179 APSInt IntVal(33);
17180 if (Recip.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
17181 APFloat::opOK ||
17182 !IsExact)
17183 return SDValue();
17184
17185 int32_t C = IntVal.exactLogBase2();
17186 if (C == -1 || C == 0 || C > 32)
17187 return SDValue();
17188
17189 SDLoc DL(N);
17190 bool isSigned = OpOpcode == ISD::SINT_TO_FP;
17191 SDValue ConvInput = Op.getOperand(0);
17192 if (IntBits < FloatBits)
17194 NumLanes == 2 ? MVT::v2i32 : MVT::v4i32, ConvInput);
17195
17196 unsigned IntrinsicOpcode = isSigned ? Intrinsic::arm_neon_vcvtfxs2fp
17197 : Intrinsic::arm_neon_vcvtfxu2fp;
17198 return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
17199 DAG.getConstant(IntrinsicOpcode, DL, MVT::i32), ConvInput,
17200 DAG.getConstant(C, DL, MVT::i32));
17201}
17202
17204 const ARMSubtarget *ST) {
17205 if (!ST->hasMVEIntegerOps())
17206 return SDValue();
17207
17208 assert(N->getOpcode() == ISD::VECREDUCE_ADD);
17209 EVT ResVT = N->getValueType(0);
17210 SDValue N0 = N->getOperand(0);
17211 SDLoc dl(N);
17212
17213 // Try to turn vecreduce_add(add(x, y)) into vecreduce(x) + vecreduce(y)
17214 if (ResVT == MVT::i32 && N0.getOpcode() == ISD::ADD &&
17215 (N0.getValueType() == MVT::v4i32 || N0.getValueType() == MVT::v8i16 ||
17216 N0.getValueType() == MVT::v16i8)) {
17217 SDValue Red0 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(0));
17218 SDValue Red1 = DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, N0.getOperand(1));
17219 return DAG.getNode(ISD::ADD, dl, ResVT, Red0, Red1);
17220 }
17221
17222 // We are looking for something that will have illegal types if left alone,
17223 // but that we can convert to a single instruction under MVE. For example
17224 // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
17225 // or
17226 // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
17227
17228 // The legal cases are:
17229 // VADDV u/s 8/16/32
17230 // VMLAV u/s 8/16/32
17231 // VADDLV u/s 32
17232 // VMLALV u/s 16/32
17233
17234 // If the input vector is smaller than legal (v4i8/v4i16 for example) we can
17235 // extend it and use v4i32 instead.
17236 auto ExtTypeMatches = [](SDValue A, ArrayRef<MVT> ExtTypes) {
17237 EVT AVT = A.getValueType();
17238 return any_of(ExtTypes, [&](MVT Ty) {
17239 return AVT.getVectorNumElements() == Ty.getVectorNumElements() &&
17240 AVT.bitsLE(Ty);
17241 });
17242 };
17243 auto ExtendIfNeeded = [&](SDValue A, unsigned ExtendCode) {
17244 EVT AVT = A.getValueType();
17245 if (!AVT.is128BitVector())
17246 A = DAG.getNode(
17247 ExtendCode, dl,
17249 *DAG.getContext(),
17251 A);
17252 return A;
17253 };
17254 auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
17255 if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
17256 return SDValue();
17257 SDValue A = N0->getOperand(0);
17258 if (ExtTypeMatches(A, ExtTypes))
17259 return ExtendIfNeeded(A, ExtendCode);
17260 return SDValue();
17261 };
17262 auto IsPredVADDV = [&](MVT RetTy, unsigned ExtendCode,
17263 ArrayRef<MVT> ExtTypes, SDValue &Mask) {
17264 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17266 return SDValue();
17267 Mask = N0->getOperand(0);
17268 SDValue Ext = N0->getOperand(1);
17269 if (Ext->getOpcode() != ExtendCode)
17270 return SDValue();
17271 SDValue A = Ext->getOperand(0);
17272 if (ExtTypeMatches(A, ExtTypes))
17273 return ExtendIfNeeded(A, ExtendCode);
17274 return SDValue();
17275 };
17276 auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17277 SDValue &A, SDValue &B) {
17278 // For a vmla we are trying to match a larger pattern:
17279 // ExtA = sext/zext A
17280 // ExtB = sext/zext B
17281 // Mul = mul ExtA, ExtB
17282 // vecreduce.add Mul
17283 // There might also be en extra extend between the mul and the addreduce, so
17284 // long as the bitwidth is high enough to make them equivalent (for example
17285 // original v8i16 might be mul at v8i32 and the reduce happens at v8i64).
17286 if (ResVT != RetTy)
17287 return false;
17288 SDValue Mul = N0;
17289 if (Mul->getOpcode() == ExtendCode &&
17290 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17291 ResVT.getScalarSizeInBits())
17292 Mul = Mul->getOperand(0);
17293 if (Mul->getOpcode() != ISD::MUL)
17294 return false;
17295 SDValue ExtA = Mul->getOperand(0);
17296 SDValue ExtB = Mul->getOperand(1);
17297 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17298 return false;
17299 A = ExtA->getOperand(0);
17300 B = ExtB->getOperand(0);
17301 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17302 A = ExtendIfNeeded(A, ExtendCode);
17303 B = ExtendIfNeeded(B, ExtendCode);
17304 return true;
17305 }
17306 return false;
17307 };
17308 auto IsPredVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
17309 SDValue &A, SDValue &B, SDValue &Mask) {
17310 // Same as the pattern above with a select for the zero predicated lanes
17311 // ExtA = sext/zext A
17312 // ExtB = sext/zext B
17313 // Mul = mul ExtA, ExtB
17314 // N0 = select Mask, Mul, 0
17315 // vecreduce.add N0
17316 if (ResVT != RetTy || N0->getOpcode() != ISD::VSELECT ||
17318 return false;
17319 Mask = N0->getOperand(0);
17320 SDValue Mul = N0->getOperand(1);
17321 if (Mul->getOpcode() == ExtendCode &&
17322 Mul->getOperand(0).getScalarValueSizeInBits() * 2 >=
17323 ResVT.getScalarSizeInBits())
17324 Mul = Mul->getOperand(0);
17325 if (Mul->getOpcode() != ISD::MUL)
17326 return false;
17327 SDValue ExtA = Mul->getOperand(0);
17328 SDValue ExtB = Mul->getOperand(1);
17329 if (ExtA->getOpcode() != ExtendCode || ExtB->getOpcode() != ExtendCode)
17330 return false;
17331 A = ExtA->getOperand(0);
17332 B = ExtB->getOperand(0);
17333 if (ExtTypeMatches(A, ExtTypes) && ExtTypeMatches(B, ExtTypes)) {
17334 A = ExtendIfNeeded(A, ExtendCode);
17335 B = ExtendIfNeeded(B, ExtendCode);
17336 return true;
17337 }
17338 return false;
17339 };
17340 auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
17341 // Split illegal MVT::v16i8->i64 vector reductions into two legal v8i16->i64
17342 // reductions. The operands are extended with MVEEXT, but as they are
17343 // reductions the lane orders do not matter. MVEEXT may be combined with
17344 // loads to produce two extending loads, or else they will be expanded to
17345 // VREV/VMOVL.
17346 EVT VT = Ops[0].getValueType();
17347 if (VT == MVT::v16i8) {
17348 assert((Opcode == ARMISD::VMLALVs || Opcode == ARMISD::VMLALVu) &&
17349 "Unexpected illegal long reduction opcode");
17350 bool IsUnsigned = Opcode == ARMISD::VMLALVu;
17351
17352 SDValue Ext0 =
17353 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17354 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[0]);
17355 SDValue Ext1 =
17356 DAG.getNode(IsUnsigned ? ARMISD::MVEZEXT : ARMISD::MVESEXT, dl,
17357 DAG.getVTList(MVT::v8i16, MVT::v8i16), Ops[1]);
17358
17359 SDValue MLA0 = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
17360 Ext0, Ext1);
17361 SDValue MLA1 =
17362 DAG.getNode(IsUnsigned ? ARMISD::VMLALVAu : ARMISD::VMLALVAs, dl,
17363 DAG.getVTList(MVT::i32, MVT::i32), MLA0, MLA0.getValue(1),
17364 Ext0.getValue(1), Ext1.getValue(1));
17365 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, MLA1, MLA1.getValue(1));
17366 }
17367 SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
17368 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
17369 SDValue(Node.getNode(), 1));
17370 };
17371
17372 SDValue A, B;
17373 SDValue Mask;
17374 if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17375 return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
17376 if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
17377 return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
17378 if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17379 A, B))
17380 return Create64bitNode(ARMISD::VMLALVs, {A, B});
17381 if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v16i8, MVT::v8i16, MVT::v4i32},
17382 A, B))
17383 return Create64bitNode(ARMISD::VMLALVu, {A, B});
17384 if (IsVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B))
17385 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17386 DAG.getNode(ARMISD::VMLAVs, dl, MVT::i32, A, B));
17387 if (IsVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B))
17388 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17389 DAG.getNode(ARMISD::VMLAVu, dl, MVT::i32, A, B));
17390
17391 if (IsPredVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17392 Mask))
17393 return DAG.getNode(ARMISD::VMLAVps, dl, ResVT, A, B, Mask);
17394 if (IsPredVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B,
17395 Mask))
17396 return DAG.getNode(ARMISD::VMLAVpu, dl, ResVT, A, B, Mask);
17397 if (IsPredVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17398 Mask))
17399 return Create64bitNode(ARMISD::VMLALVps, {A, B, Mask});
17400 if (IsPredVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B,
17401 Mask))
17402 return Create64bitNode(ARMISD::VMLALVpu, {A, B, Mask});
17403 if (IsPredVMLAV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, A, B, Mask))
17404 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17405 DAG.getNode(ARMISD::VMLAVps, dl, MVT::i32, A, B, Mask));
17406 if (IsPredVMLAV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, A, B, Mask))
17407 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17408 DAG.getNode(ARMISD::VMLAVpu, dl, MVT::i32, A, B, Mask));
17409
17410 if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
17411 return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
17412 if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
17413 return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
17414 if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
17415 return Create64bitNode(ARMISD::VADDLVs, {A});
17416 if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
17417 return Create64bitNode(ARMISD::VADDLVu, {A});
17418 if (SDValue A = IsVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}))
17419 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17420 DAG.getNode(ARMISD::VADDVs, dl, MVT::i32, A));
17421 if (SDValue A = IsVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}))
17422 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17423 DAG.getNode(ARMISD::VADDVu, dl, MVT::i32, A));
17424
17425 if (SDValue A = IsPredVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17426 return DAG.getNode(ARMISD::VADDVps, dl, ResVT, A, Mask);
17427 if (SDValue A = IsPredVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, Mask))
17428 return DAG.getNode(ARMISD::VADDVpu, dl, ResVT, A, Mask);
17429 if (SDValue A = IsPredVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}, Mask))
17430 return Create64bitNode(ARMISD::VADDLVps, {A, Mask});
17431 if (SDValue A = IsPredVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}, Mask))
17432 return Create64bitNode(ARMISD::VADDLVpu, {A, Mask});
17433 if (SDValue A = IsPredVADDV(MVT::i16, ISD::SIGN_EXTEND, {MVT::v16i8}, Mask))
17434 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17435 DAG.getNode(ARMISD::VADDVps, dl, MVT::i32, A, Mask));
17436 if (SDValue A = IsPredVADDV(MVT::i16, ISD::ZERO_EXTEND, {MVT::v16i8}, Mask))
17437 return DAG.getNode(ISD::TRUNCATE, dl, ResVT,
17438 DAG.getNode(ARMISD::VADDVpu, dl, MVT::i32, A, Mask));
17439
17440 // Some complications. We can get a case where the two inputs of the mul are
17441 // the same, then the output sext will have been helpfully converted to a
17442 // zext. Turn it back.
17443 SDValue Op = N0;
17444 if (Op->getOpcode() == ISD::VSELECT)
17445 Op = Op->getOperand(1);
17446 if (Op->getOpcode() == ISD::ZERO_EXTEND &&
17447 Op->getOperand(0)->getOpcode() == ISD::MUL) {
17448 SDValue Mul = Op->getOperand(0);
17449 if (Mul->getOperand(0) == Mul->getOperand(1) &&
17450 Mul->getOperand(0)->getOpcode() == ISD::SIGN_EXTEND) {
17451 SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, N0->getValueType(0), Mul);
17452 if (Op != N0)
17453 Ext = DAG.getNode(ISD::VSELECT, dl, N0->getValueType(0),
17454 N0->getOperand(0), Ext, N0->getOperand(2));
17455 return DAG.getNode(ISD::VECREDUCE_ADD, dl, ResVT, Ext);
17456 }
17457 }
17458
17459 return SDValue();
17460}
17461
17462// Looks for vaddv(shuffle) or vmlav(shuffle, shuffle), with a shuffle where all
17463// the lanes are used. Due to the reduction being commutative the shuffle can be
17464// removed.
17466 unsigned VecOp = N->getOperand(0).getValueType().isVector() ? 0 : 2;
17467 auto *Shuf = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp));
17468 if (!Shuf || !Shuf->getOperand(1).isUndef())
17469 return SDValue();
17470
17471 // Check all elements are used once in the mask.
17472 ArrayRef<int> Mask = Shuf->getMask();
17473 APInt SetElts(Mask.size(), 0);
17474 for (int E : Mask) {
17475 if (E < 0 || E >= (int)Mask.size())
17476 return SDValue();
17477 SetElts.setBit(E);
17478 }
17479 if (!SetElts.isAllOnes())
17480 return SDValue();
17481
17482 if (N->getNumOperands() != VecOp + 1) {
17483 auto *Shuf2 = dyn_cast<ShuffleVectorSDNode>(N->getOperand(VecOp + 1));
17484 if (!Shuf2 || !Shuf2->getOperand(1).isUndef() || Shuf2->getMask() != Mask)
17485 return SDValue();
17486 }
17487
17489 for (SDValue Op : N->ops()) {
17490 if (Op.getValueType().isVector())
17491 Ops.push_back(Op.getOperand(0));
17492 else
17493 Ops.push_back(Op);
17494 }
17495 return DAG.getNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops);
17496}
17497
17500 SDValue Op0 = N->getOperand(0);
17501 SDValue Op1 = N->getOperand(1);
17502 unsigned IsTop = N->getConstantOperandVal(2);
17503
17504 // VMOVNT a undef -> a
17505 // VMOVNB a undef -> a
17506 // VMOVNB undef a -> a
17507 if (Op1->isUndef())
17508 return Op0;
17509 if (Op0->isUndef() && !IsTop)
17510 return Op1;
17511
17512 // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
17513 // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
17514 if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
17515 Op1->getOpcode() == ARMISD::VQMOVNu) &&
17516 Op1->getConstantOperandVal(2) == 0)
17517 return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
17518 Op0, Op1->getOperand(1), N->getOperand(2));
17519
17520 // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
17521 // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
17522 // into the top or bottom lanes.
17523 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17524 APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
17525 APInt Op0DemandedElts =
17526 IsTop ? Op1DemandedElts
17527 : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
17528
17529 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17530 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17531 return SDValue(N, 0);
17532 if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, DCI))
17533 return SDValue(N, 0);
17534
17535 return SDValue();
17536}
17537
17540 SDValue Op0 = N->getOperand(0);
17541 unsigned IsTop = N->getConstantOperandVal(2);
17542
17543 unsigned NumElts = N->getValueType(0).getVectorNumElements();
17544 APInt Op0DemandedElts =
17545 APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
17546 : APInt::getHighBitsSet(2, 1));
17547
17548 const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
17549 if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, DCI))
17550 return SDValue(N, 0);
17551 return SDValue();
17552}
17553
17556 EVT VT = N->getValueType(0);
17557 SDValue LHS = N->getOperand(0);
17558 SDValue RHS = N->getOperand(1);
17559
17560 auto *Shuf0 = dyn_cast<ShuffleVectorSDNode>(LHS);
17561 auto *Shuf1 = dyn_cast<ShuffleVectorSDNode>(RHS);
17562 // Turn VQDMULH(shuffle, shuffle) -> shuffle(VQDMULH)
17563 if (Shuf0 && Shuf1 && Shuf0->getMask().equals(Shuf1->getMask()) &&
17564 LHS.getOperand(1).isUndef() && RHS.getOperand(1).isUndef() &&
17565 (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
17566 SDLoc DL(N);
17567 SDValue NewBinOp = DCI.DAG.getNode(N->getOpcode(), DL, VT,
17568 LHS.getOperand(0), RHS.getOperand(0));
17569 SDValue UndefV = LHS.getOperand(1);
17570 return DCI.DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
17571 }
17572 return SDValue();
17573}
17574
17576 SDLoc DL(N);
17577 SDValue Op0 = N->getOperand(0);
17578 SDValue Op1 = N->getOperand(1);
17579
17580 // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
17581 // uses of the intrinsics.
17582 if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
17583 int ShiftAmt = C->getSExtValue();
17584 if (ShiftAmt == 0) {
17585 SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
17586 DAG.ReplaceAllUsesWith(N, Merge.getNode());
17587 return SDValue();
17588 }
17589
17590 if (ShiftAmt >= -32 && ShiftAmt < 0) {
17591 unsigned NewOpcode =
17592 N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
17593 SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
17594 DAG.getConstant(-ShiftAmt, DL, MVT::i32));
17595 DAG.ReplaceAllUsesWith(N, NewShift.getNode());
17596 return NewShift;
17597 }
17598 }
17599
17600 return SDValue();
17601}
17602
17603/// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
17605 DAGCombinerInfo &DCI) const {
17606 SelectionDAG &DAG = DCI.DAG;
17607 unsigned IntNo = N->getConstantOperandVal(0);
17608 switch (IntNo) {
17609 default:
17610 // Don't do anything for most intrinsics.
17611 break;
17612
17613 // Vector shifts: check for immediate versions and lower them.
17614 // Note: This is done during DAG combining instead of DAG legalizing because
17615 // the build_vectors for 64-bit vector element shift counts are generally
17616 // not legal, and it is hard to see their values after they get legalized to
17617 // loads from a constant pool.
17618 case Intrinsic::arm_neon_vshifts:
17619 case Intrinsic::arm_neon_vshiftu:
17620 case Intrinsic::arm_neon_vrshifts:
17621 case Intrinsic::arm_neon_vrshiftu:
17622 case Intrinsic::arm_neon_vrshiftn:
17623 case Intrinsic::arm_neon_vqshifts:
17624 case Intrinsic::arm_neon_vqshiftu:
17625 case Intrinsic::arm_neon_vqshiftsu:
17626 case Intrinsic::arm_neon_vqshiftns:
17627 case Intrinsic::arm_neon_vqshiftnu:
17628 case Intrinsic::arm_neon_vqshiftnsu:
17629 case Intrinsic::arm_neon_vqrshiftns:
17630 case Intrinsic::arm_neon_vqrshiftnu:
17631 case Intrinsic::arm_neon_vqrshiftnsu: {
17632 EVT VT = N->getOperand(1).getValueType();
17633 int64_t Cnt;
17634 unsigned VShiftOpc = 0;
17635
17636 switch (IntNo) {
17637 case Intrinsic::arm_neon_vshifts:
17638 case Intrinsic::arm_neon_vshiftu:
17639 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt)) {
17640 VShiftOpc = ARMISD::VSHLIMM;
17641 break;
17642 }
17643 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt)) {
17644 VShiftOpc = (IntNo == Intrinsic::arm_neon_vshifts ? ARMISD::VSHRsIMM
17645 : ARMISD::VSHRuIMM);
17646 break;
17647 }
17648 return SDValue();
17649
17650 case Intrinsic::arm_neon_vrshifts:
17651 case Intrinsic::arm_neon_vrshiftu:
17652 if (isVShiftRImm(N->getOperand(2), VT, false, true, Cnt))
17653 break;
17654 return SDValue();
17655
17656 case Intrinsic::arm_neon_vqshifts:
17657 case Intrinsic::arm_neon_vqshiftu:
17658 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17659 break;
17660 return SDValue();
17661
17662 case Intrinsic::arm_neon_vqshiftsu:
17663 if (isVShiftLImm(N->getOperand(2), VT, false, Cnt))
17664 break;
17665 llvm_unreachable("invalid shift count for vqshlu intrinsic");
17666
17667 case Intrinsic::arm_neon_vrshiftn:
17668 case Intrinsic::arm_neon_vqshiftns:
17669 case Intrinsic::arm_neon_vqshiftnu:
17670 case Intrinsic::arm_neon_vqshiftnsu:
17671 case Intrinsic::arm_neon_vqrshiftns:
17672 case Intrinsic::arm_neon_vqrshiftnu:
17673 case Intrinsic::arm_neon_vqrshiftnsu:
17674 // Narrowing shifts require an immediate right shift.
17675 if (isVShiftRImm(N->getOperand(2), VT, true, true, Cnt))
17676 break;
17677 llvm_unreachable("invalid shift count for narrowing vector shift "
17678 "intrinsic");
17679
17680 default:
17681 llvm_unreachable("unhandled vector shift");
17682 }
17683
17684 switch (IntNo) {
17685 case Intrinsic::arm_neon_vshifts:
17686 case Intrinsic::arm_neon_vshiftu:
17687 // Opcode already set above.
17688 break;
17689 case Intrinsic::arm_neon_vrshifts:
17690 VShiftOpc = ARMISD::VRSHRsIMM;
17691 break;
17692 case Intrinsic::arm_neon_vrshiftu:
17693 VShiftOpc = ARMISD::VRSHRuIMM;
17694 break;
17695 case Intrinsic::arm_neon_vrshiftn:
17696 VShiftOpc = ARMISD::VRSHRNIMM;
17697 break;
17698 case Intrinsic::arm_neon_vqshifts:
17699 VShiftOpc = ARMISD::VQSHLsIMM;
17700 break;
17701 case Intrinsic::arm_neon_vqshiftu:
17702 VShiftOpc = ARMISD::VQSHLuIMM;
17703 break;
17704 case Intrinsic::arm_neon_vqshiftsu:
17705 VShiftOpc = ARMISD::VQSHLsuIMM;
17706 break;
17707 case Intrinsic::arm_neon_vqshiftns:
17708 VShiftOpc = ARMISD::VQSHRNsIMM;
17709 break;
17710 case Intrinsic::arm_neon_vqshiftnu:
17711 VShiftOpc = ARMISD::VQSHRNuIMM;
17712 break;
17713 case Intrinsic::arm_neon_vqshiftnsu:
17714 VShiftOpc = ARMISD::VQSHRNsuIMM;
17715 break;
17716 case Intrinsic::arm_neon_vqrshiftns:
17717 VShiftOpc = ARMISD::VQRSHRNsIMM;
17718 break;
17719 case Intrinsic::arm_neon_vqrshiftnu:
17720 VShiftOpc = ARMISD::VQRSHRNuIMM;
17721 break;
17722 case Intrinsic::arm_neon_vqrshiftnsu:
17723 VShiftOpc = ARMISD::VQRSHRNsuIMM;
17724 break;
17725 }
17726
17727 SDLoc dl(N);
17728 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17729 N->getOperand(1), DAG.getConstant(Cnt, dl, MVT::i32));
17730 }
17731
17732 case Intrinsic::arm_neon_vshiftins: {
17733 EVT VT = N->getOperand(1).getValueType();
17734 int64_t Cnt;
17735 unsigned VShiftOpc = 0;
17736
17737 if (isVShiftLImm(N->getOperand(3), VT, false, Cnt))
17738 VShiftOpc = ARMISD::VSLIIMM;
17739 else if (isVShiftRImm(N->getOperand(3), VT, false, true, Cnt))
17740 VShiftOpc = ARMISD::VSRIIMM;
17741 else {
17742 llvm_unreachable("invalid shift count for vsli/vsri intrinsic");
17743 }
17744
17745 SDLoc dl(N);
17746 return DAG.getNode(VShiftOpc, dl, N->getValueType(0),
17747 N->getOperand(1), N->getOperand(2),
17748 DAG.getConstant(Cnt, dl, MVT::i32));
17749 }
17750
17751 case Intrinsic::arm_neon_vqrshifts:
17752 case Intrinsic::arm_neon_vqrshiftu:
17753 // No immediate versions of these to check for.
17754 break;
17755
17756 case Intrinsic::arm_neon_vbsl: {
17757 SDLoc dl(N);
17758 return DAG.getNode(ARMISD::VBSP, dl, N->getValueType(0), N->getOperand(1),
17759 N->getOperand(2), N->getOperand(3));
17760 }
17761 case Intrinsic::arm_mve_vqdmlah:
17762 case Intrinsic::arm_mve_vqdmlash:
17763 case Intrinsic::arm_mve_vqrdmlah:
17764 case Intrinsic::arm_mve_vqrdmlash:
17765 case Intrinsic::arm_mve_vmla_n_predicated:
17766 case Intrinsic::arm_mve_vmlas_n_predicated:
17767 case Intrinsic::arm_mve_vqdmlah_predicated:
17768 case Intrinsic::arm_mve_vqdmlash_predicated:
17769 case Intrinsic::arm_mve_vqrdmlah_predicated:
17770 case Intrinsic::arm_mve_vqrdmlash_predicated: {
17771 // These intrinsics all take an i32 scalar operand which is narrowed to the
17772 // size of a single lane of the vector type they return. So we don't need
17773 // any bits of that operand above that point, which allows us to eliminate
17774 // uxth/sxth.
17775 unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
17776 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17777 if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
17778 return SDValue();
17779 break;
17780 }
17781
17782 case Intrinsic::arm_mve_minv:
17783 case Intrinsic::arm_mve_maxv:
17784 case Intrinsic::arm_mve_minav:
17785 case Intrinsic::arm_mve_maxav:
17786 case Intrinsic::arm_mve_minv_predicated:
17787 case Intrinsic::arm_mve_maxv_predicated:
17788 case Intrinsic::arm_mve_minav_predicated:
17789 case Intrinsic::arm_mve_maxav_predicated: {
17790 // These intrinsics all take an i32 scalar operand which is narrowed to the
17791 // size of a single lane of the vector type they take as the other input.
17792 unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
17793 APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
17794 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
17795 return SDValue();
17796 break;
17797 }
17798
17799 case Intrinsic::arm_mve_addv: {
17800 // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
17801 // which allow PerformADDVecReduce to turn it into VADDLV when possible.
17802 bool Unsigned = N->getConstantOperandVal(2);
17803 unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
17804 return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
17805 }
17806
17807 case Intrinsic::arm_mve_addlv:
17808 case Intrinsic::arm_mve_addlv_predicated: {
17809 // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
17810 // which recombines the two outputs into an i64
17811 bool Unsigned = N->getConstantOperandVal(2);
17812 unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
17813 (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
17814 (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
17815
17817 for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
17818 if (i != 2) // skip the unsigned flag
17819 Ops.push_back(N->getOperand(i));
17820
17821 SDLoc dl(N);
17822 SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
17823 return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
17824 val.getValue(1));
17825 }
17826 }
17827
17828 return SDValue();
17829}
17830
17832 EVT VT = Y.getValueType();
17833 if (!VT.isVector())
17834 return hasAndNotCompare(Y);
17835 if (Subtarget->hasMVEIntegerOps())
17836 return VT.is128BitVector();
17837 if (Subtarget->hasNEON())
17838 return VT.is64BitVector() || VT.is128BitVector();
17839 return false;
17840}
17841
17842/// PerformShiftCombine - Checks for immediate versions of vector shifts and
17843/// lowers them. As with the vector shift intrinsics, this is done during DAG
17844/// combining instead of DAG legalizing because the build_vectors for 64-bit
17845/// vector element shift counts are generally not legal, and it is hard to see
17846/// their values after they get legalized to loads from a constant pool.
17849 const ARMSubtarget *ST) {
17850 SelectionDAG &DAG = DCI.DAG;
17851 EVT VT = N->getValueType(0);
17852
17853 if (ST->isThumb1Only() && N->getOpcode() == ISD::SHL && VT == MVT::i32 &&
17854 N->getOperand(0)->getOpcode() == ISD::AND &&
17855 N->getOperand(0)->hasOneUse()) {
17856 if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
17857 return SDValue();
17858 // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't
17859 // usually show up because instcombine prefers to canonicalize it to
17860 // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come
17861 // out of GEP lowering in some cases.
17862 SDValue N0 = N->getOperand(0);
17863 ConstantSDNode *ShiftAmtNode = dyn_cast<ConstantSDNode>(N->getOperand(1));
17864 if (!ShiftAmtNode)
17865 return SDValue();
17866 uint32_t ShiftAmt = static_cast<uint32_t>(ShiftAmtNode->getZExtValue());
17867 ConstantSDNode *AndMaskNode = dyn_cast<ConstantSDNode>(N0->getOperand(1));
17868 if (!AndMaskNode)
17869 return SDValue();
17870 uint32_t AndMask = static_cast<uint32_t>(AndMaskNode->getZExtValue());
17871 // Don't transform uxtb/uxth.
17872 if (AndMask == 255 || AndMask == 65535)
17873 return SDValue();
17874 if (isMask_32(AndMask)) {
17875 uint32_t MaskedBits = llvm::countl_zero(AndMask);
17876 if (MaskedBits > ShiftAmt) {
17877 SDLoc DL(N);
17878 SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0),
17879 DAG.getConstant(MaskedBits, DL, MVT::i32));
17880 return DAG.getNode(
17881 ISD::SRL, DL, MVT::i32, SHL,
17882 DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32));
17883 }
17884 }
17885 }
17886
17887 // Nothing to be done for scalar shifts.
17888 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
17889 if (!VT.isVector() || !TLI.isTypeLegal(VT))
17890 return SDValue();
17891 if (ST->hasMVEIntegerOps())
17892 return SDValue();
17893
17894 int64_t Cnt;
17895
17896 switch (N->getOpcode()) {
17897 default: llvm_unreachable("unexpected shift opcode");
17898
17899 case ISD::SHL:
17900 if (isVShiftLImm(N->getOperand(1), VT, false, Cnt)) {
17901 SDLoc dl(N);
17902 return DAG.getNode(ARMISD::VSHLIMM, dl, VT, N->getOperand(0),
17903 DAG.getConstant(Cnt, dl, MVT::i32));
17904 }
17905 break;
17906
17907 case ISD::SRA:
17908 case ISD::SRL:
17909 if (isVShiftRImm(N->getOperand(1), VT, false, false, Cnt)) {
17910 unsigned VShiftOpc =
17911 (N->getOpcode() == ISD::SRA ? ARMISD::VSHRsIMM : ARMISD::VSHRuIMM);
17912 SDLoc dl(N);
17913 return DAG.getNode(VShiftOpc, dl, VT, N->getOperand(0),
17914 DAG.getConstant(Cnt, dl, MVT::i32));
17915 }
17916 }
17917 return SDValue();
17918}
17919
17920// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
17921// split into multiple extending loads, which are simpler to deal with than an
17922// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
17923// to convert the type to an f32.
17925 SDValue N0 = N->getOperand(0);
17926 if (N0.getOpcode() != ISD::LOAD)
17927 return SDValue();
17929 if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() ||
17930 LD->getExtensionType() != ISD::NON_EXTLOAD)
17931 return SDValue();
17932 EVT FromVT = LD->getValueType(0);
17933 EVT ToVT = N->getValueType(0);
17934 if (!ToVT.isVector())
17935 return SDValue();
17937 EVT ToEltVT = ToVT.getVectorElementType();
17938 EVT FromEltVT = FromVT.getVectorElementType();
17939
17940 unsigned NumElements = 0;
17941 if (ToEltVT == MVT::i32 && FromEltVT == MVT::i8)
17942 NumElements = 4;
17943 if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
17944 NumElements = 4;
17945 if (NumElements == 0 ||
17946 (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
17947 FromVT.getVectorNumElements() % NumElements != 0 ||
17948 !isPowerOf2_32(NumElements))
17949 return SDValue();
17950
17951 LLVMContext &C = *DAG.getContext();
17952 SDLoc DL(LD);
17953 // Details about the old load
17954 SDValue Ch = LD->getChain();
17955 SDValue BasePtr = LD->getBasePtr();
17956 Align Alignment = LD->getBaseAlign();
17957 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
17958 AAMDNodes AAInfo = LD->getAAInfo();
17959
17960 ISD::LoadExtType NewExtType =
17961 N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
17962 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
17963 EVT NewFromVT = EVT::getVectorVT(
17964 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
17965 EVT NewToVT = EVT::getVectorVT(
17966 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
17967
17970 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
17971 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
17972 SDValue NewPtr =
17973 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
17974
17975 SDValue NewLoad =
17976 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
17977 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
17978 Alignment, MMOFlags, AAInfo);
17979 Loads.push_back(NewLoad);
17980 Chains.push_back(SDValue(NewLoad.getNode(), 1));
17981 }
17982
17983 // Float truncs need to extended with VCVTB's into their floating point types.
17984 if (FromEltVT == MVT::f16) {
17986
17987 for (unsigned i = 0; i < Loads.size(); i++) {
17988 SDValue LoadBC =
17989 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
17990 SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
17991 DAG.getConstant(0, DL, MVT::i32));
17992 Extends.push_back(FPExt);
17993 }
17994
17995 Loads = Extends;
17996 }
17997
17998 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
17999 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18000 return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
18001}
18002
18003/// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
18004/// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND.
18006 const ARMSubtarget *ST) {
18007 SDValue N0 = N->getOperand(0);
18008 EVT VT = N->getValueType(0);
18009 SDLoc DL(N);
18010
18011 // Check for sign- and zero-extensions of vector extract operations of 8- and
18012 // 16-bit vector elements. NEON and MVE support these directly. They are
18013 // handled during DAG combining because type legalization will promote them
18014 // to 32-bit types and it is messy to recognize the operations after that.
18015 if ((ST->hasNEON() || ST->hasMVEIntegerOps()) &&
18017 SDValue Vec = N0.getOperand(0);
18018 SDValue Lane = N0.getOperand(1);
18019 EVT EltVT = N0.getValueType();
18020 const TargetLowering &TLI = DAG.getTargetLoweringInfo();
18021
18022 if (VT == MVT::i32 &&
18023 (EltVT == MVT::i8 || EltVT == MVT::i16) &&
18024 TLI.isTypeLegal(Vec.getValueType()) &&
18025 isa<ConstantSDNode>(Lane)) {
18026
18027 unsigned Opc = 0;
18028 switch (N->getOpcode()) {
18029 default: llvm_unreachable("unexpected opcode");
18030 case ISD::SIGN_EXTEND:
18031 Opc = ARMISD::VGETLANEs;
18032 break;
18033 case ISD::ZERO_EXTEND:
18034 case ISD::ANY_EXTEND:
18035 Opc = ARMISD::VGETLANEu;
18036 break;
18037 }
18038 return DAG.getNode(Opc, DL, VT, Vec, Lane);
18039 }
18040 }
18041
18042 if (ST->hasMVEIntegerOps())
18043 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
18044 return NewLoad;
18045
18046 // Combine sext(buildvector(..)) to buildvector(sext(..)) to help avoid
18047 // difficult to lower i1 buildvector.
18048 if (ST->hasMVEIntegerOps() && N0.getValueType().getScalarSizeInBits() == 1 &&
18049 N0.getOpcode() == ISD::BUILD_VECTOR && VT.getScalarSizeInBits() <= 32) {
18051 for (unsigned I = 0; I < N0.getNumOperands(); I++) {
18052 SDValue InReg = N0.getOperand(I);
18053 if (N->getOpcode() == ISD::ZERO_EXTEND)
18054 InReg = DAG.getNode(ISD::AND, DL, InReg.getValueType(), InReg,
18055 DAG.getConstant(1, DL, InReg.getValueType()));
18056 else if (N->getOpcode() == ISD::SIGN_EXTEND)
18057 InReg = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, InReg.getValueType(),
18058 InReg, DAG.getValueType(MVT::i1));
18059 SDValue Ext = DAG.getNode(N->getOpcode(), DL, MVT::i32, InReg);
18060 Ops.push_back(Ext);
18061 }
18062 return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Ops);
18063 }
18064
18065 return SDValue();
18066}
18067
18069 const ARMSubtarget *ST) {
18070 if (ST->hasMVEFloatOps())
18071 if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
18072 return NewLoad;
18073
18074 return SDValue();
18075}
18076
18077// Lower smin(smax(x, C1), C2) to ssat or usat, if they have saturating
18078// constant bounds.
18080 const ARMSubtarget *Subtarget) {
18081 if ((Subtarget->isThumb() || !Subtarget->hasV6Ops()) &&
18082 !Subtarget->isThumb2())
18083 return SDValue();
18084
18085 EVT VT = Op.getValueType();
18086 SDValue Op0 = Op.getOperand(0);
18087
18088 if (VT != MVT::i32 ||
18089 (Op0.getOpcode() != ISD::SMIN && Op0.getOpcode() != ISD::SMAX) ||
18090 !isa<ConstantSDNode>(Op.getOperand(1)) ||
18092 return SDValue();
18093
18094 SDValue Min = Op;
18095 SDValue Max = Op0;
18096 SDValue Input = Op0.getOperand(0);
18097 if (Min.getOpcode() == ISD::SMAX)
18098 std::swap(Min, Max);
18099
18100 if (Min.getOpcode() != ISD::SMIN || Max.getOpcode() != ISD::SMAX)
18101 return SDValue();
18102
18103 APInt MinC = Min.getConstantOperandAPInt(1);
18104 APInt MaxC = Max.getConstantOperandAPInt(1);
18105 if (MaxC.sgt(MinC))
18106 return SDValue();
18107
18108 SDLoc DL(Op);
18109
18110 // A clamp whose bounds are already a saturation range maps to a single
18111 // SSAT / USAT.
18112 if ((MinC + 1).isPowerOf2()) {
18113 if (MinC == ~MaxC)
18114 return DAG.getNode(ARMISD::SSAT, DL, VT, Input,
18115 DAG.getConstant(MinC.countr_one(), DL, VT));
18116 if (MaxC == 0)
18117 return DAG.getNode(ARMISD::USAT, DL, VT, Input,
18118 DAG.getConstant(MinC.countr_one(), DL, VT));
18119 }
18120
18121 // For power-of-two clamp widths, convert the range to be zero-centered,
18122 // apply SSAT, and convert the result back.
18123 //
18124 // Width = Hi - Lo + 1
18125 // Center = Lo + Width / 2
18126 // Result = ssat(X - Center) + Center
18127 //
18128 // The idea is to shift the input so that the clamp range is centered
18129 // around zero, apply ssat, and then shift the result back.
18130 //
18131 // For example clamp(X, -118, 137) -> Width = 256, Center = 10, so it becomes
18132 // ssat(X - 10, 8) + 10
18133
18134 APInt Width = MinC - MaxC + 1;
18135 if (!Width.isPowerOf2() || Width.isOne())
18136 return SDValue();
18137 unsigned SatBit = Width.logBase2() - 1; // ssat to SatBit + 1 signed bits
18138 APInt Center = MaxC + Width.lshr(1);
18139
18140 // The rewrite is only valid when X - Center does not overflow;
18141 SDValue NegC = DAG.getConstant(-Center, DL, VT);
18143 return SDValue();
18144
18145 SDValue Shifted = DAG.getNode(ISD::ADD, DL, VT, Input, NegC);
18146 SDValue Sat = DAG.getNode(ARMISD::SSAT, DL, VT, Shifted,
18147 DAG.getConstant(SatBit, DL, VT));
18148 return DAG.getNode(ISD::ADD, DL, VT, Sat, DAG.getConstant(Center, DL, VT));
18149}
18150
18151/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
18152/// saturates.
18154 const ARMSubtarget *ST) {
18155 EVT VT = N->getValueType(0);
18156 SDValue N0 = N->getOperand(0);
18157
18158 if (VT == MVT::i32)
18159 return PerformMinMaxToSatCombine(SDValue(N, 0), DAG, ST);
18160
18161 if (!ST->hasMVEIntegerOps())
18162 return SDValue();
18163
18164 if (SDValue V = PerformVQDMULHCombine(N, DAG))
18165 return V;
18166
18167 if (VT != MVT::v4i32 && VT != MVT::v8i16)
18168 return SDValue();
18169
18170 auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
18171 // Check one is a smin and the other is a smax
18172 if (Min->getOpcode() != ISD::SMIN)
18173 std::swap(Min, Max);
18174 if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
18175 return false;
18176
18177 APInt SaturateC;
18178 if (VT == MVT::v4i32)
18179 SaturateC = APInt(32, (1 << 15) - 1, true);
18180 else //if (VT == MVT::v8i16)
18181 SaturateC = APInt(16, (1 << 7) - 1, true);
18182
18183 APInt MinC, MaxC;
18184 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18185 MinC != SaturateC)
18186 return false;
18187 if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
18188 MaxC != ~SaturateC)
18189 return false;
18190 return true;
18191 };
18192
18193 if (IsSignedSaturate(N, N0.getNode())) {
18194 SDLoc DL(N);
18195 MVT ExtVT, HalfVT;
18196 if (VT == MVT::v4i32) {
18197 HalfVT = MVT::v8i16;
18198 ExtVT = MVT::v4i16;
18199 } else { // if (VT == MVT::v8i16)
18200 HalfVT = MVT::v16i8;
18201 ExtVT = MVT::v8i8;
18202 }
18203
18204 // Create a VQMOVNB with undef top lanes, then signed extended into the top
18205 // half. That extend will hopefully be removed if only the bottom bits are
18206 // demanded (though a truncating store, for example).
18207 SDValue VQMOVN =
18208 DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
18209 N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
18210 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18211 return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
18212 DAG.getValueType(ExtVT));
18213 }
18214
18215 auto IsUnsignedSaturate = [&](SDNode *Min) {
18216 // For unsigned, we just need to check for <= 0xffff
18217 if (Min->getOpcode() != ISD::UMIN)
18218 return false;
18219
18220 APInt SaturateC;
18221 if (VT == MVT::v4i32)
18222 SaturateC = APInt(32, (1 << 16) - 1, true);
18223 else //if (VT == MVT::v8i16)
18224 SaturateC = APInt(16, (1 << 8) - 1, true);
18225
18226 APInt MinC;
18227 if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
18228 MinC != SaturateC)
18229 return false;
18230 return true;
18231 };
18232
18233 if (IsUnsignedSaturate(N)) {
18234 SDLoc DL(N);
18235 MVT HalfVT;
18236 unsigned ExtConst;
18237 if (VT == MVT::v4i32) {
18238 HalfVT = MVT::v8i16;
18239 ExtConst = 0x0000FFFF;
18240 } else { //if (VT == MVT::v8i16)
18241 HalfVT = MVT::v16i8;
18242 ExtConst = 0x00FF;
18243 }
18244
18245 // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
18246 // an AND. That extend will hopefully be removed if only the bottom bits are
18247 // demanded (though a truncating store, for example).
18248 SDValue VQMOVN =
18249 DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
18250 DAG.getConstant(0, DL, MVT::i32));
18251 SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
18252 return DAG.getNode(ISD::AND, DL, VT, Bitcast,
18253 DAG.getConstant(ExtConst, DL, VT));
18254 }
18255
18256 return SDValue();
18257}
18258
18261 if (!C)
18262 return nullptr;
18263 const APInt *CV = &C->getAPIntValue();
18264 return CV->isPowerOf2() ? CV : nullptr;
18265}
18266
18268 // If we have a CMOV, OR and AND combination such as:
18269 // if (x & CN)
18270 // y |= CM;
18271 //
18272 // And:
18273 // * CN is a single bit;
18274 // * All bits covered by CM are known zero in y
18275 //
18276 // Then we can convert this into a sequence of BFI instructions. This will
18277 // always be a win if CM is a single bit, will always be no worse than the
18278 // TST&OR sequence if CM is two bits, and for thumb will be no worse if CM is
18279 // three bits (due to the extra IT instruction).
18280
18281 SDValue Op0 = CMOV->getOperand(0);
18282 SDValue Op1 = CMOV->getOperand(1);
18283 auto CC = CMOV->getConstantOperandAPInt(2).getLimitedValue();
18284 SDValue CmpZ = CMOV->getOperand(3);
18285
18286 // The compare must be against zero.
18287 if (!isNullConstant(CmpZ->getOperand(1)))
18288 return SDValue();
18289
18290 assert(CmpZ->getOpcode() == ARMISD::CMPZ);
18291 SDValue And = CmpZ->getOperand(0);
18292 if (And->getOpcode() != ISD::AND)
18293 return SDValue();
18294 const APInt *AndC = isPowerOf2Constant(And->getOperand(1));
18295 if (!AndC)
18296 return SDValue();
18297 SDValue X = And->getOperand(0);
18298
18299 if (CC == ARMCC::EQ) {
18300 // We're performing an "equal to zero" compare. Swap the operands so we
18301 // canonicalize on a "not equal to zero" compare.
18302 std::swap(Op0, Op1);
18303 } else {
18304 assert(CC == ARMCC::NE && "How can a CMPZ node not be EQ or NE?");
18305 }
18306
18307 if (Op1->getOpcode() != ISD::OR)
18308 return SDValue();
18309
18311 if (!OrC)
18312 return SDValue();
18313 SDValue Y = Op1->getOperand(0);
18314
18315 if (Op0 != Y)
18316 return SDValue();
18317
18318 // Now, is it profitable to continue?
18319 APInt OrCI = OrC->getAPIntValue();
18320 unsigned Heuristic = Subtarget->isThumb() ? 3 : 2;
18321 if (OrCI.popcount() > Heuristic)
18322 return SDValue();
18323
18324 // Lastly, can we determine that the bits defined by OrCI
18325 // are zero in Y?
18326 KnownBits Known = DAG.computeKnownBits(Y);
18327 if ((OrCI & Known.Zero) != OrCI)
18328 return SDValue();
18329
18330 // OK, we can do the combine.
18331 SDValue V = Y;
18332 SDLoc dl(X);
18333 EVT VT = X.getValueType();
18334 unsigned BitInX = AndC->logBase2();
18335
18336 if (BitInX != 0) {
18337 // We must shift X first.
18338 X = DAG.getNode(ISD::SRL, dl, VT, X,
18339 DAG.getConstant(BitInX, dl, VT));
18340 }
18341
18342 for (unsigned BitInY = 0, NumActiveBits = OrCI.getActiveBits();
18343 BitInY < NumActiveBits; ++BitInY) {
18344 if (OrCI[BitInY] == 0)
18345 continue;
18346 APInt Mask(VT.getSizeInBits(), 0);
18347 Mask.setBit(BitInY);
18348 V = DAG.getNode(ARMISD::BFI, dl, VT, V, X,
18349 // Confusingly, the operand is an *inverted* mask.
18350 DAG.getConstant(~Mask, dl, VT));
18351 }
18352
18353 return V;
18354}
18355
18356// Given N, the value controlling the conditional branch, search for the loop
18357// intrinsic, returning it, along with how the value is used. We need to handle
18358// patterns such as the following:
18359// (brcond (xor (setcc (loop.decrement), 0, ne), 1), exit)
18360// (brcond (setcc (loop.decrement), 0, eq), exit)
18361// (brcond (setcc (loop.decrement), 0, ne), header)
18363 bool &Negate) {
18364 switch (N->getOpcode()) {
18365 default:
18366 break;
18367 case ISD::XOR: {
18368 if (!isa<ConstantSDNode>(N.getOperand(1)))
18369 return SDValue();
18370 if (!cast<ConstantSDNode>(N.getOperand(1))->isOne())
18371 return SDValue();
18372 Negate = !Negate;
18373 return SearchLoopIntrinsic(N.getOperand(0), CC, Imm, Negate);
18374 }
18375 case ISD::SETCC: {
18376 auto *Const = dyn_cast<ConstantSDNode>(N.getOperand(1));
18377 if (!Const)
18378 return SDValue();
18379 if (Const->isZero())
18380 Imm = 0;
18381 else if (Const->isOne())
18382 Imm = 1;
18383 else
18384 return SDValue();
18385 CC = cast<CondCodeSDNode>(N.getOperand(2))->get();
18386 return SearchLoopIntrinsic(N->getOperand(0), CC, Imm, Negate);
18387 }
18389 unsigned IntOp = N.getConstantOperandVal(1);
18390 if (IntOp != Intrinsic::test_start_loop_iterations &&
18391 IntOp != Intrinsic::loop_decrement_reg)
18392 return SDValue();
18393 return N;
18394 }
18395 }
18396 return SDValue();
18397}
18398
18401 const ARMSubtarget *ST) {
18402
18403 // The hwloop intrinsics that we're interested are used for control-flow,
18404 // either for entering or exiting the loop:
18405 // - test.start.loop.iterations will test whether its operand is zero. If it
18406 // is zero, the proceeding branch should not enter the loop.
18407 // - loop.decrement.reg also tests whether its operand is zero. If it is
18408 // zero, the proceeding branch should not branch back to the beginning of
18409 // the loop.
18410 // So here, we need to check that how the brcond is using the result of each
18411 // of the intrinsics to ensure that we're branching to the right place at the
18412 // right time.
18413
18414 ISD::CondCode CC;
18415 SDValue Cond;
18416 int Imm = 1;
18417 bool Negate = false;
18418 SDValue Chain = N->getOperand(0);
18419 SDValue Dest;
18420
18421 if (N->getOpcode() == ISD::BRCOND) {
18422 CC = ISD::SETEQ;
18423 Cond = N->getOperand(1);
18424 Dest = N->getOperand(2);
18425 } else {
18426 assert(N->getOpcode() == ISD::BR_CC && "Expected BRCOND or BR_CC!");
18427 CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
18428 Cond = N->getOperand(2);
18429 Dest = N->getOperand(4);
18430 if (auto *Const = dyn_cast<ConstantSDNode>(N->getOperand(3))) {
18431 if (!Const->isOne() && !Const->isZero())
18432 return SDValue();
18433 Imm = Const->getZExtValue();
18434 } else
18435 return SDValue();
18436 }
18437
18438 SDValue Int = SearchLoopIntrinsic(Cond, CC, Imm, Negate);
18439 if (!Int)
18440 return SDValue();
18441
18442 if (Negate)
18443 CC = ISD::getSetCCInverse(CC, /* Integer inverse */ MVT::i32);
18444
18445 auto IsTrueIfZero = [](ISD::CondCode CC, int Imm) {
18446 return (CC == ISD::SETEQ && Imm == 0) ||
18447 (CC == ISD::SETNE && Imm == 1) ||
18448 (CC == ISD::SETLT && Imm == 1) ||
18449 (CC == ISD::SETULT && Imm == 1);
18450 };
18451
18452 auto IsFalseIfZero = [](ISD::CondCode CC, int Imm) {
18453 return (CC == ISD::SETEQ && Imm == 1) ||
18454 (CC == ISD::SETNE && Imm == 0) ||
18455 (CC == ISD::SETGT && Imm == 0) ||
18456 (CC == ISD::SETUGT && Imm == 0) ||
18457 (CC == ISD::SETGE && Imm == 1) ||
18458 (CC == ISD::SETUGE && Imm == 1);
18459 };
18460
18461 assert((IsTrueIfZero(CC, Imm) || IsFalseIfZero(CC, Imm)) &&
18462 "unsupported condition");
18463
18464 SDLoc dl(Int);
18465 SelectionDAG &DAG = DCI.DAG;
18466 SDValue Elements = Int.getOperand(2);
18467 unsigned IntOp = Int->getConstantOperandVal(1);
18468 assert((N->hasOneUse() && N->user_begin()->getOpcode() == ISD::BR) &&
18469 "expected single br user");
18470 SDNode *Br = *N->user_begin();
18471 SDValue OtherTarget = Br->getOperand(1);
18472
18473 // Update the unconditional branch to branch to the given Dest.
18474 auto UpdateUncondBr = [](SDNode *Br, SDValue Dest, SelectionDAG &DAG) {
18475 SDValue NewBrOps[] = { Br->getOperand(0), Dest };
18476 SDValue NewBr = DAG.getNode(ISD::BR, SDLoc(Br), MVT::Other, NewBrOps);
18477 DAG.ReplaceAllUsesOfValueWith(SDValue(Br, 0), NewBr);
18478 };
18479
18480 if (IntOp == Intrinsic::test_start_loop_iterations) {
18481 SDValue Res;
18482 SDValue Setup = DAG.getNode(ARMISD::WLSSETUP, dl, MVT::i32, Elements);
18483 // We expect this 'instruction' to branch when the counter is zero.
18484 if (IsTrueIfZero(CC, Imm)) {
18485 SDValue Ops[] = {Chain, Setup, Dest};
18486 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18487 } else {
18488 // The logic is the reverse of what we need for WLS, so find the other
18489 // basic block target: the target of the proceeding br.
18490 UpdateUncondBr(Br, Dest, DAG);
18491
18492 SDValue Ops[] = {Chain, Setup, OtherTarget};
18493 Res = DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops);
18494 }
18495 // Update LR count to the new value
18496 DAG.ReplaceAllUsesOfValueWith(Int.getValue(0), Setup);
18497 // Update chain
18498 DAG.ReplaceAllUsesOfValueWith(Int.getValue(2), Int.getOperand(0));
18499 return Res;
18500 } else {
18501 SDValue Size =
18502 DAG.getTargetConstant(Int.getConstantOperandVal(3), dl, MVT::i32);
18503 SDValue Args[] = { Int.getOperand(0), Elements, Size, };
18504 SDValue LoopDec = DAG.getNode(ARMISD::LOOP_DEC, dl,
18505 DAG.getVTList(MVT::i32, MVT::Other), Args);
18506 DAG.ReplaceAllUsesWith(Int.getNode(), LoopDec.getNode());
18507
18508 // We expect this instruction to branch when the count is not zero.
18509 SDValue Target = IsFalseIfZero(CC, Imm) ? Dest : OtherTarget;
18510
18511 // Update the unconditional branch to target the loop preheader if we've
18512 // found the condition has been reversed.
18513 if (Target == OtherTarget)
18514 UpdateUncondBr(Br, Dest, DAG);
18515
18516 Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
18517 SDValue(LoopDec.getNode(), 1), Chain);
18518
18519 SDValue EndArgs[] = { Chain, SDValue(LoopDec.getNode(), 0), Target };
18520 return DAG.getNode(ARMISD::LE, dl, MVT::Other, EndArgs);
18521 }
18522 return SDValue();
18523}
18524
18525/// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
18526SDValue
18528 SDValue Cmp = N->getOperand(3);
18529 if (Cmp.getOpcode() != ARMISD::CMPZ)
18530 // Only looking at NE cases.
18531 return SDValue();
18532
18533 SDLoc dl(N);
18534 SDValue LHS = Cmp.getOperand(0);
18535 SDValue RHS = Cmp.getOperand(1);
18536 SDValue Chain = N->getOperand(0);
18537 SDValue BB = N->getOperand(1);
18538 SDValue ARMcc = N->getOperand(2);
18540
18541 // (brcond Chain BB ne (cmpz (and (cmov 0 1 CC Flags) 1) 0))
18542 // -> (brcond Chain BB CC Flags)
18543 if (CC == ARMCC::NE && LHS.getOpcode() == ISD::AND && LHS->hasOneUse() &&
18544 LHS->getOperand(0)->getOpcode() == ARMISD::CMOV &&
18545 LHS->getOperand(0)->hasOneUse() &&
18546 isNullConstant(LHS->getOperand(0)->getOperand(0)) &&
18547 isOneConstant(LHS->getOperand(0)->getOperand(1)) &&
18548 isOneConstant(LHS->getOperand(1)) && isNullConstant(RHS)) {
18549 return DAG.getNode(ARMISD::BRCOND, dl, MVT::Other, Chain, BB,
18550 LHS->getOperand(0)->getOperand(2),
18551 LHS->getOperand(0)->getOperand(3));
18552 }
18553
18554 return SDValue();
18555}
18556
18557/// PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
18558SDValue
18560 SDLoc dl(N);
18561 EVT VT = N->getValueType(0);
18562 SDValue FalseVal = N->getOperand(0);
18563 SDValue TrueVal = N->getOperand(1);
18564 SDValue ARMcc = N->getOperand(2);
18565 SDValue Cmp = N->getOperand(3);
18566
18567 // Try to form CSINV etc.
18568 unsigned Opcode;
18569 bool InvertCond;
18570 if (SDValue CSetOp =
18571 matchCSET(Opcode, InvertCond, TrueVal, FalseVal, Subtarget)) {
18572 if (InvertCond) {
18573 ARMCC::CondCodes CondCode =
18574 (ARMCC::CondCodes)cast<const ConstantSDNode>(ARMcc)->getZExtValue();
18575 CondCode = ARMCC::getOppositeCondition(CondCode);
18576 ARMcc = DAG.getConstant(CondCode, SDLoc(ARMcc), MVT::i32);
18577 }
18578 return DAG.getNode(Opcode, dl, VT, CSetOp, CSetOp, ARMcc, Cmp);
18579 }
18580
18581 if (Cmp.getOpcode() != ARMISD::CMPZ)
18582 // Only looking at EQ and NE cases.
18583 return SDValue();
18584
18585 SDValue LHS = Cmp.getOperand(0);
18586 SDValue RHS = Cmp.getOperand(1);
18588
18589 // BFI is only available on V6T2+.
18590 if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops()) {
18592 if (R)
18593 return R;
18594 }
18595
18596 // Simplify
18597 // mov r1, r0
18598 // cmp r1, x
18599 // mov r0, y
18600 // moveq r0, x
18601 // to
18602 // cmp r0, x
18603 // movne r0, y
18604 //
18605 // mov r1, r0
18606 // cmp r1, x
18607 // mov r0, x
18608 // movne r0, y
18609 // to
18610 // cmp r0, x
18611 // movne r0, y
18612 /// FIXME: Turn this into a target neutral optimization?
18613 SDValue Res;
18614 if (CC == ARMCC::NE && FalseVal == RHS && FalseVal != LHS) {
18615 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, TrueVal, ARMcc, Cmp);
18616 } else if (CC == ARMCC::EQ && TrueVal == RHS) {
18617 SDValue ARMcc;
18618 SDValue NewCmp = getARMCmp(LHS, RHS, ISD::SETNE, ARMcc, DAG, dl);
18619 Res = DAG.getNode(ARMISD::CMOV, dl, VT, LHS, FalseVal, ARMcc, NewCmp);
18620 }
18621
18622 // (cmov F T ne (cmpz (cmov 0 1 CC Flags) 0))
18623 // -> (cmov F T CC Flags)
18624 if (CC == ARMCC::NE && LHS.getOpcode() == ARMISD::CMOV && LHS->hasOneUse() &&
18625 isNullConstant(LHS->getOperand(0)) && isOneConstant(LHS->getOperand(1)) &&
18626 isNullConstant(RHS)) {
18627 return DAG.getNode(ARMISD::CMOV, dl, VT, FalseVal, TrueVal,
18628 LHS->getOperand(2), LHS->getOperand(3));
18629 }
18630
18631 if (!VT.isInteger())
18632 return SDValue();
18633
18634 // Fold away an unnecessary CMPZ/CMOV
18635 // CMOV A, B, C1, (CMPZ (CMOV 1, 0, C2, D), 0) ->
18636 // if C1==EQ -> CMOV A, B, C2, D
18637 // if C1==NE -> CMOV A, B, NOT(C2), D
18638 if (N->getConstantOperandVal(2) == ARMCC::EQ ||
18639 N->getConstantOperandVal(2) == ARMCC::NE) {
18641 if (SDValue C = IsCMPZCSINC(N->getOperand(3).getNode(), Cond)) {
18642 if (N->getConstantOperandVal(2) == ARMCC::NE)
18644 return DAG.getNode(N->getOpcode(), SDLoc(N), MVT::i32, N->getOperand(0),
18645 N->getOperand(1),
18646 DAG.getConstant(Cond, SDLoc(N), MVT::i32), C);
18647 }
18648 }
18649
18650 // Materialize a boolean comparison for integers so we can avoid branching.
18651 if (isNullConstant(FalseVal)) {
18652 if (CC == ARMCC::EQ && isOneConstant(TrueVal)) {
18653 if (!Subtarget->isThumb1Only() && Subtarget->hasV5TOps()) {
18654 // If x == y then x - y == 0 and ARM's CLZ will return 32, shifting it
18655 // right 5 bits will make that 32 be 1, otherwise it will be 0.
18656 // CMOV 0, 1, ==, (CMPZ x, y) -> SRL (CTLZ (SUB x, y)), 5
18657 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18658 Res = DAG.getNode(ISD::SRL, dl, VT, DAG.getNode(ISD::CTLZ, dl, VT, Sub),
18659 DAG.getConstant(5, dl, MVT::i32));
18660 } else {
18661 // CMOV 0, 1, ==, (CMPZ x, y) ->
18662 // (UADDO_CARRY (SUB x, y), t:0, t:1)
18663 // where t = (USUBO_CARRY 0, (SUB x, y), 0)
18664 //
18665 // The USUBO_CARRY computes 0 - (x - y) and this will give a borrow when
18666 // x != y. In other words, a carry C == 1 when x == y, C == 0
18667 // otherwise.
18668 // The final UADDO_CARRY computes
18669 // x - y + (0 - (x - y)) + C == C
18670 SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, LHS, RHS);
18671 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18672 SDValue Neg = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, Sub);
18673 // ISD::USUBO_CARRY returns a borrow but we want the carry here
18674 // actually.
18675 SDValue Carry =
18676 DAG.getNode(ISD::SUB, dl, MVT::i32,
18677 DAG.getConstant(1, dl, MVT::i32), Neg.getValue(1));
18678 Res = DAG.getNode(ISD::UADDO_CARRY, dl, VTs, Sub, Neg, Carry);
18679 }
18680 } else if (CC == ARMCC::NE && !isNullConstant(RHS) &&
18681 (!Subtarget->isThumb1Only() || isPowerOf2Constant(TrueVal))) {
18682 // This seems pointless but will allow us to combine it further below.
18683 // CMOV 0, z, !=, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18684 SDValue Sub =
18685 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18686 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, TrueVal, ARMcc,
18687 Sub.getValue(1));
18688 FalseVal = Sub;
18689 }
18690 } else if (isNullConstant(TrueVal)) {
18691 if (CC == ARMCC::EQ && !isNullConstant(RHS) &&
18692 (!Subtarget->isThumb1Only() || isPowerOf2Constant(FalseVal))) {
18693 // This seems pointless but will allow us to combine it further below
18694 // Note that we change == for != as this is the dual for the case above.
18695 // CMOV z, 0, ==, (CMPZ x, y) -> CMOV (SUBC x, y), z, !=, (SUBC x, y):1
18696 SDValue Sub =
18697 DAG.getNode(ARMISD::SUBC, dl, DAG.getVTList(VT, MVT::i32), LHS, RHS);
18698 Res = DAG.getNode(ARMISD::CMOV, dl, VT, Sub, FalseVal,
18699 DAG.getConstant(ARMCC::NE, dl, MVT::i32),
18700 Sub.getValue(1));
18701 FalseVal = Sub;
18702 }
18703 }
18704
18705 // On Thumb1, the DAG above may be further combined if z is a power of 2
18706 // (z == 2 ^ K).
18707 // CMOV (SUBC x, y), z, !=, (SUBC x, y):1 ->
18708 // t1 = (USUBO (SUB x, y), 1)
18709 // t2 = (USUBO_CARRY (SUB x, y), t1:0, t1:1)
18710 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18711 //
18712 // This also handles the special case of comparing against zero; it's
18713 // essentially, the same pattern, except there's no SUBC:
18714 // CMOV x, z, !=, (CMPZ x, 0) ->
18715 // t1 = (USUBO x, 1)
18716 // t2 = (USUBO_CARRY x, t1:0, t1:1)
18717 // Result = if K != 0 then (SHL t2:0, K) else t2:0
18718 const APInt *TrueConst;
18719 if (Subtarget->isThumb1Only() && CC == ARMCC::NE &&
18720 ((FalseVal.getOpcode() == ARMISD::SUBC && FalseVal.getOperand(0) == LHS &&
18721 FalseVal.getOperand(1) == RHS) ||
18722 (FalseVal == LHS && isNullConstant(RHS))) &&
18723 (TrueConst = isPowerOf2Constant(TrueVal))) {
18724 SDVTList VTs = DAG.getVTList(VT, MVT::i32);
18725 unsigned ShiftAmount = TrueConst->logBase2();
18726 if (ShiftAmount)
18727 TrueVal = DAG.getConstant(1, dl, VT);
18728 SDValue Subc = DAG.getNode(ISD::USUBO, dl, VTs, FalseVal, TrueVal);
18729 Res = DAG.getNode(ISD::USUBO_CARRY, dl, VTs, FalseVal, Subc,
18730 Subc.getValue(1));
18731
18732 if (ShiftAmount)
18733 Res = DAG.getNode(ISD::SHL, dl, VT, Res,
18734 DAG.getConstant(ShiftAmount, dl, MVT::i32));
18735 }
18736
18737 if (Res.getNode()) {
18738 KnownBits Known = DAG.computeKnownBits(SDValue(N,0));
18739 // Capture demanded bits information that would be otherwise lost.
18740 if (Known.Zero == 0xfffffffe)
18741 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18742 DAG.getValueType(MVT::i1));
18743 else if (Known.Zero == 0xffffff00)
18744 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18745 DAG.getValueType(MVT::i8));
18746 else if (Known.Zero == 0xffff0000)
18747 Res = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Res,
18748 DAG.getValueType(MVT::i16));
18749 }
18750
18751 return Res;
18752}
18753
18756 const ARMSubtarget *ST) {
18757 SelectionDAG &DAG = DCI.DAG;
18758 SDValue Src = N->getOperand(0);
18759 EVT DstVT = N->getValueType(0);
18760
18761 // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
18762 if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
18763 EVT SrcVT = Src.getValueType();
18764 if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
18765 return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
18766 }
18767
18768 // We may have a bitcast of something that has already had this bitcast
18769 // combine performed on it, so skip past any VECTOR_REG_CASTs.
18770 if (Src.getOpcode() == ARMISD::VECTOR_REG_CAST &&
18771 Src.getOperand(0).getValueType().getScalarSizeInBits() <=
18772 Src.getValueType().getScalarSizeInBits())
18773 Src = Src.getOperand(0);
18774
18775 // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
18776 // would be generated is at least the width of the element type.
18777 EVT SrcVT = Src.getValueType();
18778 if ((Src.getOpcode() == ARMISD::VMOVIMM ||
18779 Src.getOpcode() == ARMISD::VMVNIMM ||
18780 Src.getOpcode() == ARMISD::VMOVFPIMM) &&
18781 SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
18782 DAG.getDataLayout().isBigEndian())
18783 return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
18784
18785 // bitcast(extract(x, n)); bitcast(extract(x, n+1)) -> VMOVRRD x
18786 if (SDValue R = PerformExtractEltToVMOVRRD(N, DCI))
18787 return R;
18788
18789 return SDValue();
18790}
18791
18792// Some combines for the MVETrunc truncations legalizer helper. Also lowers the
18793// node into stack operations after legalizeOps.
18796 SelectionDAG &DAG = DCI.DAG;
18797 EVT VT = N->getValueType(0);
18798 SDLoc DL(N);
18799
18800 // MVETrunc(Undef, Undef) -> Undef
18801 if (all_of(N->ops(), [](SDValue Op) { return Op.isUndef(); }))
18802 return DAG.getUNDEF(VT);
18803
18804 // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc
18805 if (N->getNumOperands() == 2 &&
18806 N->getOperand(0).getOpcode() == ARMISD::MVETRUNC &&
18807 N->getOperand(1).getOpcode() == ARMISD::MVETRUNC)
18808 return DAG.getNode(ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0),
18809 N->getOperand(0).getOperand(1),
18810 N->getOperand(1).getOperand(0),
18811 N->getOperand(1).getOperand(1));
18812
18813 // MVETrunc(shuffle, shuffle) -> VMOVN
18814 if (N->getNumOperands() == 2 &&
18815 N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE &&
18816 N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) {
18817 auto *S0 = cast<ShuffleVectorSDNode>(N->getOperand(0).getNode());
18818 auto *S1 = cast<ShuffleVectorSDNode>(N->getOperand(1).getNode());
18819
18820 if (S0->getOperand(0) == S1->getOperand(0) &&
18821 S0->getOperand(1) == S1->getOperand(1)) {
18822 // Construct complete shuffle mask
18823 SmallVector<int, 8> Mask(S0->getMask());
18824 Mask.append(S1->getMask().begin(), S1->getMask().end());
18825
18826 if (isVMOVNTruncMask(Mask, VT, false))
18827 return DAG.getNode(
18828 ARMISD::VMOVN, DL, VT,
18829 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18830 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18831 DAG.getConstant(1, DL, MVT::i32));
18832 if (isVMOVNTruncMask(Mask, VT, true))
18833 return DAG.getNode(
18834 ARMISD::VMOVN, DL, VT,
18835 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)),
18836 DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)),
18837 DAG.getConstant(1, DL, MVT::i32));
18838 }
18839 }
18840
18841 // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the
18842 // truncate to a buildvector to allow the generic optimisations to kick in.
18843 if (all_of(N->ops(), [](SDValue Op) {
18844 return Op.getOpcode() == ISD::BUILD_VECTOR ||
18845 Op.getOpcode() == ISD::VECTOR_SHUFFLE ||
18846 (Op.getOpcode() == ISD::BITCAST &&
18847 Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR);
18848 })) {
18849 SmallVector<SDValue, 8> Extracts;
18850 for (unsigned Op = 0; Op < N->getNumOperands(); Op++) {
18851 SDValue O = N->getOperand(Op);
18852 for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) {
18853 SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O,
18854 DAG.getConstant(i, DL, MVT::i32));
18855 Extracts.push_back(Ext);
18856 }
18857 }
18858 return DAG.getBuildVector(VT, DL, Extracts);
18859 }
18860
18861 // If we are late in the legalization process and nothing has optimised
18862 // the trunc to anything better, lower it to a stack store and reload,
18863 // performing the truncation whilst keeping the lanes in the correct order:
18864 // VSTRH.32 a, stack; VSTRH.32 b, stack+8; VLDRW.32 stack;
18865 if (!DCI.isAfterLegalizeDAG())
18866 return SDValue();
18867
18868 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
18869 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
18870 int NumIns = N->getNumOperands();
18871 assert((NumIns == 2 || NumIns == 4) &&
18872 "Expected 2 or 4 inputs to an MVETrunc");
18873 EVT StoreVT = VT.getHalfNumVectorElementsVT(*DAG.getContext());
18874 if (N->getNumOperands() == 4)
18875 StoreVT = StoreVT.getHalfNumVectorElementsVT(*DAG.getContext());
18876
18877 SmallVector<SDValue> Chains;
18878 for (int I = 0; I < NumIns; I++) {
18879 SDValue Ptr = DAG.getNode(
18880 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
18881 DAG.getConstant(I * 16 / NumIns, DL, StackPtr.getValueType()));
18883 DAG.getMachineFunction(), SPFI, I * 16 / NumIns);
18884 SDValue Ch = DAG.getTruncStore(DAG.getEntryNode(), DL, N->getOperand(I),
18885 Ptr, MPI, StoreVT, Align(4));
18886 Chains.push_back(Ch);
18887 }
18888
18889 SDValue Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18890 MachinePointerInfo MPI =
18892 return DAG.getLoad(VT, DL, Chain, StackPtr, MPI, Align(4));
18893}
18894
18895// Take a MVEEXT(load x) and split that into (extload x, extload x+8)
18897 SelectionDAG &DAG) {
18898 SDValue N0 = N->getOperand(0);
18900 if (!LD || !LD->isSimple() || !N0.hasOneUse() || LD->isIndexed())
18901 return SDValue();
18902
18903 EVT FromVT = LD->getMemoryVT();
18904 EVT ToVT = N->getValueType(0);
18905 if (!ToVT.isVector())
18906 return SDValue();
18907 assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements() * 2);
18908 EVT ToEltVT = ToVT.getVectorElementType();
18909 EVT FromEltVT = FromVT.getVectorElementType();
18910
18911 unsigned NumElements = 0;
18912 if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8))
18913 NumElements = 4;
18914 if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
18915 NumElements = 8;
18916 assert(NumElements != 0);
18917
18918 ISD::LoadExtType NewExtType =
18919 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
18920 if (LD->getExtensionType() != ISD::NON_EXTLOAD &&
18921 LD->getExtensionType() != ISD::EXTLOAD &&
18922 LD->getExtensionType() != NewExtType)
18923 return SDValue();
18924
18925 LLVMContext &C = *DAG.getContext();
18926 SDLoc DL(LD);
18927 // Details about the old load
18928 SDValue Ch = LD->getChain();
18929 SDValue BasePtr = LD->getBasePtr();
18930 Align Alignment = LD->getBaseAlign();
18931 MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
18932 AAMDNodes AAInfo = LD->getAAInfo();
18933
18934 SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
18935 EVT NewFromVT = EVT::getVectorVT(
18936 C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
18937 EVT NewToVT = EVT::getVectorVT(
18938 C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
18939
18942 for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
18943 unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
18944 SDValue NewPtr =
18945 DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::getFixed(NewOffset));
18946
18947 SDValue NewLoad =
18948 DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
18949 LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
18950 Alignment, MMOFlags, AAInfo);
18951 Loads.push_back(NewLoad);
18952 Chains.push_back(SDValue(NewLoad.getNode(), 1));
18953 }
18954
18955 SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
18956 DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
18957 return DAG.getMergeValues(Loads, DL);
18958}
18959
18960// Perform combines for MVEEXT. If it has not be optimized to anything better
18961// before lowering, it gets converted to stack store and extloads performing the
18962// extend whilst still keeping the same lane ordering.
18965 SelectionDAG &DAG = DCI.DAG;
18966 EVT VT = N->getValueType(0);
18967 SDLoc DL(N);
18968 assert(N->getNumValues() == 2 && "Expected MVEEXT with 2 elements");
18969 assert((VT == MVT::v4i32 || VT == MVT::v8i16) && "Unexpected MVEEXT type");
18970
18971 EVT ExtVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
18972 *DAG.getContext());
18973 auto Extend = [&](SDValue V) {
18974 SDValue VVT = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, V);
18975 return N->getOpcode() == ARMISD::MVESEXT
18976 ? DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, VVT,
18977 DAG.getValueType(ExtVT))
18978 : DAG.getZeroExtendInReg(VVT, DL, ExtVT);
18979 };
18980
18981 // MVEEXT(VDUP) -> SIGN_EXTEND_INREG(VDUP)
18982 if (N->getOperand(0).getOpcode() == ARMISD::VDUP) {
18983 SDValue Ext = Extend(N->getOperand(0));
18984 return DAG.getMergeValues({Ext, Ext}, DL);
18985 }
18986
18987 // MVEEXT(shuffle) -> SIGN_EXTEND_INREG/ZERO_EXTEND_INREG
18988 if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(N->getOperand(0))) {
18989 ArrayRef<int> Mask = SVN->getMask();
18990 assert(Mask.size() == 2 * VT.getVectorNumElements());
18991 assert(Mask.size() == SVN->getValueType(0).getVectorNumElements());
18992 unsigned Rev = VT == MVT::v4i32 ? ARMISD::VREV32 : ARMISD::VREV16;
18993 SDValue Op0 = SVN->getOperand(0);
18994 SDValue Op1 = SVN->getOperand(1);
18995
18996 auto CheckInregMask = [&](int Start, int Offset) {
18997 for (int Idx = 0, E = VT.getVectorNumElements(); Idx < E; ++Idx)
18998 if (Mask[Start + Idx] >= 0 && Mask[Start + Idx] != Idx * 2 + Offset)
18999 return false;
19000 return true;
19001 };
19002 SDValue V0 = SDValue(N, 0);
19003 SDValue V1 = SDValue(N, 1);
19004 if (CheckInregMask(0, 0))
19005 V0 = Extend(Op0);
19006 else if (CheckInregMask(0, 1))
19007 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
19008 else if (CheckInregMask(0, Mask.size()))
19009 V0 = Extend(Op1);
19010 else if (CheckInregMask(0, Mask.size() + 1))
19011 V0 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
19012
19013 if (CheckInregMask(VT.getVectorNumElements(), Mask.size()))
19014 V1 = Extend(Op1);
19015 else if (CheckInregMask(VT.getVectorNumElements(), Mask.size() + 1))
19016 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op1));
19017 else if (CheckInregMask(VT.getVectorNumElements(), 0))
19018 V1 = Extend(Op0);
19019 else if (CheckInregMask(VT.getVectorNumElements(), 1))
19020 V1 = Extend(DAG.getNode(Rev, DL, SVN->getValueType(0), Op0));
19021
19022 if (V0.getNode() != N || V1.getNode() != N)
19023 return DAG.getMergeValues({V0, V1}, DL);
19024 }
19025
19026 // MVEEXT(load) -> extload, extload
19027 if (N->getOperand(0)->getOpcode() == ISD::LOAD)
19029 return L;
19030
19031 if (!DCI.isAfterLegalizeDAG())
19032 return SDValue();
19033
19034 // Lower to a stack store and reload:
19035 // VSTRW.32 a, stack; VLDRH.32 stack; VLDRH.32 stack+8;
19036 SDValue StackPtr = DAG.CreateStackTemporary(TypeSize::getFixed(16), Align(4));
19037 int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
19038 int NumOuts = N->getNumValues();
19039 assert((NumOuts == 2 || NumOuts == 4) &&
19040 "Expected 2 or 4 outputs to an MVEEXT");
19041 EVT LoadVT = N->getOperand(0).getValueType().getHalfNumVectorElementsVT(
19042 *DAG.getContext());
19043 if (N->getNumOperands() == 4)
19044 LoadVT = LoadVT.getHalfNumVectorElementsVT(*DAG.getContext());
19045
19046 MachinePointerInfo MPI =
19048 SDValue Chain = DAG.getStore(DAG.getEntryNode(), DL, N->getOperand(0),
19049 StackPtr, MPI, Align(4));
19050
19052 for (int I = 0; I < NumOuts; I++) {
19053 SDValue Ptr = DAG.getNode(
19054 ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
19055 DAG.getConstant(I * 16 / NumOuts, DL, StackPtr.getValueType()));
19057 DAG.getMachineFunction(), SPFI, I * 16 / NumOuts);
19058 SDValue Load = DAG.getExtLoad(
19059 N->getOpcode() == ARMISD::MVESEXT ? ISD::SEXTLOAD : ISD::ZEXTLOAD, DL,
19060 VT, Chain, Ptr, MPI, LoadVT, Align(4));
19061 Loads.push_back(Load);
19062 }
19063
19064 return DAG.getMergeValues(Loads, DL);
19065}
19066
19068 DAGCombinerInfo &DCI) const {
19069 switch (N->getOpcode()) {
19070 default: break;
19071 case ISD::SELECT_CC:
19072 case ISD::SELECT: return PerformSELECTCombine(N, DCI, Subtarget);
19073 case ISD::VSELECT: return PerformVSELECTCombine(N, DCI, Subtarget);
19074 case ISD::SETCC: return PerformVSetCCToVCTPCombine(N, DCI, Subtarget);
19075 case ARMISD::ADDE: return PerformADDECombine(N, DCI, Subtarget);
19076 case ARMISD::UMLAL: return PerformUMLALCombine(N, DCI.DAG, Subtarget);
19077 case ISD::ADD: return PerformADDCombine(N, DCI, Subtarget);
19078 case ISD::SUB: return PerformSUBCombine(N, DCI, Subtarget);
19079 case ISD::MUL: return PerformMULCombine(N, DCI, Subtarget);
19080 case ISD::OR: return PerformORCombine(N, DCI, Subtarget);
19081 case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget);
19082 case ISD::AND: return PerformANDCombine(N, DCI, Subtarget);
19083 case ISD::BRCOND:
19084 case ISD::BR_CC: return PerformHWLoopCombine(N, DCI, Subtarget);
19085 case ARMISD::ADDC:
19086 case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget);
19087 case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget);
19088 case ARMISD::BFI: return PerformBFICombine(N, DCI.DAG);
19089 case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
19090 case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
19091 case ARMISD::VMOVhr: return PerformVMOVhrCombine(N, DCI);
19092 case ARMISD::VMOVrh: return PerformVMOVrhCombine(N, DCI.DAG);
19093 case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget);
19094 case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
19097 return PerformExtractEltCombine(N, DCI, Subtarget);
19101 case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
19102 case ARMISD::VDUP: return PerformVDUPCombine(N, DCI.DAG, Subtarget);
19103 case ISD::FP_TO_SINT:
19104 case ISD::FP_TO_UINT:
19105 return PerformVCVTCombine(N, DCI.DAG, Subtarget);
19106 case ISD::FADD:
19107 return PerformFADDCombine(N, DCI.DAG, Subtarget);
19108 case ISD::FMUL:
19109 return PerformVMulVCTPCombine(N, DCI.DAG, Subtarget);
19111 return PerformIntrinsicCombine(N, DCI);
19112 case ISD::SHL:
19113 case ISD::SRA:
19114 case ISD::SRL:
19115 return PerformShiftCombine(N, DCI, Subtarget);
19116 case ISD::SIGN_EXTEND:
19117 case ISD::ZERO_EXTEND:
19118 case ISD::ANY_EXTEND:
19119 return PerformExtendCombine(N, DCI.DAG, Subtarget);
19120 case ISD::FP_EXTEND:
19121 return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
19122 case ISD::SMIN:
19123 case ISD::UMIN:
19124 case ISD::SMAX:
19125 case ISD::UMAX:
19126 return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
19127 case ARMISD::CMOV:
19128 return PerformCMOVCombine(N, DCI.DAG);
19129 case ARMISD::BRCOND:
19130 return PerformBRCONDCombine(N, DCI.DAG);
19131 case ARMISD::CMPZ:
19132 return PerformCMPZCombine(N, DCI.DAG);
19133 case ARMISD::CSINC:
19134 case ARMISD::CSINV:
19135 case ARMISD::CSNEG:
19136 return PerformCSETCombine(N, DCI.DAG);
19137 case ISD::LOAD:
19138 return PerformLOADCombine(N, DCI, Subtarget);
19139 case ARMISD::VLD1DUP:
19140 case ARMISD::VLD2DUP:
19141 case ARMISD::VLD3DUP:
19142 case ARMISD::VLD4DUP:
19143 return PerformVLDCombine(N, DCI);
19145 return PerformARMBUILD_VECTORCombine(N, DCI);
19146 case ISD::BITCAST:
19147 return PerformBITCASTCombine(N, DCI, Subtarget);
19148 case ARMISD::PREDICATE_CAST:
19149 return PerformPREDICATE_CASTCombine(N, DCI);
19150 case ARMISD::VECTOR_REG_CAST:
19151 return PerformVECTOR_REG_CASTCombine(N, DCI.DAG, Subtarget);
19152 case ARMISD::MVETRUNC:
19153 return PerformMVETruncCombine(N, DCI);
19154 case ARMISD::MVESEXT:
19155 case ARMISD::MVEZEXT:
19156 return PerformMVEExtCombine(N, DCI);
19157 case ARMISD::VCMP:
19158 return PerformVCMPCombine(N, DCI.DAG, Subtarget);
19159 case ISD::VECREDUCE_ADD:
19160 return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
19161 case ARMISD::VADDVs:
19162 case ARMISD::VADDVu:
19163 case ARMISD::VADDLVs:
19164 case ARMISD::VADDLVu:
19165 case ARMISD::VADDLVAs:
19166 case ARMISD::VADDLVAu:
19167 case ARMISD::VMLAVs:
19168 case ARMISD::VMLAVu:
19169 case ARMISD::VMLALVs:
19170 case ARMISD::VMLALVu:
19171 case ARMISD::VMLALVAs:
19172 case ARMISD::VMLALVAu:
19173 return PerformReduceShuffleCombine(N, DCI.DAG);
19174 case ARMISD::VMOVN:
19175 return PerformVMOVNCombine(N, DCI);
19176 case ARMISD::VQMOVNs:
19177 case ARMISD::VQMOVNu:
19178 return PerformVQMOVNCombine(N, DCI);
19179 case ARMISD::VQDMULH:
19180 return PerformVQDMULHCombine(N, DCI);
19181 case ARMISD::ASRL:
19182 case ARMISD::LSRL:
19183 case ARMISD::LSLL:
19184 return PerformLongShiftCombine(N, DCI.DAG);
19185 case ARMISD::SMULWB: {
19186 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19187 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19188 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19189 return SDValue();
19190 break;
19191 }
19192 case ARMISD::SMULWT: {
19193 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19194 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19195 if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
19196 return SDValue();
19197 break;
19198 }
19199 case ARMISD::SMLALBB:
19200 case ARMISD::QADD16b:
19201 case ARMISD::QSUB16b:
19202 case ARMISD::UQADD16b:
19203 case ARMISD::UQSUB16b: {
19204 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19205 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
19206 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19207 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19208 return SDValue();
19209 break;
19210 }
19211 case ARMISD::SMLALBT: {
19212 unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
19213 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19214 unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
19215 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19216 if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
19217 (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
19218 return SDValue();
19219 break;
19220 }
19221 case ARMISD::SMLALTB: {
19222 unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
19223 APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
19224 unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
19225 APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
19226 if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
19227 (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
19228 return SDValue();
19229 break;
19230 }
19231 case ARMISD::SMLALTT: {
19232 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19233 APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
19234 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19235 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19236 return SDValue();
19237 break;
19238 }
19239 case ARMISD::QADD8b:
19240 case ARMISD::QSUB8b:
19241 case ARMISD::UQADD8b:
19242 case ARMISD::UQSUB8b: {
19243 unsigned BitWidth = N->getValueType(0).getSizeInBits();
19244 APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 8);
19245 if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
19246 (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
19247 return SDValue();
19248 break;
19249 }
19250 case ARMISD::VBSP:
19251 if (N->getOperand(1) == N->getOperand(2))
19252 return N->getOperand(1);
19253 return SDValue();
19256 switch (N->getConstantOperandVal(1)) {
19257 case Intrinsic::arm_neon_vld1:
19258 case Intrinsic::arm_neon_vld1x2:
19259 case Intrinsic::arm_neon_vld1x3:
19260 case Intrinsic::arm_neon_vld1x4:
19261 case Intrinsic::arm_neon_vld2:
19262 case Intrinsic::arm_neon_vld3:
19263 case Intrinsic::arm_neon_vld4:
19264 case Intrinsic::arm_neon_vld2lane:
19265 case Intrinsic::arm_neon_vld3lane:
19266 case Intrinsic::arm_neon_vld4lane:
19267 case Intrinsic::arm_neon_vld2dup:
19268 case Intrinsic::arm_neon_vld3dup:
19269 case Intrinsic::arm_neon_vld4dup:
19270 case Intrinsic::arm_neon_vst1:
19271 case Intrinsic::arm_neon_vst1x2:
19272 case Intrinsic::arm_neon_vst1x3:
19273 case Intrinsic::arm_neon_vst1x4:
19274 case Intrinsic::arm_neon_vst2:
19275 case Intrinsic::arm_neon_vst3:
19276 case Intrinsic::arm_neon_vst4:
19277 case Intrinsic::arm_neon_vst2lane:
19278 case Intrinsic::arm_neon_vst3lane:
19279 case Intrinsic::arm_neon_vst4lane:
19280 return PerformVLDCombine(N, DCI);
19281 case Intrinsic::arm_mve_vld2q:
19282 case Intrinsic::arm_mve_vld4q:
19283 case Intrinsic::arm_mve_vst2q:
19284 case Intrinsic::arm_mve_vst4q:
19285 return PerformMVEVLDCombine(N, DCI);
19286 default: break;
19287 }
19288 break;
19289 }
19290 return SDValue();
19291}
19292
19294 EVT VT) const {
19295 return (VT == MVT::f32) && (Opc == ISD::LOAD || Opc == ISD::STORE);
19296}
19297
19299 Align Alignment,
19301 unsigned *Fast) const {
19302 // Depends what it gets converted into if the type is weird.
19303 if (!VT.isSimple())
19304 return false;
19305
19306 // The AllowsUnaligned flag models the SCTLR.A setting in ARM cpus
19307 bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
19308 auto Ty = VT.getSimpleVT().SimpleTy;
19309
19310 if (Ty == MVT::i8 || Ty == MVT::i16 || Ty == MVT::i32) {
19311 // Unaligned access can use (for example) LRDB, LRDH, LDR
19312 if (AllowsUnaligned) {
19313 if (Fast)
19314 *Fast = Subtarget->hasV7Ops();
19315 return true;
19316 }
19317 }
19318
19319 if (Ty == MVT::f64 || Ty == MVT::v2f64) {
19320 // For any little-endian targets with neon, we can support unaligned ld/st
19321 // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
19322 // A big-endian target may also explicitly support unaligned accesses
19323 if (Subtarget->hasNEON() && (AllowsUnaligned || Subtarget->isLittle())) {
19324 if (Fast)
19325 *Fast = 1;
19326 return true;
19327 }
19328 }
19329
19330 if (!Subtarget->hasMVEIntegerOps())
19331 return false;
19332
19333 // These are for predicates
19334 if ((Ty == MVT::v16i1 || Ty == MVT::v8i1 || Ty == MVT::v4i1 ||
19335 Ty == MVT::v2i1)) {
19336 if (Fast)
19337 *Fast = 1;
19338 return true;
19339 }
19340
19341 // These are for truncated stores/narrowing loads. They are fine so long as
19342 // the alignment is at least the size of the item being loaded
19343 if ((Ty == MVT::v4i8 || Ty == MVT::v8i8 || Ty == MVT::v4i16) &&
19344 Alignment >= VT.getScalarSizeInBits() / 8) {
19345 if (Fast)
19346 *Fast = true;
19347 return true;
19348 }
19349
19350 // In little-endian MVE, the store instructions VSTRB.U8, VSTRH.U16 and
19351 // VSTRW.U32 all store the vector register in exactly the same format, and
19352 // differ only in the range of their immediate offset field and the required
19353 // alignment. So there is always a store that can be used, regardless of
19354 // actual type.
19355 //
19356 // For big endian, that is not the case. But can still emit a (VSTRB.U8;
19357 // VREV64.8) pair and get the same effect. This will likely be better than
19358 // aligning the vector through the stack.
19359 if (Ty == MVT::v16i8 || Ty == MVT::v8i16 || Ty == MVT::v8f16 ||
19360 Ty == MVT::v4i32 || Ty == MVT::v4f32 || Ty == MVT::v2i64 ||
19361 Ty == MVT::v2f64) {
19362 if (Fast)
19363 *Fast = 1;
19364 return true;
19365 }
19366
19367 return false;
19368}
19369
19371 LLVMContext &Context, const MemOp &Op,
19372 const AttributeList &FuncAttributes) const {
19373 // See if we can use NEON instructions for this...
19374 if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
19375 !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) {
19376 unsigned Fast;
19377 if (Op.size() >= 16 &&
19378 (Op.isAligned(Align(16)) ||
19379 (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, Align(1),
19381 Fast))) {
19382 return MVT::v2f64;
19383 } else if (Op.size() >= 8 &&
19384 (Op.isAligned(Align(8)) ||
19386 MVT::f64, 0, Align(1), MachineMemOperand::MONone, &Fast) &&
19387 Fast))) {
19388 return MVT::f64;
19389 }
19390 }
19391
19392 // Let the target-independent logic figure it out.
19393 return MVT::Other;
19394}
19395
19396// 64-bit integers are split into their high and low parts and held in two
19397// different registers, so the trunc is free since the low register can just
19398// be used.
19399bool ARMTargetLowering::isTruncateFree(Type *SrcTy, Type *DstTy) const {
19400 if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
19401 return false;
19402 unsigned SrcBits = SrcTy->getPrimitiveSizeInBits();
19403 unsigned DestBits = DstTy->getPrimitiveSizeInBits();
19404 return (SrcBits == 64 && DestBits == 32);
19405}
19406
19408 if (SrcVT.isVector() || DstVT.isVector() || !SrcVT.isInteger() ||
19409 !DstVT.isInteger())
19410 return false;
19411 unsigned SrcBits = SrcVT.getSizeInBits();
19412 unsigned DestBits = DstVT.getSizeInBits();
19413 return (SrcBits == 64 && DestBits == 32);
19414}
19415
19417 if (Val.getOpcode() != ISD::LOAD)
19418 return false;
19419
19420 EVT VT1 = Val.getValueType();
19421 if (!VT1.isSimple() || !VT1.isInteger() ||
19422 !VT2.isSimple() || !VT2.isInteger())
19423 return false;
19424
19425 switch (VT1.getSimpleVT().SimpleTy) {
19426 default: break;
19427 case MVT::i1:
19428 case MVT::i8:
19429 case MVT::i16:
19430 // 8-bit and 16-bit loads implicitly zero-extend to 32-bits.
19431 return true;
19432 }
19433
19434 return false;
19435}
19436
19438 if (!VT.isSimple())
19439 return false;
19440
19441 // There are quite a few FP16 instructions (e.g. VNMLA, VNMLS, etc.) that
19442 // negate values directly (fneg is free). So, we don't want to let the DAG
19443 // combiner rewrite fneg into xors and some other instructions. For f16 and
19444 // FullFP16 argument passing, some bitcast nodes may be introduced,
19445 // triggering this DAG combine rewrite, so we are avoiding that with this.
19446 switch (VT.getSimpleVT().SimpleTy) {
19447 default: break;
19448 case MVT::f16:
19449 return Subtarget->hasFullFP16();
19450 }
19451
19452 return false;
19453}
19454
19456 if (!Subtarget->hasMVEIntegerOps())
19457 return nullptr;
19458 Type *SVIType = SVI->getType();
19459 Type *ScalarType = SVIType->getScalarType();
19460
19461 if (ScalarType->isFloatTy())
19462 return Type::getInt32Ty(SVIType->getContext());
19463 if (ScalarType->isHalfTy())
19464 return Type::getInt16Ty(SVIType->getContext());
19465 return nullptr;
19466}
19467
19469 EVT VT = ExtVal.getValueType();
19470
19471 if (!isTypeLegal(VT))
19472 return false;
19473
19474 if (auto *Ld = dyn_cast<MaskedLoadSDNode>(ExtVal.getOperand(0))) {
19475 if (Ld->isExpandingLoad())
19476 return false;
19477 }
19478
19479 if (Subtarget->hasMVEIntegerOps())
19480 return true;
19481
19482 // Don't create a loadext if we can fold the extension into a wide/long
19483 // instruction.
19484 // If there's more than one user instruction, the loadext is desirable no
19485 // matter what. There can be two uses by the same instruction.
19486 if (ExtVal->use_empty() ||
19487 !ExtVal->user_begin()->isOnlyUserOf(ExtVal.getNode()))
19488 return true;
19489
19490 SDNode *U = *ExtVal->user_begin();
19491 if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB ||
19492 U->getOpcode() == ISD::SHL || U->getOpcode() == ARMISD::VSHLIMM))
19493 return false;
19494
19495 return true;
19496}
19497
19499 if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
19500 return false;
19501
19502 if (!isTypeLegal(EVT::getEVT(Ty1)))
19503 return false;
19504
19505 assert(Ty1->getPrimitiveSizeInBits() <= 64 && "i128 is probably not a noop");
19506
19507 // Assuming the caller doesn't have a zeroext or signext return parameter,
19508 // truncation all the way down to i1 is valid.
19509 return true;
19510}
19511
19512/// isFMAFasterThanFMulAndFAdd - Return true if an FMA operation is faster
19513/// than a pair of fmul and fadd instructions. fmuladd intrinsics will be
19514/// expanded to FMAs when this method returns true, otherwise fmuladd is
19515/// expanded to fmul + fadd.
19516///
19517/// ARM supports both fused and unfused multiply-add operations; we already
19518/// lower a pair of fmul and fadd to the latter so it's not clear that there
19519/// would be a gain or that the gain would be worthwhile enough to risk
19520/// correctness bugs.
19521///
19522/// For MVE, we set this to true as it helps simplify the need for some
19523/// patterns (and we don't have the non-fused floating point instruction).
19524bool ARMTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
19525 EVT VT) const {
19526 if (Subtarget->useSoftFloat())
19527 return false;
19528
19529 if (!VT.isSimple())
19530 return false;
19531
19532 switch (VT.getSimpleVT().SimpleTy) {
19533 case MVT::v4f32:
19534 case MVT::v8f16:
19535 return Subtarget->hasMVEFloatOps();
19536 case MVT::f16:
19537 return Subtarget->useFPVFMx16();
19538 case MVT::f32:
19539 return Subtarget->useFPVFMx();
19540 case MVT::f64:
19541 return Subtarget->useFPVFMx64();
19542 default:
19543 break;
19544 }
19545
19546 return false;
19547}
19548
19549static bool isLegalT1AddressImmediate(int64_t V, EVT VT) {
19550 if (V < 0)
19551 return false;
19552
19553 unsigned Scale = 1;
19554 switch (VT.getSimpleVT().SimpleTy) {
19555 case MVT::i1:
19556 case MVT::i8:
19557 // Scale == 1;
19558 break;
19559 case MVT::i16:
19560 // Scale == 2;
19561 Scale = 2;
19562 break;
19563 default:
19564 // On thumb1 we load most things (i32, i64, floats, etc) with a LDR
19565 // Scale == 4;
19566 Scale = 4;
19567 break;
19568 }
19569
19570 if ((V & (Scale - 1)) != 0)
19571 return false;
19572 return isUInt<5>(V / Scale);
19573}
19574
19575static bool isLegalT2AddressImmediate(int64_t V, EVT VT,
19576 const ARMSubtarget *Subtarget) {
19577 if (!VT.isInteger() && !VT.isFloatingPoint())
19578 return false;
19579 if (VT.isVector() && Subtarget->hasNEON())
19580 return false;
19581 if (VT.isVector() && VT.isFloatingPoint() && Subtarget->hasMVEIntegerOps() &&
19582 !Subtarget->hasMVEFloatOps())
19583 return false;
19584
19585 bool IsNeg = false;
19586 if (V < 0) {
19587 IsNeg = true;
19588 V = -V;
19589 }
19590
19591 unsigned NumBytes = std::max((unsigned)VT.getSizeInBits() / 8, 1U);
19592
19593 // MVE: size * imm7
19594 if (VT.isVector() && Subtarget->hasMVEIntegerOps()) {
19595 switch (VT.getSimpleVT().getVectorElementType().SimpleTy) {
19596 case MVT::i32:
19597 case MVT::f32:
19598 return isShiftedUInt<7,2>(V);
19599 case MVT::i16:
19600 case MVT::f16:
19601 return isShiftedUInt<7,1>(V);
19602 case MVT::i8:
19603 return isUInt<7>(V);
19604 default:
19605 return false;
19606 }
19607 }
19608
19609 // half VLDR: 2 * imm8
19610 if (VT.isFloatingPoint() && NumBytes == 2 && Subtarget->hasFPRegs16())
19611 return isShiftedUInt<8, 1>(V);
19612 // VLDR and LDRD: 4 * imm8
19613 if ((VT.isFloatingPoint() && Subtarget->hasVFP2Base()) || NumBytes == 8)
19614 return isShiftedUInt<8, 2>(V);
19615
19616 if (NumBytes == 1 || NumBytes == 2 || NumBytes == 4) {
19617 // + imm12 or - imm8
19618 if (IsNeg)
19619 return isUInt<8>(V);
19620 return isUInt<12>(V);
19621 }
19622
19623 return false;
19624}
19625
19626/// isLegalAddressImmediate - Return true if the integer value can be used
19627/// as the offset of the target addressing mode for load / store of the
19628/// given type.
19629static bool isLegalAddressImmediate(int64_t V, EVT VT,
19630 const ARMSubtarget *Subtarget) {
19631 if (V == 0)
19632 return true;
19633
19634 if (!VT.isSimple())
19635 return false;
19636
19637 if (Subtarget->isThumb1Only())
19638 return isLegalT1AddressImmediate(V, VT);
19639 else if (Subtarget->isThumb2())
19640 return isLegalT2AddressImmediate(V, VT, Subtarget);
19641
19642 // ARM mode.
19643 if (V < 0)
19644 V = - V;
19645 switch (VT.getSimpleVT().SimpleTy) {
19646 default: return false;
19647 case MVT::i1:
19648 case MVT::i8:
19649 case MVT::i32:
19650 // +- imm12
19651 return isUInt<12>(V);
19652 case MVT::i16:
19653 // +- imm8
19654 return isUInt<8>(V);
19655 case MVT::f32:
19656 case MVT::f64:
19657 if (!Subtarget->hasVFP2Base()) // FIXME: NEON?
19658 return false;
19659 return isShiftedUInt<8, 2>(V);
19660 }
19661}
19662
19664 EVT VT) const {
19665 int Scale = AM.Scale;
19666 if (Scale < 0)
19667 return false;
19668
19669 switch (VT.getSimpleVT().SimpleTy) {
19670 default: return false;
19671 case MVT::i1:
19672 case MVT::i8:
19673 case MVT::i16:
19674 case MVT::i32:
19675 if (Scale == 1)
19676 return true;
19677 // r + r << imm
19678 Scale = Scale & ~1;
19679 return Scale == 2 || Scale == 4 || Scale == 8;
19680 case MVT::i64:
19681 // FIXME: What are we trying to model here? ldrd doesn't have an r + r
19682 // version in Thumb mode.
19683 // r + r
19684 if (Scale == 1)
19685 return true;
19686 // r * 2 (this can be lowered to r + r).
19687 if (!AM.HasBaseReg && Scale == 2)
19688 return true;
19689 return false;
19690 case MVT::isVoid:
19691 // Note, we allow "void" uses (basically, uses that aren't loads or
19692 // stores), because arm allows folding a scale into many arithmetic
19693 // operations. This should be made more precise and revisited later.
19694
19695 // Allow r << imm, but the imm has to be a multiple of two.
19696 if (Scale & 1) return false;
19697 return isPowerOf2_32(Scale);
19698 }
19699}
19700
19702 EVT VT) const {
19703 const int Scale = AM.Scale;
19704
19705 // Negative scales are not supported in Thumb1.
19706 if (Scale < 0)
19707 return false;
19708
19709 // Thumb1 addressing modes do not support register scaling excepting the
19710 // following cases:
19711 // 1. Scale == 1 means no scaling.
19712 // 2. Scale == 2 this can be lowered to r + r if there is no base register.
19713 return (Scale == 1) || (!AM.HasBaseReg && Scale == 2);
19714}
19715
19716/// isLegalAddressingMode - Return true if the addressing mode represented
19717/// by AM is legal for this target, for a load/store of the specified type.
19719 const AddrMode &AM, Type *Ty,
19720 unsigned AS, Instruction *I) const {
19721 EVT VT = getValueType(DL, Ty, true);
19722 if (!isLegalAddressImmediate(AM.BaseOffs, VT, Subtarget))
19723 return false;
19724
19725 // Can never fold addr of global into load/store.
19726 if (AM.BaseGV)
19727 return false;
19728
19729 switch (AM.Scale) {
19730 case 0: // no scale reg, must be "r+i" or "r", or "i".
19731 break;
19732 default:
19733 // ARM doesn't support any R+R*scale+imm addr modes.
19734 if (AM.BaseOffs)
19735 return false;
19736
19737 if (!VT.isSimple())
19738 return false;
19739
19740 if (Subtarget->isThumb1Only())
19741 return isLegalT1ScaledAddressingMode(AM, VT);
19742
19743 if (Subtarget->isThumb2())
19744 return isLegalT2ScaledAddressingMode(AM, VT);
19745
19746 int Scale = AM.Scale;
19747 switch (VT.getSimpleVT().SimpleTy) {
19748 default: return false;
19749 case MVT::i1:
19750 case MVT::i8:
19751 case MVT::i32:
19752 if (Scale < 0) Scale = -Scale;
19753 if (Scale == 1)
19754 return true;
19755 // r + r << imm
19756 return isPowerOf2_32(Scale & ~1);
19757 case MVT::i16:
19758 case MVT::i64:
19759 // r +/- r
19760 if (Scale == 1 || (AM.HasBaseReg && Scale == -1))
19761 return true;
19762 // r * 2 (this can be lowered to r + r).
19763 if (!AM.HasBaseReg && Scale == 2)
19764 return true;
19765 return false;
19766
19767 case MVT::isVoid:
19768 // Note, we allow "void" uses (basically, uses that aren't loads or
19769 // stores), because arm allows folding a scale into many arithmetic
19770 // operations. This should be made more precise and revisited later.
19771
19772 // Allow r << imm, but the imm has to be a multiple of two.
19773 if (Scale & 1) return false;
19774 return isPowerOf2_32(Scale);
19775 }
19776 }
19777 return true;
19778}
19779
19780/// isLegalICmpImmediate - Return true if the specified immediate is legal
19781/// icmp immediate, that is the target has icmp instructions which can compare
19782/// a register against the immediate without having to materialize the
19783/// immediate into a register.
19785 // Thumb2 and ARM modes can use cmn for negative immediates.
19786 if (!Subtarget->isThumb())
19787 return ARM_AM::getSOImmVal((uint32_t)Imm) != -1 ||
19788 ARM_AM::getSOImmVal(-(uint32_t)Imm) != -1;
19789 if (Subtarget->isThumb2())
19790 return ARM_AM::getT2SOImmVal((uint32_t)Imm) != -1 ||
19791 ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
19792 // Thumb1 doesn't have cmn, and only 8-bit immediates.
19793 return Imm >= 0 && Imm <= 255;
19794}
19795
19796/// isLegalAddImmediate - Return true if the specified immediate is a legal add
19797/// *or sub* immediate, that is the target has add or sub instructions which can
19798/// add a register with the immediate without having to materialize the
19799/// immediate into a register.
19801 // Same encoding for add/sub, just flip the sign.
19802 uint64_t AbsImm = AbsoluteValue(Imm);
19803 if (!Subtarget->isThumb())
19804 return ARM_AM::getSOImmVal(AbsImm) != -1;
19805 if (Subtarget->isThumb2())
19806 return ARM_AM::getT2SOImmVal(AbsImm) != -1;
19807 // Thumb1 only has 8-bit unsigned immediate.
19808 return AbsImm <= 255;
19809}
19810
19811// Return false to prevent folding
19812// (mul (add r, c0), c1) -> (add (mul r, c1), c0*c1) in DAGCombine,
19813// if the folding leads to worse code.
19815 SDValue ConstNode) const {
19816 // Let the DAGCombiner decide for vector types and large types.
19817 const EVT VT = AddNode.getValueType();
19818 if (VT.isVector() || VT.getScalarSizeInBits() > 32)
19819 return true;
19820
19821 // It is worse if c0 is legal add immediate, while c1*c0 is not
19822 // and has to be composed by at least two instructions.
19823 const ConstantSDNode *C0Node = cast<ConstantSDNode>(AddNode.getOperand(1));
19824 const ConstantSDNode *C1Node = cast<ConstantSDNode>(ConstNode);
19825 const int64_t C0 = C0Node->getSExtValue();
19826 APInt CA = C0Node->getAPIntValue() * C1Node->getAPIntValue();
19828 return true;
19829 if (ConstantMaterializationCost((unsigned)CA.getZExtValue(), Subtarget) > 1)
19830 return false;
19831
19832 // Default to true and let the DAGCombiner decide.
19833 return true;
19834}
19835
19837 bool isSEXTLoad, SDValue &Base,
19838 SDValue &Offset, bool &isInc,
19839 SelectionDAG &DAG) {
19840 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19841 return false;
19842
19843 if (VT == MVT::i16 || ((VT == MVT::i8 || VT == MVT::i1) && isSEXTLoad)) {
19844 // AddressingMode 3
19845 Base = Ptr->getOperand(0);
19847 int RHSC = (int)RHS->getZExtValue();
19848 if (RHSC < 0 && RHSC > -256) {
19849 assert(Ptr->getOpcode() == ISD::ADD);
19850 isInc = false;
19851 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19852 return true;
19853 }
19854 }
19855 isInc = (Ptr->getOpcode() == ISD::ADD);
19856 Offset = Ptr->getOperand(1);
19857 return true;
19858 } else if (VT == MVT::i32 || VT == MVT::i8 || VT == MVT::i1) {
19859 // AddressingMode 2
19861 int RHSC = (int)RHS->getZExtValue();
19862 if (RHSC < 0 && RHSC > -0x1000) {
19863 assert(Ptr->getOpcode() == ISD::ADD);
19864 isInc = false;
19865 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19866 Base = Ptr->getOperand(0);
19867 return true;
19868 }
19869 }
19870
19871 if (Ptr->getOpcode() == ISD::ADD) {
19872 isInc = true;
19873 ARM_AM::ShiftOpc ShOpcVal=
19875 if (ShOpcVal != ARM_AM::no_shift) {
19876 Base = Ptr->getOperand(1);
19877 Offset = Ptr->getOperand(0);
19878 } else {
19879 Base = Ptr->getOperand(0);
19880 Offset = Ptr->getOperand(1);
19881 }
19882 return true;
19883 }
19884
19885 isInc = (Ptr->getOpcode() == ISD::ADD);
19886 Base = Ptr->getOperand(0);
19887 Offset = Ptr->getOperand(1);
19888 return true;
19889 }
19890
19891 // FIXME: Use VLDM / VSTM to emulate indexed FP load / store.
19892 return false;
19893}
19894
19896 bool isSEXTLoad, SDValue &Base,
19897 SDValue &Offset, bool &isInc,
19898 SelectionDAG &DAG) {
19899 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19900 return false;
19901
19902 Base = Ptr->getOperand(0);
19904 int RHSC = (int)RHS->getZExtValue();
19905 if (RHSC < 0 && RHSC > -0x100) { // 8 bits.
19906 assert(Ptr->getOpcode() == ISD::ADD);
19907 isInc = false;
19908 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19909 return true;
19910 } else if (RHSC > 0 && RHSC < 0x100) { // 8 bit, no zero.
19911 isInc = Ptr->getOpcode() == ISD::ADD;
19912 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19913 return true;
19914 }
19915 }
19916
19917 return false;
19918}
19919
19920static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
19921 bool isSEXTLoad, bool IsMasked, bool isLE,
19923 bool &isInc, SelectionDAG &DAG) {
19924 if (Ptr->getOpcode() != ISD::ADD && Ptr->getOpcode() != ISD::SUB)
19925 return false;
19926 if (!isa<ConstantSDNode>(Ptr->getOperand(1)))
19927 return false;
19928
19929 // We allow LE non-masked loads to change the type (for example use a vldrb.8
19930 // as opposed to a vldrw.32). This can allow extra addressing modes or
19931 // alignments for what is otherwise an equivalent instruction.
19932 bool CanChangeType = isLE && !IsMasked;
19933
19935 int RHSC = (int)RHS->getZExtValue();
19936
19937 auto IsInRange = [&](int RHSC, int Limit, int Scale) {
19938 if (RHSC < 0 && RHSC > -Limit * Scale && RHSC % Scale == 0) {
19939 assert(Ptr->getOpcode() == ISD::ADD);
19940 isInc = false;
19941 Offset = DAG.getConstant(-RHSC, SDLoc(Ptr), RHS->getValueType(0));
19942 return true;
19943 } else if (RHSC > 0 && RHSC < Limit * Scale && RHSC % Scale == 0) {
19944 isInc = Ptr->getOpcode() == ISD::ADD;
19945 Offset = DAG.getConstant(RHSC, SDLoc(Ptr), RHS->getValueType(0));
19946 return true;
19947 }
19948 return false;
19949 };
19950
19951 // Try to find a matching instruction based on s/zext, Alignment, Offset and
19952 // (in BE/masked) type.
19953 Base = Ptr->getOperand(0);
19954 if (VT == MVT::v4i16) {
19955 if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
19956 return true;
19957 } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
19958 if (IsInRange(RHSC, 0x80, 1))
19959 return true;
19960 } else if (Alignment >= 4 &&
19961 (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
19962 IsInRange(RHSC, 0x80, 4))
19963 return true;
19964 else if (Alignment >= 2 &&
19965 (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
19966 IsInRange(RHSC, 0x80, 2))
19967 return true;
19968 else if ((CanChangeType || VT == MVT::v16i8) && IsInRange(RHSC, 0x80, 1))
19969 return true;
19970 return false;
19971}
19972
19973/// getPreIndexedAddressParts - returns true by value, base pointer and
19974/// offset pointer and addressing mode by reference if the node's address
19975/// can be legally represented as pre-indexed load / store address.
19976bool
19978 SDValue &Offset,
19980 SelectionDAG &DAG) const {
19981 if (Subtarget->isThumb1Only())
19982 return false;
19983
19984 EVT VT;
19985 SDValue Ptr;
19986 Align Alignment;
19987 unsigned AS = 0;
19988 bool isSEXTLoad = false;
19989 bool IsMasked = false;
19990 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
19991 Ptr = LD->getBasePtr();
19992 VT = LD->getMemoryVT();
19993 Alignment = LD->getAlign();
19994 AS = LD->getAddressSpace();
19995 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
19996 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
19997 Ptr = ST->getBasePtr();
19998 VT = ST->getMemoryVT();
19999 Alignment = ST->getAlign();
20000 AS = ST->getAddressSpace();
20001 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
20002 Ptr = LD->getBasePtr();
20003 VT = LD->getMemoryVT();
20004 Alignment = LD->getAlign();
20005 AS = LD->getAddressSpace();
20006 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20007 IsMasked = true;
20009 Ptr = ST->getBasePtr();
20010 VT = ST->getMemoryVT();
20011 Alignment = ST->getAlign();
20012 AS = ST->getAddressSpace();
20013 IsMasked = true;
20014 } else
20015 return false;
20016
20017 unsigned Fast = 0;
20018 if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment,
20020 // Only generate post-increment or pre-increment forms when a real
20021 // hardware instruction exists for them. Do not emit postinc/preinc
20022 // if the operation will end up as a libcall.
20023 return false;
20024 }
20025
20026 bool isInc;
20027 bool isLegal = false;
20028 if (VT.isVector())
20029 isLegal = Subtarget->hasMVEIntegerOps() &&
20031 Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
20032 Subtarget->isLittle(), Base, Offset, isInc, DAG);
20033 else {
20034 if (Subtarget->isThumb2())
20035 isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
20036 Offset, isInc, DAG);
20037 else
20038 isLegal = getARMIndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
20039 Offset, isInc, DAG);
20040 }
20041 if (!isLegal)
20042 return false;
20043
20044 AM = isInc ? ISD::PRE_INC : ISD::PRE_DEC;
20045 return true;
20046}
20047
20048/// getPostIndexedAddressParts - returns true by value, base pointer and
20049/// offset pointer and addressing mode by reference if this node can be
20050/// combined with a load / store to form a post-indexed load / store.
20052 SDValue &Base,
20053 SDValue &Offset,
20055 SelectionDAG &DAG) const {
20056 EVT VT;
20057 SDValue Ptr;
20058 Align Alignment;
20059 bool isSEXTLoad = false, isNonExt;
20060 bool IsMasked = false;
20061 if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
20062 VT = LD->getMemoryVT();
20063 Ptr = LD->getBasePtr();
20064 Alignment = LD->getAlign();
20065 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20066 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20067 } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
20068 VT = ST->getMemoryVT();
20069 Ptr = ST->getBasePtr();
20070 Alignment = ST->getAlign();
20071 isNonExt = !ST->isTruncatingStore();
20072 } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
20073 VT = LD->getMemoryVT();
20074 Ptr = LD->getBasePtr();
20075 Alignment = LD->getAlign();
20076 isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
20077 isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
20078 IsMasked = true;
20080 VT = ST->getMemoryVT();
20081 Ptr = ST->getBasePtr();
20082 Alignment = ST->getAlign();
20083 isNonExt = !ST->isTruncatingStore();
20084 IsMasked = true;
20085 } else
20086 return false;
20087
20088 if (Subtarget->isThumb1Only()) {
20089 // Thumb-1 can do a limited post-inc load or store as an updating LDM. It
20090 // must be non-extending/truncating, i32, with an offset of 4.
20091 assert(Op->getValueType(0) == MVT::i32 && "Non-i32 post-inc op?!");
20092 if (Op->getOpcode() != ISD::ADD || !isNonExt)
20093 return false;
20094 auto *RHS = dyn_cast<ConstantSDNode>(Op->getOperand(1));
20095 if (!RHS || RHS->getZExtValue() != 4)
20096 return false;
20097 if (Alignment < Align(4))
20098 return false;
20099
20100 Offset = Op->getOperand(1);
20101 Base = Op->getOperand(0);
20102 AM = ISD::POST_INC;
20103 return true;
20104 }
20105
20106 bool isInc;
20107 bool isLegal = false;
20108 if (VT.isVector())
20109 isLegal = Subtarget->hasMVEIntegerOps() &&
20110 getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
20111 Subtarget->isLittle(), Base, Offset,
20112 isInc, DAG);
20113 else {
20114 if (Subtarget->isThumb2())
20115 isLegal = getT2IndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20116 isInc, DAG);
20117 else
20118 isLegal = getARMIndexedAddressParts(Op, VT, isSEXTLoad, Base, Offset,
20119 isInc, DAG);
20120 }
20121 if (!isLegal)
20122 return false;
20123
20124 if (Ptr != Base) {
20125 // Swap base ptr and offset to catch more post-index load / store when
20126 // it's legal. In Thumb2 mode, offset must be an immediate.
20127 if (Ptr == Offset && Op->getOpcode() == ISD::ADD &&
20128 !Subtarget->isThumb2())
20130
20131 // Post-indexed load / store update the base pointer.
20132 if (Ptr != Base)
20133 return false;
20134 }
20135
20136 AM = isInc ? ISD::POST_INC : ISD::POST_DEC;
20137 return true;
20138}
20139
20141 KnownBits &Known,
20142 const APInt &DemandedElts,
20143 const SelectionDAG &DAG,
20144 unsigned Depth) const {
20145 unsigned BitWidth = Known.getBitWidth();
20146 Known.resetAll();
20147 switch (Op.getOpcode()) {
20148 default: break;
20149 case ARMISD::ADDC:
20150 case ARMISD::ADDE:
20151 case ARMISD::SUBC:
20152 case ARMISD::SUBE:
20153 // Special cases when we convert a carry to a boolean.
20154 if (Op.getResNo() == 0) {
20155 SDValue LHS = Op.getOperand(0);
20156 SDValue RHS = Op.getOperand(1);
20157 // (ADDE 0, 0, C) will give us a single bit.
20158 if (Op->getOpcode() == ARMISD::ADDE && isNullConstant(LHS) &&
20159 isNullConstant(RHS)) {
20161 return;
20162 }
20163 }
20164 break;
20165 case ARMISD::CMOV: {
20166 // Bits are known zero/one if known on the LHS and RHS.
20167 Known = DAG.computeKnownBits(Op.getOperand(0), Depth+1);
20168 if (Known.isUnknown())
20169 return;
20170
20171 KnownBits KnownRHS = DAG.computeKnownBits(Op.getOperand(1), Depth+1);
20172 Known = Known.intersectWith(KnownRHS);
20173 return;
20174 }
20176 Intrinsic::ID IntID =
20177 static_cast<Intrinsic::ID>(Op->getConstantOperandVal(1));
20178 switch (IntID) {
20179 default: return;
20180 case Intrinsic::arm_ldaex:
20181 case Intrinsic::arm_ldrex: {
20182 EVT VT = cast<MemIntrinsicSDNode>(Op)->getMemoryVT();
20183 unsigned MemBits = VT.getScalarSizeInBits();
20184 Known.Zero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
20185 return;
20186 }
20187 }
20188 }
20189 case ARMISD::BFI: {
20190 // Conservatively, we can recurse down the first operand
20191 // and just mask out all affected bits.
20192 Known = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20193
20194 // The operand to BFI is already a mask suitable for removing the bits it
20195 // sets.
20196 const APInt &Mask = Op.getConstantOperandAPInt(2);
20197 Known.Zero &= Mask;
20198 Known.One &= Mask;
20199 return;
20200 }
20201 case ARMISD::VGETLANEs:
20202 case ARMISD::VGETLANEu: {
20203 const SDValue &SrcSV = Op.getOperand(0);
20204 EVT VecVT = SrcSV.getValueType();
20205 assert(VecVT.isVector() && "VGETLANE expected a vector type");
20206 const unsigned NumSrcElts = VecVT.getVectorNumElements();
20207 ConstantSDNode *Pos = cast<ConstantSDNode>(Op.getOperand(1).getNode());
20208 assert(Pos->getAPIntValue().ult(NumSrcElts) &&
20209 "VGETLANE index out of bounds");
20210 unsigned Idx = Pos->getZExtValue();
20211 APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
20212 Known = DAG.computeKnownBits(SrcSV, DemandedElt, Depth + 1);
20213
20214 EVT VT = Op.getValueType();
20215 const unsigned DstSz = VT.getScalarSizeInBits();
20216 const unsigned SrcSz = VecVT.getVectorElementType().getSizeInBits();
20217 (void)SrcSz;
20218 assert(SrcSz == Known.getBitWidth());
20219 assert(DstSz > SrcSz);
20220 if (Op.getOpcode() == ARMISD::VGETLANEs)
20221 Known = Known.sext(DstSz);
20222 else {
20223 Known = Known.zext(DstSz);
20224 }
20225 assert(DstSz == Known.getBitWidth());
20226 break;
20227 }
20228 case ARMISD::VMOVrh: {
20229 KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20230 assert(KnownOp.getBitWidth() == 16);
20231 Known = KnownOp.zext(32);
20232 break;
20233 }
20234 case ARMISD::CSINC:
20235 case ARMISD::CSINV:
20236 case ARMISD::CSNEG: {
20237 KnownBits KnownOp0 = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
20238 KnownBits KnownOp1 = DAG.computeKnownBits(Op->getOperand(1), Depth + 1);
20239
20240 // The result is either:
20241 // CSINC: KnownOp0 or KnownOp1 + 1
20242 // CSINV: KnownOp0 or ~KnownOp1
20243 // CSNEG: KnownOp0 or KnownOp1 * -1
20244 if (Op.getOpcode() == ARMISD::CSINC)
20245 KnownOp1 =
20246 KnownBits::add(KnownOp1, KnownBits::makeConstant(APInt(32, 1)));
20247 else if (Op.getOpcode() == ARMISD::CSINV)
20248 std::swap(KnownOp1.Zero, KnownOp1.One);
20249 else if (Op.getOpcode() == ARMISD::CSNEG)
20250 KnownOp1 = KnownBits::mul(KnownOp1,
20252
20253 Known = KnownOp0.intersectWith(KnownOp1);
20254 break;
20255 }
20256 case ARMISD::VORRIMM:
20257 case ARMISD::VBICIMM: {
20258 unsigned Encoded = Op.getConstantOperandVal(1);
20259 unsigned DecEltBits = 0;
20260 uint64_t DecodedVal = ARM_AM::decodeVMOVModImm(Encoded, DecEltBits);
20261
20262 unsigned EltBits = Op.getScalarValueSizeInBits();
20263 if (EltBits != DecEltBits) {
20264 // Be conservative: only update Known when EltBits == DecEltBits.
20265 // This is believed to always be true for VORRIMM/VBICIMM today, but if
20266 // that changes in the future, doing nothing here is safer than risking
20267 // subtle bugs.
20268 break;
20269 }
20270
20271 KnownBits KnownLHS = DAG.computeKnownBits(Op.getOperand(0), Depth + 1);
20272 bool IsVORR = Op.getOpcode() == ARMISD::VORRIMM;
20273 APInt Imm(DecEltBits, DecodedVal);
20274
20275 Known.One = IsVORR ? (KnownLHS.One | Imm) : (KnownLHS.One & ~Imm);
20276 Known.Zero = IsVORR ? (KnownLHS.Zero & ~Imm) : (KnownLHS.Zero | Imm);
20277 break;
20278 }
20279 }
20280}
20281
20282static bool isLegalLogicalImmediate(unsigned Imm,
20283 const ARMSubtarget *Subtarget) {
20284 if (!Subtarget->isThumb())
20285 return ARM_AM::getSOImmVal(Imm) != -1;
20286 if (Subtarget->isThumb2())
20287 return ARM_AM::getT2SOImmVal(Imm) != -1;
20288 // Thumb1 only has 8-bit unsigned immediate.
20289 return Imm <= 255;
20290}
20291
20292/// Refine i32 AND/OR/XOR with a constant RHS using demanded bits: replace the
20293/// immediate with an equivalent constant that ARM/Thumb can encode as a
20294/// logical immediate (or that selects better lowering), without changing the
20295/// computed result on those demanded bits.
20296static bool optimizeLogicalImm(SDValue Op, unsigned Imm,
20297 const APInt &DemandedBits,
20298 const ARMSubtarget *Subtarget,
20300
20301 if (Imm == 0 || Imm == ~0U)
20302 return false;
20303
20304 unsigned Opc = Op.getOpcode();
20305 unsigned Demanded = DemandedBits.getZExtValue();
20306 EVT VT = Op.getValueType();
20307
20308 unsigned ShrunkImm = Imm & Demanded;
20309 unsigned ExpandedImm = Imm | ~Demanded;
20310
20311 auto IsLegalImm = [ShrunkImm, ExpandedImm](unsigned CandidateImm) -> bool {
20312 return (ShrunkImm & CandidateImm) == ShrunkImm &&
20313 (~ExpandedImm & CandidateImm) == 0;
20314 };
20315 auto UseImm = [Imm, Opc, Op, VT, &TLO](unsigned NewImm) -> bool {
20316 if (NewImm == Imm)
20317 return true;
20318 SDLoc DL(Op);
20319 SDValue NewC = TLO.DAG.getConstant(NewImm, DL, VT);
20320 SDValue NewOp =
20321 TLO.DAG.getNode(Opc, DL, VT, Op.getOperand(0), NewC, Op->getFlags());
20322 return TLO.CombineTo(Op, NewOp);
20323 };
20324
20325 // Shrunk immediate is 0: AND becomes zero; OR/XOR with 0 leaves the other
20326 // operand (still valid on demanded bits).
20327 if (ShrunkImm == 0) {
20328 ++NumOptimizedImms;
20329 return UseImm(ShrunkImm);
20330 }
20331
20332 // If the immediate is all ones: for AND this removes the operation; for
20333 // OR/XOR it remains a transform valid on demanded bits. (Target-independent
20334 // shrink may not fold this, so keep it to avoid obscure combine loops.)
20335 if (ExpandedImm == ~0U) {
20336 ++NumOptimizedImms;
20337 return UseImm(ExpandedImm);
20338 }
20339
20340 // Thumb1: prefer 0xFF / 0xFFFF when they fit the demanded-bit envelope so
20341 // lowering can match uxtb / uxth (AND immediates only; OR/XOR do not use
20342 // that). Run this before strict ShrunkImm: a tight 8-bit ShrunkImm can be
20343 // legal while 0xFF still matches the envelope and yields better isel (uxtb).
20344 if (Opc == ISD::AND && Subtarget->hasV6Ops()) {
20345 if (IsLegalImm(0xFF)) {
20346 ++NumOptimizedImms;
20347 return UseImm(0xFF);
20348 }
20349
20350 if (IsLegalImm(0xFFFF)) {
20351 ++NumOptimizedImms;
20352 return UseImm(0xFFFF);
20353 }
20354 }
20355
20356 // Don't optimize if it is legal.
20357 if (isLegalLogicalImmediate(Imm, Subtarget))
20358 return false;
20359
20360 // FIXME: Check for BIC being legal causes infinite loop due to target
20361 // independent DAG combine undoing this.
20362
20363 // Prefer strict shrink when ShrunkImm encodes for this target, before
20364 // complement expansion.
20365 if (isLegalLogicalImmediate(ShrunkImm, Subtarget)) {
20366 ++NumOptimizedImms;
20367 return UseImm(ShrunkImm);
20368 }
20369
20370 // Complement expansion: if all undemanded bits are already one, ExpandedImm
20371 // is Imm with every non-demanded bit set. When (~ExpandedImm) < 256, the
20372 // complement fits in an 8-bit unsigned value, i.e. bits 8–31 of ExpandedImm
20373 // are all ones; only the low byte may differ from ~0. Use that expanded
20374 // constant so isel sees a mask shape that fits logical-immediate patterns.
20375 if ((~ExpandedImm) < 256) {
20376 ++NumOptimizedImms;
20377 return UseImm(ExpandedImm);
20378 }
20379
20380 // FIXME: The check for v6 is because this interferes with some ubfx
20381 // optimizations.
20382 if (Opc == ISD::AND && isLegalLogicalImmediate(~ExpandedImm, Subtarget) &&
20383 !Subtarget->hasV6Ops()) {
20384 ++NumOptimizedImms;
20385 return UseImm(ExpandedImm);
20386 }
20387
20388 // Potential improvements:
20389 //
20390 // We could try to recognize lsls+lsrs or lsrs+lsls pairs here.
20391 // We could try to prefer Thumb1 immediates which can be lowered to a
20392 // two-instruction sequence.
20393
20394 return false;
20395}
20396
20398 SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
20399 TargetLoweringOpt &TLO) const {
20400 // Delay this optimization to as late as possible.
20401 if (!TLO.LegalOps)
20402 return false;
20403
20404 EVT VT = Op.getValueType();
20405
20406 // Ignore vectors.
20407 if (VT.isVector())
20408 return false;
20409
20410 unsigned Size = VT.getSizeInBits();
20411
20412 if (Size != 32)
20413 return false;
20414
20415 // Exit early if we demand all bits.
20416 if (DemandedBits.isAllOnes())
20417 return false;
20418
20419 switch (Op.getOpcode()) {
20420 default:
20421 return false;
20422 case ISD::AND:
20423 case ISD::OR:
20424 case ISD::XOR:
20425 break;
20426 }
20427 ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
20428 if (!C)
20429 return false;
20430 unsigned Imm = C->getZExtValue();
20431 return optimizeLogicalImm(Op, Imm, DemandedBits, Subtarget, TLO);
20432}
20433
20435 SDValue Op, const APInt &OriginalDemandedBits,
20436 const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
20437 unsigned Depth) const {
20438 unsigned Opc = Op.getOpcode();
20439
20440 switch (Opc) {
20441 case ARMISD::ASRL:
20442 case ARMISD::LSRL: {
20443 // If this is result 0 and the other result is unused, see if the demand
20444 // bits allow us to shrink this long shift into a standard small shift in
20445 // the opposite direction.
20446 if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
20447 isa<ConstantSDNode>(Op->getOperand(2))) {
20448 unsigned ShAmt = Op->getConstantOperandVal(2);
20449 if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(APInt::getAllOnes(32)
20450 << (32 - ShAmt)))
20451 return TLO.CombineTo(
20452 Op, TLO.DAG.getNode(
20453 ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
20454 TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
20455 }
20456 break;
20457 }
20458 case ARMISD::VBICIMM: {
20459 SDValue Op0 = Op.getOperand(0);
20460 unsigned ModImm = Op.getConstantOperandVal(1);
20461 unsigned EltBits = 0;
20462 uint64_t Mask = ARM_AM::decodeVMOVModImm(ModImm, EltBits);
20463 if ((OriginalDemandedBits & Mask) == 0)
20464 return TLO.CombineTo(Op, Op0);
20465 }
20466 }
20467
20469 Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
20470}
20471
20472//===----------------------------------------------------------------------===//
20473// ARM Inline Assembly Support
20474//===----------------------------------------------------------------------===//
20475
20476const char *ARMTargetLowering::LowerXConstraint(EVT ConstraintVT) const {
20477 // At this point, we have to lower this constraint to something else, so we
20478 // lower it to an "r" or "w". However, by doing this we will force the result
20479 // to be in register, while the X constraint is much more permissive.
20480 //
20481 // Although we are correct (we are free to emit anything, without
20482 // constraints), we might break use cases that would expect us to be more
20483 // efficient and emit something else.
20484 if (!Subtarget->hasVFP2Base())
20485 return "r";
20486 if (ConstraintVT.isFloatingPoint())
20487 return "w";
20488 if (ConstraintVT.isVector() && Subtarget->hasNEON() &&
20489 (ConstraintVT.getSizeInBits() == 64 ||
20490 ConstraintVT.getSizeInBits() == 128))
20491 return "w";
20492
20493 return "r";
20494}
20495
20496/// getConstraintType - Given a constraint letter, return the type of
20497/// constraint it is for this target.
20500 unsigned S = Constraint.size();
20501 if (S == 1) {
20502 switch (Constraint[0]) {
20503 default: break;
20504 case 'l': return C_RegisterClass;
20505 case 'w': return C_RegisterClass;
20506 case 'h': return C_RegisterClass;
20507 case 'x': return C_RegisterClass;
20508 case 't': return C_RegisterClass;
20509 case 'j': return C_Immediate; // Constant for movw.
20510 // An address with a single base register. Due to the way we
20511 // currently handle addresses it is the same as an 'r' memory constraint.
20512 case 'Q': return C_Memory;
20513 }
20514 } else if (S == 2) {
20515 switch (Constraint[0]) {
20516 default: break;
20517 case 'T': return C_RegisterClass;
20518 // All 'U+' constraints are addresses.
20519 case 'U': return C_Memory;
20520 }
20521 }
20522 return TargetLowering::getConstraintType(Constraint);
20523}
20524
20525/// Examine constraint type and operand type and determine a weight value.
20526/// This object must already have been set up with the operand type
20527/// and the current alternative constraint selected.
20530 AsmOperandInfo &info, const char *constraint) const {
20532 Value *CallOperandVal = info.CallOperandVal;
20533 // If we don't have a value, we can't do a match,
20534 // but allow it at the lowest weight.
20535 if (!CallOperandVal)
20536 return CW_Default;
20537 Type *type = CallOperandVal->getType();
20538 // Look at the constraint type.
20539 switch (*constraint) {
20540 default:
20542 break;
20543 case 'l':
20544 if (type->isIntegerTy()) {
20545 if (Subtarget->isThumb())
20546 weight = CW_SpecificReg;
20547 else
20548 weight = CW_Register;
20549 }
20550 break;
20551 case 'w':
20552 if (type->isFloatingPointTy())
20553 weight = CW_Register;
20554 break;
20555 }
20556 return weight;
20557}
20558
20559static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT) {
20560 if (PR == 0 || VT == MVT::Other)
20561 return false;
20562 if (ARM::SPRRegClass.contains(PR))
20563 return VT != MVT::f32 && VT != MVT::f16 && VT != MVT::i32;
20564 if (ARM::DPRRegClass.contains(PR))
20565 return VT != MVT::f64 && !VT.is64BitVector();
20566 return false;
20567}
20568
20569using RCPair = std::pair<unsigned, const TargetRegisterClass *>;
20570
20572 const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const {
20573 switch (Constraint.size()) {
20574 case 1:
20575 // GCC ARM Constraint Letters
20576 switch (Constraint[0]) {
20577 case 'l': // Low regs or general regs.
20578 if (Subtarget->isThumb())
20579 return RCPair(0U, &ARM::tGPRRegClass);
20580 return RCPair(0U, &ARM::GPRRegClass);
20581 case 'h': // High regs or no regs.
20582 if (Subtarget->isThumb())
20583 return RCPair(0U, &ARM::hGPRRegClass);
20584 break;
20585 case 'r':
20586 if (Subtarget->isThumb1Only())
20587 return RCPair(0U, &ARM::tGPRRegClass);
20588 return RCPair(0U, &ARM::GPRRegClass);
20589 case 'w':
20590 if (VT == MVT::Other)
20591 break;
20592 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20593 return RCPair(0U, &ARM::SPRRegClass);
20594 if (VT.getSizeInBits() == 64)
20595 return RCPair(0U, &ARM::DPRRegClass);
20596 if (VT.getSizeInBits() == 128)
20597 return RCPair(0U, &ARM::QPRRegClass);
20598 break;
20599 case 'x':
20600 if (VT == MVT::Other)
20601 break;
20602 if (VT == MVT::f32 || VT == MVT::f16 || VT == MVT::bf16)
20603 return RCPair(0U, &ARM::SPR_8RegClass);
20604 if (VT.getSizeInBits() == 64)
20605 return RCPair(0U, &ARM::DPR_8RegClass);
20606 if (VT.getSizeInBits() == 128)
20607 return RCPair(0U, &ARM::QPR_8RegClass);
20608 break;
20609 case 't':
20610 if (VT == MVT::Other)
20611 break;
20612 if (VT == MVT::f32 || VT == MVT::i32 || VT == MVT::f16 || VT == MVT::bf16)
20613 return RCPair(0U, &ARM::SPRRegClass);
20614 if (VT.getSizeInBits() == 64)
20615 return RCPair(0U, &ARM::DPR_VFP2RegClass);
20616 if (VT.getSizeInBits() == 128)
20617 return RCPair(0U, &ARM::QPR_VFP2RegClass);
20618 break;
20619 }
20620 break;
20621
20622 case 2:
20623 if (Constraint[0] == 'T') {
20624 switch (Constraint[1]) {
20625 default:
20626 break;
20627 case 'e':
20628 return RCPair(0U, &ARM::tGPREvenRegClass);
20629 case 'o':
20630 return RCPair(0U, &ARM::tGPROddRegClass);
20631 }
20632 }
20633 break;
20634
20635 default:
20636 break;
20637 }
20638
20639 if (StringRef("{cc}").equals_insensitive(Constraint))
20640 return std::make_pair(unsigned(ARM::CPSR), &ARM::CCRRegClass);
20641
20642 // r14 is an alias of lr.
20643 if (StringRef("{r14}").equals_insensitive(Constraint))
20644 return std::make_pair(unsigned(ARM::LR), getRegClassFor(MVT::i32));
20645
20646 auto RCP = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
20647 if (isIncompatibleReg(RCP.first, VT))
20648 return {0, nullptr};
20649 return RCP;
20650}
20651
20652/// LowerAsmOperandForConstraint - Lower the specified operand into the Ops
20653/// vector. If it is invalid, don't add anything to Ops.
20655 StringRef Constraint,
20656 std::vector<SDValue> &Ops,
20657 SelectionDAG &DAG) const {
20658 SDValue Result;
20659
20660 // Currently only support length 1 constraints.
20661 if (Constraint.size() != 1)
20662 return;
20663
20664 char ConstraintLetter = Constraint[0];
20665 switch (ConstraintLetter) {
20666 default: break;
20667 case 'j':
20668 case 'I': case 'J': case 'K': case 'L':
20669 case 'M': case 'N': case 'O':
20671 if (!C)
20672 return;
20673
20674 int64_t CVal64 = C->getSExtValue();
20675 int CVal = (int) CVal64;
20676 // None of these constraints allow values larger than 32 bits. Check
20677 // that the value fits in an int.
20678 if (CVal != CVal64)
20679 return;
20680
20681 switch (ConstraintLetter) {
20682 case 'j':
20683 // Constant suitable for movw, must be between 0 and
20684 // 65535.
20685 if (Subtarget->hasV6T2Ops() || (Subtarget->hasV8MBaselineOps()))
20686 if (CVal >= 0 && CVal <= 65535)
20687 break;
20688 return;
20689 case 'I':
20690 if (Subtarget->isThumb1Only()) {
20691 // This must be a constant between 0 and 255, for ADD
20692 // immediates.
20693 if (CVal >= 0 && CVal <= 255)
20694 break;
20695 } else if (Subtarget->isThumb2()) {
20696 // A constant that can be used as an immediate value in a
20697 // data-processing instruction.
20698 if (ARM_AM::getT2SOImmVal(CVal) != -1)
20699 break;
20700 } else {
20701 // A constant that can be used as an immediate value in a
20702 // data-processing instruction.
20703 if (ARM_AM::getSOImmVal(CVal) != -1)
20704 break;
20705 }
20706 return;
20707
20708 case 'J':
20709 if (Subtarget->isThumb1Only()) {
20710 // This must be a constant between -255 and -1, for negated ADD
20711 // immediates. This can be used in GCC with an "n" modifier that
20712 // prints the negated value, for use with SUB instructions. It is
20713 // not useful otherwise but is implemented for compatibility.
20714 if (CVal >= -255 && CVal <= -1)
20715 break;
20716 } else {
20717 // This must be a constant between -4095 and 4095. This is suitable
20718 // for use as the immediate offset field in LDR and STR instructions
20719 // such as LDR r0,[r1,#offset].
20720 if (CVal >= -4095 && CVal <= 4095)
20721 break;
20722 }
20723 return;
20724
20725 case 'K':
20726 if (Subtarget->isThumb1Only()) {
20727 // A 32-bit value where only one byte has a nonzero value. Exclude
20728 // zero to match GCC. This constraint is used by GCC internally for
20729 // constants that can be loaded with a move/shift combination.
20730 // It is not useful otherwise but is implemented for compatibility.
20731 if (CVal != 0 && ARM_AM::isThumbImmShiftedVal(CVal))
20732 break;
20733 } else if (Subtarget->isThumb2()) {
20734 // A constant whose bitwise inverse can be used as an immediate
20735 // value in a data-processing instruction. This can be used in GCC
20736 // with a "B" modifier that prints the inverted value, for use with
20737 // BIC and MVN instructions. It is not useful otherwise but is
20738 // implemented for compatibility.
20739 if (ARM_AM::getT2SOImmVal(~CVal) != -1)
20740 break;
20741 } else {
20742 // A constant whose bitwise inverse can be used as an immediate
20743 // value in a data-processing instruction. This can be used in GCC
20744 // with a "B" modifier that prints the inverted value, for use with
20745 // BIC and MVN instructions. It is not useful otherwise but is
20746 // implemented for compatibility.
20747 if (ARM_AM::getSOImmVal(~CVal) != -1)
20748 break;
20749 }
20750 return;
20751
20752 case 'L':
20753 if (Subtarget->isThumb1Only()) {
20754 // This must be a constant between -7 and 7,
20755 // for 3-operand ADD/SUB immediate instructions.
20756 if (CVal >= -7 && CVal < 7)
20757 break;
20758 } else if (Subtarget->isThumb2()) {
20759 // A constant whose negation can be used as an immediate value in a
20760 // data-processing instruction. This can be used in GCC with an "n"
20761 // modifier that prints the negated value, for use with SUB
20762 // instructions. It is not useful otherwise but is implemented for
20763 // compatibility.
20764 if (ARM_AM::getT2SOImmVal(-CVal) != -1)
20765 break;
20766 } else {
20767 // A constant whose negation can be used as an immediate value in a
20768 // data-processing instruction. This can be used in GCC with an "n"
20769 // modifier that prints the negated value, for use with SUB
20770 // instructions. It is not useful otherwise but is implemented for
20771 // compatibility.
20772 if (ARM_AM::getSOImmVal(-CVal) != -1)
20773 break;
20774 }
20775 return;
20776
20777 case 'M':
20778 if (Subtarget->isThumb1Only()) {
20779 // This must be a multiple of 4 between 0 and 1020, for
20780 // ADD sp + immediate.
20781 if ((CVal >= 0 && CVal <= 1020) && ((CVal & 3) == 0))
20782 break;
20783 } else {
20784 // A power of two or a constant between 0 and 32. This is used in
20785 // GCC for the shift amount on shifted register operands, but it is
20786 // useful in general for any shift amounts.
20787 if ((CVal >= 0 && CVal <= 32) || ((CVal & (CVal - 1)) == 0))
20788 break;
20789 }
20790 return;
20791
20792 case 'N':
20793 if (Subtarget->isThumb1Only()) {
20794 // This must be a constant between 0 and 31, for shift amounts.
20795 if (CVal >= 0 && CVal <= 31)
20796 break;
20797 }
20798 return;
20799
20800 case 'O':
20801 if (Subtarget->isThumb1Only()) {
20802 // This must be a multiple of 4 between -508 and 508, for
20803 // ADD/SUB sp = sp + immediate.
20804 if ((CVal >= -508 && CVal <= 508) && ((CVal & 3) == 0))
20805 break;
20806 }
20807 return;
20808 }
20809 Result = DAG.getSignedTargetConstant(CVal, SDLoc(Op), Op.getValueType());
20810 break;
20811 }
20812
20813 if (Result.getNode()) {
20814 Ops.push_back(Result);
20815 return;
20816 }
20817 return TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
20818}
20819
20820static RTLIB::Libcall getDivRemLibcall(
20821 const SDNode *N, MVT::SimpleValueType SVT) {
20822 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20823 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20824 "Unhandled Opcode in getDivRemLibcall");
20825 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20826 N->getOpcode() == ISD::SREM;
20827 RTLIB::Libcall LC;
20828 switch (SVT) {
20829 default: llvm_unreachable("Unexpected request for libcall!");
20830 case MVT::i8: LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8; break;
20831 case MVT::i16: LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
20832 case MVT::i32: LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
20833 case MVT::i64: LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
20834 }
20835 return LC;
20836}
20837
20839 const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget) {
20840 assert((N->getOpcode() == ISD::SDIVREM || N->getOpcode() == ISD::UDIVREM ||
20841 N->getOpcode() == ISD::SREM || N->getOpcode() == ISD::UREM) &&
20842 "Unhandled Opcode in getDivRemArgList");
20843 bool isSigned = N->getOpcode() == ISD::SDIVREM ||
20844 N->getOpcode() == ISD::SREM;
20846 for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
20847 EVT ArgVT = N->getOperand(i).getValueType();
20848 Type *ArgTy = ArgVT.getTypeForEVT(*Context);
20849 TargetLowering::ArgListEntry Entry(N->getOperand(i), ArgTy);
20850 Entry.IsSExt = isSigned;
20851 Entry.IsZExt = !isSigned;
20852 Args.push_back(Entry);
20853 }
20854 if (Subtarget->getTargetTriple().isOSWindows() && Args.size() >= 2)
20855 std::swap(Args[0], Args[1]);
20856 return Args;
20857}
20858
20859SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
20860 assert((Subtarget->isTargetAEABI() || Subtarget->isTargetAndroid() ||
20861 Subtarget->isTargetGNUAEABI() || Subtarget->isTargetMuslAEABI() ||
20862 Subtarget->isTargetFuchsia() || Subtarget->isTargetWindows()) &&
20863 "Register-based DivRem lowering only");
20864 unsigned Opcode = Op->getOpcode();
20865 assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
20866 "Invalid opcode for Div/Rem lowering");
20867 bool isSigned = (Opcode == ISD::SDIVREM);
20868 EVT VT = Op->getValueType(0);
20869 SDLoc dl(Op);
20870
20871 if (VT == MVT::i64 && isa<ConstantSDNode>(Op.getOperand(1))) {
20873 if (expandDIVREMByConstant(Op.getNode(), Result, MVT::i32, DAG)) {
20874 SDValue Res0 =
20875 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[0], Result[1]);
20876 SDValue Res1 =
20877 DAG.getNode(ISD::BUILD_PAIR, dl, VT, Result[2], Result[3]);
20878 return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(),
20879 {Res0, Res1});
20880 }
20881 }
20882
20883 Type *Ty = VT.getTypeForEVT(*DAG.getContext());
20884
20885 // If the target has hardware divide, use divide + multiply + subtract:
20886 // div = a / b
20887 // rem = a - b * div
20888 // return {div, rem}
20889 // This should be lowered into UDIV/SDIV + MLS later on.
20890 bool hasDivide = Subtarget->isThumb() ? Subtarget->hasDivideInThumbMode()
20891 : Subtarget->hasDivideInARMMode();
20892 if (hasDivide && Op->getValueType(0).isSimple() &&
20893 Op->getSimpleValueType(0) == MVT::i32) {
20894 unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
20895 const SDValue Dividend = Op->getOperand(0);
20896 const SDValue Divisor = Op->getOperand(1);
20897 SDValue Div = DAG.getNode(DivOpcode, dl, VT, Dividend, Divisor);
20898 SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Div, Divisor);
20899 SDValue Rem = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
20900
20901 SDValue Values[2] = {Div, Rem};
20902 return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(VT, VT), Values);
20903 }
20904
20905 RTLIB::Libcall LC = getDivRemLibcall(Op.getNode(),
20906 VT.getSimpleVT().SimpleTy);
20907 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20908
20909 SDValue InChain = DAG.getEntryNode();
20910
20912 DAG.getContext(),
20913 Subtarget);
20914
20915 SDValue Callee =
20916 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20917
20918 Type *RetTy = StructType::get(Ty, Ty);
20919
20920 if (getTM().getTargetTriple().isOSWindows())
20921 InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain);
20922
20923 TargetLowering::CallLoweringInfo CLI(DAG);
20924 CLI.setDebugLoc(dl)
20925 .setChain(InChain)
20926 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20927 Callee, std::move(Args))
20928 .setInRegister()
20931
20932 std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
20933 return CallInfo.first;
20934}
20935
20936// Lowers REM using divmod helpers
20937// see RTABI section 4.2/4.3
20938SDValue ARMTargetLowering::LowerREM(SDNode *N, SelectionDAG &DAG) const {
20939 EVT VT = N->getValueType(0);
20940
20941 if (VT == MVT::i64 && isa<ConstantSDNode>(N->getOperand(1))) {
20943 if (expandDIVREMByConstant(N, Result, MVT::i32, DAG))
20944 return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), N->getValueType(0),
20945 Result[0], Result[1]);
20946 }
20947
20948 // Build return types (div and rem)
20949 std::vector<Type*> RetTyParams;
20950 Type *RetTyElement;
20951
20952 switch (VT.getSimpleVT().SimpleTy) {
20953 default: llvm_unreachable("Unexpected request for libcall!");
20954 case MVT::i8: RetTyElement = Type::getInt8Ty(*DAG.getContext()); break;
20955 case MVT::i16: RetTyElement = Type::getInt16Ty(*DAG.getContext()); break;
20956 case MVT::i32: RetTyElement = Type::getInt32Ty(*DAG.getContext()); break;
20957 case MVT::i64: RetTyElement = Type::getInt64Ty(*DAG.getContext()); break;
20958 }
20959
20960 RetTyParams.push_back(RetTyElement);
20961 RetTyParams.push_back(RetTyElement);
20962 ArrayRef<Type*> ret = ArrayRef<Type*>(RetTyParams);
20963 Type *RetTy = StructType::get(*DAG.getContext(), ret);
20964
20965 RTLIB::Libcall LC = getDivRemLibcall(N, N->getValueType(0).getSimpleVT().
20966 SimpleTy);
20967 RTLIB::LibcallImpl LCImpl = DAG.getLibcalls().getLibcallImpl(LC);
20968 SDValue InChain = DAG.getEntryNode();
20970 Subtarget);
20971 bool isSigned = N->getOpcode() == ISD::SREM;
20972
20973 SDValue Callee =
20974 DAG.getExternalSymbol(LCImpl, getPointerTy(DAG.getDataLayout()));
20975
20976 if (getTM().getTargetTriple().isOSWindows())
20977 InChain = WinDBZCheckDenominator(DAG, N, InChain);
20978
20979 // Lower call
20980 CallLoweringInfo CLI(DAG);
20981 CLI.setChain(InChain)
20982 .setCallee(DAG.getLibcalls().getLibcallImplCallingConv(LCImpl), RetTy,
20983 Callee, std::move(Args))
20986 .setDebugLoc(SDLoc(N));
20987 std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
20988
20989 // Return second (rem) result operand (first contains div)
20990 SDNode *ResNode = CallResult.first.getNode();
20991 assert(ResNode->getNumOperands() == 2 && "divmod should return two operands");
20992 return ResNode->getOperand(1);
20993}
20994
20995SDValue
20996ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
20997 assert(getTM().getTargetTriple().isOSWindows() &&
20998 "unsupported target platform");
20999 SDLoc DL(Op);
21000
21001 // Get the inputs.
21002 SDValue Chain = Op.getOperand(0);
21003 SDValue Size = Op.getOperand(1);
21004
21006 "no-stack-arg-probe")) {
21007 MaybeAlign Align =
21008 cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
21009 SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
21010 Chain = SP.getValue(1);
21011 SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
21012 if (Align)
21013 SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
21014 DAG.getSignedConstant(-Align->value(), DL, MVT::i32));
21015 Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
21016 SDValue Ops[2] = { SP, Chain };
21017 return DAG.getMergeValues(Ops, DL);
21018 }
21019
21020 SDValue Words = DAG.getNode(ISD::SRL, DL, MVT::i32, Size,
21021 DAG.getConstant(2, DL, MVT::i32));
21022
21023 SDValue Glue;
21024 Chain = DAG.getCopyToReg(Chain, DL, ARM::R4, Words, Glue);
21025 Glue = Chain.getValue(1);
21026
21027 SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
21028 Chain = DAG.getNode(ARMISD::WIN__CHKSTK, DL, NodeTys, Chain, Glue);
21029
21030 SDValue NewSP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
21031 Chain = NewSP.getValue(1);
21032
21033 SDValue Ops[2] = { NewSP, Chain };
21034 return DAG.getMergeValues(Ops, DL);
21035}
21036
21037SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
21038 bool IsStrict = Op->isStrictFPOpcode();
21039 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
21040 const unsigned DstSz = Op.getValueType().getSizeInBits();
21041 const unsigned SrcSz = SrcVal.getValueType().getSizeInBits();
21042 assert(DstSz > SrcSz && DstSz <= 64 && SrcSz >= 16 &&
21043 "Unexpected type for custom-lowering FP_EXTEND");
21044
21045 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
21046 "With both FP DP and 16, any FP conversion is legal!");
21047
21048 assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
21049 "With FP16, 16 to 32 conversion is legal!");
21050
21051 // Converting from 32 -> 64 is valid if we have FP64.
21052 if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
21053 // FIXME: Remove this when we have strict fp instruction selection patterns
21054 if (IsStrict) {
21055 SDLoc Loc(Op);
21057 Loc, Op.getValueType(), SrcVal);
21058 return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
21059 }
21060 return Op;
21061 }
21062
21063 // Either we are converting from 16 -> 64, without FP16 and/or
21064 // FP.double-precision or without Armv8-fp. So we must do it in two
21065 // steps.
21066 // Or we are converting from 32 -> 64 without fp.double-precision or 16 -> 32
21067 // without FP16. So we must do a function call.
21068 SDLoc Loc(Op);
21069 RTLIB::Libcall LC;
21070 MakeLibCallOptions CallOptions;
21071 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21072 for (unsigned Sz = SrcSz; Sz <= 32 && Sz < DstSz; Sz *= 2) {
21073 bool Supported = (Sz == 16 ? Subtarget->hasFP16() : Subtarget->hasFP64());
21074 MVT SrcVT = (Sz == 16 ? MVT::f16 : MVT::f32);
21075 MVT DstVT = (Sz == 16 ? MVT::f32 : MVT::f64);
21076 if (Supported) {
21077 if (IsStrict) {
21078 SrcVal = DAG.getNode(ISD::STRICT_FP_EXTEND, Loc,
21079 {DstVT, MVT::Other}, {Chain, SrcVal});
21080 Chain = SrcVal.getValue(1);
21081 } else {
21082 SrcVal = DAG.getNode(ISD::FP_EXTEND, Loc, DstVT, SrcVal);
21083 }
21084 } else {
21085 LC = RTLIB::getFPEXT(SrcVT, DstVT);
21086 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
21087 "Unexpected type for custom-lowering FP_EXTEND");
21088 std::tie(SrcVal, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
21089 Loc, Chain);
21090 }
21091 }
21092
21093 return IsStrict ? DAG.getMergeValues({SrcVal, Chain}, Loc) : SrcVal;
21094}
21095
21096SDValue ARMTargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
21097 bool IsStrict = Op->isStrictFPOpcode();
21098
21099 SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
21100 EVT SrcVT = SrcVal.getValueType();
21101 EVT DstVT = Op.getValueType();
21102
21103 if (DstVT == MVT::bf16) {
21104 if (Subtarget->hasBF16() && SrcVT == MVT::f32)
21105 return Op;
21106 return SDValue();
21107 }
21108
21109 const unsigned DstSz = Op.getValueType().getSizeInBits();
21110 const unsigned SrcSz = SrcVT.getSizeInBits();
21111 (void)DstSz;
21112 assert(DstSz < SrcSz && SrcSz <= 64 && DstSz >= 16 &&
21113 "Unexpected type for custom-lowering FP_ROUND");
21114
21115 assert((!Subtarget->hasFP64() || !Subtarget->hasFPARMv8Base()) &&
21116 "With both FP DP and 16, any FP conversion is legal!");
21117
21118 SDLoc Loc(Op);
21119
21120 // Instruction from 32 -> 16 if hasFP16 is valid
21121 if (SrcSz == 32 && Subtarget->hasFP16())
21122 return Op;
21123
21124 // Lib call from 32 -> 16 / 64 -> [32, 16]
21125 RTLIB::Libcall LC = RTLIB::getFPROUND(SrcVT, DstVT);
21126 assert(LC != RTLIB::UNKNOWN_LIBCALL &&
21127 "Unexpected type for custom-lowering FP_ROUND");
21128 MakeLibCallOptions CallOptions;
21129 SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
21131 std::tie(Result, Chain) = makeLibCall(DAG, LC, DstVT, SrcVal, CallOptions,
21132 Loc, Chain);
21133 return IsStrict ? DAG.getMergeValues({Result, Chain}, Loc) : Result;
21134}
21135
21136bool
21138 // The ARM target isn't yet aware of offsets.
21139 return false;
21140}
21141
21143 if (v == 0xffffffff)
21144 return false;
21145
21146 // there can be 1's on either or both "outsides", all the "inside"
21147 // bits must be 0's
21148 return isShiftedMask_32(~v);
21149}
21150
21151/// isFPImmLegal - Returns true if the target can instruction select the
21152/// specified FP immediate natively. If false, the legalizer will
21153/// materialize the FP immediate as a load from a constant pool.
21155 bool ForCodeSize) const {
21156 if (!Subtarget->hasVFP3Base())
21157 return false;
21158 if (VT == MVT::f16 && Subtarget->hasFullFP16())
21159 return ARM_AM::getFP16Imm(Imm) != -1;
21160 if (VT == MVT::f32 && Subtarget->hasFullFP16() &&
21161 ARM_AM::getFP32FP16Imm(Imm) != -1)
21162 return true;
21163 if (VT == MVT::f32)
21164 return ARM_AM::getFP32Imm(Imm) != -1;
21165 if (VT == MVT::f64 && Subtarget->hasFP64())
21166 return ARM_AM::getFP64Imm(Imm) != -1;
21167 return false;
21168}
21169
21170/// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
21171/// MemIntrinsicNodes. The associated MachineMemOperands record the alignment
21172/// specified in the intrinsic calls.
21175 MachineFunction &MF, unsigned Intrinsic) const {
21176 IntrinsicInfo Info;
21177 switch (Intrinsic) {
21178 case Intrinsic::arm_neon_vld1:
21179 case Intrinsic::arm_neon_vld2:
21180 case Intrinsic::arm_neon_vld3:
21181 case Intrinsic::arm_neon_vld4:
21182 case Intrinsic::arm_neon_vld2lane:
21183 case Intrinsic::arm_neon_vld3lane:
21184 case Intrinsic::arm_neon_vld4lane:
21185 case Intrinsic::arm_neon_vld2dup:
21186 case Intrinsic::arm_neon_vld3dup:
21187 case Intrinsic::arm_neon_vld4dup: {
21188 Info.opc = ISD::INTRINSIC_W_CHAIN;
21189 // Conservatively set memVT to the entire set of vectors loaded.
21190 auto &DL = I.getDataLayout();
21191 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21192 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21193 Info.ptrVal = I.getArgOperand(0);
21194 Info.offset = 0;
21195 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21196 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21197 // volatile loads with NEON intrinsics not supported
21198 Info.flags = MachineMemOperand::MOLoad;
21199 Infos.push_back(Info);
21200 return;
21201 }
21202 case Intrinsic::arm_neon_vld1x2:
21203 case Intrinsic::arm_neon_vld1x3:
21204 case Intrinsic::arm_neon_vld1x4: {
21205 Info.opc = ISD::INTRINSIC_W_CHAIN;
21206 // Conservatively set memVT to the entire set of vectors loaded.
21207 auto &DL = I.getDataLayout();
21208 uint64_t NumElts = DL.getTypeSizeInBits(I.getType()) / 64;
21209 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21210 Info.ptrVal = I.getArgOperand(I.arg_size() - 1);
21211 Info.offset = 0;
21212 Info.align = I.getParamAlign(I.arg_size() - 1).valueOrOne();
21213 // volatile loads with NEON intrinsics not supported
21214 Info.flags = MachineMemOperand::MOLoad;
21215 Infos.push_back(Info);
21216 return;
21217 }
21218 case Intrinsic::arm_neon_vst1:
21219 case Intrinsic::arm_neon_vst2:
21220 case Intrinsic::arm_neon_vst3:
21221 case Intrinsic::arm_neon_vst4:
21222 case Intrinsic::arm_neon_vst2lane:
21223 case Intrinsic::arm_neon_vst3lane:
21224 case Intrinsic::arm_neon_vst4lane: {
21225 Info.opc = ISD::INTRINSIC_VOID;
21226 // Conservatively set memVT to the entire set of vectors stored.
21227 auto &DL = I.getDataLayout();
21228 unsigned NumElts = 0;
21229 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21230 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21231 if (!ArgTy->isVectorTy())
21232 break;
21233 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21234 }
21235 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21236 Info.ptrVal = I.getArgOperand(0);
21237 Info.offset = 0;
21238 Value *AlignArg = I.getArgOperand(I.arg_size() - 1);
21239 Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
21240 // volatile stores with NEON intrinsics not supported
21241 Info.flags = MachineMemOperand::MOStore;
21242 Infos.push_back(Info);
21243 return;
21244 }
21245 case Intrinsic::arm_neon_vst1x2:
21246 case Intrinsic::arm_neon_vst1x3:
21247 case Intrinsic::arm_neon_vst1x4: {
21248 Info.opc = ISD::INTRINSIC_VOID;
21249 // Conservatively set memVT to the entire set of vectors stored.
21250 auto &DL = I.getDataLayout();
21251 unsigned NumElts = 0;
21252 for (unsigned ArgI = 1, ArgE = I.arg_size(); ArgI < ArgE; ++ArgI) {
21253 Type *ArgTy = I.getArgOperand(ArgI)->getType();
21254 if (!ArgTy->isVectorTy())
21255 break;
21256 NumElts += DL.getTypeSizeInBits(ArgTy) / 64;
21257 }
21258 Info.memVT = EVT::getVectorVT(I.getType()->getContext(), MVT::i64, NumElts);
21259 Info.ptrVal = I.getArgOperand(0);
21260 Info.offset = 0;
21261 Info.align = I.getParamAlign(0).valueOrOne();
21262 // volatile stores with NEON intrinsics not supported
21263 Info.flags = MachineMemOperand::MOStore;
21264 Infos.push_back(Info);
21265 return;
21266 }
21267 case Intrinsic::arm_mve_vld2q:
21268 case Intrinsic::arm_mve_vld4q: {
21269 Info.opc = ISD::INTRINSIC_W_CHAIN;
21270 // Conservatively set memVT to the entire set of vectors loaded.
21271 Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
21272 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
21273 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21274 Info.ptrVal = I.getArgOperand(0);
21275 Info.offset = 0;
21276 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21277 // volatile loads with MVE intrinsics not supported
21278 Info.flags = MachineMemOperand::MOLoad;
21279 Infos.push_back(Info);
21280 return;
21281 }
21282 case Intrinsic::arm_mve_vst2q:
21283 case Intrinsic::arm_mve_vst4q: {
21284 Info.opc = ISD::INTRINSIC_VOID;
21285 // Conservatively set memVT to the entire set of vectors stored.
21286 Type *VecTy = I.getArgOperand(1)->getType();
21287 unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
21288 Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
21289 Info.ptrVal = I.getArgOperand(0);
21290 Info.offset = 0;
21291 Info.align = Align(VecTy->getScalarSizeInBits() / 8);
21292 // volatile stores with MVE intrinsics not supported
21293 Info.flags = MachineMemOperand::MOStore;
21294 Infos.push_back(Info);
21295 return;
21296 }
21297 case Intrinsic::arm_mve_vldr_gather_base:
21298 case Intrinsic::arm_mve_vldr_gather_base_predicated: {
21299 Info.opc = ISD::INTRINSIC_W_CHAIN;
21300 Info.ptrVal = nullptr;
21301 Info.memVT = MVT::getVT(I.getType());
21302 Info.align = Align(1);
21303 Info.flags |= MachineMemOperand::MOLoad;
21304 Infos.push_back(Info);
21305 return;
21306 }
21307 case Intrinsic::arm_mve_vldr_gather_base_wb:
21308 case Intrinsic::arm_mve_vldr_gather_base_wb_predicated: {
21309 Info.opc = ISD::INTRINSIC_W_CHAIN;
21310 Info.ptrVal = nullptr;
21311 Info.memVT = MVT::getVT(I.getType()->getContainedType(0));
21312 Info.align = Align(1);
21313 Info.flags |= MachineMemOperand::MOLoad;
21314 Infos.push_back(Info);
21315 return;
21316 }
21317 case Intrinsic::arm_mve_vldr_gather_offset:
21318 case Intrinsic::arm_mve_vldr_gather_offset_predicated: {
21319 Info.opc = ISD::INTRINSIC_W_CHAIN;
21320 Info.ptrVal = nullptr;
21321 MVT DataVT = MVT::getVT(I.getType());
21322 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
21323 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21324 DataVT.getVectorNumElements());
21325 Info.align = Align(1);
21326 Info.flags |= MachineMemOperand::MOLoad;
21327 Infos.push_back(Info);
21328 return;
21329 }
21330 case Intrinsic::arm_mve_vstr_scatter_base:
21331 case Intrinsic::arm_mve_vstr_scatter_base_predicated: {
21332 Info.opc = ISD::INTRINSIC_VOID;
21333 Info.ptrVal = nullptr;
21334 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21335 Info.align = Align(1);
21336 Info.flags |= MachineMemOperand::MOStore;
21337 Infos.push_back(Info);
21338 return;
21339 }
21340 case Intrinsic::arm_mve_vstr_scatter_base_wb:
21341 case Intrinsic::arm_mve_vstr_scatter_base_wb_predicated: {
21342 Info.opc = ISD::INTRINSIC_W_CHAIN;
21343 Info.ptrVal = nullptr;
21344 Info.memVT = MVT::getVT(I.getArgOperand(2)->getType());
21345 Info.align = Align(1);
21346 Info.flags |= MachineMemOperand::MOStore;
21347 Infos.push_back(Info);
21348 return;
21349 }
21350 case Intrinsic::arm_mve_vstr_scatter_offset:
21351 case Intrinsic::arm_mve_vstr_scatter_offset_predicated: {
21352 Info.opc = ISD::INTRINSIC_VOID;
21353 Info.ptrVal = nullptr;
21354 MVT DataVT = MVT::getVT(I.getArgOperand(2)->getType());
21355 unsigned MemSize = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
21356 Info.memVT = MVT::getVectorVT(MVT::getIntegerVT(MemSize),
21357 DataVT.getVectorNumElements());
21358 Info.align = Align(1);
21359 Info.flags |= MachineMemOperand::MOStore;
21360 Infos.push_back(Info);
21361 return;
21362 }
21363 case Intrinsic::arm_ldaex:
21364 case Intrinsic::arm_ldrex: {
21365 auto &DL = I.getDataLayout();
21366 Type *ValTy = I.getParamElementType(0);
21367 Info.opc = ISD::INTRINSIC_W_CHAIN;
21368 Info.memVT = MVT::getVT(ValTy);
21369 Info.ptrVal = I.getArgOperand(0);
21370 Info.offset = 0;
21371 Info.align = DL.getABITypeAlign(ValTy);
21373 Infos.push_back(Info);
21374 return;
21375 }
21376 case Intrinsic::arm_stlex:
21377 case Intrinsic::arm_strex: {
21378 auto &DL = I.getDataLayout();
21379 Type *ValTy = I.getParamElementType(1);
21380 Info.opc = ISD::INTRINSIC_W_CHAIN;
21381 Info.memVT = MVT::getVT(ValTy);
21382 Info.ptrVal = I.getArgOperand(1);
21383 Info.offset = 0;
21384 Info.align = DL.getABITypeAlign(ValTy);
21386 Infos.push_back(Info);
21387 return;
21388 }
21389 case Intrinsic::arm_stlexd:
21390 case Intrinsic::arm_strexd:
21391 Info.opc = ISD::INTRINSIC_W_CHAIN;
21392 Info.memVT = MVT::i64;
21393 Info.ptrVal = I.getArgOperand(2);
21394 Info.offset = 0;
21395 Info.align = Align(8);
21397 Infos.push_back(Info);
21398 return;
21399
21400 case Intrinsic::arm_ldaexd:
21401 case Intrinsic::arm_ldrexd:
21402 Info.opc = ISD::INTRINSIC_W_CHAIN;
21403 Info.memVT = MVT::i64;
21404 Info.ptrVal = I.getArgOperand(0);
21405 Info.offset = 0;
21406 Info.align = Align(8);
21408 Infos.push_back(Info);
21409 return;
21410
21411 default:
21412 break;
21413 }
21414}
21415
21416/// Returns true if it is beneficial to convert a load of a constant
21417/// to just the constant itself.
21419 Type *Ty) const {
21420 assert(Ty->isIntegerTy());
21421
21422 unsigned Bits = Ty->getPrimitiveSizeInBits();
21423 if (Bits == 0 || Bits > 32)
21424 return false;
21425 return true;
21426}
21427
21429 unsigned Index) const {
21431 return false;
21432
21433 return (Index == 0 || Index == ResVT.getVectorNumElements());
21434}
21435
21437 ARM_MB::MemBOpt Domain) const {
21438 // First, if the target has no DMB, see what fallback we can use.
21439 if (!Subtarget->hasDataBarrier()) {
21440 // Some ARMv6 cpus can support data barriers with an mcr instruction.
21441 // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
21442 // here.
21443 if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
21444 Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
21445 Builder.getInt32(0), Builder.getInt32(7),
21446 Builder.getInt32(10), Builder.getInt32(5)};
21447 return Builder.CreateIntrinsicWithoutFolding(Intrinsic::arm_mcr, args);
21448 }
21449 // Instead of using barriers, atomic accesses on these subtargets use
21450 // libcalls.
21451 llvm_unreachable("makeDMB on a target so old that it has no barriers");
21452 } else {
21453 // Only a full system barrier exists in the M-class architectures.
21454 Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
21455 Constant *CDomain = Builder.getInt32(Domain);
21456 return Builder.CreateIntrinsicWithoutFolding(Intrinsic::arm_dmb, CDomain);
21457 }
21458}
21459
21460// Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html
21462 Instruction *Inst,
21463 AtomicOrdering Ord) const {
21464 switch (Ord) {
21467 llvm_unreachable("Invalid fence: unordered/non-atomic");
21470 return nullptr; // Nothing to do
21472 if (!Inst->hasAtomicStore())
21473 return nullptr; // Nothing to do
21474 [[fallthrough]];
21477 if (Subtarget->preferISHSTBarriers())
21478 return makeDMB(Builder, ARM_MB::ISHST);
21479 // FIXME: add a comment with a link to documentation justifying this.
21480 else
21481 return makeDMB(Builder, ARM_MB::ISH);
21482 }
21483 llvm_unreachable("Unknown fence ordering in emitLeadingFence");
21484}
21485
21487 Instruction *Inst,
21488 AtomicOrdering Ord) const {
21489 switch (Ord) {
21492 llvm_unreachable("Invalid fence: unordered/not-atomic");
21495 return nullptr; // Nothing to do
21499 return makeDMB(Builder, ARM_MB::ISH);
21500 }
21501 llvm_unreachable("Unknown fence ordering in emitTrailingFence");
21502}
21503
21504// Loads and stores less than 64-bits are already atomic; ones above that
21505// are doomed anyway, so defer to the default libcall and blame the OS when
21506// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21507// anything for those.
21510 bool has64BitAtomicStore;
21511 if (Subtarget->isMClass())
21512 has64BitAtomicStore = false;
21513 else if (Subtarget->isThumb())
21514 has64BitAtomicStore = Subtarget->hasV7Ops();
21515 else
21516 has64BitAtomicStore = Subtarget->hasV6Ops();
21517
21518 unsigned Size = SI->getValueOperand()->getType()->getPrimitiveSizeInBits();
21519 return Size == 64 && has64BitAtomicStore ? AtomicExpansionKind::Expand
21521}
21522
21523// Loads and stores less than 64-bits are already atomic; ones above that
21524// are doomed anyway, so defer to the default libcall and blame the OS when
21525// things go wrong. Cortex M doesn't have ldrexd/strexd though, so don't emit
21526// anything for those.
21527// FIXME: ldrd and strd are atomic if the CPU has LPAE (e.g. A15 has that
21528// guarantee, see DDI0406C ARM architecture reference manual,
21529// sections A8.8.72-74 LDRD)
21532 bool has64BitAtomicLoad;
21533 if (Subtarget->isMClass())
21534 has64BitAtomicLoad = false;
21535 else if (Subtarget->isThumb())
21536 has64BitAtomicLoad = Subtarget->hasV7Ops();
21537 else
21538 has64BitAtomicLoad = Subtarget->hasV6Ops();
21539
21540 unsigned Size = LI->getType()->getPrimitiveSizeInBits();
21541 return (Size == 64 && has64BitAtomicLoad) ? AtomicExpansionKind::LLOnly
21543}
21544
21545// For the real atomic operations, we have ldrex/strex up to 32 bits,
21546// and up to 64 bits on the non-M profiles
21549 if (AI->isFloatingPointOperation())
21551
21552 unsigned Size = AI->getType()->getPrimitiveSizeInBits();
21553 bool hasAtomicRMW;
21554 if (Subtarget->isMClass())
21555 hasAtomicRMW = Subtarget->hasV8MBaselineOps();
21556 else if (Subtarget->isThumb())
21557 hasAtomicRMW = Subtarget->hasV7Ops();
21558 else
21559 hasAtomicRMW = Subtarget->hasV6Ops();
21560 if (Size <= (Subtarget->isMClass() ? 32U : 64U) && hasAtomicRMW) {
21561 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21562 // implement atomicrmw without spilling. If the target address is also on
21563 // the stack and close enough to the spill slot, this can lead to a
21564 // situation where the monitor always gets cleared and the atomic operation
21565 // can never succeed. So at -O0 lower this operation to a CAS loop.
21566 if (getTargetMachine().getOptLevel() == CodeGenOptLevel::None)
21569 }
21571}
21572
21573// Similar to shouldExpandAtomicRMWInIR, ldrex/strex can be used up to 32
21574// bits, and up to 64 bits on the non-M profiles.
21577 const AtomicCmpXchgInst *AI) const {
21578 // At -O0, fast-regalloc cannot cope with the live vregs necessary to
21579 // implement cmpxchg without spilling. If the address being exchanged is also
21580 // on the stack and close enough to the spill slot, this can lead to a
21581 // situation where the monitor always gets cleared and the atomic operation
21582 // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
21583 unsigned Size = AI->getOperand(1)->getType()->getPrimitiveSizeInBits();
21584 bool HasAtomicCmpXchg;
21585 if (Subtarget->isMClass())
21586 HasAtomicCmpXchg = Subtarget->hasV8MBaselineOps();
21587 else if (Subtarget->isThumb())
21588 HasAtomicCmpXchg = Subtarget->hasV7Ops();
21589 else
21590 HasAtomicCmpXchg = Subtarget->hasV6Ops();
21591 if (getTargetMachine().getOptLevel() != CodeGenOptLevel::None &&
21592 HasAtomicCmpXchg && Size <= (Subtarget->isMClass() ? 32U : 64U))
21595}
21596
21598 const Instruction *I) const {
21599 return InsertFencesForAtomic;
21600}
21601
21603 // ROPI/RWPI are not supported currently.
21604 return !Subtarget->isROPI() && !Subtarget->isRWPI();
21605}
21606
21608 Module &M, const LibcallLoweringInfo &Libcalls) const {
21609 // MSVC CRT provides functionalities for stack protection.
21610 RTLIB::LibcallImpl SecurityCheckCookieLibcall =
21611 Libcalls.getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
21612
21613 RTLIB::LibcallImpl SecurityCookieVar =
21614 Libcalls.getLibcallImpl(RTLIB::STACK_CHECK_GUARD);
21615 if (SecurityCheckCookieLibcall != RTLIB::Unsupported &&
21616 SecurityCookieVar != RTLIB::Unsupported) {
21617 // MSVC CRT has a global variable holding security cookie.
21618 M.getOrInsertGlobal(getLibcallImplName(SecurityCookieVar),
21619 PointerType::getUnqual(M.getContext()));
21620
21621 // MSVC CRT has a function to validate security cookie.
21622 FunctionCallee SecurityCheckCookie =
21623 M.getOrInsertFunction(getLibcallImplName(SecurityCheckCookieLibcall),
21624 Type::getVoidTy(M.getContext()),
21625 PointerType::getUnqual(M.getContext()));
21626 if (Function *F = dyn_cast<Function>(SecurityCheckCookie.getCallee()))
21627 F->addParamAttr(0, Attribute::AttrKind::InReg);
21628 }
21629
21631}
21632
21634 unsigned &Cost) const {
21635 // If we do not have NEON, vector types are not natively supported.
21636 if (!Subtarget->hasNEON())
21637 return false;
21638
21639 // Floating point values and vector values map to the same register file.
21640 // Therefore, although we could do a store extract of a vector type, this is
21641 // better to leave at float as we have more freedom in the addressing mode for
21642 // those.
21643 if (VectorTy->isFPOrFPVectorTy())
21644 return false;
21645
21646 // If the index is unknown at compile time, this is very expensive to lower
21647 // and it is not possible to combine the store with the extract.
21648 if (!isa<ConstantInt>(Idx))
21649 return false;
21650
21651 assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
21652 unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedValue();
21653 // We can do a store + vector extract on any vector that fits perfectly in a D
21654 // or Q register.
21655 if (BitWidth == 64 || BitWidth == 128) {
21656 Cost = 0;
21657 return true;
21658 }
21659 return false;
21660}
21661
21663 SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
21664 UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const {
21665 unsigned Opcode = Op.getOpcode();
21666 switch (Opcode) {
21667 case ARMISD::VORRIMM:
21668 case ARMISD::VBICIMM:
21669 return false;
21670 }
21672 Op, DemandedElts, DAG, Kind, ConsiderFlags, Depth);
21673}
21674
21676 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21677}
21678
21680 return Subtarget->hasV5TOps() && !Subtarget->isThumb1Only();
21681}
21682
21684 const Instruction &AndI) const {
21685 if (!Subtarget->hasV7Ops())
21686 return false;
21687
21688 // Sink the `and` instruction only if the mask would fit into a modified
21689 // immediate operand.
21691 if (!Mask || Mask->getValue().getBitWidth() > 32u)
21692 return false;
21693 auto MaskVal = unsigned(Mask->getValue().getZExtValue());
21694 return (Subtarget->isThumb2() ? ARM_AM::getT2SOImmVal(MaskVal)
21695 : ARM_AM::getSOImmVal(MaskVal)) != -1;
21696}
21697
21700 SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const {
21701 if (Subtarget->hasMinSize() && !getTM().getTargetTriple().isOSWindows())
21704 ExpansionFactor);
21705}
21706
21708 Value *Addr,
21709 AtomicOrdering Ord) const {
21710 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21711 bool IsAcquire = isAcquireOrStronger(Ord);
21712
21713 // Since i64 isn't legal and intrinsics don't get type-lowered, the ldrexd
21714 // intrinsic must return {i32, i32} and we have to recombine them into a
21715 // single i64 here.
21716 if (ValueTy->getPrimitiveSizeInBits() == 64) {
21718 IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
21719
21720 Value *LoHi =
21721 Builder.CreateIntrinsic(Int, Addr, /*FMFSource=*/nullptr, "lohi");
21722
21723 Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
21724 Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
21725 if (!Subtarget->isLittle())
21726 std::swap (Lo, Hi);
21727 Lo = Builder.CreateZExt(Lo, ValueTy, "lo64");
21728 Hi = Builder.CreateZExt(Hi, ValueTy, "hi64");
21729 return Builder.CreateOr(
21730 Lo, Builder.CreateShl(Hi, ConstantInt::get(ValueTy, 32)), "val64");
21731 }
21732
21733 Type *Tys[] = { Addr->getType() };
21734 Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
21735 CallInst *CI = Builder.CreateIntrinsicWithoutFolding(Int, Tys, Addr);
21736
21737 CI->addParamAttr(
21738 0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
21739 return Builder.CreateTruncOrBitCast(CI, ValueTy);
21740}
21741
21743 IRBuilderBase &Builder) const {
21744 if (!Subtarget->hasV7Ops())
21745 return;
21746 Builder.CreateIntrinsic(Intrinsic::arm_clrex, {});
21747}
21748
21750 Value *Val, Value *Addr,
21751 AtomicOrdering Ord) const {
21752 Module *M = Builder.GetInsertBlock()->getParent()->getParent();
21753 bool IsRelease = isReleaseOrStronger(Ord);
21754
21755 // Since the intrinsics must have legal type, the i64 intrinsics take two
21756 // parameters: "i32, i32". We must marshal Val into the appropriate form
21757 // before the call.
21758 if (Val->getType()->getPrimitiveSizeInBits() == 64) {
21760 IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
21761 Type *Int32Ty = Type::getInt32Ty(M->getContext());
21762
21763 Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
21764 Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
21765 if (!Subtarget->isLittle())
21766 std::swap(Lo, Hi);
21767 return Builder.CreateIntrinsic(Int, {Lo, Hi, Addr});
21768 }
21769
21770 Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
21771 Type *Tys[] = { Addr->getType() };
21773
21774 CallInst *CI = Builder.CreateCall(
21775 Strex, {Builder.CreateZExtOrBitCast(
21776 Val, Strex->getFunctionType()->getParamType(0)),
21777 Addr});
21778 CI->addParamAttr(1, Attribute::get(M->getContext(), Attribute::ElementType,
21779 Val->getType()));
21780 return CI;
21781}
21782
21783
21785 return Subtarget->isMClass();
21786}
21787
21788/// A helper function for determining the number of interleaved accesses we
21789/// will generate when lowering accesses of the given type.
21790unsigned
21792 const DataLayout &DL) const {
21793 return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
21794}
21795
21797 unsigned Factor, FixedVectorType *VecTy, Align Alignment,
21798 const DataLayout &DL) const {
21799
21800 unsigned VecSize = DL.getTypeSizeInBits(VecTy);
21801 unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
21802
21803 if (!Subtarget->hasNEON() && !Subtarget->hasMVEIntegerOps())
21804 return false;
21805
21806 // Ensure the vector doesn't have f16 elements. Even though we could do an
21807 // i16 vldN, we can't hold the f16 vectors and will end up converting via
21808 // f32.
21809 if (Subtarget->hasNEON() && VecTy->getElementType()->isHalfTy())
21810 return false;
21811 if (Subtarget->hasMVEIntegerOps() && Factor == 3)
21812 return false;
21813
21814 // Ensure the number of vector elements is greater than 1.
21815 if (VecTy->getNumElements() < 2)
21816 return false;
21817
21818 // Ensure the element type is legal.
21819 if (ElSize != 8 && ElSize != 16 && ElSize != 32)
21820 return false;
21821 // And the alignment if high enough under MVE.
21822 if (Subtarget->hasMVEIntegerOps() && Alignment < ElSize / 8)
21823 return false;
21824
21825 // Ensure the total vector size is 64 or a multiple of 128. Types larger than
21826 // 128 will be split into multiple interleaved accesses.
21827 if (Subtarget->hasNEON() && VecSize == 64)
21828 return true;
21829 return VecSize % 128 == 0;
21830}
21831
21833 if (Subtarget->hasNEON())
21834 return 4;
21835 if (Subtarget->hasMVEIntegerOps())
21838}
21839
21840/// Lower an interleaved load into a vldN intrinsic.
21841///
21842/// E.g. Lower an interleaved load (Factor = 2):
21843/// %wide.vec = load <8 x i32>, <8 x i32>* %ptr, align 4
21844/// %v0 = shuffle %wide.vec, undef, <0, 2, 4, 6> ; Extract even elements
21845/// %v1 = shuffle %wide.vec, undef, <1, 3, 5, 7> ; Extract odd elements
21846///
21847/// Into:
21848/// %vld2 = { <4 x i32>, <4 x i32> } call llvm.arm.neon.vld2(%ptr, 4)
21849/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 0
21850/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %vld2, i32 1
21852 Instruction *Load, Value *Mask, ArrayRef<ShuffleVectorInst *> Shuffles,
21853 ArrayRef<unsigned> Indices, unsigned Factor, const APInt &GapMask) const {
21854 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
21855 "Invalid interleave factor");
21856 assert(!Shuffles.empty() && "Empty shufflevector input");
21857 assert(Shuffles.size() == Indices.size() &&
21858 "Unmatched number of shufflevectors and indices");
21859
21860 auto *LI = dyn_cast<LoadInst>(Load);
21861 if (!LI)
21862 return false;
21863 assert(!Mask && GapMask.popcount() == Factor && "Unexpected mask on a load");
21864
21865 auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
21866 Type *EltTy = VecTy->getElementType();
21867
21868 const DataLayout &DL = LI->getDataLayout();
21869 Align Alignment = LI->getAlign();
21870
21871 // Skip if we do not have NEON and skip illegal vector types. We can
21872 // "legalize" wide vector types into multiple interleaved accesses as long as
21873 // the vector types are divisible by 128.
21874 if (!isLegalInterleavedAccessType(Factor, VecTy, Alignment, DL))
21875 return false;
21876
21877 unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
21878
21879 // A pointer vector can not be the return type of the ldN intrinsics. Need to
21880 // load integer vectors first and then convert to pointer vectors.
21881 if (EltTy->isPointerTy())
21882 VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
21883
21884 IRBuilder<> Builder(LI);
21885
21886 // The base address of the load.
21887 Value *BaseAddr = LI->getPointerOperand();
21888
21889 if (NumLoads > 1) {
21890 // If we're going to generate more than one load, reset the sub-vector type
21891 // to something legal.
21892 VecTy = FixedVectorType::get(VecTy->getElementType(),
21893 VecTy->getNumElements() / NumLoads);
21894 }
21895
21896 assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
21897
21898 auto createLoadIntrinsic = [&](Value *BaseAddr) {
21899 if (Subtarget->hasNEON()) {
21900 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21901 Type *Tys[] = {VecTy, PtrTy};
21902 static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
21903 Intrinsic::arm_neon_vld3,
21904 Intrinsic::arm_neon_vld4};
21905
21907 Ops.push_back(BaseAddr);
21908 Ops.push_back(Builder.getInt32(LI->getAlign().value()));
21909
21910 return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
21911 /*FMFSource=*/nullptr, "vldN");
21912 } else {
21913 assert((Factor == 2 || Factor == 4) &&
21914 "expected interleave factor of 2 or 4 for MVE");
21915 Intrinsic::ID LoadInts =
21916 Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
21917 Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
21918 Type *Tys[] = {VecTy, PtrTy};
21919
21921 Ops.push_back(BaseAddr);
21922 return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
21923 "vldN");
21924 }
21925 };
21926
21927 // Holds sub-vectors extracted from the load intrinsic return values. The
21928 // sub-vectors are associated with the shufflevector instructions they will
21929 // replace.
21931
21932 for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
21933 // If we're generating more than one load, compute the base address of
21934 // subsequent loads as an offset from the previous.
21935 if (LoadCount > 0)
21936 BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
21937 VecTy->getNumElements() * Factor);
21938
21939 Value *VldN = createLoadIntrinsic(BaseAddr);
21940
21941 // Replace uses of each shufflevector with the corresponding vector loaded
21942 // by ldN.
21943 for (unsigned i = 0; i < Shuffles.size(); i++) {
21944 ShuffleVectorInst *SV = Shuffles[i];
21945 unsigned Index = Indices[i];
21946
21947 Value *SubVec = Builder.CreateExtractValue(VldN, Index);
21948
21949 // Convert the integer vector to pointer vector if the element is pointer.
21950 if (EltTy->isPointerTy())
21951 SubVec = Builder.CreateIntToPtr(
21952 SubVec,
21954
21955 SubVecs[SV].push_back(SubVec);
21956 }
21957 }
21958
21959 // Replace uses of the shufflevector instructions with the sub-vectors
21960 // returned by the load intrinsic. If a shufflevector instruction is
21961 // associated with more than one sub-vector, those sub-vectors will be
21962 // concatenated into a single wide vector.
21963 for (ShuffleVectorInst *SVI : Shuffles) {
21964 auto &SubVec = SubVecs[SVI];
21965 auto *WideVec =
21966 SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
21967 SVI->replaceAllUsesWith(WideVec);
21968 }
21969
21970 return true;
21971}
21972
21973/// Lower an interleaved store into a vstN intrinsic.
21974///
21975/// E.g. Lower an interleaved store (Factor = 3):
21976/// %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
21977/// <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
21978/// store <12 x i32> %i.vec, <12 x i32>* %ptr, align 4
21979///
21980/// Into:
21981/// %sub.v0 = shuffle <8 x i32> %v0, <8 x i32> v1, <0, 1, 2, 3>
21982/// %sub.v1 = shuffle <8 x i32> %v0, <8 x i32> v1, <4, 5, 6, 7>
21983/// %sub.v2 = shuffle <8 x i32> %v0, <8 x i32> v1, <8, 9, 10, 11>
21984/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
21985///
21986/// Note that the new shufflevectors will be removed and we'll only generate one
21987/// vst3 instruction in CodeGen.
21988///
21989/// Example for a more general valid mask (Factor 3). Lower:
21990/// %i.vec = shuffle <32 x i32> %v0, <32 x i32> %v1,
21991/// <4, 32, 16, 5, 33, 17, 6, 34, 18, 7, 35, 19>
21992/// store <12 x i32> %i.vec, <12 x i32>* %ptr
21993///
21994/// Into:
21995/// %sub.v0 = shuffle <32 x i32> %v0, <32 x i32> v1, <4, 5, 6, 7>
21996/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
21997/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
21998/// call void llvm.arm.neon.vst3(%ptr, %sub.v0, %sub.v1, %sub.v2, 4)
22000 Value *LaneMask,
22001 ShuffleVectorInst *SVI,
22002 unsigned Factor,
22003 const APInt &GapMask) const {
22004 assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
22005 "Invalid interleave factor");
22006 auto *SI = dyn_cast<StoreInst>(Store);
22007 if (!SI)
22008 return false;
22009 assert(!LaneMask && GapMask.popcount() == Factor &&
22010 "Unexpected mask on store");
22011
22012 auto *VecTy = cast<FixedVectorType>(SVI->getType());
22013 assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
22014
22015 unsigned LaneLen = VecTy->getNumElements() / Factor;
22016 Type *EltTy = VecTy->getElementType();
22017 auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
22018
22019 const DataLayout &DL = SI->getDataLayout();
22020 Align Alignment = SI->getAlign();
22021
22022 // Skip if we do not have NEON and skip illegal vector types. We can
22023 // "legalize" wide vector types into multiple interleaved accesses as long as
22024 // the vector types are divisible by 128.
22025 if (!isLegalInterleavedAccessType(Factor, SubVecTy, Alignment, DL))
22026 return false;
22027
22028 unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
22029
22030 Value *Op0 = SVI->getOperand(0);
22031 Value *Op1 = SVI->getOperand(1);
22032 IRBuilder<> Builder(SI);
22033
22034 // StN intrinsics don't support pointer vectors as arguments. Convert pointer
22035 // vectors to integer vectors.
22036 if (EltTy->isPointerTy()) {
22037 Type *IntTy = DL.getIntPtrType(EltTy);
22038
22039 // Convert to the corresponding integer vector.
22040 auto *IntVecTy =
22042 Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
22043 Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
22044
22045 SubVecTy = FixedVectorType::get(IntTy, LaneLen);
22046 }
22047
22048 // The base address of the store.
22049 Value *BaseAddr = SI->getPointerOperand();
22050
22051 if (NumStores > 1) {
22052 // If we're going to generate more than one store, reset the lane length
22053 // and sub-vector type to something legal.
22054 LaneLen /= NumStores;
22055 SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
22056 }
22057
22058 assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
22059
22060 auto Mask = SVI->getShuffleMask();
22061
22062 auto createStoreIntrinsic = [&](Value *BaseAddr,
22063 SmallVectorImpl<Value *> &Shuffles) {
22064 if (Subtarget->hasNEON()) {
22065 static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
22066 Intrinsic::arm_neon_vst3,
22067 Intrinsic::arm_neon_vst4};
22068 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
22069 Type *Tys[] = {PtrTy, SubVecTy};
22070
22072 Ops.push_back(BaseAddr);
22073 append_range(Ops, Shuffles);
22074 Ops.push_back(Builder.getInt32(SI->getAlign().value()));
22075 Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
22076 } else {
22077 assert((Factor == 2 || Factor == 4) &&
22078 "expected interleave factor of 2 or 4 for MVE");
22079 Intrinsic::ID StoreInts =
22080 Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
22081 Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
22082 Type *Tys[] = {PtrTy, SubVecTy};
22083
22085 Ops.push_back(BaseAddr);
22086 append_range(Ops, Shuffles);
22087 for (unsigned F = 0; F < Factor; F++) {
22088 Ops.push_back(Builder.getInt32(F));
22089 Builder.CreateIntrinsic(StoreInts, Tys, Ops);
22090 Ops.pop_back();
22091 }
22092 }
22093 };
22094
22095 for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
22096 // If we generating more than one store, we compute the base address of
22097 // subsequent stores as an offset from the previous.
22098 if (StoreCount > 0)
22099 BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
22100 BaseAddr, LaneLen * Factor);
22101
22102 SmallVector<Value *, 4> Shuffles;
22103
22104 // Split the shufflevector operands into sub vectors for the new vstN call.
22105 for (unsigned i = 0; i < Factor; i++) {
22106 unsigned IdxI = StoreCount * LaneLen * Factor + i;
22107 if (Mask[IdxI] >= 0) {
22108 Shuffles.push_back(Builder.CreateShuffleVector(
22109 Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
22110 } else {
22111 unsigned StartMask = 0;
22112 for (unsigned j = 1; j < LaneLen; j++) {
22113 unsigned IdxJ = StoreCount * LaneLen * Factor + j;
22114 if (Mask[IdxJ * Factor + IdxI] >= 0) {
22115 StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
22116 break;
22117 }
22118 }
22119 // Note: If all elements in a chunk are undefs, StartMask=0!
22120 // Note: Filling undef gaps with random elements is ok, since
22121 // those elements were being written anyway (with undefs).
22122 // In the case of all undefs we're defaulting to using elems from 0
22123 // Note: StartMask cannot be negative, it's checked in
22124 // isReInterleaveMask
22125 Shuffles.push_back(Builder.CreateShuffleVector(
22126 Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
22127 }
22128 }
22129
22130 createStoreIntrinsic(BaseAddr, Shuffles);
22131 }
22132 return true;
22133}
22134
22142
22144 uint64_t &Members) {
22145 if (auto *ST = dyn_cast<StructType>(Ty)) {
22146 for (unsigned i = 0; i < ST->getNumElements(); ++i) {
22147 uint64_t SubMembers = 0;
22148 if (!isHomogeneousAggregate(ST->getElementType(i), Base, SubMembers))
22149 return false;
22150 Members += SubMembers;
22151 }
22152 } else if (auto *AT = dyn_cast<ArrayType>(Ty)) {
22153 uint64_t SubMembers = 0;
22154 if (!isHomogeneousAggregate(AT->getElementType(), Base, SubMembers))
22155 return false;
22156 Members += SubMembers * AT->getNumElements();
22157 } else if (Ty->isFloatTy()) {
22158 if (Base != HA_UNKNOWN && Base != HA_FLOAT)
22159 return false;
22160 Members = 1;
22161 Base = HA_FLOAT;
22162 } else if (Ty->isDoubleTy()) {
22163 if (Base != HA_UNKNOWN && Base != HA_DOUBLE)
22164 return false;
22165 Members = 1;
22166 Base = HA_DOUBLE;
22167 } else if (auto *VT = dyn_cast<VectorType>(Ty)) {
22168 Members = 1;
22169 switch (Base) {
22170 case HA_FLOAT:
22171 case HA_DOUBLE:
22172 return false;
22173 case HA_VECT64:
22174 return VT->getPrimitiveSizeInBits().getFixedValue() == 64;
22175 case HA_VECT128:
22176 return VT->getPrimitiveSizeInBits().getFixedValue() == 128;
22177 case HA_UNKNOWN:
22178 switch (VT->getPrimitiveSizeInBits().getFixedValue()) {
22179 case 64:
22180 Base = HA_VECT64;
22181 return true;
22182 case 128:
22183 Base = HA_VECT128;
22184 return true;
22185 default:
22186 return false;
22187 }
22188 }
22189 }
22190
22191 return (Members > 0 && Members <= 4);
22192}
22193
22194/// Return the correct alignment for the current calling convention.
22196 Type *ArgTy, const DataLayout &DL) const {
22197 const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
22198 if (!ArgTy->isVectorTy())
22199 return ABITypeAlign;
22200
22201 // Avoid over-aligning vector parameters. It would require realigning the
22202 // stack and waste space for no real benefit.
22203 MaybeAlign StackAlign = DL.getStackAlignment();
22204 assert(StackAlign && "data layout string is missing stack alignment");
22205 return std::min(ABITypeAlign, *StackAlign);
22206}
22207
22208/// Return true if a type is an AAPCS-VFP homogeneous aggregate or one of
22209/// [N x i32] or [N x i64]. This allows front-ends to skip emitting padding when
22210/// passing according to AAPCS rules.
22212 Type *Ty, CallingConv::ID CallConv, bool isVarArg,
22213 const DataLayout &DL) const {
22214 if (getEffectiveCallingConv(CallConv, isVarArg) !=
22216 return false;
22217
22219 uint64_t Members = 0;
22220 bool IsHA = isHomogeneousAggregate(Ty, Base, Members);
22221 LLVM_DEBUG(dbgs() << "isHA: " << IsHA << " "; Ty->dump());
22222
22223 bool IsIntArray = Ty->isArrayTy() && Ty->getArrayElementType()->isIntegerTy();
22224 return IsHA || IsIntArray;
22225}
22226
22228 const Constant *PersonalityFn) const {
22229 // Platforms which do not use SjLj EH may return values in these registers
22230 // via the personality function.
22232 return EM == ExceptionHandling::SjLj ? Register() : ARM::R0;
22233}
22234
22236 const Constant *PersonalityFn) const {
22237 // Platforms which do not use SjLj EH may return values in these registers
22238 // via the personality function.
22240 return EM == ExceptionHandling::SjLj ? Register() : ARM::R1;
22241}
22242
22243void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
22244 // Update IsSplitCSR in ARMFunctionInfo.
22245 ARMFunctionInfo *AFI = Entry->getParent()->getInfo<ARMFunctionInfo>();
22246 AFI->setIsSplitCSR(true);
22247}
22248
22249void ARMTargetLowering::insertCopiesSplitCSR(
22250 MachineBasicBlock *Entry,
22251 const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
22252 const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
22253 const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
22254 if (!IStart)
22255 return;
22256
22257 const TargetInstrInfo *TII = Subtarget->getInstrInfo();
22258 MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
22259 MachineBasicBlock::iterator MBBI = Entry->begin();
22260 for (const MCPhysReg *I = IStart; *I; ++I) {
22261 const TargetRegisterClass *RC = nullptr;
22262 if (ARM::GPRRegClass.contains(*I))
22263 RC = &ARM::GPRRegClass;
22264 else if (ARM::DPRRegClass.contains(*I))
22265 RC = &ARM::DPRRegClass;
22266 else
22267 llvm_unreachable("Unexpected register class in CSRsViaCopy!");
22268
22269 Register NewVR = MRI->createVirtualRegister(RC);
22270 // Create copy from CSR to a virtual register.
22271 // FIXME: this currently does not emit CFI pseudo-instructions, it works
22272 // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
22273 // nounwind. If we want to generalize this later, we may need to emit
22274 // CFI pseudo-instructions.
22275 assert(Entry->getParent()->getFunction().hasFnAttribute(
22276 Attribute::NoUnwind) &&
22277 "Function should be nounwind in insertCopiesSplitCSR!");
22278 Entry->addLiveIn(*I);
22279 BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
22280 .addReg(*I);
22281
22282 // Insert the copy-back instructions right before the terminator.
22283 for (auto *Exit : Exits)
22284 BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
22285 TII->get(TargetOpcode::COPY), *I)
22286 .addReg(NewVR);
22287 }
22288}
22289
22294
22296 return Subtarget->hasMVEIntegerOps();
22297}
22298
22301 auto *VTy = dyn_cast<FixedVectorType>(Ty);
22302 if (!VTy)
22303 return false;
22304
22305 auto *ScalarTy = VTy->getScalarType();
22306 unsigned NumElements = VTy->getNumElements();
22307
22308 unsigned VTyWidth = VTy->getScalarSizeInBits() * NumElements;
22309 if (VTyWidth < 128 || !llvm::isPowerOf2_32(VTyWidth))
22310 return false;
22311
22312 // Both VCADD and VCMUL/VCMLA support the same types, F16 and F32
22313 if (ScalarTy->isHalfTy() || ScalarTy->isFloatTy())
22314 return Subtarget->hasMVEFloatOps();
22315
22317 return false;
22318
22319 return Subtarget->hasMVEIntegerOps() &&
22320 (ScalarTy->isIntegerTy(8) || ScalarTy->isIntegerTy(16) ||
22321 ScalarTy->isIntegerTy(32));
22322}
22323
22325 static const MCPhysReg RCRegs[] = {ARM::FPSCR_RM};
22326 return RCRegs;
22327}
22328
22331 ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB,
22332 Value *Accumulator) const {
22333
22335
22336 unsigned TyWidth = Ty->getScalarSizeInBits() * Ty->getNumElements();
22337
22338 assert(TyWidth >= 128 && "Width of vector type must be at least 128 bits");
22339
22340 if (TyWidth > 128) {
22341 int Stride = Ty->getNumElements() / 2;
22342 auto SplitSeq = llvm::seq<int>(0, Ty->getNumElements());
22343 auto SplitSeqVec = llvm::to_vector(SplitSeq);
22344 ArrayRef<int> LowerSplitMask(&SplitSeqVec[0], Stride);
22345 ArrayRef<int> UpperSplitMask(&SplitSeqVec[Stride], Stride);
22346
22347 auto *LowerSplitA = B.CreateShuffleVector(InputA, LowerSplitMask);
22348 auto *LowerSplitB = B.CreateShuffleVector(InputB, LowerSplitMask);
22349 auto *UpperSplitA = B.CreateShuffleVector(InputA, UpperSplitMask);
22350 auto *UpperSplitB = B.CreateShuffleVector(InputB, UpperSplitMask);
22351 Value *LowerSplitAcc = nullptr;
22352 Value *UpperSplitAcc = nullptr;
22353
22354 if (Accumulator) {
22355 LowerSplitAcc = B.CreateShuffleVector(Accumulator, LowerSplitMask);
22356 UpperSplitAcc = B.CreateShuffleVector(Accumulator, UpperSplitMask);
22357 }
22358
22359 auto *LowerSplitInt = createComplexDeinterleavingIR(
22360 B, OperationType, Rotation, LowerSplitA, LowerSplitB, LowerSplitAcc);
22361 auto *UpperSplitInt = createComplexDeinterleavingIR(
22362 B, OperationType, Rotation, UpperSplitA, UpperSplitB, UpperSplitAcc);
22363
22364 ArrayRef<int> JoinMask(&SplitSeqVec[0], Ty->getNumElements());
22365 return B.CreateShuffleVector(LowerSplitInt, UpperSplitInt, JoinMask);
22366 }
22367
22368 auto *IntTy = Type::getInt32Ty(B.getContext());
22369
22370 ConstantInt *ConstRotation = nullptr;
22371 if (OperationType == ComplexDeinterleavingOperation::CMulPartial) {
22372 ConstRotation = ConstantInt::get(IntTy, (int)Rotation);
22373
22374 if (Accumulator)
22375 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmlaq, Ty,
22376 {ConstRotation, Accumulator, InputB, InputA});
22377 return B.CreateIntrinsic(Intrinsic::arm_mve_vcmulq, Ty,
22378 {ConstRotation, InputB, InputA});
22379 }
22380
22381 if (OperationType == ComplexDeinterleavingOperation::CAdd) {
22382 // 1 means the value is not halved.
22383 auto *ConstHalving = ConstantInt::get(IntTy, 1);
22384
22386 ConstRotation = ConstantInt::get(IntTy, 0);
22388 ConstRotation = ConstantInt::get(IntTy, 1);
22389
22390 if (!ConstRotation)
22391 return nullptr; // Invalid rotation for arm_mve_vcaddq
22392
22393 return B.CreateIntrinsic(Intrinsic::arm_mve_vcaddq, Ty,
22394 {ConstHalving, ConstRotation, InputA, InputB});
22395 }
22396
22397 return nullptr;
22398}
static bool isAddSubSExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt)
isVShiftRImm - Check if this is a valid build_vector for the immediate operand of a vector shift righ...
static bool isExtendedBUILD_VECTOR(SDValue N, SelectionDAG &DAG, bool isSigned)
static SDValue carryFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG, bool Invert)
static SDValue overflowFlagToValue(SDValue Glue, EVT VT, SelectionDAG &DAG)
static bool isZeroExtended(SDValue N, SelectionDAG &DAG)
return SDValue()
static bool isCMN(SDValue Op, ISD::CondCode CC, SelectionDAG &DAG)
static const MCPhysReg GPRArgRegs[]
static SDValue valueToCarryFlag(SDValue Value, SelectionDAG &DAG, bool Invert)
static SDValue GeneratePerfectShuffle(unsigned ID, SDValue V1, SDValue V2, unsigned PFEntry, SDValue LHS, SDValue RHS, SelectionDAG &DAG, const SDLoc &DL)
GeneratePerfectShuffle - Given an entry in the perfect-shuffle table, emit the specified operations t...
constexpr MVT FlagsVT
Value type used for NZCV flags.
static bool getVShiftImm(SDValue Op, unsigned ElementBits, int64_t &Cnt)
getVShiftImm - Check if this is a valid build_vector for the immediate operand of a vector shift oper...
static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm, const APInt &Demanded, TargetLowering::TargetLoweringOpt &TLO, unsigned NewOpc)
static bool isSafeSignedCMN(SDValue Op, SelectionDAG &DAG)
static SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG)
static bool isSignExtended(SDValue N, SelectionDAG &DAG)
static bool isAddSubZExt(SDValue N, SelectionDAG &DAG)
static bool isVShiftLImm(SDValue Op, EVT VT, bool isLong, int64_t &Cnt)
isVShiftLImm - Check if this is a valid build_vector for the immediate operand of a vector shift left...
static bool canGuaranteeTCO(CallingConv::ID CC, bool GuaranteeTailCalls)
Return true if the calling convention is one that we can guarantee TCO for.
unsigned RegSize
assert(UImm &&(UImm !=~static_cast< T >(0)) &&"Invalid immediate!")
amdgpu aa AMDGPU Address space based Alias Analysis Wrapper
static bool isConstant(const MachineInstr &MI)
constexpr LLT F64
constexpr LLT S1
This file declares a class to represent arbitrary precision floating point values and provide a varie...
This file implements a class to represent arbitrary precision integral constant values and operations...
static SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG)
static bool isStore(int Opcode)
static bool isThumb(const MCSubtargetInfo &STI)
static SDValue PerformExtractEltToVMOVRRD(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isIncompatibleReg(const MCPhysReg &PR, MVT VT)
static SDValue PerformVQDMULHCombine(SDNode *N, SelectionDAG &DAG)
static SDValue LowerBUILD_VECTOR_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddRequiredExtensionForVMULL(SDValue N, SelectionDAG &DAG, const EVT &OrigTy, const EVT &ExtTy, unsigned ExtOpcode)
AddRequiredExtensionForVMULL - Add a sign/zero extension to extend the total value size to 64 bits.
static cl::opt< unsigned > ConstpoolPromotionMaxSize("arm-promote-constant-max-size", cl::Hidden, cl::desc("Maximum size of constant to promote into a constant pool"), cl::init(64))
static bool isZeroOrAllOnes(SDValue N, bool AllOnes)
static SDValue LowerINSERT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isVTBLMask(ArrayRef< int > M, EVT VT)
static SDValue PerformSUBCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSUBCombine - Target-specific dag combine xforms for ISD::SUB.
static cl::opt< bool > EnableConstpoolPromotion("arm-promote-constant", cl::Hidden, cl::desc("Enable / disable promotion of unnamed_addr constants into " "constant pools"), cl::init(false))
static SDValue PerformFAddVSelectCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformExtractFpToIntStores(StoreSDNode *St, SelectionDAG &DAG)
static SDValue PerformVDUPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVDUPCombine - Target-specific dag combine xforms for ARMISD::VDUP.
static SDValue PerformExtractEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static const APInt * isPowerOf2Constant(SDValue V)
static SDValue PerformVCVTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVCVTCombine - VCVT (floating-point to fixed-point, Advanced SIMD) can replace combinations of ...
static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorFP_TO_INT(SDValue Op, SelectionDAG &DAG)
static SDValue LowerVECTOR_SHUFFLEUsingOneOff(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static bool isValidMVECond(unsigned CC, bool IsFloat)
static SDValue PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC)
IntCCToARMCC - Convert a DAG integer condition code to an ARM CC.
static SDValue PerformSTORECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformSTORECombine - Target-specific dag combine xforms for ISD::STORE.
static SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isGTorGE(ISD::CondCode CC)
static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a vldN-lane (N > 1) intrinsic,...
static SDValue ParseBFI(SDNode *N, APInt &ToMask, APInt &FromMask)
static bool isReverseMask(ArrayRef< int > M, EVT VT)
static bool isVZIP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVZIP_v_undef_Mask - Special case of isVZIPMask for canonical form of "vector_shuffle v,...
static SDValue PerformSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVECTOR_REG_CASTCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVMulVCTPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
PerformVMulVCTPCombine - VCVT (fixed-point to floating-point, Advanced SIMD) can replace combinations...
static SDValue createGPRPairNode2xi32(SelectionDAG &DAG, SDValue V0, SDValue V1)
static SDValue bitcastf32Toi32(SDValue Op, SelectionDAG &DAG)
static bool findPointerConstIncrement(SDNode *N, SDValue *Ptr, SDValue *CInc)
static bool isVTRNMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool CanInvertMVEVCMP(SDValue N)
static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG)
static SDValue AddCombineToVPADD(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformShiftCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
PerformShiftCombine - Checks for immediate versions of vector shifts and lowers them.
static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode, ARMCC::CondCodes &CondCode2)
FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
static void ExpandREAD_REGISTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static EVT getVectorTyFromPredicateVector(EVT VT)
static SDValue PerformFADDVCMLACombine(SDNode *N, SelectionDAG &DAG)
static SDValue handleCMSEValue(const SDValue &Value, const ISD::InputArg &Arg, SelectionDAG &DAG, const SDLoc &DL)
static SDValue PerformARMBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
Target-specific dag combine xforms for ARMISD::BUILD_VECTOR.
static bool isSRL16(const SDValue &Op)
static SDValue PerformVMOVrhCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformLOADCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue IsCMPZCSINC(SDNode *Cmp, ARMCC::CondCodes &CC)
static unsigned getPointerConstIncrement(unsigned Opcode, SDValue Ptr, SDValue Inc, const SelectionDAG &DAG)
static SDValue combineSelectAndUseCommutative(SDNode *N, bool AllOnes, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static Register genTPEntry(MachineBasicBlock *TpEntry, MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpExit, Register OpSizeReg, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI)
Adds logic in loop entry MBB to calculate loop iteration count and adds t2WhileLoopSetup and t2WhileL...
static SDValue createGPRPairNodei64(SelectionDAG &DAG, SDValue V)
static bool isLTorLE(ISD::CondCode CC)
static SDValue PerformVCMPCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerSDIV_v4i16(SDValue N0, SDValue N1, const SDLoc &dl, SelectionDAG &DAG)
static SDValue performNegCMovCombine(SDNode *N, SelectionDAG &DAG)
static EVT getExtensionTo64Bits(const EVT &OrigVT)
static SDValue PerformBITCASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue AddCombineTo64bitMLAL(SDNode *AddeSubeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerWRITE_REGISTER(SDValue Op, SelectionDAG &DAG)
static bool checkAndUpdateCPSRKill(MachineBasicBlock::iterator SelectItr, MachineBasicBlock *BB, const TargetRegisterInfo *TRI)
static SDValue PerformCMPZCombine(SDNode *N, SelectionDAG &DAG)
static bool hasNormalLoadOperand(SDNode *N)
hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node are normal,...
static SDValue PerformInsertEltCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
PerformInsertEltCombine - Target-specific dag combine xforms for ISD::INSERT_VECTOR_ELT.
static SDValue PerformVDUPLANECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVDUPLANECombine - Target-specific dag combine xforms for ARMISD::VDUPLANE.
static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static cl::opt< unsigned > ConstpoolPromotionMaxTotal("arm-promote-constant-max-total", cl::Hidden, cl::desc("Maximum size of ALL constants to promote into a constant pool"), cl::init(128))
static SDValue LowerTruncatei1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static RTLIB::Libcall getDivRemLibcall(const SDNode *N, MVT::SimpleValueType SVT)
static SDValue SkipLoadExtensionForVMULL(LoadSDNode *LD, SelectionDAG &DAG)
SkipLoadExtensionForVMULL - return a load of the original vector size that does not do any sign/zero ...
static SDValue AddCombineVUZPToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombineWithOperands - Try DAG combinations for an ADD with operands N0 and N1.
static SDValue PromoteMVEPredVector(SDLoc dl, SDValue Pred, EVT VT, SelectionDAG &DAG)
static SDValue matchCSET(unsigned &Opcode, bool &InvertCond, SDValue TrueVal, SDValue FalseVal, const ARMSubtarget *Subtarget)
static bool isVZIPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isVTRN_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVTRN_v_undef_Mask - Special case of isVTRNMask for canonical form of "vector_shuffle v,...
static SDValue LowerUDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue FindBFIToCombineWith(SDNode *N)
static SDValue LowerADDSUBSAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static void checkVSELConstraints(ISD::CondCode CC, ARMCC::CondCodes &CondCode, bool &swpCmpOps, bool &swpVselOps)
static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isS16(const SDValue &Op, SelectionDAG &DAG)
static bool isSRA16(const SDValue &Op)
static SDValue AddCombineBUILD_VECTORToVPADDL(SDNode *N, SDValue N0, SDValue N1, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerVECTOR_SHUFFLEUsingMovs(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue LowerInterruptReturn(SmallVectorImpl< SDValue > &RetOps, const SDLoc &DL, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getInvertedARMCondCode(SDValue ARMcc, SelectionDAG &DAG)
static SDValue LowerSDIV_v4i8(SDValue X, SDValue Y, const SDLoc &dl, SelectionDAG &DAG)
static void expandf64Toi32(SDValue Op, SelectionDAG &DAG, SDValue &RetVal1, SDValue &RetVal2)
static SDValue LowerCONCAT_VECTORS_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static bool isSHL16(const SDValue &Op)
static bool isVEXTMask(ArrayRef< int > M, EVT VT, bool &ReverseVEXT, unsigned &Imm)
static SDValue PerformMVEVLDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
cl::opt< unsigned > ArmMaxBaseUpdatesToCheck("arm-max-base-updates-to-check", cl::Hidden, cl::desc("Maximum number of base-updates to check generating postindex."), cl::init(64))
static bool isTruncMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue PerformADDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
static unsigned getLdOpcode(unsigned LdSize, bool IsThumb1, bool IsThumb2)
Return the load opcode for a given load size.
static SDValue LowerADDSUBO_CARRY(SDValue Op, SelectionDAG &DAG, unsigned Opcode, bool IsSigned)
static bool isLegalT2AddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
static bool isLegalMVEShuffleOp(unsigned PFEntry)
static SDValue PerformSignExtendInregCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformShuffleVMOVNCombine(ShuffleVectorSDNode *N, SelectionDAG &DAG)
static bool isVUZPMask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG)
PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for ISD::VECTOR_SHUFFLE.
static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG)
SkipExtensionForVMULL - For a node that is a SIGN_EXTEND, ZERO_EXTEND, ANY_EXTEND,...
static int getNegationCost(SDValue Op)
static bool isVMOVNTruncMask(ArrayRef< int > M, EVT ToVT, bool rev)
static SDValue PerformVQMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static MachineBasicBlock * OtherSucc(MachineBasicBlock *MBB, MachineBasicBlock *Succ)
static SDValue LowerVecReduceMinMax(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformAddcSubcCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformVSELECTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static TargetLowering::ArgListTy getDivRemArgList(const SDNode *N, LLVMContext *Context, const ARMSubtarget *Subtarget)
static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue getZeroVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl)
getZeroVector - Returns a vector of specified type with all zero elements.
static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static ARMCC::CondCodes getVCMPCondCode(SDValue N)
static cl::opt< bool > ARMInterworking("arm-interworking", cl::Hidden, cl::desc("Enable / disable ARM interworking (for debugging only)"), cl::init(true))
static void ReplaceREADCYCLECOUNTER(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformORCombineToBFI(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes, SDValue &CC, bool &Invert, SDValue &OtherOp, SelectionDAG &DAG)
static SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformVSetCCToVCTPCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerBUILD_VECTORToVIDUP(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isZeroVector(SDValue N)
static SDValue PerformAddeSubeCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void ReplaceCMP_SWAP_64Results(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG)
static bool isLowerSaturate(const SDValue LHS, const SDValue RHS, const SDValue TrueVal, const SDValue FalseVal, const ISD::CondCode CC, const SDValue K)
static bool isLegalLogicalImmediate(unsigned Imm, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG)
static void emitPostSt(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned StSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment store operation with given size.
static bool isVMOVNMask(ArrayRef< int > M, EVT VT, bool Top, bool SingleSource)
static SDValue CombineBaseUpdate(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
CombineBaseUpdate - Target-specific DAG combine function for VLDDUP, NEON load/store intrinsics,...
static SDValue LowerSaturatingConditional(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSubCSINCCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVRRDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMOVRRDCombine - Target-specific dag combine xforms for ARMISD::VMOVRRD.
static SDValue LowerFP_TO_INT_SAT(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformCSETCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformVMOVNCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue PerformInsertSubvectorCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI)
static SDValue LowerVectorExtend(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue WinDBZCheckDenominator(SelectionDAG &DAG, SDNode *N, SDValue InChain)
static SDValue LowerVECTOR_SHUFFLEv8i8(SDValue Op, ArrayRef< int > ShuffleMask, SelectionDAG &DAG)
static SDValue PerformVMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformVMULCombine Distribute (A + B) * C to (A * C) + (B * C) to take advantage of the special multi...
static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG)
static SDValue PerformBFICombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformORCombine - Target-specific dag combine xforms for ISD::OR.
static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG)
static SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG)
static unsigned SelectPairHalf(unsigned Elements, ArrayRef< int > Mask, unsigned Index)
static void emitPostLd(MachineBasicBlock *BB, MachineBasicBlock::iterator Pos, const TargetInstrInfo *TII, const DebugLoc &dl, unsigned LdSize, unsigned Data, unsigned AddrIn, unsigned AddrOut, bool IsThumb1, bool IsThumb2)
Emit a post-increment load operation with given size.
static SDValue TryDistrubutionADDVecReduce(SDNode *N, SelectionDAG &DAG)
static bool isValidBaseUpdate(SDNode *N, SDNode *User)
static SDValue IsSingleInstrConstant(SDValue N, SelectionDAG &DAG, const ARMSubtarget *ST, const SDLoc &dl)
static bool IsQRMVEInstruction(const SDNode *N, const SDNode *Op)
static SDValue PerformMinMaxToSatCombine(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformXORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment, bool isSEXTLoad, bool IsMasked, bool isLE, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
std::pair< unsigned, const TargetRegisterClass * > RCPair
static SDValue combineSelectAndUse(SDNode *N, SDValue Slct, SDValue OtherOp, TargetLowering::DAGCombinerInfo &DCI, bool AllOnes=false)
static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, ISD::ZERO_EXTEND,...
static SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
cl::opt< unsigned > MVEMaxSupportedInterleaveFactor("mve-max-interleave-factor", cl::Hidden, cl::desc("Maximum interleave factor for MVE VLDn to generate."), cl::init(2))
static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef, unsigned SplatBitSize, SelectionDAG &DAG, const SDLoc &dl, EVT &VT, EVT VectorVT, VMOVModImmType type)
isVMOVModifiedImm - Check if the specified splat value corresponds to a valid vector constant for a N...
static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC, SelectionDAG &DAG)
BC is a bitcast that is about to be turned into a VMOVDRR.
static SDValue promoteToConstantPool(const ARMTargetLowering *TLI, const GlobalValue *GV, SelectionDAG &DAG, EVT PtrVT, const SDLoc &dl)
static unsigned isNEONTwoResultShuffleMask(ArrayRef< int > ShuffleMask, EVT VT, unsigned &WhichResult, bool &isV_UNDEF)
Check if ShuffleMask is a NEON two-result shuffle (VZIP, VUZP, VTRN), and return the corresponding AR...
static bool BitsProperlyConcatenate(const APInt &A, const APInt &B)
static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, SelectionDAG &DAG)
static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG)
static bool TryCombineBaseUpdate(struct BaseUpdateTarget &Target, struct BaseUpdateUser &User, bool SimpleConstIncOnly, TargetLowering::DAGCombinerInfo &DCI)
static bool allUsersAreInFunction(const Value *V, const Function *F)
Return true if all users of V are within function F, looking through ConstantExprs.
static bool isSingletonVEXTMask(ArrayRef< int > M, EVT VT, unsigned &Imm)
static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG)
PerformVMOVDRRCombine - Target-specific dag combine xforms for ARMISD::VMOVDRR.
static bool isLowerSaturatingConditional(const SDValue &Op, SDValue &V, SDValue &SatK)
static bool isLegalAddressImmediate(int64_t V, EVT VT, const ARMSubtarget *Subtarget)
isLegalAddressImmediate - Return true if the integer value can be used as the offset of the target ad...
static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static bool isLegalT1AddressImmediate(int64_t V, EVT VT)
static SDValue CombineANDShift(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG)
static SDValue PerformSHLSimplify(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformADDECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformADDECombine - Target-specific dag combine transform from ARMISD::ADDC, ARMISD::ADDE,...
static SDValue PerformReduceShuffleCombine(SDNode *N, SelectionDAG &DAG)
static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformHWLoopCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST)
static SDValue PerformORCombineToShiftInsert(SelectionDAG &DAG, SDValue AndOp, SDValue ShiftOp, EVT VT, SDLoc dl)
static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG)
static bool isVUZP_v_undef_Mask(ArrayRef< int > M, EVT VT, unsigned &WhichResult)
isVUZP_v_undef_Mask - Special case of isVUZPMask for canonical form of "vector_shuffle v,...
static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base, uint64_t &Members)
static SDValue PerformMULCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerReverse_VECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG)
static SDValue PerformANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static SDValue PerformADDVecReduce(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue LowerPredicateStore(SDValue Op, SelectionDAG &DAG)
static SDValue SearchLoopIntrinsic(SDValue N, ISD::CondCode &CC, int &Imm, bool &Negate)
static bool canChangeToInt(SDValue Op, bool &SeenZero, const ARMSubtarget *Subtarget)
canChangeToInt - Given the fp compare operand, return true if it is suitable to morph to an integer c...
static unsigned getStOpcode(unsigned StSize, bool IsThumb1, bool IsThumb2)
Return the store opcode for a given store size.
static bool IsVUZPShuffleNode(SDNode *N)
static SDValue Expand64BitShift(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
static void attachMEMCPYScratchRegs(const ARMSubtarget *Subtarget, MachineInstr &MI, const SDNode *Node)
Attaches vregs to MEMCPY that it will use as scratch registers when it is expanded into LDM/STM.
static bool isFloatingPointZero(SDValue Op)
isFloatingPointZero - Return true if this is +0.0.
static SDValue findMUL_LOHI(SDValue V)
static SDValue LowerVECTOR_SHUFFLE_i1(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformORCombine_i1(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *Subtarget)
static SDValue PerformSplittingMVEEXTToWideningLoad(SDNode *N, SelectionDAG &DAG)
static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG)
static void genTPLoopBody(MachineBasicBlock *TpLoopBody, MachineBasicBlock *TpEntry, MachineBasicBlock *TpExit, const TargetInstrInfo *TII, DebugLoc Dl, MachineRegisterInfo &MRI, Register OpSrcReg, Register OpDestReg, Register ElementCountReg, Register TotalIterationsReg, bool IsMemcpy)
Adds logic in the loopBody MBB to generate MVE_VCTP, t2DoLoopDec and t2DoLoopEnd.
static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget)
PerformBUILD_VECTORCombine - Target-specific dag combine xforms for ISD::BUILD_VECTOR.
static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *ST)
static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG, const ARMSubtarget *ST)
PerformMinMaxCombine - Target-specific DAG combining for creating truncating saturates.
MachineBasicBlock & MBB
MachineBasicBlock MachineBasicBlock::iterator DebugLoc DL
MachineBasicBlock MachineBasicBlock::iterator MBBI
This file a TargetTransformInfoImplBase conforming object specific to the ARM target machine.
Function Alias Analysis false
Function Alias Analysis Results
Atomic ordering constants.
This file contains the simple types necessary to represent the attributes associated with functions a...
#define X(NUM, ENUM, NAME)
Definition ELF.h:856
This file implements the BitVector class.
static GCRegistry::Add< ErlangGC > A("erlang", "erlang-compatible garbage collector")
static GCRegistry::Add< CoreCLRGC > E("coreclr", "CoreCLR-compatible GC")
static GCRegistry::Add< OcamlGC > B("ocaml", "ocaml 3.10-compatible GC")
static std::optional< bool > isBigEndian(const SmallDenseMap< int64_t, int64_t, 8 > &MemOffset2Idx, int64_t LowestIdx)
Given a map from byte offsets in memory to indices in a load/store, determine if that map corresponds...
This file contains the declarations for the subclasses of Constant, which represent the different fla...
static void createLoadIntrinsic(IntrinsicInst *II, LoadInst *LI, dxil::ResourceTypeInfo &RTI)
static void createStoreIntrinsic(IntrinsicInst *II, StoreInst *SI, dxil::ResourceTypeInfo &RTI)
This file defines the DenseMap class.
static bool isSigned(unsigned Opcode)
#define Check(C,...)
#define op(i)
#define im(i)
const HexagonInstrInfo * TII
IRTranslator LLVM IR MI
Module.h This file contains the declarations for the Module class.
std::pair< Value *, Value * > ShuffleOps
We are building a shuffle to create V, which is a sequence of insertelement, extractelement pairs.
static Value * LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP)
Emit the code to lower ctpop of V before the specified instruction IP.
const AbstractManglingParser< Derived, Alloc >::OperatorInfo AbstractManglingParser< Derived, Alloc >::Ops[]
#define RegName(no)
static LVOptions Options
Definition LVOptions.cpp:25
lazy value info
#define F(x, y, z)
Definition MD5.cpp:54
#define I(x, y, z)
Definition MD5.cpp:57
#define G(x, y, z)
Definition MD5.cpp:55
This file declares the MachineConstantPool class which is an abstract constant pool to keep track of ...
static DebugLoc getDebugLoc(MachineBasicBlock::instr_iterator FirstMI, MachineBasicBlock::instr_iterator LastMI)
Return the first DebugLoc that has line number information, given a range of instructions.
Register Reg
Register const TargetRegisterInfo * TRI
Promote Memory to Register
Definition Mem2Reg.cpp:110
#define T
nvptx lower args
uint64_t High
uint64_t IntrinsicInst * II
PowerPC Reduce CR logical Operation
R600 Clause Merge
const SmallVectorImpl< MachineOperand > & Cond
static cl::opt< RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode > Mode("regalloc-enable-advisor", cl::Hidden, cl::init(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default), cl::desc("Enable regalloc advisor mode"), cl::values(clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Default, "default", "Default"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Release, "release", "precompiled"), clEnumValN(RegAllocEvictionAdvisorAnalysisLegacy::AdvisorMode::Development, "development", "for training")))
SI Lower i1 Copies
This file contains some templates that are useful if you are working with the STL at all.
static bool contains(SmallPtrSetImpl< ConstantExpr * > &Cache, ConstantExpr *Expr, Constant *C)
Definition Value.cpp:484
static cl::opt< unsigned > MaxSteps("has-predecessor-max-steps", cl::Hidden, cl::init(8192), cl::desc("DAG combiner limit number of steps when searching DAG " "for predecessor nodes"))
This file defines the SmallPtrSet class.
This file defines the SmallVector class.
This file defines the 'Statistic' class, which is designed to be an easy way to expose various metric...
#define STATISTIC(VARNAME, DESC)
Definition Statistic.h:171
This file contains some functions that are useful when dealing with strings.
This file implements the StringSwitch template, which mimics a switch() statement whose cases are str...
#define LLVM_DEBUG(...)
Definition Debug.h:119
static TableGen::Emitter::Opt Y("gen-skeleton-entry", EmitSkeleton, "Generate example skeleton entry")
static SymbolRef::Type getType(const Symbol *Sym)
Definition TapiFile.cpp:39
This file describes how to lower LLVM code to machine code.
static X86::CondCode getSwappedCondition(X86::CondCode CC)
Assuming the flags are set by MI(a,b), return the condition code if we modify the instructions such t...
static constexpr int Concat[]
Value * RHS
Value * LHS
BinaryOperator * Mul
static bool isIntrinsic(const CallBase &Call, Intrinsic::ID ID)
The Input class is used to parse a yaml document into in-memory structs and vectors.
static constexpr roundingMode rmTowardZero
Definition APFloat.h:349
LLVM_ABI bool getExactInverse(APFloat *Inv) const
If this value is normal and has an exact, normal, multiplicative inverse, store it in inv and return ...
Definition APFloat.cpp:5843
APInt bitcastToAPInt() const
Definition APFloat.h:1436
opStatus convertToInteger(MutableArrayRef< integerPart > Input, unsigned int Width, bool IsSigned, roundingMode RM, bool *IsExact) const
Definition APFloat.h:1397
Class for arbitrary precision integers.
Definition APInt.h:78
static APInt getAllOnes(unsigned numBits)
Return an APInt of a specified width with all bits set.
Definition APInt.h:235
bool isMinSignedValue() const
Determine if this is the smallest signed value.
Definition APInt.h:424
uint64_t getZExtValue() const
Get zero extended value.
Definition APInt.h:1563
unsigned popcount() const
Count the number of bits set.
Definition APInt.h:1693
LLVM_ABI APInt zextOrTrunc(unsigned width) const
Zero extend or truncate to width.
Definition APInt.cpp:1076
unsigned getActiveBits() const
Compute the number of active bits in the value.
Definition APInt.h:1535
LLVM_ABI APInt trunc(unsigned width) const
Truncate to new width.
Definition APInt.cpp:968
void setBit(unsigned BitPosition)
Set the given bit to 1 whose position is given as "bitPosition".
Definition APInt.h:1353
bool sgt(const APInt &RHS) const
Signed greater than comparison.
Definition APInt.h:1208
bool isAllOnes() const
Determine if all bits are set. This is true for zero-width values.
Definition APInt.h:372
unsigned getBitWidth() const
Return the number of bits in the APInt.
Definition APInt.h:1511
bool ult(const APInt &RHS) const
Unsigned less than comparison.
Definition APInt.h:1118
unsigned countr_zero() const
Count the number of trailing zero bits.
Definition APInt.h:1662
unsigned countl_zero() const
The APInt version of std::countl_zero.
Definition APInt.h:1621
static LLVM_ABI APInt getSplat(unsigned NewLen, const APInt &V)
Return a value containing V broadcasted over NewLen bits.
Definition APInt.cpp:652
unsigned logBase2() const
Definition APInt.h:1784
uint64_t getLimitedValue(uint64_t Limit=UINT64_MAX) const
If this value is smaller than the specified limit, return it, otherwise return the limit value.
Definition APInt.h:476
bool isSubsetOf(const APInt &RHS) const
This operation checks that all bits set in this APInt are also set in RHS.
Definition APInt.h:1264
bool isPowerOf2() const
Check if this APInt's value is a power of two greater than zero.
Definition APInt.h:441
static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet)
Constructs an APInt value that has the bottom loBitsSet bits set.
Definition APInt.h:307
static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet)
Constructs an APInt value that has the top hiBitsSet bits set.
Definition APInt.h:297
bool isOne() const
Determine if this is a value of 1.
Definition APInt.h:390
static APInt getOneBitSet(unsigned numBits, unsigned BitNo)
Return an APInt with exactly one bit set in the result.
Definition APInt.h:240
int64_t getSExtValue() const
Get sign extended value.
Definition APInt.h:1585
void lshrInPlace(unsigned ShiftAmt)
Logical right-shift this APInt by ShiftAmt in place.
Definition APInt.h:865
APInt lshr(unsigned shiftAmt) const
Logical right-shift function.
Definition APInt.h:858
unsigned countr_one() const
Count the number of trailing one bits.
Definition APInt.h:1679
bool uge(const APInt &RHS) const
Unsigned greater or equal comparison.
Definition APInt.h:1228
An arbitrary precision integer that knows its signedness.
Definition APSInt.h:24
const ARMBaseRegisterInfo & getRegisterInfo() const
const uint32_t * getSjLjDispatchPreservedMask(const MachineFunction &MF) const
const MCPhysReg * getCalleeSavedRegs(const MachineFunction *MF) const override
Code Generation virtual methods...
Register getFrameRegister(const MachineFunction &MF) const override
const uint32_t * getCallPreservedMask(const MachineFunction &MF, CallingConv::ID) const override
const uint32_t * getTLSCallPreservedMask(const MachineFunction &MF) const
const uint32_t * getThisReturnPreservedMask(const MachineFunction &MF, CallingConv::ID) const
getThisReturnPreservedMask - Returns a call preserved mask specific to the case that 'returned' is on...
static ARMConstantPoolConstant * Create(const Constant *C, unsigned ID)
static ARMConstantPoolMBB * Create(LLVMContext &C, const MachineBasicBlock *mbb, unsigned ID, unsigned char PCAdj)
static ARMConstantPoolSymbol * Create(LLVMContext &C, StringRef s, unsigned ID, unsigned char PCAdj)
ARMConstantPoolValue - ARM specific constantpool value.
ARMFunctionInfo - This class is derived from MachineFunctionInfo and contains private ARM-specific in...
SmallPtrSet< const GlobalVariable *, 2 > & getGlobalsPromotedToConstantPool()
void setArgumentStackToRestore(unsigned v)
void setArgRegsSaveSize(unsigned s)
void setReturnRegsCount(unsigned s)
unsigned getArgRegsSaveSize() const
void markGlobalAsPromotedToConstantPool(const GlobalVariable *GV)
Indicate to the backend that GV has had its storage changed to inside a constant pool.
void setArgumentStackSize(unsigned size)
unsigned getArgumentStackSize() const
const Triple & getTargetTriple() const
const ARMBaseInstrInfo * getInstrInfo() const override
bool isThumb1Only() const
bool useFPVFMx() const
bool isThumb2() const
bool hasBaseDSP() const
const ARMTargetLowering * getTargetLowering() const override
const ARMBaseRegisterInfo * getRegisterInfo() const override
bool hasVFP2Base() const
bool useFPVFMx64() const
bool isLittle() const
bool useFPVFMx16() const
bool isMClass() const
bool useMulOps() const
bool shouldFoldSelectWithIdentityConstant(unsigned BinOpcode, EVT VT, unsigned SelectOpcode, SDValue X, SDValue Y) const override
Return true if pulling a binary operation into a select with an identity constant is profitable.
bool isReadOnly(const GlobalValue *GV) const
unsigned getMaxSupportedInterleaveFactor() const override
Get the maximum supported factor for interleaved memory accesses.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicLoadInIR(LoadInst *LI) const override
Returns how the given (atomic) load should be expanded by the IR-level AtomicExpand pass.
unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const
Returns the number of interleaved accesses that will be generated when lowering accesses of the given...
bool shouldInsertFencesForAtomic(const Instruction *I) const override
Whether AtomicExpandPass should automatically insert fences and reduce ordering for this atomic.
Align getABIAlignmentForCallingConv(Type *ArgTy, const DataLayout &DL) const override
Return the correct alignment for the current calling convention.
bool isDesirableToCommuteWithShift(const SDNode *N, CombineLevel Level) const override
Return true if it is profitable to move this shift by a constant amount through its operand,...
Register getExceptionPointerRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception address on entry to an ...
ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const override
Examine constraint string and operand type and determine a weight value.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS, Instruction *I=nullptr) const override
isLegalAddressingMode - Return true if the addressing mode represented by AM is legal for this target...
const ARMSubtarget * getSubtarget() const
bool isLegalT2ScaledAddressingMode(const AddrMode &AM, EVT VT) const
bool isLegalT1ScaledAddressingMode(const AddrMode &AM, EVT VT) const
Returns true if the addressing mode representing by AM is legal for the Thumb1 target,...
bool getPreIndexedAddressParts(SDNode *N, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPreIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mod...
MachineInstr * EmitKCFICheck(MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &MBBI, const TargetInstrInfo *TII) const override
bool shouldAlignPointerArgs(CallInst *CI, unsigned &MinSize, Align &PrefAlign) const override
Return true if the pointer arguments to CI should be aligned by aligning the object whose address is ...
void getTgtMemIntrinsic(SmallVectorImpl< IntrinsicInfo > &Infos, const CallBase &I, MachineFunction &MF, unsigned Intrinsic) const override
getTgtMemIntrinsic - Represent NEON load and store intrinsics as MemIntrinsicNodes.
void ReplaceNodeResults(SDNode *N, SmallVectorImpl< SDValue > &Results, SelectionDAG &DAG) const override
ReplaceNodeResults - Replace the results of node with an illegal result type with new values built ou...
void emitAtomicCmpXchgNoStoreLLBalance(IRBuilderBase &Builder) const override
bool isMulAddWithConstProfitable(SDValue AddNode, SDValue ConstNode) const override
Return true if it may be profitable to transform (mul (add x, c1), c2) -> (add (mul x,...
bool isLegalAddImmediate(int64_t Imm) const override
isLegalAddImmediate - Return true if the specified immediate is legal add immediate,...
EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override
Returns the target specific optimal type for load and store operations as a result of memset,...
Instruction * emitTrailingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
bool isFNegFree(EVT VT) const override
Return true if an fneg operation is free to the point where it is never worthwhile to replace it with...
void finalizeLowering(MachineFunction &MF) const override
Execute target specific actions to finalize target lowering.
SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const
bool isFPImmLegal(const APFloat &Imm, EVT VT, bool ForCodeSize=false) const override
isFPImmLegal - Returns true if the target can instruction select the specified FP immediate natively.
ConstraintType getConstraintType(StringRef Constraint) const override
getConstraintType - Given a constraint letter, return the type of constraint it is for this target.
bool preferIncOfAddToSubOfNot(EVT VT) const override
These two forms are equivalent: sub y, (xor x, -1) add (add x, 1), y The variant with two add's is IR...
void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const override
Determine which of the bits specified in Mask are known to be either zero or one and return them in t...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicStoreInIR(StoreInst *SI) const override
Returns how the given (atomic) store should be expanded by the IR-level AtomicExpand pass into.
SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const
PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
bool shouldFoldConstantShiftPairToMask(const SDNode *N) const override
Return true if it is profitable to fold a pair of shifts into a mask.
bool isDesirableToCommuteXorWithShift(const SDNode *N) const override
Return true if it is profitable to combine an XOR of a logical shift to create a logical shift of NOT...
SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const
PerformCMOVCombine - Target-specific DAG combining for ARMISD::CMOV.
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicRMWInIR(const AtomicRMWInst *AI) const override
Returns how the IR-level AtomicExpand pass should expand the given AtomicRMW, if at all.
Value * createComplexDeinterleavingIR(IRBuilderBase &B, ComplexDeinterleavingOperation OperationType, ComplexDeinterleavingRotation Rotation, Value *InputA, Value *InputB, Value *Accumulator=nullptr) const override
Create the IR node for the given complex deinterleaving operation.
bool isComplexDeinterleavingSupported() const override
Does this target support complex deinterleaving.
SDValue PerformMVEExtCombine(SDNode *N, DAGCombinerInfo &DCI) const
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering) const override
createFastISel - This method returns a target specific FastISel object, or null if the target does no...
void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const override
Inserts necessary declarations for SSP (stack protection) purpose.
bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth) const override
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context, EVT VT) const override
getSetCCResultType - Return the value type to use for ISD::SETCC.
MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override
This method should be implemented by targets that mark instructions with the 'usesCustomInserter' fla...
Value * emitStoreConditional(IRBuilderBase &Builder, Value *Val, Value *Addr, AtomicOrdering Ord) const override
Perform a store-conditional operation to Addr.
SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override
This callback is invoked for operations that are unsupported by the target, which are registered to u...
CCAssignFn * CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const
void AdjustInstrPostInstrSelection(MachineInstr &MI, SDNode *Node) const override
This method should be implemented by targets that mark instructions with the 'hasPostISelHook' flag.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const override
Return true if EXTRACT_SUBVECTOR is cheap for this result type with this index.
bool isTruncateFree(Type *SrcTy, Type *DstTy) const override
Return true if it's free to truncate a value of type FromTy to type ToTy.
bool isShuffleMaskLegal(ArrayRef< int > M, EVT VT) const override
isShuffleMaskLegal - Targets can use this to indicate that they only support some VECTOR_SHUFFLE oper...
TargetLoweringBase::AtomicExpansionKind shouldExpandAtomicCmpXchgInIR(const AtomicCmpXchgInst *AI) const override
Returns how the given atomic cmpxchg should be expanded by the IR-level AtomicExpand pass.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override
Returns true if it is beneficial to convert a load of a constant to just the constant itself.
bool lowerInterleavedStore(Instruction *Store, Value *Mask, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved store into a vstN intrinsic.
const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const override
getRegClassFor - Return the register class that should be used for the specified value type.
bool useLoadStackGuardNode(const Module &M) const override
If this function returns true, SelectionDAGBuilder emits a LOAD_STACK_GUARD node when it is lowering ...
bool lowerInterleavedLoad(Instruction *Load, Value *Mask, ArrayRef< ShuffleVectorInst * > Shuffles, ArrayRef< unsigned > Indices, unsigned Factor, const APInt &GapMask) const override
Lower an interleaved load into a vldN intrinsic.
std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const override
Return the largest legal super-reg register class of the register class for the specified type and it...
bool preferSelectsOverBooleanArithmetic(EVT VT) const override
Should we prefer selects to doing arithmetic on boolean types.
bool isZExtFree(SDValue Val, EVT VT2) const override
Return true if zero-extending the specific node Val to type VT2 is free (either because it's implicit...
bool isCheapToSpeculateCttz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic cttz.
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override
This method will be invoked for all target nodes and for any target-independent nodes that the target...
bool isCheapToSpeculateCtlz(Type *Ty) const override
Return true if it is cheap to speculate a call to intrinsic ctlz.
bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, TargetLoweringOpt &TLO) const override
bool hasAndNot(SDValue Y) const override
Return true if the target has a bitwise and-not operation: X = ~A & B This can be used to simplify se...
ARMTargetLowering(const TargetMachine &TM, const ARMSubtarget &STI)
bool isComplexDeinterleavingOperationSupported(ComplexDeinterleavingOperation Operation, Type *Ty) const override
Does this target support complex deinterleaving with the given operation and type.
bool supportKCFIBundles() const override
Return true if the target supports kcfi operand bundles.
SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const
PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND.
Register getExceptionSelectorRegister(const Constant *PersonalityFn) const override
If a physical register, this returns the register that receives the exception typeid on entry to a la...
Type * shouldConvertSplatType(ShuffleVectorInst *SVI) const override
Given a shuffle vector SVI representing a vector splat, return a new scalar type of size equal to SVI...
Value * emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy, Value *Addr, AtomicOrdering Ord) const override
Perform a load-linked operation on Addr, returning a "Value *" with the corresponding pointee type.
Instruction * makeDMB(IRBuilderBase &Builder, ARM_MB::MemBOpt Domain) const
bool isLegalICmpImmediate(int64_t Imm) const override
isLegalICmpImmediate - Return true if the specified immediate is legal icmp immediate,...
const char * LowerXConstraint(EVT ConstraintVT) const override
Try to replace an X constraint, which matches anything, with another that has more specific requireme...
unsigned getJumpTableEncoding() const override
Return the entry encoding for a jump table in the current function.
bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override
Return true if it is profitable for dag combiner to transform a floating point op of specified opcode...
CCAssignFn * CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const
bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags, unsigned *Fast) const override
allowsMisalignedMemoryAccesses - Returns true if the target allows unaligned memory accesses of the s...
bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy, Align Alignment, const DataLayout &DL) const
Returns true if VecTy is a legal interleaved access type.
bool isVectorLoadExtDesirable(SDValue ExtVal) const override
Return true if folding a vector load into ExtVal (a sign, zero, or any extend node) is profitable.
bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override
Return true if the target can combine store(extractelement VectorTy,Idx).
bool useSoftFloat() const override
bool alignLoopsWithOptSize() const override
Should loops be aligned even when the function is marked OptSize (but not MinSize).
SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const
bool allowTruncateForTailCall(Type *Ty1, Type *Ty2) const override
Return true if a truncation from FromTy to ToTy is permitted when deciding whether a call is in tail ...
void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const override
LowerAsmOperandForConstraint - Lower the specified operand into the Ops vector.
bool hasAndNotCompare(SDValue V) const override
Return true if the target should transform: (X & Y) == Y ---> (~X & Y) == 0 (X & Y) !...
std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const override
Given a physical register constraint (e.g.
bool shouldConvertFpToSat(unsigned Op, EVT FPVT, EVT VT) const override
Should we generate fp_to_si_sat and fp_to_ui_sat from type FPVT to type VT from min(max(fptoi)) satur...
bool functionArgumentNeedsConsecutiveRegisters(Type *Ty, CallingConv::ID CallConv, bool isVarArg, const DataLayout &DL) const override
Returns true if an argument of type Ty needs to be passed in a contiguous block of registers in calli...
bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override
Return true if folding a constant offset with the given GlobalAddress is legal.
const ARMBaseTargetMachine & getTM() const
bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override
Return if the target supports combining a chain like:
ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const override
bool getPostIndexedAddressParts(SDNode *N, SDNode *Op, SDValue &Base, SDValue &Offset, ISD::MemIndexedMode &AM, SelectionDAG &DAG) const override
getPostIndexedAddressParts - returns true by value, base pointer and offset pointer and addressing mo...
Instruction * emitLeadingFence(IRBuilderBase &Builder, Instruction *Inst, AtomicOrdering Ord) const override
Inserts in the IR a target-specific intrinsic specifying a fence.
bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const override
Return true if Op can create undef or poison from non-undef & non-poison operands.
Represent a constant reference to an array (0 or more elements consecutively in memory),...
Definition ArrayRef.h:40
size_t size() const
Get the array size.
Definition ArrayRef.h:141
bool empty() const
Check if the array is empty.
Definition ArrayRef.h:136
An instruction that atomically checks whether a specified value is in a memory location,...
an instruction that atomically reads a memory location, combines it with another value,...
bool isFloatingPointOperation() const
static LLVM_ABI Attribute get(LLVMContext &Context, AttrKind Kind, uint64_t Val=0)
Return a uniquified Attribute object.
static LLVM_ABI BaseIndexOffset match(const SDNode *N, const SelectionDAG &DAG)
Parses tree in N for base, index, offset addresses.
LLVM Basic Block Representation.
Definition BasicBlock.h:62
The address of a basic block.
Definition Constants.h:1088
static BranchProbability getZero()
A "pseudo-class" with methods for operating on BUILD_VECTORs.
LLVM_ABI bool isConstantSplat(APInt &SplatValue, APInt &SplatUndef, unsigned &SplatBitSize, bool &HasAnyUndefs, unsigned MinSplatBits=0, bool isBigEndian=false) const
Check if this is a constant splat, and if so, find the smallest element size that splats the vector.
LLVM_ABI int32_t getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements, uint32_t BitWidth) const
If this is a constant FP splat and the splatted constant FP is an exact power or 2,...
CCState - This class holds information needed while lowering arguments and return values.
void getInRegsParamInfo(unsigned InRegsParamRecordIndex, unsigned &BeginReg, unsigned &EndReg) const
unsigned getFirstUnallocated(ArrayRef< MCPhysReg > Regs) const
getFirstUnallocated - Return the index of the first unallocated register in the set,...
static LLVM_ABI bool resultsCompatible(CallingConv::ID CalleeCC, CallingConv::ID CallerCC, MachineFunction &MF, LLVMContext &C, const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn CalleeFn, CCAssignFn CallerFn)
Returns true if the results of the two calling conventions are compatible.
MCRegister AllocateReg(MCPhysReg Reg)
AllocateReg - Attempt to allocate one register.
LLVM_ABI bool CheckReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
CheckReturn - Analyze the return values of a function, returning true if the return can be performed ...
LLVM_ABI void AnalyzeReturn(const SmallVectorImpl< ISD::OutputArg > &Outs, CCAssignFn Fn)
AnalyzeReturn - Analyze the returned values of a return, incorporating info about the result values i...
unsigned getInRegsParamsProcessed() const
uint64_t getStackSize() const
Returns the size of the currently allocated portion of the stack.
void addInRegsParamInfo(unsigned RegBegin, unsigned RegEnd)
LLVM_ABI void AnalyzeFormalArguments(const SmallVectorImpl< ISD::InputArg > &Ins, CCAssignFn Fn)
AnalyzeFormalArguments - Analyze an array of argument values, incorporating info about the formals in...
unsigned getInRegsParamsCount() const
CCValAssign - Represent assignment of one arg/retval to a location.
Register getLocReg() const
LocInfo getLocInfo() const
bool needsCustom() const
int64_t getLocMemOffset() const
unsigned getValNo() const
Base class for all callable instructions (InvokeInst and CallInst) Holds everything related to callin...
LLVM_ABI bool isMustTailCall() const
Tests if this call site must be tail call optimized.
AttributeList getAttributes() const
Return the attributes for this call.
void addParamAttr(unsigned ArgNo, Attribute::AttrKind Kind)
Adds the attribute to the indicated argument.
This class represents a function call, abstracting a target machine's calling convention.
bool isTailCall() const
static Constant * get(LLVMContext &Context, ArrayRef< ElementTy > Elts)
get() constructor - Return a constant with array type with an element count and element type matching...
Definition Constants.h:878
const APFloat & getValueAPF() const
ConstantFP - Floating Point Values [float, double].
Definition Constants.h:420
This is the shared class of boolean and integer constants.
Definition Constants.h:87
uint64_t getZExtValue() const
Return the constant as a 64-bit unsigned integer value after it has been zero extended as appropriate...
Definition Constants.h:168
MachineConstantPoolValue * getMachineCPVal() const
const Constant * getConstVal() const
LLVM_ABI Type * getType() const
uint64_t getZExtValue() const
const APInt & getAPIntValue() const
int64_t getSExtValue() const
This is an important base class in LLVM.
Definition Constant.h:43
A parsed version of the target data layout string in and methods for querying it.
Definition DataLayout.h:64
bool isLittleEndian() const
Layout endianness...
Definition DataLayout.h:217
bool isBigEndian() const
Definition DataLayout.h:218
MaybeAlign getStackAlignment() const
Returns the natural stack alignment, or MaybeAlign() if one wasn't specified.
Definition DataLayout.h:250
LLVM_ABI TypeSize getTypeAllocSize(Type *Ty) const
Returns the offset in bytes between successive objects of the specified type, including alignment pad...
StringRef getInternalSymbolPrefix() const
Definition DataLayout.h:308
LLVM_ABI Align getPreferredAlign(const GlobalVariable *GV) const
Returns the preferred alignment of the specified global.
LLVM_ABI Align getPrefTypeAlign(Type *Ty) const
Returns the preferred stack/global alignment for the specified type.
A debug info location.
Definition DebugLoc.h:126
iterator find(const_arg_type_t< KeyT > Val)
Definition DenseMap.h:225
unsigned size() const
Definition DenseMap.h:174
bool empty() const
Definition DenseMap.h:173
iterator begin()
Definition DenseMap.h:139
iterator end()
Definition DenseMap.h:143
This is a fast-path instruction selection class that generates poor code and doesn't support illegal ...
Definition FastISel.h:66
Class to represent fixed width SIMD vectors.
unsigned getNumElements() const
static LLVM_ABI FixedVectorType * get(Type *ElementType, unsigned NumElts)
Definition Type.cpp:867
A handy container for a FunctionType+Callee-pointer pair, which can be passed around as a single enti...
FunctionLoweringInfo - This contains information that is global to a function that is used when lower...
Type * getParamType(unsigned i) const
Parameter type accessors.
FunctionType * getFunctionType() const
Returns the FunctionType for me.
Definition Function.h:211
bool hasMinSize() const
Optimize this function for minimum size (-Oz).
Definition Function.h:685
CallingConv::ID getCallingConv() const
getCallingConv()/setCallingConv(CC) - These method get and set the calling convention of this functio...
Definition Function.h:272
arg_iterator arg_begin()
Definition Function.h:842
LLVMContext & getContext() const
getContext - Return a reference to the LLVMContext associated with this function.
Definition Function.cpp:353
bool hasStructRetAttr() const
Determine if the function returns a structure through first or second pointer argument.
Definition Function.h:669
const Argument * const_arg_iterator
Definition Function.h:74
bool isVarArg() const
isVarArg - Return true if this function takes a variable number of arguments.
Definition Function.h:229
bool hasFnAttribute(Attribute::AttrKind Kind) const
Return true if the function has the attribute.
Definition Function.cpp:723
const GlobalValue * getGlobal() const
bool isDSOLocal() const
bool hasExternalWeakLinkage() const
bool hasDLLImportStorageClass() const
Module * getParent()
Get the module that this global value is contained inside of...
bool isStrongDefinitionForLinker() const
Returns true if this global's definition will be the one chosen by the linker.
static bool isWeakForLinker(LinkageTypes Linkage)
Whether the definition of this global may be replaced at link time.
@ InternalLinkage
Rename collisions when linking (static functions).
Definition GlobalValue.h:60
Common base class shared among various IRBuilders.
Definition IRBuilder.h:114
This provides a uniform API for creating instructions and inserting them into a basic block: either a...
Definition IRBuilder.h:2848
LLVM_ABI bool hasAtomicStore() const LLVM_READONLY
Return true if this atomic instruction stores to memory.
This is an important class for using LLVM in a threaded context.
Definition LLVMContext.h:68
LLVM_ABI void diagnose(const DiagnosticInfo &DI)
Report a message to the currently installed diagnostic handler.
bool isUnindexed() const
Return true if this is NOT a pre/post inc/dec load/store.
bool isIndexed() const
Return true if this is a pre/post inc/dec load/store.
Tracks which library functions to use for a particular subtarget.
CallingConv::ID getLibcallImplCallingConv(RTLIB::LibcallImpl Call) const
Get the CallingConv that should be used for the specified libcall.
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Return the lowering's selection of implementation call for Call.
An instruction for reading from memory.
This class is used to represent ISD::LOAD nodes.
const SDValue & getBasePtr() const
Describe properties that are true of each instruction in the target description file.
Machine Value Type.
static MVT getFloatingPointVT(unsigned BitWidth)
static auto integer_fixedlen_vector_valuetypes()
SimpleValueType SimpleTy
uint64_t getScalarSizeInBits() const
unsigned getVectorNumElements() const
bool isInteger() const
Return true if this is an integer or a vector integer type.
static LLVM_ABI MVT getVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
static auto integer_valuetypes()
TypeSize getSizeInBits() const
Returns the size of the specified MVT in bits.
static auto fixedlen_vector_valuetypes()
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
bool isScalarInteger() const
Return true if this is an integer, not including vectors.
static MVT getVectorVT(MVT VT, unsigned NumElements)
MVT getVectorElementType() const
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
static MVT getIntegerVT(unsigned BitWidth)
static auto fp_valuetypes()
bool is64BitVector() const
Return true if this is a 64-bit vector type.
LLVM_ABI void transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB)
Transfers all the successors, as in transferSuccessors, and update PHI operands in the successor bloc...
bool isEHPad() const
Returns true if the block is a landing pad.
LLVM_ABI MachineBasicBlock * getFallThrough(bool JumpToFallThrough=true)
Return the fallthrough block if the block can implicitly transfer control to the block after it by fa...
void setCallFrameSize(unsigned N)
Set the call frame size on entry to this basic block.
const BasicBlock * getBasicBlock() const
Return the LLVM basic block that this instance corresponded to originally.
LLVM_ABI bool canFallThrough()
Return true if the block can implicitly transfer control to the block after it by falling off the end...
LLVM_ABI void addSuccessor(MachineBasicBlock *Succ, BranchProbability Prob=BranchProbability::getUnknown())
Add Succ as a successor of this MachineBasicBlock.
Instructions::iterator instr_iterator
MachineInstrBundleIterator< MachineInstr, true > reverse_iterator
LLVM_ABI MachineBasicBlock * splitAt(MachineInstr &SplitInst, bool UpdateLiveIns=true, LiveIntervals *LIS=nullptr)
Split a basic block into 2 pieces at SplitPoint.
void addLiveIn(MCRegister PhysReg, LaneBitmask LaneMask=LaneBitmask::getAll())
Adds the specified register as a live in.
const MachineFunction * getParent() const
Return the MachineFunction containing this basic block.
LLVM_ABI instr_iterator erase(instr_iterator I)
Remove an instruction from the instruction list and delete it.
iterator_range< succ_iterator > successors()
iterator_range< pred_iterator > predecessors()
void splice(iterator Where, MachineBasicBlock *Other, iterator From)
Take an instruction from MBB 'Other' at the position From, and insert it into this MBB right before '...
MachineInstrBundleIterator< MachineInstr > iterator
LLVM_ABI void moveAfter(MachineBasicBlock *NewBefore)
LLVM_ABI bool isLiveIn(MCRegister Reg, LaneBitmask LaneMask=LaneBitmask::getAll()) const
Return true if the specified register is in the live in set.
void setIsEHPad(bool V=true)
Indicates the block is a landing pad.
The MachineConstantPool class keeps track of constants referenced by a function which must be spilled...
LLVM_ABI unsigned getConstantPoolIndex(const Constant *C, Align Alignment)
getConstantPoolIndex - Create a new entry in the constant pool or return an existing one.
LLVM_ABI int CreateFixedObject(uint64_t Size, int64_t SPOffset, bool IsImmutable, bool isAliased=false)
Create a new object at a fixed location on the stack.
LLVM_ABI void computeMaxCallFrameSize(MachineFunction &MF, std::vector< MachineBasicBlock::iterator > *FrameSDOps=nullptr)
Computes the maximum size of a callframe.
void setFrameAddressIsTaken(bool T)
void setHasTailCall(bool V=true)
void setReturnAddressIsTaken(bool s)
int64_t getObjectSize(int ObjectIdx) const
Return the size of the specified object.
bool hasVAStart() const
Returns true if the function calls the llvm.va_start intrinsic.
int64_t getObjectOffset(int ObjectIdx) const
Return the assigned stack offset of the specified object from the incoming stack pointer.
bool isFixedObjectIndex(int ObjectIdx) const
Returns true if the specified index corresponds to a fixed stack object.
int getFunctionContextIndex() const
Return the index for the function context object.
Properties which a MachineFunction may have at a given point in time.
unsigned getFunctionNumber() const
getFunctionNumber - Return a unique ID for the current function.
MachineMemOperand * getMachineMemOperand(MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, LLT MemTy, Align base_alignment, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr, SyncScope::ID SSID=SyncScope::System, AtomicOrdering Ordering=AtomicOrdering::NotAtomic, AtomicOrdering FailureOrdering=AtomicOrdering::NotAtomic)
getMachineMemOperand - Allocate a new MachineMemOperand.
MachineFrameInfo & getFrameInfo()
getFrameInfo - Return the frame info object for the current function.
void push_back(MachineBasicBlock *MBB)
MachineRegisterInfo & getRegInfo()
getRegInfo - Return information about the registers currently in use.
const DataLayout & getDataLayout() const
Return the DataLayout attached to the Module associated to this MF.
Function & getFunction()
Return the LLVM function that this machine code represents.
BasicBlockListType::iterator iterator
Ty * getInfo()
getInfo - Keep track of various per-function pieces of information for backends that would like to do...
MachineConstantPool * getConstantPool()
getConstantPool - Return the constant pool object for the current function.
const MachineFunctionProperties & getProperties() const
Get the function properties.
Register addLiveIn(MCRegister PReg, const TargetRegisterClass *RC)
addLiveIn - Add the specified physical register as a live-in value and create a corresponding virtual...
MachineBasicBlock * CreateMachineBasicBlock(const BasicBlock *BB=nullptr, std::optional< UniqueBBID > BBID=std::nullopt)
CreateMachineInstr - Allocate a new MachineInstr.
void insert(iterator MBBI, MachineBasicBlock *MBB)
const TargetMachine & getTarget() const
getTarget - Return the target machine this machine code is compiled with
const MachineInstrBuilder & addExternalSymbol(const char *FnName, unsigned TargetFlags=0) const
const MachineInstrBuilder & addUse(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register use operand.
const MachineInstrBuilder & addReg(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a new virtual register operand.
const MachineInstrBuilder & addImm(int64_t Val) const
Add a new immediate operand.
const MachineInstrBuilder & add(const MachineOperand &MO) const
const MachineInstrBuilder & addFrameIndex(int Idx) const
const MachineInstrBuilder & addConstantPoolIndex(unsigned Idx, int Offset=0, unsigned TargetFlags=0) const
const MachineInstrBuilder & addRegMask(const uint32_t *Mask) const
const MachineInstrBuilder & addJumpTableIndex(unsigned Idx, unsigned TargetFlags=0) const
const MachineInstrBuilder & addMBB(MachineBasicBlock *MBB, unsigned TargetFlags=0) const
const MachineInstrBuilder & addDef(Register RegNo, RegState Flags={}, unsigned SubReg=0) const
Add a virtual register definition operand.
const MachineInstrBuilder & cloneMemRefs(const MachineInstr &OtherMI) const
const MachineInstrBuilder & setMIFlags(unsigned Flags) const
const MachineInstrBuilder & addMemOperand(MachineMemOperand *MMO) const
MachineInstr * getInstr() const
If conversion operators fail, use this method to get the MachineInstr explicitly.
Representation of each machine instruction.
bool readsRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr reads the specified register.
bool definesRegister(Register Reg, const TargetRegisterInfo *TRI) const
Return true if the MachineInstr fully defines the specified register.
MachineOperand * mop_iterator
iterator/begin/end - Iterate over all operands of a machine instruction.
const MachineOperand & getOperand(unsigned i) const
LLVM_ABI unsigned createJumpTableIndex(const std::vector< MachineBasicBlock * > &DestBBs)
createJumpTableIndex - Create a new jump table.
@ EK_Inline
EK_Inline - Jump table entries are emitted inline at their point of use.
@ EK_BlockAddress
EK_BlockAddress - Each entry is a plain address of block, e.g.: .word LBB123.
A description of a memory reference used in the backend.
Flags
Flags values. These may be or'd together.
@ MOVolatile
The memory access is volatile.
@ MODereferenceable
The memory access is dereferenceable (i.e., doesn't trap).
@ MOLoad
The memory access reads data.
@ MONonTemporal
The memory access is non-temporal.
@ MOInvariant
The memory access always returns the same value (or traps).
@ MOStore
The memory access writes data.
Flags getFlags() const
Return the raw flags of the source value,.
MachineOperand class - Representation of each machine instruction operand.
LLVM_ABI void setIsRenamable(bool Val=true)
bool isReg() const
isReg - Tests if this is a MO_Register operand.
LLVM_ABI void setReg(Register Reg)
Change the register this operand corresponds to.
static MachineOperand CreateImm(int64_t Val)
Register getReg() const
getReg - Returns the register number.
LLVM_ABI void setIsDef(bool Val=true)
Change a def to a use, or a use to a def.
static MachineOperand CreateReg(Register Reg, bool isDef, bool isImp=false, bool isKill=false, bool isDead=false, bool isUndef=false, bool isEarlyClobber=false, unsigned SubReg=0, bool isDebug=false, bool isInternalRead=false, bool isRenamable=false)
MachineRegisterInfo - Keep track of information for virtual and physical registers,...
LLVM_ABI Register createVirtualRegister(const TargetRegisterClass *RegClass, StringRef Name="")
createVirtualRegister - Create and return a new virtual register in the function with the specified r...
This class is used to represent an MLOAD node.
This class is used to represent an MSTORE node.
This SDNode is used for target intrinsics that touch memory and need an associated MachineMemOperand.
This is an abstract virtual class for memory operations.
Align getBaseAlign() const
Returns alignment and volatility of the memory access.
Align getAlign() const
bool isVolatile() const
AAMDNodes getAAInfo() const
Returns the AA info that describes the dereference.
bool isSimple() const
Returns true if the memory operation is neither atomic or volatile.
MachineMemOperand * getMemOperand() const
Return the unique MachineMemOperand object describing the memory reference performed by operation.
const MachinePointerInfo & getPointerInfo() const
const SDValue & getChain() const
EVT getMemoryVT() const
Return the type of the in-memory value.
A Module instance is used to store all the information related to an LLVM module.
Definition Module.h:67
static PointerType * getUnqual(Type *ElementType)
This constructs a pointer to an object of the specified type in the default address space (address sp...
Wrapper class representing virtual and physical registers.
Definition Register.h:20
Wrapper class for IR location info (IR ordering and DebugLoc) to be passed into SDNode creation funct...
const DebugLoc & getDebugLoc() const
Represents one node in the SelectionDAG.
unsigned getOpcode() const
Return the SelectionDAG opcode value for this node.
bool hasOneUse() const
Return true if there is exactly one use of this node.
LLVM_ABI bool isOnlyUserOf(const SDNode *N) const
Return true if this node is the only use of N.
iterator_range< use_iterator > uses()
SDNodeFlags getFlags() const
static bool hasPredecessorHelper(const SDNode *N, SmallPtrSetImpl< const SDNode * > &Visited, SmallVectorImpl< const SDNode * > &Worklist, unsigned int MaxSteps=0, bool TopologicalPrune=false)
Returns true if N is a predecessor of any node in Worklist.
uint64_t getAsZExtVal() const
Helper method returns the zero-extended integer value of a ConstantSDNode.
bool use_empty() const
Return true if there are no uses of this node.
unsigned getNumValues() const
Return the number of values defined/returned by this operator.
unsigned getNumOperands() const
Return the number of values used by this operation.
const SDValue & getOperand(unsigned Num) const
uint64_t getConstantOperandVal(unsigned Num) const
Helper method returns the integer value of a ConstantSDNode operand.
const APInt & getConstantOperandAPInt(unsigned Num) const
Helper method returns the APInt of a ConstantSDNode operand.
bool isPredecessorOf(const SDNode *N) const
Return true if this node is a predecessor of N.
LLVM_ABI bool hasAnyUseOfValue(unsigned Value) const
Return true if there are any use of the indicated value.
EVT getValueType(unsigned ResNo) const
Return the type of a specified result.
void setCFIType(uint32_t Type)
bool isUndef() const
Returns true if the node type is UNDEF or POISON.
iterator_range< user_iterator > users()
void setFlags(SDNodeFlags NewFlags)
user_iterator user_begin() const
Provide iteration support to walk over all users of an SDNode.
Represents a use of a SDNode.
Unlike LLVM values, Selection DAG nodes may return multiple values as the result of a computation.
bool isUndef() const
SDNode * getNode() const
get the SDNode which holds the desired result
bool hasOneUse() const
Return true if there is exactly one node using value ResNo of Node.
SDValue getValue(unsigned R) const
EVT getValueType() const
Return the ValueType of the referenced return value.
const SDValue & getOperand(unsigned i) const
const APInt & getConstantOperandAPInt(unsigned i) const
uint64_t getScalarValueSizeInBits() const
unsigned getResNo() const
get the index which selects a specific result in the SDNode
uint64_t getConstantOperandVal(unsigned i) const
unsigned getOpcode() const
unsigned getNumOperands() const
This is used to represent a portion of an LLVM function in a low-level Data Dependence DAG representa...
LLVM_ABI SDValue getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl, EVT VT, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, EVT MemVT, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
SDValue getTargetGlobalAddress(const GlobalValue *GV, const SDLoc &DL, EVT VT, int64_t offset=0, unsigned TargetFlags=0)
LLVM_ABI SDValue getStackArgumentTokenFactor(SDValue Chain)
Compute a TokenFactor to force all the incoming stack arguments to be loaded from the stack.
const TargetSubtargetInfo & getSubtarget() const
SDValue getCopyToReg(SDValue Chain, const SDLoc &dl, Register Reg, SDValue N)
LLVM_ABI SDValue getMergeValues(ArrayRef< SDValue > Ops, const SDLoc &dl)
Create a MERGE_VALUES node from the given operands.
LLVM_ABI SDVTList getVTList(EVT VT)
Return an SDVTList that represents the list of values specified.
LLVM_ABI SDValue getSplatValue(SDValue V, bool LegalTypes=false)
If V is a splat vector, return its scalar source operand by extracting that element from the source v...
LLVM_ABI SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget=false, bool IsOpaque=false)
LLVM_ABI MachineSDNode * getMachineNode(unsigned Opcode, const SDLoc &dl, EVT VT)
These are used for target selectors to create a new node with specified return type(s),...
LLVM_ABI SDNode * getNodeIfExists(unsigned Opcode, SDVTList VTList, ArrayRef< SDValue > Ops, const SDNodeFlags Flags, bool AllowCommute=false)
Get the specified node if it's already available, or else return NULL.
LLVM_ABI SDValue UnrollVectorOp(SDNode *N, unsigned ResNE=0)
Utility function used by legalize and lowering to "unroll" a vector operation by splitting out the sc...
LLVM_ABI bool haveNoCommonBitsSet(SDValue A, SDValue B) const
Return true if A and B have no common bits set.
LLVM_ABI SDValue getRegister(Register Reg, EVT VT)
LLVM_ABI SDValue getLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Ptr, MachinePointerInfo PtrInfo, MaybeAlign Alignment=MaybeAlign(), MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes(), const MDNode *Ranges=nullptr)
Loads are not normal binary operators: their result type is not determined by their operands,...
LLVM_ABI SDValue getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef< SDValue > Ops, EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags Flags=MachineMemOperand::MOLoad|MachineMemOperand::MOStore, LocationSize Size=LocationSize::precise(0), const AAMDNodes &AAInfo=AAMDNodes())
Creates a MemIntrinsicNode that may produce a result and takes a list of operands.
void addNoMergeSiteInfo(const SDNode *Node, bool NoMerge)
Set NoMergeSiteInfo to be associated with Node if NoMerge is true.
std::pair< SDValue, SDValue > SplitVectorOperand(const SDNode *N, unsigned OpNo)
Split the node's operand with EXTRACT_SUBVECTOR and return the low/high part.
LLVM_ABI SDValue getNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a bitwise NOT operation as (XOR Val, -1).
const TargetLowering & getTargetLoweringInfo() const
SDValue getTargetJumpTable(int JTI, EVT VT, unsigned TargetFlags=0)
SDValue getUNDEF(EVT VT)
Return an UNDEF node. UNDEF does not have a useful SDLoc.
SDValue getCALLSEQ_END(SDValue Chain, SDValue Op1, SDValue Op2, SDValue InGlue, const SDLoc &DL)
Return a new CALLSEQ_END node, which always must have a glue result (to ensure it's not CSE'd).
SDValue getBuildVector(EVT VT, const SDLoc &DL, ArrayRef< SDValue > Ops)
Return an ISD::BUILD_VECTOR node.
LLVM_ABI SDValue getBitcast(EVT VT, SDValue V)
Return a bitcast using the SDLoc of the value operand, and casting to the provided type.
SDValue getCopyFromReg(SDValue Chain, const SDLoc &dl, Register Reg, EVT VT)
LLVM_ABI SDValue getNegative(SDValue Val, const SDLoc &DL, EVT VT)
Create negative operation as (SUB 0, Val).
LLVM_ABI void setNodeMemRefs(MachineSDNode *N, ArrayRef< MachineMemOperand * > NewMemRefs)
Mutate the specified machine node's memory references to the provided list.
LLVM_ABI SDValue getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT)
Return the expression required to zero extend the Op value assuming it was the smaller SrcTy value.
const DataLayout & getDataLayout() const
LLVM_ABI SDValue getConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
Create a ConstantSDNode wrapping a constant value.
SDValue getSignedTargetConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI SDValue getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, EVT SVT, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
LLVM_ABI void ReplaceAllUsesWith(SDValue From, SDValue To)
Modify anything using 'From' to use 'To' instead.
LLVM_ABI std::pair< SDValue, SDValue > SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the vector with EXTRACT_SUBVECTOR using the provided VTs and return the low/high part.
LLVM_ABI SDValue getStore(SDValue Chain, const SDLoc &dl, SDValue Val, SDValue Ptr, MachinePointerInfo PtrInfo, Align Alignment, MachineMemOperand::Flags MMOFlags=MachineMemOperand::MONone, const AAMDNodes &AAInfo=AAMDNodes())
Helper function to build ISD::STORE nodes.
LLVM_ABI SDValue getSignedConstant(int64_t Val, const SDLoc &DL, EVT VT, bool isTarget=false, bool isOpaque=false)
LLVM_ABI MaybeAlign InferPtrAlign(SDValue Ptr) const
Infer alignment of a load / store address.
SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, const SDLoc &DL)
Return a new CALLSEQ_START node, that starts new call frame, in which InSize bytes are set up inside ...
LLVM_ABI SDValue getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT, SDValue Operand)
A convenience function for creating TargetInstrInfo::EXTRACT_SUBREG nodes.
SDValue getSelectCC(const SDLoc &DL, SDValue LHS, SDValue RHS, SDValue True, SDValue False, ISD::CondCode Cond, SDNodeFlags Flags=SDNodeFlags())
Helper function to make it easier to build SelectCC's if you just have an ISD::CondCode instead of an...
LLVM_ABI SDValue getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either sign-extending or trunca...
LLVM_ABI bool isKnownNeverZero(SDValue Op, unsigned Depth=0) const
Test whether the given SDValue is known to contain non-zero value(s).
LLVM_ABI SDValue getExternalSymbol(const char *Sym, EVT VT)
const TargetMachine & getTarget() const
const LibcallLoweringInfo & getLibcalls() const
LLVM_ABI SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI SDValue getValueType(EVT)
LLVM_ABI SDValue getNode(unsigned Opcode, const SDLoc &DL, EVT VT, ArrayRef< SDUse > Ops)
Gets or creates the specified node.
LLVM_ABI OverflowKind computeOverflowForSignedAdd(SDValue N0, SDValue N1) const
Determine if the result of the signed addition of 2 nodes can overflow.
SDValue getTargetConstant(uint64_t Val, const SDLoc &DL, EVT VT, bool isOpaque=false)
LLVM_ABI unsigned ComputeNumSignBits(SDValue Op, unsigned Depth=0) const
Return the number of times the sign bit of the register is replicated into the other bits.
LLVM_ABI SDValue getVectorIdxConstant(uint64_t Val, const SDLoc &DL, bool isTarget=false)
LLVM_ABI void ReplaceAllUsesOfValueWith(SDValue From, SDValue To)
Replace any uses of From with To, leaving uses of other values produced by From.getNode() alone.
MachineFunction & getMachineFunction() const
LLVM_ABI SDValue getFrameIndex(int FI, EVT VT, bool isTarget=false)
LLVM_ABI KnownBits computeKnownBits(SDValue Op, unsigned Depth=0) const
Determine which bits of Op are known to be either zero or one and return them in Known.
LLVM_ABI SDValue getRegisterMask(const uint32_t *RegMask)
LLVM_ABI SDValue getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT)
Convert Op, which must be of integer type, to the integer type VT, by either zero-extending or trunca...
LLVM_ABI SDValue getCondCode(ISD::CondCode Cond)
void addCallSiteInfo(const SDNode *Node, CallSiteInfo &&CallInfo)
Set CallSiteInfo to be associated with Node.
LLVM_ABI bool MaskedValueIsZero(SDValue Op, const APInt &Mask, unsigned Depth=0) const
Return true if 'Op & Mask' is known to be zero.
SDValue getObjectPtrOffset(const SDLoc &SL, SDValue Ptr, TypeSize Offset)
Create an add instruction with appropriate flags when used for addressing some offset of an object.
LLVMContext * getContext() const
LLVM_ABI SDValue getTargetExternalSymbol(const char *Sym, EVT VT, unsigned TargetFlags=0)
LLVM_ABI SDValue CreateStackTemporary(TypeSize Bytes, Align Alignment)
Create a stack temporary based on the size in bytes and the alignment.
SDValue getTargetConstantPool(const Constant *C, EVT VT, MaybeAlign Align=std::nullopt, int Offset=0, unsigned TargetFlags=0)
SDValue getEntryNode() const
Return the token chain corresponding to the entry of the function.
LLVM_ABI SDValue getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain, SDValue Base, SDValue Offset, SDValue Mask, SDValue Src0, EVT MemVT, MachineMemOperand *MMO, ISD::MemIndexedMode AM, ISD::LoadExtType, bool IsExpanding=false)
DenormalMode getDenormalMode(EVT VT) const
Return the current function's default denormal handling kind for the given floating point type.
LLVM_ABI std::pair< SDValue, SDValue > SplitScalar(const SDValue &N, const SDLoc &DL, const EVT &LoVT, const EVT &HiVT)
Split the scalar node with EXTRACT_ELEMENT using the provided VTs and return the low/high part.
LLVM_ABI SDValue getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1, SDValue N2, ArrayRef< int > Mask)
Return an ISD::VECTOR_SHUFFLE node.
LLVM_ABI SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT)
Create a logical NOT operation as (XOR Val, BooleanOne).
This instruction constructs a fixed permutation of two input vectors.
VectorType * getType() const
Overload to return most specific vector type.
static LLVM_ABI void getShuffleMask(const Constant *Mask, SmallVectorImpl< int > &Result)
Convert the input shuffle mask operand to a vector of integers.
static LLVM_ABI bool isIdentityMask(ArrayRef< int > Mask, int NumSrcElts)
Return true if this shuffle mask chooses elements from exactly one source vector without lane crossin...
This SDNode is used to implement the code generator support for the llvm IR shufflevector instruction...
int getMaskElt(unsigned Idx) const
ArrayRef< int > getMask() const
static LLVM_ABI bool isSplatMask(ArrayRef< int > Mask)
void insert_range(Range &&R)
std::pair< iterator, bool > insert(PtrType Ptr)
Inserts Ptr if and only if there is no element in the container equal to Ptr.
SmallPtrSet - This class implements a set which is optimized for holding SmallSize or less elements.
bool empty() const
Definition SmallSet.h:169
bool erase(const T &V)
Definition SmallSet.h:200
std::pair< const_iterator, bool > insert(const T &V)
insert - Insert an element into the set if it isn't already there.
Definition SmallSet.h:184
This class consists of common code factored out of the SmallVector class to reduce code duplication b...
iterator insert(iterator I, T &&Elt)
void resize(size_type N)
void push_back(const T &Elt)
This is a 'vector' (really, a variable-sized array), optimized for the case when the array is small.
An instruction for storing to memory.
This class is used to represent ISD::STORE nodes.
const SDValue & getBasePtr() const
const SDValue & getValue() const
bool isTruncatingStore() const
Return true if the op does a truncation before store.
Represent a constant reference to a string, i.e.
Definition StringRef.h:56
const unsigned char * bytes_end() const
Definition StringRef.h:125
constexpr size_t size() const
Get the string size.
Definition StringRef.h:144
constexpr const char * data() const
Get a pointer to the start of the string (which may not be null terminated).
Definition StringRef.h:138
const unsigned char * bytes_begin() const
Definition StringRef.h:122
static LLVM_ABI StructType * get(LLVMContext &Context, ArrayRef< Type * > Elements, bool isPacked=false)
This static method is the primary way to create a literal StructType.
Definition Type.cpp:477
TargetInstrInfo - Interface to description of machine instruction set.
Provides information about what library functions are available for the current target.
bool isOperationExpand(unsigned Op, EVT VT) const
Return true if the specified operation is illegal on this target or unlikely to be made legal with cu...
void setBooleanVectorContents(BooleanContent Ty)
Specify how the target extends the result of a vector boolean value from a vector of i1 to a wider ty...
void setOperationAction(unsigned Op, MVT VT, LegalizeAction Action)
Indicate that the specified operation does not work with the specified type and indicate what to do a...
virtual void finalizeLowering(MachineFunction &MF) const
Execute target specific actions to finalize target lowering.
void setMaxDivRemBitWidthSupported(unsigned SizeInBits)
Set the size in bits of the maximum div/rem the backend supports.
bool PredictableSelectIsExpensive
Tells the code generator that select is more expensive than a branch if the branch is usually predict...
EVT getValueType(const DataLayout &DL, Type *Ty, bool AllowUnknown=false) const
Return the EVT corresponding to this LLVM type.
unsigned MaxStoresPerMemcpyOptSize
Likewise for functions with the OptSize attribute.
virtual const TargetRegisterClass * getRegClassFor(MVT VT, bool isDivergent=false) const
Return the register class that should be used for the specified value type.
ShiftLegalizationStrategy
Return the preferred strategy to legalize tihs SHIFT instruction, with ExpansionFactor being the recu...
void setMinStackArgumentAlignment(Align Alignment)
Set the minimum stack alignment of an argument.
const TargetMachine & getTargetMachine() const
virtual void insertSSPDeclarations(Module &M, const LibcallLoweringInfo &Libcalls) const
Inserts necessary declarations for SSP (stack protection) purpose.
void setIndexedMaskedLoadAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked load does or does not work with the specified type and ind...
void setIndexedLoadAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed load does or does not work with the specified type and indicate w...
void setPrefLoopAlignment(Align Alignment)
Set the target's preferred loop alignment.
void setMaxAtomicSizeInBitsSupported(unsigned SizeInBits)
Set the maximum atomic operation size supported by the backend.
Sched::Preference getSchedulingPreference() const
Return target scheduling preference.
void setMinFunctionAlignment(Align Alignment)
Set the target's minimum function alignment.
unsigned MaxStoresPerMemsetOptSize
Likewise for functions with the OptSize attribute.
void setBooleanContents(BooleanContent Ty)
Specify how the target extends the result of integer and floating point boolean values from i1 to a w...
unsigned MaxStoresPerMemmove
Specify maximum number of store instructions per memmove call.
void computeRegisterProperties(const TargetRegisterInfo *TRI)
Once all of the register classes are added, this allows us to compute derived properties we expose.
unsigned MaxStoresPerMemmoveOptSize
Likewise for functions with the OptSize attribute.
void addRegisterClass(MVT VT, const TargetRegisterClass *RC)
Add the specified register class as an available regclass for the specified value type.
bool isTypeLegal(EVT VT) const
Return true if the target has native support for the specified value type.
void setIndexedStoreAction(ArrayRef< unsigned > IdxModes, MVT VT, LegalizeAction Action)
Indicate that the specified indexed store does or does not work with the specified type and indicate ...
virtual MVT getPointerTy(const DataLayout &DL, uint32_t AS=0) const
Return the pointer type for the given address space, defaults to the pointer type from the data layou...
void setPrefFunctionAlignment(Align Alignment)
Set the target's preferred function alignment.
virtual unsigned getMaxSupportedInterleaveFactor() const
Get the maximum supported factor for interleaved memory accesses.
void setIndexedMaskedStoreAction(unsigned IdxMode, MVT VT, LegalizeAction Action)
Indicate that the specified indexed masked store does or does not work with the specified type and in...
unsigned MaxStoresPerMemset
Specify maximum number of store instructions per memset call.
void setTruncStoreAction(MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified truncating store does not work with the specified type and indicate what ...
virtual ShiftLegalizationStrategy preferredShiftLegalizationStrategy(SelectionDAG &DAG, SDNode *N, unsigned ExpansionFactor) const
bool isOperationLegalOrCustom(unsigned Op, EVT VT, bool LegalOnly=false) const
Return true if the specified operation is legal on this target or can be made legal with custom lower...
virtual bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace=0, Align Alignment=Align(1), MachineMemOperand::Flags Flags=MachineMemOperand::MONone, unsigned *Fast=nullptr) const
Return true if the target supports a memory access of this type for the given address space and align...
void setStackPointerRegisterToSaveRestore(Register R)
If set to a physical register, this specifies the register that llvm.savestack/llvm....
void AddPromotedToType(unsigned Opc, MVT OrigVT, MVT DestVT)
If Opc/OrigVT is specified as being promoted, the promotion code defaults to trying a larger integer/...
AtomicExpansionKind
Enum that specifies what an atomic load/AtomicRMWInst is expanded to, if at all.
virtual std::pair< const TargetRegisterClass *, uint8_t > findRepresentativeClass(const TargetRegisterInfo *TRI, MVT VT) const
Return the largest legal super-reg register class of the register class for the specified type and it...
RTLIB::LibcallImpl getLibcallImpl(RTLIB::Libcall Call) const
Get the libcall impl routine name for the specified libcall.
static StringRef getLibcallImplName(RTLIB::LibcallImpl Call)
Get the libcall routine name for the specified libcall implementation.
void setTargetDAGCombine(ArrayRef< ISD::NodeType > NTs)
Targets should invoke this method for each target independent node that they want to provide a custom...
void setLoadExtAction(unsigned ExtType, MVT ValVT, MVT MemVT, LegalizeAction Action)
Indicate that the specified load with extension does not work with the specified type and indicate wh...
LegalizeTypeAction getTypeAction(LLVMContext &Context, EVT VT) const
Return how we should legalize values of this type, either it is already legal (return 'Legal') or we ...
std::vector< ArgListEntry > ArgListTy
unsigned MaxStoresPerMemcpy
Specify maximum number of store instructions per memcpy call.
void setSchedulingPreference(Sched::Preference Pref)
Specify the target scheduling preference.
This class defines information used to lower LLVM code to legal SelectionDAG operators that the targe...
bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedEltMask, APInt &KnownUndef, APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Vector Op.
void softenSetCCOperands(SelectionDAG &DAG, EVT VT, SDValue &NewLHS, SDValue &NewRHS, ISD::CondCode &CCCode, const SDLoc &DL, const SDValue OldLHS, const SDValue OldRHS) const
Soften the operands of a comparison.
SDValue expandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG) const
Expands an unaligned store to 2 half-size stores for integer values, and possibly more for vectors.
virtual ConstraintType getConstraintType(StringRef Constraint) const
Given a constraint, return the type of constraint it is for this target.
bool parametersInCSRMatch(const MachineRegisterInfo &MRI, const uint32_t *CallerPreservedMask, const SmallVectorImpl< CCValAssign > &ArgLocs, const SmallVectorImpl< SDValue > &OutVals) const
Check whether parameters to a call that are passed in callee saved registers are the same as from the...
std::pair< SDValue, SDValue > expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const
Expands an unaligned load to 2 half-size loads for an integer, and possibly more for vectors.
virtual SDValue LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA, SelectionDAG &DAG) const
Lower TLS global address SDNode for target independent emulated TLS model.
std::pair< SDValue, SDValue > LowerCallTo(CallLoweringInfo &CLI) const
This function lowers an abstract call to a function into an actual call.
bool expandDIVREMByConstant(SDNode *N, SmallVectorImpl< SDValue > &Result, EVT HiLoVT, SelectionDAG &DAG, SDValue LL=SDValue(), SDValue LH=SDValue()) const
Attempt to expand an n-bit div/rem/divrem by constant using an n/2-bit algorithm.
bool isPositionIndependent() const
virtual ConstraintWeight getSingleConstraintMatchWeight(AsmOperandInfo &info, const char *constraint) const
Examine constraint string and operand type and determine a weight value.
SDValue buildLegalVectorShuffle(EVT VT, const SDLoc &DL, SDValue N0, SDValue N1, MutableArrayRef< int > Mask, SelectionDAG &DAG) const
Tries to build a legal vector shuffle using the provided parameters or equivalent variations.
virtual std::pair< unsigned, const TargetRegisterClass * > getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, StringRef Constraint, MVT VT) const
Given a physical register constraint (e.g.
bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0, bool AssumeSingleUse=false) const
Look at Op.
virtual bool SimplifyDemandedBitsForTargetNode(SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts, KnownBits &Known, TargetLoweringOpt &TLO, unsigned Depth=0) const
Attempt to simplify any target nodes based on the demanded bits/elts, returning true on success.
TargetLowering(const TargetLowering &)=delete
bool isConstTrueVal(SDValue N) const
Return if the N is a constant or constant vector equal to the true value from getBooleanContents().
virtual ArrayRef< MCPhysReg > getRoundingControlRegisters() const
Returns a 0 terminated array of rounding control registers that can be attached into strict FP call.
virtual bool canCreateUndefOrPoisonForTargetNode(SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG, UndefPoisonKind Kind, bool ConsiderFlags, unsigned Depth) const
Return true if Op can create undef or poison from non-undef & non-poison operands.
virtual void LowerAsmOperandForConstraint(SDValue Op, StringRef Constraint, std::vector< SDValue > &Ops, SelectionDAG &DAG) const
Lower the specified operand into the Ops vector.
std::pair< SDValue, SDValue > makeLibCall(SelectionDAG &DAG, RTLIB::LibcallImpl LibcallImpl, EVT RetVT, ArrayRef< SDValue > Ops, MakeLibCallOptions CallOptions, const SDLoc &dl, SDValue Chain=SDValue()) const
Returns a pair of (return value, chain).
void setTypeIdForCallsiteInfo(const CallBase *CB, MachineFunction &MF, MachineFunction::CallSiteInfo &CSInfo) const
Primary interface to the complete machine description for the target machine.
TLSModel::Model getTLSModel(const GlobalValue *GV) const
Returns the TLS model which should be used for the given global variable.
ExceptionHandling getExceptionModel() const
Return the ExceptionHandling to use, considering TargetOptions and the Triple's default.
const Triple & getTargetTriple() const
bool useEmulatedTLS() const
Returns true if this target uses emulated TLS.
virtual const TargetSubtargetInfo * getSubtargetImpl(const Function &) const
Virtual method implemented by subclasses that returns a reference to that target's TargetSubtargetInf...
TargetOptions Options
unsigned EnableFastISel
EnableFastISel - This flag enables fast-path instruction selection which trades away generated code q...
unsigned GuaranteedTailCallOpt
GuaranteedTailCallOpt - This flag is enabled when -tailcallopt is specified on the commandline.
TargetRegisterInfo base class - We assume that the target defines a static array of TargetRegisterDes...
virtual const TargetRegisterInfo * getRegisterInfo() const =0
Return the target's register information.
Target - Wrapper for Target specific information.
Triple - Helper class for working with autoconf configuration names.
Definition Triple.h:47
ObjectFormatType getObjectFormat() const
Get the object format for this triple.
Definition Triple.h:449
bool isOSWindows() const
Tests whether the OS is Windows.
Definition Triple.h:688
static constexpr TypeSize getFixed(ScalarTy ExactSize)
Definition TypeSize.h:343
The instances of the Type class are immutable: once they are created, they are never changed.
Definition Type.h:46
static LLVM_ABI IntegerType * getInt64Ty(LLVMContext &C)
Definition Type.cpp:310
bool isVectorTy() const
True if this is an instance of VectorType.
Definition Type.h:288
static LLVM_ABI IntegerType * getInt32Ty(LLVMContext &C)
Definition Type.cpp:309
bool isPointerTy() const
True if this is an instance of PointerType.
Definition Type.h:282
bool isFloatTy() const
Return true if this is 'float', a 32-bit IEEE fp type.
Definition Type.h:155
static LLVM_ABI Type * getVoidTy(LLVMContext &C)
Definition Type.cpp:282
static LLVM_ABI IntegerType * getInt8Ty(LLVMContext &C)
Definition Type.cpp:307
Type * getScalarType() const
If this is a vector type, return the element type, otherwise return 'this'.
Definition Type.h:368
LLVM_ABI TypeSize getPrimitiveSizeInBits() const LLVM_READONLY
Return the basic size of this type if it is a primitive type.
Definition Type.cpp:197
static LLVM_ABI IntegerType * getInt16Ty(LLVMContext &C)
Definition Type.cpp:308
bool isHalfTy() const
Return true if this is 'half', a 16-bit IEEE fp type.
Definition Type.h:144
LLVMContext & getContext() const
Return the LLVMContext in which this type was uniqued.
Definition Type.h:130
LLVM_ABI unsigned getScalarSizeInBits() const LLVM_READONLY
If this is a vector type, return the getPrimitiveSizeInBits value for the element type.
Definition Type.cpp:232
bool isFloatingPointTy() const
Return true if this is one of the floating-point types.
Definition Type.h:186
bool isIntegerTy() const
True if this is an instance of IntegerType.
Definition Type.h:257
bool isFPOrFPVectorTy() const
Return true if this is a FP type or a vector of FP.
Definition Type.h:227
A Use represents the edge between a Value definition and its users.
Definition Use.h:35
LLVM_ABI unsigned getOperandNo() const
Return the operand # of this use in its User.
Definition Use.cpp:36
User * getUser() const
Returns the User that contains this Use.
Definition Use.h:61
Value * getOperand(unsigned i) const
Definition User.h:207
unsigned getNumOperands() const
Definition User.h:229
LLVM Value Representation.
Definition Value.h:75
Type * getType() const
All values are typed, get the type of this value.
Definition Value.h:255
bool hasOneUse() const
Return true if there is exactly one use of this value.
Definition Value.h:439
Base class of all SIMD vector types.
Type * getElementType() const
std::pair< iterator, bool > insert(const ValueT &V)
Definition DenseSet.h:212
bool contains(const_arg_type_t< ValueT > V) const
Check if the set contains the given element.
Definition DenseSet.h:185
constexpr ScalarTy getFixedValue() const
Definition TypeSize.h:200
const ParentTy * getParent() const
Definition ilist_node.h:34
self_iterator getIterator()
Definition ilist_node.h:123
IteratorT end() const
#define llvm_unreachable(msg)
Marks that the current location is not supposed to be reachable.
constexpr char Align[]
Key for Kernel::Arg::Metadata::mAlign.
constexpr char Args[]
Key for Kernel::Metadata::mArgs.
static CondCodes getOppositeCondition(CondCodes CC)
Definition ARMBaseInfo.h:49
@ SECREL
Thread Pointer Offset.
@ SBREL
Section Relative (Windows TLS)
@ GOTTPOFF
Global Offset Table, PC Relative.
@ TPOFF
Global Offset Table, Thread Pointer Offset.
TOF
Target Operand Flag enum.
@ MO_NONLAZY
MO_NONLAZY - This is an independent flag, on a symbol operand "FOO" it represents a symbol which,...
@ MO_SBREL
MO_SBREL - On a symbol operand, this represents a static base relative relocation.
@ MO_DLLIMPORT
MO_DLLIMPORT - On a symbol operand, this represents that the reference to the symbol is for an import...
@ MO_GOT
MO_GOT - On a symbol operand, this represents a GOT relative relocation.
@ MO_COFFSTUB
MO_COFFSTUB - On a symbol operand "FOO", this indicates that the reference is actually to the "....
static ShiftOpc getShiftOpcForNode(unsigned Opcode)
int getSOImmVal(unsigned Arg)
getSOImmVal - Given a 32-bit immediate, if it is something that can fit into an shifter_operand immed...
int getFP32Imm(const APInt &Imm)
getFP32Imm - Return an 8-bit floating-point version of the 32-bit floating-point value.
uint64_t decodeVMOVModImm(unsigned ModImm, unsigned &EltBits)
decodeVMOVModImm - Decode a NEON/MVE modified immediate value into the element value and the element ...
unsigned getAM2Offset(unsigned AM2Opc)
bool isThumbImmShiftedVal(unsigned V)
isThumbImmShiftedVal - Return true if the specified value can be obtained by left shifting a 8-bit im...
int getT2SOImmVal(unsigned Arg)
getT2SOImmVal - Given a 32-bit immediate, if it is something that can fit into a Thumb-2 shifter_oper...
unsigned createVMOVModImm(unsigned OpCmode, unsigned Val)
int getFP64Imm(const APInt &Imm)
getFP64Imm - Return an 8-bit floating-point version of the 64-bit floating-point value.
int getFP16Imm(const APInt &Imm)
getFP16Imm - Return an 8-bit floating-point version of the 16-bit floating-point value.
unsigned getSORegOpc(ShiftOpc ShOp, unsigned Imm)
int getFP32FP16Imm(const APInt &Imm)
If this is a FP16Imm encoded as a fp32 value, return the 8-bit encoding for it.
AddrOpc getAM2Op(unsigned AM2Opc)
bool isBitFieldInvertedMask(unsigned v)
const unsigned FPStatusBits
FastISel * createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo, const LibcallLoweringInfo *libcallLowering)
const unsigned FPReservedBits
const unsigned RoundingBitsPos
constexpr std::underlying_type_t< E > Mask()
Get a bitmask with 1s in all places up to the high-order bit of E's largest value.
@ Entry
Definition COFF.h:862
unsigned ID
LLVM IR allows to use arbitrary numbers as calling convention identifiers.
Definition CallingConv.h:24
@ Swift
Calling convention for Swift.
Definition CallingConv.h:69
@ ARM_APCS
ARM Procedure Calling Standard (obsolete, but still used on some targets).
@ CFGuard_Check
Special calling convention on Windows for calling the Control Guard Check ICall funtion.
Definition CallingConv.h:82
@ PreserveMost
Used for runtime calls that preserves most registers.
Definition CallingConv.h:63
@ ARM_AAPCS
ARM Architecture Procedure Calling Standard calling convention (aka EABI).
@ CXX_FAST_TLS
Used for access functions.
Definition CallingConv.h:72
@ GHC
Used by the Glasgow Haskell Compiler (GHC).
Definition CallingConv.h:50
@ PreserveAll
Used for runtime calls that preserves (almost) all registers.
Definition CallingConv.h:66
@ Fast
Attempts to make calls as fast as possible (e.g.
Definition CallingConv.h:41
@ Tail
Attemps to make calls as fast as possible while guaranteeing that tail call optimization can always b...
Definition CallingConv.h:76
@ SwiftTail
This follows the Swift calling convention in how arguments are passed but guarantees tail calls will ...
Definition CallingConv.h:87
@ ARM_AAPCS_VFP
Same as ARM_AAPCS, but uses hard floating point ABI.
@ C
The default llvm calling convention, compatible with C.
Definition CallingConv.h:34
bool isNON_EXTLoad(const SDNode *N)
Returns true if the specified node is a non-extending load.
NodeType
ISD::NodeType enum - This enum defines the target-independent operators for a SelectionDAG.
Definition ISDOpcodes.h:41
@ SETCC
SetCC operator - This evaluates to a true value iff the condition is true.
Definition ISDOpcodes.h:827
@ MERGE_VALUES
MERGE_VALUES - This node takes multiple discrete operands and returns them all as its individual resu...
Definition ISDOpcodes.h:261
@ STACKRESTORE
STACKRESTORE has two operands, an input chain and a pointer to restore to it returns an output chain.
@ STACKSAVE
STACKSAVE - STACKSAVE has one operand, an input chain.
@ STRICT_FSETCC
STRICT_FSETCC/STRICT_FSETCCS - Constrained versions of SETCC, used for floating-point operands only.
Definition ISDOpcodes.h:511
@ SET_FPENV
Sets the current floating-point environment.
@ MLOAD
Masked load and store - consecutive vector load and store operations with additional mask operand tha...
@ EH_SJLJ_LONGJMP
OUTCHAIN = EH_SJLJ_LONGJMP(INCHAIN, buffer) This corresponds to the eh.sjlj.longjmp intrinsic.
Definition ISDOpcodes.h:168
@ FGETSIGN
INT = FGETSIGN(FP) - Return the sign bit of the specified floating point value as an integer 0/1 valu...
Definition ISDOpcodes.h:538
@ SMUL_LOHI
SMUL_LOHI/UMUL_LOHI - Multiply two integers of type iN, producing a signed/unsigned value of type i[2...
Definition ISDOpcodes.h:275
@ INSERT_SUBVECTOR
INSERT_SUBVECTOR(VECTOR1, VECTOR2, IDX) - Returns a vector with VECTOR2 inserted into VECTOR1.
Definition ISDOpcodes.h:600
@ BSWAP
Byte Swap and Counting operators.
Definition ISDOpcodes.h:787
@ VAEND
VAEND, VASTART - VAEND and VASTART have three operands: an input chain, pointer, and a SRCVALUE.
@ ATOMIC_STORE
OUTCHAIN = ATOMIC_STORE(INCHAIN, val, ptr) This corresponds to "store atomic" instruction.
@ RESET_FPENV
Set floating-point environment to default state.
@ ADD
Simple integer binary arithmetic operators.
Definition ISDOpcodes.h:264
@ LOAD
LOAD and STORE have token chains as their first operand, then the same operands as an LLVM load/store...
@ SET_FPMODE
Sets the current dynamic floating-point control modes.
@ ANY_EXTEND
ANY_EXTEND - Used for integer types. The high bits are undefined.
Definition ISDOpcodes.h:861
@ FMA
FMA - Perform a * b + c with no intermediate rounding step.
Definition ISDOpcodes.h:518
@ FMODF
FMODF - Decomposes the operand into integral and fractional parts, each having the same type and sign...
@ FATAN2
FATAN2 - atan2, inspired by libm.
@ FSINCOSPI
FSINCOSPI - Compute both the sine and cosine times pi more accurately than FSINCOS(pi*x),...
@ INTRINSIC_VOID
OUTCHAIN = INTRINSIC_VOID(INCHAIN, INTRINSICID, arg1, arg2, ...) This node represents a target intrin...
Definition ISDOpcodes.h:220
@ EH_SJLJ_SETUP_DISPATCH
OUTCHAIN = EH_SJLJ_SETUP_DISPATCH(INCHAIN) The target initializes the dispatch table here.
Definition ISDOpcodes.h:172
@ GlobalAddress
Definition ISDOpcodes.h:88
@ SINT_TO_FP
[SU]INT_TO_FP - These operators convert integers (whose interpreted sign depends on the first letter)...
Definition ISDOpcodes.h:888
@ CONCAT_VECTORS
CONCAT_VECTORS(VECTOR0, VECTOR1, ...) - Given a number of values of vector type with the same length ...
Definition ISDOpcodes.h:584
@ VECREDUCE_FMAX
FMIN/FMAX nodes can have flags, for NaN/NoNaN variants.
@ FADD
Simple binary floating point operators.
Definition ISDOpcodes.h:417
@ ABS
ABS - Determine the unsigned absolute value of a signed integer value of the same bitwidth.
Definition ISDOpcodes.h:747
@ ATOMIC_FENCE
OUTCHAIN = ATOMIC_FENCE(INCHAIN, ordering, scope) This corresponds to the fence instruction.
@ RESET_FPMODE
Sets default dynamic floating-point control modes.
@ SDIVREM
SDIVREM/UDIVREM - Divide two integers and produce both a quotient and remainder result.
Definition ISDOpcodes.h:280
@ FP16_TO_FP
FP16_TO_FP, FP_TO_FP16 - These operators are used to perform promotions and truncation for half-preci...
@ BITCAST
BITCAST - This operator converts between integer, vector and FP values, as if the value was stored to...
@ BUILD_PAIR
BUILD_PAIR - This is the opposite of EXTRACT_ELEMENT in some ways.
Definition ISDOpcodes.h:254
@ FLDEXP
FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
@ STRICT_FSQRT
Constrained versions of libm-equivalent floating point intrinsics.
Definition ISDOpcodes.h:438
@ BUILTIN_OP_END
BUILTIN_OP_END - This must be the last enum value in this list.
@ GlobalTLSAddress
Definition ISDOpcodes.h:89
@ CTLZ_ZERO_POISON
Definition ISDOpcodes.h:796
@ SET_ROUNDING
Set rounding mode.
Definition ISDOpcodes.h:983
@ SIGN_EXTEND
Conversion operators.
Definition ISDOpcodes.h:852
@ AVGCEILS
AVGCEILS/AVGCEILU - Rounding averaging add - Add two integers using an integer of type i[N+2],...
Definition ISDOpcodes.h:715
@ SCALAR_TO_VECTOR
SCALAR_TO_VECTOR(VAL) - This represents the operation of loading a scalar value into element 0 of the...
Definition ISDOpcodes.h:665
@ BR
Control flow instructions. These all have token chains.
@ VECREDUCE_FADD
These reductions have relaxed evaluation order semantics, and have a single vector operand.
@ PREFETCH
PREFETCH - This corresponds to a prefetch intrinsic.
@ FSINCOS
FSINCOS - Compute both fsin and fcos as a single operation.
@ SETCCCARRY
Like SetCC, ops #0 and #1 are the LHS and RHS operands to compare, but op #2 is a boolean indicating ...
Definition ISDOpcodes.h:835
@ FNEG
Perform various unary floating-point operations inspired by libm.
@ BR_CC
BR_CC - Conditional branch.
@ SSUBO
Same for subtraction.
Definition ISDOpcodes.h:352
@ BR_JT
BR_JT - Jumptable branch.
@ SSUBSAT
RESULT = [US]SUBSAT(LHS, RHS) - Perform saturation subtraction on 2 integers with the same bit width ...
Definition ISDOpcodes.h:374
@ SELECT
Select(COND, TRUEVAL, FALSEVAL).
Definition ISDOpcodes.h:804
@ ATOMIC_LOAD
Val, OUTCHAIN = ATOMIC_LOAD(INCHAIN, ptr) This corresponds to "load atomic" instruction.
@ UNDEF
UNDEF - An undefined node.
Definition ISDOpcodes.h:233
@ EXTRACT_ELEMENT
EXTRACT_ELEMENT - This is used to get the lower or upper (determined by a Constant,...
Definition ISDOpcodes.h:247
@ VACOPY
VACOPY - VACOPY has 5 operands: an input chain, a destination pointer, a source pointer,...
@ BasicBlock
Various leaf nodes.
Definition ISDOpcodes.h:81
@ CopyFromReg
CopyFromReg - This node indicates that the input value is a virtual or physical register that is defi...
Definition ISDOpcodes.h:230
@ SADDO
RESULT, BOOL = [SU]ADDO(LHS, RHS) - Overflow-aware nodes for addition.
Definition ISDOpcodes.h:348
@ CTLS
Count leading redundant sign bits.
Definition ISDOpcodes.h:800
@ VECREDUCE_ADD
Integer reductions may have a result type larger than the vector element type.
@ GET_ROUNDING
Returns current rounding mode: -1 Undefined 0 Round to 0 1 Round to nearest, ties to even 2 Round to ...
Definition ISDOpcodes.h:978
@ STRICT_FP_TO_FP16
@ MULHU
MULHU/MULHS - Multiply high - Multiply two integers of type iN, producing an unsigned/signed value of...
Definition ISDOpcodes.h:704
@ GET_FPMODE
Reads the current dynamic floating-point control modes.
@ STRICT_FP16_TO_FP
@ GET_FPENV
Gets the current floating-point environment.
@ SHL
Shift and rotation operations.
Definition ISDOpcodes.h:769
@ VECTOR_SHUFFLE
VECTOR_SHUFFLE(VEC1, VEC2) - Returns a vector, of the same type as VEC1/VEC2.
Definition ISDOpcodes.h:649
@ EXTRACT_SUBVECTOR
EXTRACT_SUBVECTOR(VECTOR, IDX) - Returns a subvector from VECTOR.
Definition ISDOpcodes.h:614
@ READ_REGISTER
READ_REGISTER, WRITE_REGISTER - This node represents llvm.register on the DAG, which implements the n...
Definition ISDOpcodes.h:139
@ EXTRACT_VECTOR_ELT
EXTRACT_VECTOR_ELT(VECTOR, IDX) - Returns a single element from VECTOR identified by the (potentially...
Definition ISDOpcodes.h:576
@ CopyToReg
CopyToReg - This node has three operands: a chain, a register number to set to this value,...
Definition ISDOpcodes.h:224
@ ZERO_EXTEND
ZERO_EXTEND - Used for integer types, zeroing the new bits.
Definition ISDOpcodes.h:858
@ DEBUGTRAP
DEBUGTRAP - Trap intended to get the attention of a debugger.
@ SELECT_CC
Select with condition operator - This selects between a true value and a false value (ops #2 and #3) ...
Definition ISDOpcodes.h:819
@ ATOMIC_CMP_SWAP
Val, OUTCHAIN = ATOMIC_CMP_SWAP(INCHAIN, ptr, cmp, swap) For double-word atomic operations: ValLo,...
@ FMINNUM
FMINNUM/FMAXNUM - Perform floating-point minimum maximum on two values, following IEEE-754 definition...
@ SMULO
Same for multiplication.
Definition ISDOpcodes.h:356
@ DYNAMIC_STACKALLOC
DYNAMIC_STACKALLOC - Allocate some number of bytes on the stack aligned to a specified boundary.
@ SIGN_EXTEND_INREG
SIGN_EXTEND_INREG - This operator atomically performs a SHL/SRA pair to sign extend a small value in ...
Definition ISDOpcodes.h:896
@ SMIN
[US]{MIN/MAX} - Binary minimum or maximum of signed or unsigned integers.
Definition ISDOpcodes.h:727
@ FP_EXTEND
X = FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:986
@ VSELECT
Select with a vector condition (op #0) and two vector operands (ops #1 and #2), returning a vector re...
Definition ISDOpcodes.h:813
@ UADDO_CARRY
Carry-using nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:328
@ STRICT_FROUNDEVEN
Definition ISDOpcodes.h:464
@ BF16_TO_FP
BF16_TO_FP, FP_TO_BF16 - These operators are used to perform promotions and truncation for bfloat16.
@ FRAMEADDR
FRAMEADDR, RETURNADDR - These nodes represent llvm.frameaddress and llvm.returnaddress on the DAG.
Definition ISDOpcodes.h:110
@ STRICT_FP_TO_UINT
Definition ISDOpcodes.h:478
@ STRICT_FP_ROUND
X = STRICT_FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision ...
Definition ISDOpcodes.h:500
@ STRICT_FP_TO_SINT
STRICT_FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:477
@ FMINIMUM
FMINIMUM/FMAXIMUM - NaN-propagating minimum/maximum that also treat -0.0 as less than 0....
@ FP_TO_SINT
FP_TO_[US]INT - Convert a floating point value to a signed or unsigned integer.
Definition ISDOpcodes.h:934
@ READCYCLECOUNTER
READCYCLECOUNTER - This corresponds to the readcyclecounter intrinsic.
@ STRICT_FP_EXTEND
X = STRICT_FP_EXTEND(Y) - Extend a smaller FP type into a larger FP type.
Definition ISDOpcodes.h:505
@ AND
Bitwise operators - logical and, logical or, logical xor.
Definition ISDOpcodes.h:739
@ TRAP
TRAP - Trapping instruction.
@ INTRINSIC_WO_CHAIN
RESULT = INTRINSIC_WO_CHAIN(INTRINSICID, arg1, arg2, ...) This node represents a target intrinsic fun...
Definition ISDOpcodes.h:205
@ SCMP
[US]CMP - 3-way comparison of signed or unsigned integers.
Definition ISDOpcodes.h:735
@ AVGFLOORS
AVGFLOORS/AVGFLOORU - Averaging add - Add two integers using an integer of type i[N+1],...
Definition ISDOpcodes.h:710
@ STRICT_FADD
Constrained versions of the binary floating point operators.
Definition ISDOpcodes.h:427
@ INSERT_VECTOR_ELT
INSERT_VECTOR_ELT(VECTOR, VAL, IDX) - Returns VECTOR with the element at IDX replaced with VAL.
Definition ISDOpcodes.h:565
@ TokenFactor
TokenFactor - This node takes multiple tokens as input and produces a single token result.
Definition ISDOpcodes.h:53
@ ATOMIC_SWAP
Val, OUTCHAIN = ATOMIC_SWAP(INCHAIN, ptr, amt) Val, OUTCHAIN = ATOMIC_LOAD_[OpName](INCHAIN,...
@ CTTZ_ZERO_POISON
Bit counting operators with a poisoned result for zero inputs.
Definition ISDOpcodes.h:795
@ FFREXP
FFREXP - frexp, extract fractional and exponent component of a floating-point value.
@ FP_ROUND
X = FP_ROUND(Y, TRUNC) - Rounding 'Y' from a larger floating point type down to the precision of the ...
Definition ISDOpcodes.h:967
@ SPONENTRY
SPONENTRY - Represents the llvm.sponentry intrinsic.
Definition ISDOpcodes.h:122
@ STRICT_FNEARBYINT
Definition ISDOpcodes.h:458
@ FP_TO_SINT_SAT
FP_TO_[US]INT_SAT - Convert floating point value in operand 0 to a signed or unsigned scalar integer ...
Definition ISDOpcodes.h:953
@ EH_SJLJ_SETJMP
RESULT, OUTCHAIN = EH_SJLJ_SETJMP(INCHAIN, buffer) This corresponds to the eh.sjlj....
Definition ISDOpcodes.h:162
@ TRUNCATE
TRUNCATE - Completely drop the high bits.
Definition ISDOpcodes.h:864
@ VAARG
VAARG - VAARG has four operands: an input chain, a pointer, a SRCVALUE, and the alignment.
@ BRCOND
BRCOND - Conditional branch.
@ SHL_PARTS
SHL_PARTS/SRA_PARTS/SRL_PARTS - These operators are used for expanded integer shift operations.
Definition ISDOpcodes.h:841
@ FCOPYSIGN
FCOPYSIGN(X, Y) - Return the value of X with the sign of Y.
Definition ISDOpcodes.h:534
@ SADDSAT
RESULT = [US]ADDSAT(LHS, RHS) - Perform saturation addition on 2 integers with the same bit width (W)...
Definition ISDOpcodes.h:365
@ ABDS
ABDS/ABDU - Absolute difference - Return the absolute difference between two numbers interpreted as s...
Definition ISDOpcodes.h:722
@ SADDO_CARRY
Carry-using overflow-aware nodes for multiple precision addition and subtraction.
Definition ISDOpcodes.h:338
@ INTRINSIC_W_CHAIN
RESULT,OUTCHAIN = INTRINSIC_W_CHAIN(INCHAIN, INTRINSICID, arg1, ...) This node represents a target in...
Definition ISDOpcodes.h:213
@ BUILD_VECTOR
BUILD_VECTOR(ELT0, ELT1, ELT2, ELT3,...) - Return a fixed-width vector with the specified,...
Definition ISDOpcodes.h:556
bool isNormalStore(const SDNode *N)
Returns true if the specified node is a non-truncating and unindexed store.
bool isZEXTLoad(const SDNode *N)
Returns true if the specified node is a ZEXTLOAD.
LLVM_ABI CondCode getSetCCInverse(CondCode Operation, EVT Type)
Return the operation corresponding to !(X op Y), where 'op' is a valid SetCC operation.
bool isEXTLoad(const SDNode *N)
Returns true if the specified node is a EXTLOAD.
LLVM_ABI CondCode getSetCCSwappedOperands(CondCode Operation)
Return the operation corresponding to (Y op X) when given the operation for (X op Y).
LLVM_ABI bool isBuildVectorAllZeros(const SDNode *N)
Return true if the specified node is a BUILD_VECTOR where all of the elements are 0 or undef.
bool isSignedIntSetCC(CondCode Code)
Return true if this is a setcc instruction that performs a signed comparison when used with integer o...
LLVM_ABI bool isConstantSplatVector(const SDNode *N, APInt &SplatValue)
Node predicates.
MemIndexedMode
MemIndexedMode enum - This enum defines the load / store indexed addressing modes.
bool isSEXTLoad(const SDNode *N)
Returns true if the specified node is a SEXTLOAD.
CondCode
ISD::CondCode enum - These are ordered carefully to make the bitfields below work out,...
LoadExtType
LoadExtType enum - This enum defines the three variants of LOADEXT (load with extension).
static const int LAST_INDEXED_MODE
bool isNormalLoad(const SDNode *N)
Returns true if the specified node is a non-extending and unindexed load.
This namespace contains an enum with a value for every intrinsic/builtin function known by LLVM.
LLVM_ABI Function * getOrInsertDeclaration(Module *M, ID id, ArrayRef< Type * > OverloadTys={})
Look up the Function declaration of the intrinsic id in the Module M.
LLVM_ABI Libcall getSINTTOFP(EVT OpVT, EVT RetVT)
getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getUINTTOFP(EVT OpVT, EVT RetVT)
getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOUINT(EVT OpVT, EVT RetVT)
getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPTOSINT(EVT OpVT, EVT RetVT)
getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPEXT(EVT OpVT, EVT RetVT)
getFPEXT - Return the FPEXT_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
LLVM_ABI Libcall getFPROUND(EVT OpVT, EVT RetVT)
getFPROUND - Return the FPROUND_*_* value for the given types, or UNKNOWN_LIBCALL if there is none.
@ SingleThread
Synchronized with respect to signal handlers executing in the same thread.
Definition LLVMContext.h:55
initializer< Ty > init(const Ty &Val)
BaseReg
Stack frame base register. Bit 0 of FREInfo.Info.
Definition SFrame.h:77
This is an optimization pass for GlobalISel generic memory operations.
auto drop_begin(T &&RangeOrContainer, size_t N=1)
Return a range covering RangeOrContainer with the first N elements excluded.
Definition STLExtras.h:315
void dump(const SparseBitVector< ElementSize > &LHS, raw_ostream &out)
bool RetFastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Low
Lower the current thread's priority such that it does not affect foreground tasks significantly.
Definition Threading.h:280
@ Offset
Definition DWP.cpp:573
@ Length
Definition DWP.cpp:573
void stable_sort(R &&Range)
Definition STLExtras.h:2116
auto find(R &&Range, const T &Val)
Provide wrappers to std::find which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1765
bool all_of(R &&range, UnaryPredicate P)
Provide wrappers to std::all_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1739
bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns true if Val1 has a lower Constant Materialization Cost than Val2.
MachineInstrBuilder BuildMI(MachineFunction &MF, const MIMetadata &MIMD, const MCInstrDesc &MCID)
Builder interface. Specify how to create the initial instruction itself.
InstructionCost Cost
LLVM_ABI bool isNullConstant(SDValue V)
Returns true if V is a constant integer zero.
RelativeUniformCounterPtr Values
Definition InstrProf.h:91
@ Implicit
Not emitted register (e.g. carry, or temporary result).
@ Dead
Unused definition.
@ Kill
The last use of a register.
@ Define
Register definition.
decltype(auto) dyn_cast(const From &Val)
dyn_cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:643
bool isStrongerThanMonotonic(AtomicOrdering AO)
int countr_one(T Value)
Count the number of ones from the least significant bit to the first zero bit.
Definition bit.h:315
bool CCAssignFn(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
CCAssignFn - This function assigns a location for Val, updating State to reflect the change.
bool CC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
constexpr bool isMask_32(uint32_t Value)
Return true if the argument is a non-empty sequence of ones starting at the least significant bit wit...
Definition MathExtras.h:255
ExceptionHandling
Definition CodeGen.h:53
@ SjLj
setjmp/longjmp based exceptions
Definition CodeGen.h:56
bool RetCC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
bool RetCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
int bit_width(T Value)
Returns the number of bits needed to represent Value if Value is nonzero.
Definition bit.h:325
void append_range(Container &C, Range &&R)
Wrapper function to append range R to container C.
Definition STLExtras.h:2208
constexpr bool isUIntN(unsigned N, uint64_t x)
Checks if an unsigned integer fits into the given (dynamic) bit width.
Definition MathExtras.h:243
bool RetCC_ARM_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI Value * concatenateVectors(IRBuilderBase &Builder, ArrayRef< Value * > Vecs)
Concatenate a list of vectors.
constexpr bool isPowerOf2_64(uint64_t Value)
Return true if the argument is a power of two > 0 (64 bit edition.)
Definition MathExtras.h:284
void shuffle(Iterator first, Iterator last, RNG &&g)
Definition STLExtras.h:1530
bool CC_ARM_APCS_GHC(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
static std::array< MachineOperand, 2 > predOps(ARMCC::CondCodes Pred, unsigned PredReg=0)
Get the operands corresponding to the given Pred value.
bool operator==(const AddressRangeValuePair &LHS, const AddressRangeValuePair &RHS)
constexpr bool isShiftedMask_32(uint32_t Value)
Return true if the argument contains a non-empty sequence of ones with the remainder zero (32 bit ver...
Definition MathExtras.h:267
LLVM_ABI ConstantFPSDNode * isConstOrConstSplatFP(SDValue N, bool AllowUndefs=false)
Returns the SDNode if it is a constant splat BuildVector or constant float.
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Value
Definition InstrProf.h:143
int countr_zero(T Val)
Count number of 0's from the least significant bit to the most stopping at the first 1.
Definition bit.h:204
bool isReleaseOrStronger(AtomicOrdering AO)
auto dyn_cast_or_null(const Y &Val)
Definition Casting.h:753
constexpr bool has_single_bit(T Value) noexcept
Definition bit.h:149
bool any_of(R &&range, UnaryPredicate P)
Provide wrappers to std::any_of which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1746
unsigned Log2_32(uint32_t Value)
Return the floor log base 2 of the specified value, -1 if the value is zero.
Definition MathExtras.h:331
int countl_zero(T Val)
Count number of 0's from the most significant bit to the least stopping at the first 1.
Definition bit.h:263
LLVM_ABI bool isBitwiseNot(SDValue V, bool AllowUndefs=false)
Returns true if V is a bitwise not operation.
MachineInstr * getImm(const MachineOperand &MO, const MachineRegisterInfo *MRI)
constexpr bool isPowerOf2_32(uint32_t Value)
Return true if the argument is a power of two > 0.
Definition MathExtras.h:279
bool FastCC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
decltype(auto) get(const PointerIntPair< PointerTy, IntBits, IntType, PtrTraits, Info > &Pair)
LLVM_ABI raw_ostream & dbgs()
dbgs() - This returns a reference to a raw_ostream for debugging messages.
Definition Debug.cpp:209
bool CC_ARM_Win32_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI void report_fatal_error(Error Err, bool gen_crash_diag=true)
Definition Error.cpp:163
constexpr uint64_t alignTo(uint64_t Size, Align A)
Returns a multiple of A needed to store Size bytes.
Definition Alignment.h:144
constexpr bool isUInt(uint64_t x)
Checks if an unsigned integer fits into the given bit width.
Definition MathExtras.h:189
SmallVector< ValueTypeFromRangeType< R >, Size > to_vector(R &&Range)
Given a range of type R, iterate the entire range and return a SmallVector with elements of the vecto...
class LLVM_GSL_OWNER SmallVector
Forward declaration of SmallVector so that calculateSmallVectorDefaultInlinedElements can reference s...
bool isa(const From &Val)
isa<X> - Return true if the parameter to the template is an instance of one of the template type argu...
Definition Casting.h:547
LLVM_ABI raw_fd_ostream & errs()
This returns a reference to a raw_ostream for standard error.
const unsigned PerfectShuffleTable[6561+1]
AtomicOrdering
Atomic ordering for LLVM's memory model.
@ Other
Any other memory.
Definition ModRef.h:68
CombineLevel
Definition DAGCombine.h:15
@ BeforeLegalizeTypes
Definition DAGCombine.h:16
unsigned ConstantMaterializationCost(unsigned Val, const ARMSubtarget *Subtarget, bool ForCodesize=false)
Returns the number of instructions required to materialize the given constant in a register,...
@ Mul
Product of integers.
@ And
Bitwise or logical AND of integers.
@ Sub
Subtraction of integers.
@ Add
Sum of integers.
@ FAdd
Sum of floats.
uint16_t MCPhysReg
An unsigned integer type large enough to represent all physical registers, but not necessarily virtua...
Definition MCRegister.h:21
RelativeUniformCounterPtr ValuesPtrExpr VTableAddr Count
Definition InstrProf.h:145
DWARFExpression::Operation Op
ArrayRef(const T &OneElt) -> ArrayRef< T >
LLVM_ABI ConstantSDNode * isConstOrConstSplat(SDValue N, bool AllowUndefs=false, bool AllowTruncation=false)
Returns the SDNode if it is a constant splat BuildVector or constant int.
constexpr U AbsoluteValue(T X)
Return the absolute value of a signed integer, converted to the corresponding unsigned integer type.
Definition MathExtras.h:592
bool isAcquireOrStronger(AtomicOrdering AO)
constexpr unsigned BitWidth
static MachineOperand t1CondCodeOp(bool isDead=false)
Get the operand corresponding to the conditional code result for Thumb1.
auto count_if(R &&Range, UnaryPredicate P)
Wrapper function around std::count_if to count the number of times an element satisfying a given pred...
Definition STLExtras.h:2019
decltype(auto) cast(const From &Val)
cast<X> - Return the argument parameter cast to the specified type.
Definition Casting.h:559
auto find_if(R &&Range, UnaryPredicate P)
Provide wrappers to std::find_if which take ranges instead of having to pass begin/end explicitly.
Definition STLExtras.h:1772
LLVM_ABI bool isOneConstant(SDValue V)
Returns true if V is a constant integer one.
UndefPoisonKind
Enumeration to track whether we are interested in Undef, Poison, or both.
Definition UndefPoison.h:20
constexpr bool isIntN(unsigned N, int64_t x)
Checks if an signed integer fits into the given (dynamic) bit width.
Definition MathExtras.h:248
Align commonAlignment(Align A, uint64_t Offset)
Returns the alignment that satisfies both alignments.
Definition Alignment.h:201
static MachineOperand condCodeOp(unsigned CCReg=0)
Get the operand corresponding to the conditional code result.
bool isVREVMask(ArrayRef< int > M, EVT VT, unsigned BlockSize)
isVREVMask - Check if a vector shuffle corresponds to a VREV instruction with the specified blocksize...
auto seq(T Begin, T End)
Iterate over an integral type from Begin up to - but not including - End.
Definition Sequence.h:305
unsigned gettBLXrOpcode(const MachineFunction &MF)
bool CC_ARM_APCS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
@ Increment
Incrementally increasing token ID.
Definition AllocToken.h:26
bool CC_ARM_AAPCS_VFP(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, Type *OrigTy, CCState &State)
LLVM_ABI llvm::SmallVector< int, 16 > createSequentialMask(unsigned Start, unsigned NumInts, unsigned NumUndefs)
Create a sequential shuffle mask.
constexpr bool isShiftedUInt(uint64_t x)
Checks if a unsigned integer is an N bit number shifted left by S.
Definition MathExtras.h:198
unsigned convertAddSubFlagsOpcode(unsigned OldOpc)
Map pseudo instructions that imply an 'S' bit onto real opcodes.
LLVM_ABI bool isAllOnesConstant(SDValue V)
Returns true if V is an integer constant with all bits set.
LLVM_ABI void reportFatalUsageError(Error Err)
Report a fatal error that does not indicate a bug in LLVM.
Definition Error.cpp:177
void swap(llvm::BitVector &LHS, llvm::BitVector &RHS)
Implement std::swap in terms of BitVector swap.
Definition BitVector.h:862
#define N
Load/store instruction that can be merged with a base address update.
SDNode * N
Instruction that updates a pointer.
unsigned ConstInc
Pointer increment value if it is a constant, or 0 otherwise.
SDValue Inc
Pointer increment operand.
A collection of metadata nodes that might be associated with a memory access used by the alias-analys...
Definition Metadata.h:763
This struct is a compact representation of a valid (non-zero power of two) alignment.
Definition Alignment.h:39
constexpr uint64_t value() const
This is a hole in the type system and should not be abused.
Definition Alignment.h:77
static constexpr DenormalMode getIEEE()
Extended Value Type.
Definition ValueTypes.h:35
EVT changeVectorElementTypeToInteger() const
Return a vector with the same number of elements as this vector, but with the element type converted ...
Definition ValueTypes.h:90
bool isSimple() const
Test if the given EVT is simple (as opposed to being extended).
Definition ValueTypes.h:145
static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements, bool IsScalable=false)
Returns the EVT that represents a vector NumElements in length, where each element is of type VT.
Definition ValueTypes.h:70
bool bitsGT(EVT VT) const
Return true if this has more bits than VT.
Definition ValueTypes.h:307
bool bitsLT(EVT VT) const
Return true if this has less bits than VT.
Definition ValueTypes.h:323
bool isFloatingPoint() const
Return true if this is a FP or a vector FP type.
Definition ValueTypes.h:155
ElementCount getVectorElementCount() const
Definition ValueTypes.h:373
EVT getDoubleNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:494
TypeSize getSizeInBits() const
Return the size of the specified value type in bits.
Definition ValueTypes.h:396
unsigned getVectorMinNumElements() const
Given a vector type, return the minimum number of elements it contains.
Definition ValueTypes.h:382
uint64_t getScalarSizeInBits() const
Definition ValueTypes.h:408
bool isPow2VectorType() const
Returns true if the given vector is a power of 2.
Definition ValueTypes.h:501
static LLVM_ABI EVT getEVT(Type *Ty, bool HandleUnknown=false)
Return the value type corresponding to the specified type.
EVT changeVectorElementType(LLVMContext &Context, EVT EltVT) const
Return a VT for a vector type whose attributes match ourselves with the exception of the element type...
Definition ValueTypes.h:98
MVT getSimpleVT() const
Return the SimpleValueType held in the specified simple EVT.
Definition ValueTypes.h:339
bool is128BitVector() const
Return true if this is a 128-bit vector type.
Definition ValueTypes.h:230
static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth)
Returns the EVT that represents an integer with the given number of bits.
Definition ValueTypes.h:61
uint64_t getFixedSizeInBits() const
Return the size of the specified fixed width value type in bits.
Definition ValueTypes.h:404
bool isFixedLengthVector() const
Definition ValueTypes.h:199
static EVT getFloatingPointVT(unsigned BitWidth)
Returns the EVT that represents a floating-point type with the given number of bits.
Definition ValueTypes.h:55
bool isVector() const
Return true if this is a vector value type.
Definition ValueTypes.h:176
EVT getScalarType() const
If this is a vector type, return the element type, otherwise return this.
Definition ValueTypes.h:346
LLVM_ABI Type * getTypeForEVT(LLVMContext &Context) const
This method returns an LLVM type corresponding to the specified EVT.
EVT getVectorElementType() const
Given a vector type, return the type of each element.
Definition ValueTypes.h:351
bool isScalarInteger() const
Return true if this is an integer, but not a vector.
Definition ValueTypes.h:165
unsigned getVectorNumElements() const
Given a vector type, return the number of elements it contains.
Definition ValueTypes.h:359
bool bitsLE(EVT VT) const
Return true if this has no more bits than VT.
Definition ValueTypes.h:331
EVT getHalfNumVectorElementsVT(LLVMContext &Context) const
Definition ValueTypes.h:484
bool isInteger() const
Return true if this is an integer or a vector integer type.
Definition ValueTypes.h:160
bool is64BitVector() const
Return true if this is a 64-bit vector type.
Definition ValueTypes.h:225
InputArg - This struct carries flags and type information about a single incoming (formal) argument o...
EVT ArgVT
Usually the non-legalized type of the argument, which is the EVT corresponding to the OrigTy IR type.
static KnownBits makeConstant(const APInt &C)
Create known bits from a known constant.
Definition KnownBits.h:315
bool isUnknown() const
Returns true if we don't know any bits.
Definition KnownBits.h:64
unsigned getBitWidth() const
Get the bit width of this value.
Definition KnownBits.h:44
KnownBits zext(unsigned BitWidth) const
Return known bits for a zero extension of the value we're tracking.
Definition KnownBits.h:176
void resetAll()
Resets the known state of all bits.
Definition KnownBits.h:72
static KnownBits add(const KnownBits &LHS, const KnownBits &RHS, bool NSW=false, bool NUW=false, bool SelfAdd=false)
Compute knownbits resulting from addition of LHS and RHS.
Definition KnownBits.h:361
KnownBits intersectWith(const KnownBits &RHS) const
Returns KnownBits information that is known to be true for both this and RHS.
Definition KnownBits.h:325
KnownBits sext(unsigned BitWidth) const
Return known bits for a sign extension of the value we're tracking.
Definition KnownBits.h:184
static LLVM_ABI KnownBits mul(const KnownBits &LHS, const KnownBits &RHS, bool NoUndefSelfMultiply=false)
Compute known bits resulting from multiplying LHS and RHS.
APInt getSignedMinValue() const
Return the minimal signed value possible given these KnownBits.
Definition KnownBits.h:136
Matching combinators.
SmallVector< ArgRegPair, 1 > ArgRegPairs
Vector of call argument and its forwarding register.
This class contains a discriminated union of information about pointers in memory operands,...
static LLVM_ABI MachinePointerInfo getJumpTable(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a jump table entry.
static LLVM_ABI MachinePointerInfo getStack(MachineFunction &MF, int64_t Offset, uint8_t ID=0)
Stack pointer relative access.
static LLVM_ABI MachinePointerInfo getConstantPool(MachineFunction &MF)
Return a MachinePointerInfo record that refers to the constant pool.
MachinePointerInfo getWithOffset(int64_t O) const
static LLVM_ABI MachinePointerInfo getGOT(MachineFunction &MF)
Return a MachinePointerInfo record that refers to a GOT entry.
static LLVM_ABI MachinePointerInfo getFixedStack(MachineFunction &MF, int FI, int64_t Offset=0)
Return a MachinePointerInfo record that refers to the specified FrameIndex.
This struct is a compact representation of a valid (power of two) or undefined (0) alignment.
Definition Alignment.h:106
These are IR-level optimization flags that may be propagated to SDNodes.
bool hasNoSignedZeros() const
This represents a list of ValueType's that has been intern'd by a SelectionDAG.
This represents an addressing mode of: BaseGV + BaseOffs + BaseReg + Scale*ScaleReg + ScalableOffset*...
This contains information for each constraint that we are lowering.
This structure contains all information that is necessary for lowering calls.
CallLoweringInfo & setInRegister(bool Value=true)
CallLoweringInfo & setLibCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList)
SmallVector< ISD::InputArg, 32 > Ins
CallLoweringInfo & setZExtResult(bool Value=true)
CallLoweringInfo & setDebugLoc(const SDLoc &dl)
CallLoweringInfo & setSExtResult(bool Value=true)
SmallVector< ISD::OutputArg, 32 > Outs
CallLoweringInfo & setChain(SDValue InChain)
CallLoweringInfo & setCallee(CallingConv::ID CC, Type *ResultType, SDValue Target, ArgListTy &&ArgsList, AttributeSet ResultAttrs={})
LLVM_ABI void AddToWorklist(SDNode *N)
LLVM_ABI SDValue CombineTo(SDNode *N, ArrayRef< SDValue > To, bool AddTo=true)
This structure is used to pass arguments to makeLibCall function.
A convenience struct that encapsulates a DAG, and two SDValues for returning information from TargetL...