diff options
author | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
---|---|---|
committer | Dimitry Andric <dim@FreeBSD.org> | 2020-01-17 20:45:01 +0000 |
commit | 706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch) | |
tree | 4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target | |
parent | 7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff) | |
download | src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.tar.gz src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.zip |
Vendor import of llvm-project master e26a78e70, the last commit beforevendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
the llvmorg-11-init tag, from which release/10.x was branched.
Notes
Notes:
svn path=/vendor/llvm-project/master/; revision=356843
svn path=/vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085/; revision=356844; tag=vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
Diffstat (limited to 'llvm/lib/Target')
716 files changed, 37437 insertions, 17375 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td index 5b4c9e2149da..0106355b1a44 100644 --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -448,9 +448,9 @@ include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" include "AArch64SchedFalkor.td" include "AArch64SchedKryo.td" -include "AArch64SchedExynosM1.td" include "AArch64SchedExynosM3.td" include "AArch64SchedExynosM4.td" +include "AArch64SchedExynosM5.td" include "AArch64SchedThunderX.td" include "AArch64SchedThunderX2T99.td" @@ -565,8 +565,8 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76", // Note that cyclone does not fuse AES instructions, but newer apple chips do // perform the fusion and cyclone is used by default when targetting apple OSes. -def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", - "Cyclone", [ +def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7", + "Apple A7 (the CPU formerly known as Cyclone)", [ FeatureAlternateSExtLoadCVTF32Pattern, FeatureArithmeticBccFusion, FeatureArithmeticCbzFusion, @@ -582,32 +582,82 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone", FeatureZCZeroingFPWorkaround ]>; -def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M1 processors", - [FeatureSlowPaired128, - FeatureCRC, - FeatureCrypto, - FeatureExynosCheapAsMoveHandling, - FeatureForce32BitJumpTables, - FeatureFuseAES, - FeaturePerfMon, - FeaturePostRAScheduler, - FeatureSlowMisaligned128Store, - FeatureUseRSqrt, - FeatureZCZeroingFP]>; +def ProcAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10", + "Apple A10", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureCRC, + FeatureRDM, + FeaturePAN, + FeatureLOR, + FeatureVH, + ]>; -def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", - "Samsung Exynos-M2 processors", - [FeatureSlowPaired128, - FeatureCRC, +def ProcAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11", + "Apple A11", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + HasV8_2aOps + ]>; + +def ProcAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12", + "Apple A12", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, + FeatureCrypto, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, + FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, + FeaturePerfMon, + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + HasV8_3aOps + ]>; + +def ProcAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13", + "Apple A13", [ + FeatureAlternateSExtLoadCVTF32Pattern, + FeatureArithmeticBccFusion, + FeatureArithmeticCbzFusion, FeatureCrypto, - FeatureExynosCheapAsMoveHandling, - FeatureForce32BitJumpTables, + FeatureDisableLatencySchedHeuristic, + FeatureFPARMv8, FeatureFuseAES, + FeatureFuseCryptoEOR, + FeatureNEON, FeaturePerfMon, - FeaturePostRAScheduler, - FeatureSlowMisaligned128Store, - FeatureZCZeroingFP]>; + FeatureZCRegMove, + FeatureZCZeroing, + FeatureFullFP16, + FeatureFP16FML, + FeatureSHA3, + HasV8_4aOps + ]>; def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3", "Samsung Exynos-M3 processors", @@ -815,12 +865,9 @@ def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>; def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>; def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>; def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>; -def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; -def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; -def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>; def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>; def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>; -def : ProcessorModel<"exynos-m5", ExynosM4Model, [ProcExynosM4]>; +def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>; def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>; def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>; def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; @@ -834,8 +881,24 @@ def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>; // FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57. def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>; +// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode. +def : ProcessorModel<"cyclone", CycloneModel, [ProcAppleA7]>; + +// iPhone and iPad CPUs +def : ProcessorModel<"apple-a7", CycloneModel, [ProcAppleA7]>; +def : ProcessorModel<"apple-a8", CycloneModel, [ProcAppleA7]>; +def : ProcessorModel<"apple-a9", CycloneModel, [ProcAppleA7]>; +def : ProcessorModel<"apple-a10", CycloneModel, [ProcAppleA10]>; +def : ProcessorModel<"apple-a11", CycloneModel, [ProcAppleA11]>; +def : ProcessorModel<"apple-a12", CycloneModel, [ProcAppleA12]>; +def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>; + +// watch CPUs. +def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>; +def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>; + // Alias for the latest Apple processor model supported by LLVM. -def : ProcessorModel<"apple-latest", CycloneModel, [ProcCyclone]>; +def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>; //===----------------------------------------------------------------------===// // Assembly parser diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp index 7ea7915c2ca6..00e321f9b850 100644 --- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp +++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp @@ -84,6 +84,7 @@ public: return MCInstLowering.lowerOperand(MO, MCOp); } + void EmitStartOfAsmFile(Module &M) override; void EmitJumpTableInfo() override; void emitJumpTableEntry(const MachineJumpTableInfo *MJTI, const MachineBasicBlock *MBB, unsigned JTI); @@ -181,8 +182,79 @@ private: } // end anonymous namespace +void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) { + if (!TM.getTargetTriple().isOSBinFormatELF()) + return; + + // Assemble feature flags that may require creation of a note section. + unsigned Flags = ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI | + ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; + + if (any_of(M, [](const Function &F) { + return !F.isDeclaration() && + !F.hasFnAttribute("branch-target-enforcement"); + })) { + Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI; + } + + if ((Flags & ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI) == 0 && + any_of(M, [](const Function &F) { + return F.hasFnAttribute("branch-target-enforcement"); + })) { + errs() << "warning: some functions compiled with BTI and some compiled " + "without BTI\n" + << "warning: not setting BTI in feature flags\n"; + } + + if (any_of(M, [](const Function &F) { + if (F.isDeclaration()) + return false; + Attribute A = F.getFnAttribute("sign-return-address"); + return !A.isStringAttribute() || A.getValueAsString() == "none"; + })) { + Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC; + } + + if (Flags == 0) + return; + + // Emit a .note.gnu.property section with the flags. + MCSection *Cur = OutStreamer->getCurrentSectionOnly(); + MCSection *Nt = MMI->getContext().getELFSection( + ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC); + OutStreamer->SwitchSection(Nt); + + // Emit the note header. + EmitAlignment(Align(8)); + OutStreamer->EmitIntValue(4, 4); // data size for "GNU\0" + OutStreamer->EmitIntValue(4 * 4, 4); // Elf_Prop size + OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4); + OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name + + // Emit the PAC/BTI properties. + OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4); + OutStreamer->EmitIntValue(4, 4); // data size + OutStreamer->EmitIntValue(Flags, 4); // data + OutStreamer->EmitIntValue(0, 4); // pad + + OutStreamer->endSection(Nt); + OutStreamer->SwitchSection(Cur); +} + void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) { + const Function &F = MF->getFunction(); + if (F.hasFnAttribute("patchable-function-entry")) { + unsigned Num; + if (F.getFnAttribute("patchable-function-entry") + .getValueAsString() + .getAsInteger(10, Num)) + return; + for (; Num; --Num) + EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0)); + return; + } + EmitSled(MI, SledKind::FUNCTION_ENTER); } @@ -458,8 +530,8 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) { // linker can safely perform dead code stripping. Since LLVM never // generates code that does this, it is always safe to set. OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols); - emitStackMaps(SM); } + emitStackMaps(SM); } void AArch64AsmPrinter::EmitLOHs() { @@ -794,7 +866,11 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI) { unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes(); - SM.recordStackMap(MI); + auto &Ctx = OutStreamer.getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer.EmitLabel(MILabel); + + SM.recordStackMap(*MILabel, MI); assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!"); // Scan ahead to trim the shadow. @@ -820,7 +896,10 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM, // [<def>], <id>, <numBytes>, <target>, <numArgs> void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM, const MachineInstr &MI) { - SM.recordPatchPoint(MI); + auto &Ctx = OutStreamer.getContext(); + MCSymbol *MILabel = Ctx.createTempSymbol(); + OutStreamer.EmitLabel(MILabel); + SM.recordPatchPoint(*MILabel, MI); PatchPointOpers Opers(&MI); @@ -1219,7 +1298,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) { } // Force static initialization. -extern "C" void LLVMInitializeAArch64AsmPrinter() { +extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() { RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget()); RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget()); RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target()); diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp index ed93d02aa615..76ff238234d9 100644 --- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp @@ -160,7 +160,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler { MIRBuilder.buildConstant(OffsetReg, Offset); Register AddrReg = MRI.createGenericVirtualRegister(p0); - MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); + MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg); MPO = MachinePointerInfo::getStack(MF, Offset); return AddrReg; @@ -815,7 +815,7 @@ bool AArch64CallLowering::lowerTailCall( // Tell the call which registers are clobbered. auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv()); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC); if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(MF, &Mask); MIB.addRegMask(Mask); @@ -972,7 +972,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // Tell the call which registers are clobbered. auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo(); - const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv()); + const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv); if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv()) TRI->UpdateCustomCallPreservedMask(MF, &Mask); MIB.addRegMask(Mask); @@ -1000,10 +1000,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, 0)); // Finally we can copy the returned value back into its virtual-register. In - // symmetry with the arugments, the physical register must be an + // symmetry with the arguments, the physical register must be an // implicit-define of the call instruction. if (!Info.OrigRet.Ty->isVoidTy()) { - CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv()); + CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv); CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn); if (!handleAssignments(MIRBuilder, InArgs, Handler)) return false; diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/llvm/lib/Target/AArch64/AArch64CallingConvention.h index 5a55d090d7c8..59939e0684ed 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.h +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.h @@ -31,6 +31,9 @@ bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); +bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State); bool CC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT, CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, CCState &State); diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index bccbbd4591ed..a0b2d7712b66 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -75,10 +75,10 @@ def CC_AArch64_AAPCS : CallingConv<[ CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + nxv2f32, nxv4f32, nxv2f64], CCPassIndirect<i64>>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -155,7 +155,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>, CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64], + nxv2f32, nxv4f32, nxv2f64], CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>, CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1], @@ -170,6 +170,13 @@ def CC_AArch64_Win64_VarArg : CallingConv<[ CCDelegateTo<CC_AArch64_AAPCS> ]>; +// Windows Control Flow Guard checks take a single argument (the target function +// address) and have no return value. +let Entry = 1 in +def CC_AArch64_Win64_CFGuard_Check : CallingConv<[ + CCIfType<[i64], CCAssignToReg<[X15]>> +]>; + // Darwin uses a calling convention which differs in only two ways // from the standard one at this level: @@ -384,6 +391,12 @@ def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, D8, D9, D10, D11, D12, D13, D14, D15)>; +// The Control Flow Guard check call uses a custom calling convention that also +// preserves X0-X8 and Q0-Q7. +def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS, + (sequence "X%u", 0, 8), + (sequence "Q%u", 0, 7))>; + // AArch64 PCS for vector functions (VPCS) // must (additionally) preserve full Q8-Q23 registers def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, @@ -392,10 +405,10 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, // Functions taking SVE arguments or returning an SVE type // must (additionally) preserve full Z8-Z23 and predicate registers P4-P15 -def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24, - X25, X26, X27, X28, LR, FP, - (sequence "Z%u", 8, 23), - (sequence "P%u", 4, 15))>; +def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23), + (sequence "P%u", 4, 15), + X19, X20, X21, X22, X23, X24, + X25, X26, X27, X28, LR, FP)>; // Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since // 'this' and the pointer return value are both passed in X0 in these cases, @@ -473,5 +486,7 @@ def CSR_AArch64_RT_MostRegs_SCS : CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>; def CSR_AArch64_AAVPCS_SCS : CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>; +def CSR_AArch64_SVE_AAPCS_SCS + : CalleeSavedRegs<(add CSR_AArch64_SVE_AAPCS, X18)>; def CSR_AArch64_AAPCS_SCS : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>; diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp index 48dab79b32d3..259238705965 100644 --- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp +++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp @@ -20,6 +20,7 @@ #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/MC/MCContext.h" +#include "llvm/Support/Alignment.h" #include "llvm/Support/Debug.h" using namespace llvm; @@ -74,10 +75,16 @@ void AArch64CompressJumpTables::scanFunction() { BlockInfo.clear(); BlockInfo.resize(MF->getNumBlockIDs()); - int Offset = 0; + unsigned Offset = 0; for (MachineBasicBlock &MBB : *MF) { - BlockInfo[MBB.getNumber()] = Offset; - Offset += computeBlockSize(MBB); + const Align Alignment = MBB.getAlignment(); + unsigned AlignedOffset; + if (Alignment == Align::None()) + AlignedOffset = Offset; + else + AlignedOffset = alignTo(Offset, Alignment); + BlockInfo[MBB.getNumber()] = AlignedOffset; + Offset = AlignedOffset + computeBlockSize(MBB); } } diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp index a6efb115ed44..51b2ce029701 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp @@ -74,6 +74,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp index 43ae9f8ec47f..054ef8f482ca 100644 --- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp +++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp @@ -33,6 +33,7 @@ #include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -351,8 +352,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) { } // Check for flag reads and clobbers. - MIOperands::PhysRegInfo PRI = - MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI); + PhysRegInfo PRI = AnalyzePhysRegInBundle(*I, AArch64::NZCV, TRI); if (PRI.Read) { // The ccmp doesn't produce exactly the same flags as the original diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 082e17e44d04..3b8f8a19fe49 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -110,6 +110,8 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, unsigned BitSize) { MachineInstr &MI = *MBBI; Register DstReg = MI.getOperand(0).getReg(); + uint64_t RenamableState = + MI.getOperand(0).isRenamable() ? RegState::Renamable : 0; uint64_t Imm = MI.getOperand(1).getImm(); if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) { @@ -144,7 +146,8 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, bool DstIsDead = MI.getOperand(0).isDead(); MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && LastItem)) + getDeadRegState(DstIsDead && LastItem) | + RenamableState) .addImm(I->Op1) .addImm(I->Op2)); } break; @@ -155,7 +158,8 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB, MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode)) .addReg(DstReg, RegState::Define | - getDeadRegState(DstIsDead && LastItem)) + getDeadRegState(DstIsDead && LastItem) | + RenamableState) .addReg(DstReg) .addImm(I->Op1) .addImm(I->Op2)); @@ -692,10 +696,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, return true; } case AArch64::TAGPstack: { - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDG)) + int64_t Offset = MI.getOperand(2).getImm(); + BuildMI(MBB, MBBI, MI.getDebugLoc(), + TII->get(Offset >= 0 ? AArch64::ADDG : AArch64::SUBG)) .add(MI.getOperand(0)) .add(MI.getOperand(1)) - .add(MI.getOperand(2)) + .addImm(std::abs(Offset)) .add(MI.getOperand(4)); MI.eraseFromParent(); return true; diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp index b54fc2e51bac..c1fc183b04f6 100644 --- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -42,6 +42,7 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Metadata.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp index 277a3052f1e5..7e9c68f2bb30 100644 --- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp +++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp @@ -348,6 +348,8 @@ CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const { return CC_AArch64_WebKit_JS; if (CC == CallingConv::GHC) return CC_AArch64_GHC; + if (CC == CallingConv::CFGuard_Check) + return CC_AArch64_Win64_CFGuard_Check; return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS; } @@ -3251,6 +3253,13 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) { if (Callee && !computeCallAddress(Callee, Addr)) return false; + // The weak function target may be zero; in that case we must use indirect + // addressing via a stub on windows as it may be out of range for a + // PC-relative jump. + if (Subtarget->isTargetWindows() && Addr.getGlobalValue() && + Addr.getGlobalValue()->hasExternalWeakLinkage()) + return false; + // Handle the arguments now that we've gotten them. unsigned NumBytes; if (!processCallArgs(CLI, OutVTs, NumBytes)) @@ -3836,11 +3845,6 @@ bool AArch64FastISel::selectRet(const Instruction *I) { if (!FuncInfo.CanLowerReturn) return false; - // FIXME: in principle it could. Mostly just a case of zero extending outgoing - // pointers. - if (Subtarget->isTargetILP32()) - return false; - if (F.isVarArg()) return false; @@ -3920,6 +3924,11 @@ bool AArch64FastISel::selectRet(const Instruction *I) { return false; } + // "Callee" (i.e. value producer) zero extends pointers at function + // boundary. + if (Subtarget->isTargetILP32() && RV->getType()->isPointerTy()) + SrcReg = emitAnd_ri(MVT::i64, SrcReg, false, 0xffffffff); + // Make the copy. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg); @@ -5009,6 +5018,9 @@ std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) { /// simple cases. This is because the standard fastEmit functions don't cover /// MUL at all and ADD is lowered very inefficientily. bool AArch64FastISel::selectGetElementPtr(const Instruction *I) { + if (Subtarget->isTargetILP32()) + return false; + unsigned N = getRegForValue(I->getOperand(0)); if (!N) return false; diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 68e1e6a30224..ea3e800a1ad2 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -206,6 +206,11 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) { return DefaultSafeSPDisplacement; } +TargetStackID::Value +AArch64FrameLowering::getStackIDForScalableVectors() const { + return TargetStackID::SVEVector; +} + /// Returns the size of the entire SVE stackframe (calleesaves + spills). static StackOffset getSVEStackSize(const MachineFunction &MF) { const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); @@ -222,7 +227,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - unsigned NumBytes = AFI->getLocalStackSize(); + uint64_t NumBytes = AFI->getLocalStackSize(); return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 || getSVEStackSize(MF)); @@ -239,7 +244,7 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const { if (MF.hasEHFunclets()) return true; // Retain behavior of always omitting the FP for leaf functions when possible. - if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF)) + if (MF.getTarget().Options.DisableFramePointerElim(MF)) return true; if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() || MFI.hasStackMap() || MFI.hasPatchPoint() || @@ -424,7 +429,7 @@ bool AArch64FrameLowering::canUseAsPrologue( } static bool windowsRequiresStackProbe(MachineFunction &MF, - unsigned StackSizeInBytes) { + uint64_t StackSizeInBytes) { const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); if (!Subtarget.isTargetWindows()) return false; @@ -441,15 +446,12 @@ static bool windowsRequiresStackProbe(MachineFunction &MF, } bool AArch64FrameLowering::shouldCombineCSRLocalStackBump( - MachineFunction &MF, unsigned StackBumpBytes) const { + MachineFunction &MF, uint64_t StackBumpBytes) const { AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); const MachineFrameInfo &MFI = MF.getFrameInfo(); const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - if (MF.getFunction().hasOptSize()) - return false; - if (AFI->getLocalStackSize() == 0) return false; @@ -723,7 +725,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec( // Fixup callee-save register save/restore instructions to take into account // combined SP bump by adding the local stack size to the stack offsets. static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI, - unsigned LocalStackSize, + uint64_t LocalStackSize, bool NeedsWinCFI, bool *HasWinCFI) { if (AArch64InstrInfo::isSEHInstruction(MI)) @@ -834,6 +836,24 @@ static bool isTargetDarwin(const MachineFunction &MF) { return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin(); } +static bool isTargetWindows(const MachineFunction &MF) { + return MF.getSubtarget<AArch64Subtarget>().isTargetWindows(); +} + +// Convenience function to determine whether I is an SVE callee save. +static bool IsSVECalleeSave(MachineBasicBlock::iterator I) { + switch (I->getOpcode()) { + default: + return false; + case AArch64::STR_ZXI: + case AArch64::STR_PXI: + case AArch64::LDR_ZXI: + case AArch64::LDR_PXI: + return I->getFlag(MachineInstr::FrameSetup) || + I->getFlag(MachineInstr::FrameDestroy); + } +} + void AArch64FrameLowering::emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const { MachineBasicBlock::iterator MBBI = MBB.begin(); @@ -844,8 +864,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, const TargetInstrInfo *TII = Subtarget.getInstrInfo(); MachineModuleInfo &MMI = MF.getMMI(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) && - !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); + bool needsFrameMoves = + MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool HasFP = hasFP(MF); bool NeedsWinCFI = needsWinCFI(MF); bool HasWinCFI = false; @@ -897,8 +917,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // pointer from the funclet. We only save the callee saved registers in the // funclet, which are really the callee saved registers of the parent // function, including the funclet. - int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF) - : (int)MFI.getStackSize(); + int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) + : MFI.getStackSize(); if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) { assert(!HasFP && "unexpected function without stack frame but with FP"); assert(!SVEStackSize && @@ -916,15 +936,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, {-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup, false, NeedsWinCFI, &HasWinCFI); - if (!NeedsWinCFI) { + if (!NeedsWinCFI && needsFrameMoves) { // Label used to tie together the PROLOG_LABEL and the MachineMoves. MCSymbol *FrameLabel = MMI.getContext().createTempSymbol(); - // Encode the stack size of the leaf function. - unsigned CFIIndex = MF.addFrameInst( - MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); - BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) - .addCFIIndex(CFIIndex) - .setMIFlags(MachineInstr::FrameSetup); + // Encode the stack size of the leaf function. + unsigned CFIIndex = MF.addFrameInst( + MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes)); + BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); } } @@ -965,7 +985,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // and pre-inc if we decided to combine the callee-save and local stack // pointer bump above. MachineBasicBlock::iterator End = MBB.end(); - while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) { + while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) && + !IsSVECalleeSave(MBBI)) { if (CombineSPBump) fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(), NeedsWinCFI, &HasWinCFI); @@ -999,7 +1020,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (HasFP) { // Only set up FP if we actually need to. - int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; + int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0; if (CombineSPBump) FPOffset += AFI->getLocalStackSize(); @@ -1014,7 +1035,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, } if (windowsRequiresStackProbe(MF, NumBytes)) { - uint32_t NumWords = NumBytes >> 4; + uint64_t NumWords = NumBytes >> 4; if (NeedsWinCFI) { HasWinCFI = true; // alloc_l can hold at most 256MB, so assume that NumBytes doesn't @@ -1107,7 +1128,35 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, NumBytes = 0; } - emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII, + StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {}; + MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI; + + // Process the SVE callee-saves to determine what space needs to be + // allocated. + if (AFI->getSVECalleeSavedStackSize()) { + // Find callee save instructions in frame. + CalleeSavesBegin = MBBI; + assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction"); + while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator()) + ++MBBI; + CalleeSavesEnd = MBBI; + + int64_t OffsetToFirstCalleeSaveFromSP = + MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); + StackOffset OffsetToCalleeSavesFromSP = + StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; + AllocateBefore -= OffsetToCalleeSavesFromSP; + AllocateAfter = SVEStackSize - AllocateBefore; + } + + // Allocate space for the callee saves (if any). + emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP, + -AllocateBefore, TII, + MachineInstr::FrameSetup); + + // Finally allocate remaining SVE stack space. + emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP, + -AllocateAfter, TII, MachineInstr::FrameSetup); // Allocate space for the rest of the frame. @@ -1343,8 +1392,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, IsFunclet = isFuncletReturnInstr(*MBBI); } - int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF) - : MFI.getStackSize(); + int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF) + : MFI.getStackSize(); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); // All calls are tail calls in GHC calling conv, and functions have no @@ -1444,7 +1493,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock::iterator Begin = MBB.begin(); while (LastPopI != Begin) { --LastPopI; - if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) { + if (!LastPopI->getFlag(MachineInstr::FrameDestroy) || + IsSVECalleeSave(LastPopI)) { ++LastPopI; break; } else if (CombineSPBump) @@ -1476,11 +1526,53 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, NumBytes -= PrologueSaveSize; assert(NumBytes >= 0 && "Negative stack allocation size!?"); + // Process the SVE callee-saves to determine what space needs to be + // deallocated. + StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize; + MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI; + if (AFI->getSVECalleeSavedStackSize()) { + RestoreBegin = std::prev(RestoreEnd);; + while (IsSVECalleeSave(RestoreBegin) && + RestoreBegin != MBB.begin()) + --RestoreBegin; + ++RestoreBegin; + + assert(IsSVECalleeSave(RestoreBegin) && + IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction"); + + int64_t OffsetToFirstCalleeSaveFromSP = + MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex()); + StackOffset OffsetToCalleeSavesFromSP = + StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize; + DeallocateBefore = OffsetToCalleeSavesFromSP; + DeallocateAfter = SVEStackSize - DeallocateBefore; + } + // Deallocate the SVE area. - if (SVEStackSize) - if (!AFI->isStackRealigned()) - emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize, - TII, MachineInstr::FrameDestroy); + if (SVEStackSize) { + if (AFI->isStackRealigned()) { + if (AFI->getSVECalleeSavedStackSize()) + // Set SP to start of SVE area, from which the callee-save reloads + // can be done. The code below will deallocate the stack space + // space by moving FP -> SP. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP, + -SVEStackSize, TII, MachineInstr::FrameDestroy); + } else { + if (AFI->getSVECalleeSavedStackSize()) { + // Deallocate the non-SVE locals first before we can deallocate (and + // restore callee saves) from the SVE area. + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy); + NumBytes = 0; + } + + emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP, + DeallocateBefore, TII, MachineInstr::FrameDestroy); + + emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP, + DeallocateAfter, TII, MachineInstr::FrameDestroy); + } + } if (!hasFP(MF)) { bool RedZone = canUseRedZone(MF); @@ -1490,7 +1582,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF, return; bool NoCalleeSaveRestore = PrologueSaveSize == 0; - int StackRestoreBytes = RedZone ? 0 : NumBytes; + int64_t StackRestoreBytes = RedZone ? 0 : NumBytes; if (NoCalleeSaveRestore) StackRestoreBytes += AfterCSRPopSize; @@ -1582,19 +1674,20 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference( return getSEHFrameIndexOffset(MF, FI); } -static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) { const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()); unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0; - unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize(); + unsigned FPAdjust = isTargetDarwin(MF) + ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo()); return {ObjectOffset + FixedObject + FPAdjust, MVT::i8}; } -static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) { +static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) { const auto &MFI = MF.getFrameInfo(); - return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8}; + return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8}; } int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF, @@ -1611,7 +1704,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); - int ObjectOffset = MFI.getObjectOffset(FI); + int64_t ObjectOffset = MFI.getObjectOffset(FI); bool isFixed = MFI.isFixedObjectIndex(FI); bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector; return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg, @@ -1619,7 +1712,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference( } StackOffset AArch64FrameLowering::resolveFrameOffsetReference( - const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE, + const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE, unsigned &FrameReg, bool PreferFP, bool ForSimm) const { const auto &MFI = MF.getFrameInfo(); const auto *RegInfo = static_cast<const AArch64RegisterInfo *>( @@ -1627,10 +1720,10 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference( const auto *AFI = MF.getInfo<AArch64FunctionInfo>(); const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>(); - int FPOffset = getFPOffset(MF, ObjectOffset).getBytes(); - int Offset = getStackOffset(MF, ObjectOffset).getBytes(); + int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes(); + int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes(); bool isCSR = - !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize()); + !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI)); const StackOffset &SVEStackSize = getSVEStackSize(MF); @@ -1781,6 +1874,8 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, // TODO: LR can be paired with any register. We don't support this yet in // the MCLayer. We need to add support for the save_lrpair unwind code. + if (Reg2 == AArch64::FP) + return true; if (!NeedsWinCFI) return false; if (Reg2 == Reg1 + 1) @@ -1793,9 +1888,9 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2, /// LR and FP need to be allocated together when the frame needs to save /// the frame-record. This means any other register pairing with LR is invalid. static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2, - bool NeedsWinCFI, bool NeedsFrameRecord) { - if (NeedsWinCFI) - return invalidateWindowsRegisterPairing(Reg1, Reg2, true); + bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) { + if (UsesWinAAPCS) + return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI); // If we need to store the frame record, don't pair any register // with LR other than FP. @@ -1812,11 +1907,27 @@ struct RegPairInfo { unsigned Reg2 = AArch64::NoRegister; int FrameIdx; int Offset; - enum RegType { GPR, FPR64, FPR128 } Type; + enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type; RegPairInfo() = default; bool isPaired() const { return Reg2 != AArch64::NoRegister; } + + unsigned getScale() const { + switch (Type) { + case PPR: + return 2; + case GPR: + case FPR64: + return 8; + case ZPR: + case FPR128: + return 16; + } + llvm_unreachable("Unsupported type"); + } + + bool isScalable() const { return Type == PPR || Type == ZPR; } }; } // end anonymous namespace @@ -1829,6 +1940,7 @@ static void computeCalleeSaveRegisterPairs( if (CSI.empty()) return; + bool IsWindows = isTargetWindows(MF); bool NeedsWinCFI = needsWinCFI(MF); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1841,7 +1953,8 @@ static void computeCalleeSaveRegisterPairs( CC == CallingConv::PreserveMost || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); - int Offset = AFI->getCalleeSavedStackSize(); + int ByteOffset = AFI->getCalleeSavedStackSize(); + int ScalableByteOffset = AFI->getSVECalleeSavedStackSize(); // On Linux, we will have either one or zero non-paired register. On Windows // with CFI, we can have multiple unpaired registers in order to utilize the // available unwind codes. This flag assures that the alignment fixup is done @@ -1857,6 +1970,10 @@ static void computeCalleeSaveRegisterPairs( RPI.Type = RegPairInfo::FPR64; else if (AArch64::FPR128RegClass.contains(RPI.Reg1)) RPI.Type = RegPairInfo::FPR128; + else if (AArch64::ZPRRegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::ZPR; + else if (AArch64::PPRRegClass.contains(RPI.Reg1)) + RPI.Type = RegPairInfo::PPR; else llvm_unreachable("Unsupported register class."); @@ -1866,7 +1983,7 @@ static void computeCalleeSaveRegisterPairs( switch (RPI.Type) { case RegPairInfo::GPR: if (AArch64::GPR64RegClass.contains(NextReg) && - !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI, + !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI, NeedsFrameRecord)) RPI.Reg2 = NextReg; break; @@ -1879,6 +1996,9 @@ static void computeCalleeSaveRegisterPairs( if (AArch64::FPR128RegClass.contains(NextReg)) RPI.Reg2 = NextReg; break; + case RegPairInfo::PPR: + case RegPairInfo::ZPR: + break; } } @@ -1905,6 +2025,11 @@ static void computeCalleeSaveRegisterPairs( RPI.Reg1 == AArch64::LR) && "FrameRecord must be allocated together with LR"); + // Windows AAPCS has FP and LR reversed. + assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP || + RPI.Reg2 == AArch64::LR) && + "FrameRecord must be allocated together with LR"); + // MachO's compact unwind format relies on all registers being stored in // adjacent register pairs. assert((!produceCompactUnwindFrame(MF) || @@ -1916,23 +2041,33 @@ static void computeCalleeSaveRegisterPairs( RPI.FrameIdx = CSI[i].getFrameIdx(); - int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8; - Offset -= RPI.isPaired() ? 2 * Scale : Scale; + int Scale = RPI.getScale(); + if (RPI.isScalable()) + ScalableByteOffset -= Scale; + else + ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale; + + assert(!(RPI.isScalable() && RPI.isPaired()) && + "Paired spill/fill instructions don't exist for SVE vectors"); // Round up size of non-pair to pair size if we need to pad the // callee-save area to ensure 16-byte alignment. if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone && - RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) { + !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 && + !RPI.isPaired()) { FixupDone = true; - Offset -= 8; - assert(Offset % 16 == 0); + ByteOffset -= 8; + assert(ByteOffset % 16 == 0); assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16); MFI.setObjectAlignment(RPI.FrameIdx, 16); } + int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset; assert(Offset % Scale == 0); RPI.Offset = Offset / Scale; - assert((RPI.Offset >= -64 && RPI.Offset <= 63) && + + assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) || + (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) && "Offset out of bounds for LDP/STP immediate"); RegPairs.push_back(RPI); @@ -2024,6 +2159,16 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( Size = 16; Align = 16; break; + case RegPairInfo::ZPR: + StrOpc = AArch64::STR_ZXI; + Size = 16; + Align = 16; + break; + case RegPairInfo::PPR: + StrOpc = AArch64::STR_PXI; + Size = 2; + Align = 2; + break; } LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2064,6 +2209,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameSetup); + // Update the StackIDs of the SVE stack slots. + MachineFrameInfo &MFI = MF.getFrameInfo(); + if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR) + MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector); + } return true; } @@ -2115,6 +2265,16 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( Size = 16; Align = 16; break; + case RegPairInfo::ZPR: + LdrOpc = AArch64::LDR_ZXI; + Size = 16; + Align = 16; + break; + case RegPairInfo::PPR: + LdrOpc = AArch64::LDR_PXI; + Size = 2; + Align = 2; + break; } LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI); if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI); @@ -2149,12 +2309,20 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( if (NeedsWinCFI) InsertSEH(MIB, TII, MachineInstr::FrameDestroy); }; - if (ReverseCSRRestoreSeq) - for (const RegPairInfo &RPI : reverse(RegPairs)) + + // SVE objects are always restored in reverse order. + for (const RegPairInfo &RPI : reverse(RegPairs)) + if (RPI.isScalable()) EmitMI(RPI); - else + + if (ReverseCSRRestoreSeq) { + for (const RegPairInfo &RPI : reverse(RegPairs)) + if (!RPI.isScalable()) + EmitMI(RPI); + } else for (const RegPairInfo &RPI : RegPairs) - EmitMI(RPI); + if (!RPI.isScalable()) + EmitMI(RPI); if (NeedShadowCallStackProlog) { // Shadow call stack epilog: ldr x30, [x18, #-8]! @@ -2201,7 +2369,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, SavedRegs.set(Reg); bool RegUsed = SavedRegs.test(Reg); - unsigned PairedReg = CSRegs[i ^ 1]; + unsigned PairedReg = AArch64::NoRegister; + if (AArch64::GPR64RegClass.contains(Reg) || + AArch64::FPR64RegClass.contains(Reg) || + AArch64::FPR128RegClass.contains(Reg)) + PairedReg = CSRegs[i ^ 1]; + if (!RegUsed) { if (AArch64::GPR64RegClass.contains(Reg) && !RegInfo->isReservedReg(MF, Reg)) { @@ -2225,16 +2398,23 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Calculates the callee saved stack size. unsigned CSStackSize = 0; + unsigned SVECSStackSize = 0; const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (unsigned Reg : SavedRegs.set_bits()) - CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8; + for (unsigned Reg : SavedRegs.set_bits()) { + auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8; + if (AArch64::PPRRegClass.contains(Reg) || + AArch64::ZPRRegClass.contains(Reg)) + SVECSStackSize += RegSize; + else + CSStackSize += RegSize; + } // Save number of saved regs, so we can easily update CSStackSize later. unsigned NumSavedRegs = SavedRegs.count(); // The frame record needs to be created by saving the appropriate registers - unsigned EstimatedStackSize = MFI.estimateStackSize(MF); + uint64_t EstimatedStackSize = MFI.estimateStackSize(MF); if (hasFP(MF) || windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) { SavedRegs.set(AArch64::FP); @@ -2248,10 +2428,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, dbgs() << "\n";); // If any callee-saved registers are used, the frame cannot be eliminated. - unsigned MaxAlign = getStackAlignment(); int64_t SVEStackSize = - alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign); - assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16); bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize; // The CSR spill slots have not been allocated yet, so estimateStackSize @@ -2299,15 +2477,20 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF, // Adding the size of additional 64bit GPR saves. CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs); - unsigned AlignedCSStackSize = alignTo(CSStackSize, 16); + uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16); LLVM_DEBUG(dbgs() << "Estimated stack frame size: " << EstimatedStackSize + AlignedCSStackSize << " bytes.\n"); + assert((!MFI.isCalleeSavedInfoValid() || + AFI->getCalleeSavedStackSize() == AlignedCSStackSize) && + "Should not invalidate callee saved info"); + // Round up to register pair alignment to avoid additional SP adjustment // instructions. AFI->setCalleeSavedStackSize(AlignedCSStackSize); AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize); + AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16)); } bool AArch64FrameLowering::enableStackSlotScavenging( @@ -2316,9 +2499,40 @@ bool AArch64FrameLowering::enableStackSlotScavenging( return AFI->hasCalleeSaveStackFreeSpace(); } -int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, - unsigned &MaxAlign) const { - // Process all fixed stack objects. +/// returns true if there are any SVE callee saves. +static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI, + int &Min, int &Max) { + Min = std::numeric_limits<int>::max(); + Max = std::numeric_limits<int>::min(); + + if (!MFI.isCalleeSavedInfoValid()) + return false; + + const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); + for (auto &CS : CSI) { + if (AArch64::ZPRRegClass.contains(CS.getReg()) || + AArch64::PPRRegClass.contains(CS.getReg())) { + assert((Max == std::numeric_limits<int>::min() || + Max + 1 == CS.getFrameIdx()) && + "SVE CalleeSaves are not consecutive"); + + Min = std::min(Min, CS.getFrameIdx()); + Max = std::max(Max, CS.getFrameIdx()); + } + } + return Min != std::numeric_limits<int>::max(); +} + +// Process all the SVE stack objects and determine offsets for each +// object. If AssignOffsets is true, the offsets get assigned. +// Fills in the first and last callee-saved frame indices into +// Min/MaxCSFrameIndex, respectively. +// Returns the size of the stack. +static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI, + int &MinCSFrameIndex, + int &MaxCSFrameIndex, + bool AssignOffsets) { + // First process all fixed stack objects. int64_t Offset = 0; for (int I = MFI.getObjectIndexBegin(); I != 0; ++I) if (MFI.getStackID(I) == TargetStackID::SVEVector) { @@ -2327,12 +2541,69 @@ int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI, Offset = FixedOffset; } - // Note: We don't take allocatable stack objects into - // account yet, because allocation for those is not yet - // implemented. + auto Assign = [&MFI](int FI, int64_t Offset) { + LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n"); + MFI.setObjectOffset(FI, Offset); + }; + + // Then process all callee saved slots. + if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) { + // Make sure to align the last callee save slot. + MFI.setObjectAlignment(MaxCSFrameIndex, 16U); + + // Assign offsets to the callee save slots. + for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) { + Offset += MFI.getObjectSize(I); + Offset = alignTo(Offset, MFI.getObjectAlignment(I)); + if (AssignOffsets) + Assign(I, -Offset); + } + } + + // Create a buffer of SVE objects to allocate and sort it. + SmallVector<int, 8> ObjectsToAllocate; + for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) { + unsigned StackID = MFI.getStackID(I); + if (StackID != TargetStackID::SVEVector) + continue; + if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex) + continue; + if (MFI.isDeadObjectIndex(I)) + continue; + + ObjectsToAllocate.push_back(I); + } + + // Allocate all SVE locals and spills + for (unsigned FI : ObjectsToAllocate) { + unsigned Align = MFI.getObjectAlignment(FI); + // FIXME: Given that the length of SVE vectors is not necessarily a power of + // two, we'd need to align every object dynamically at runtime if the + // alignment is larger than 16. This is not yet supported. + if (Align > 16) + report_fatal_error( + "Alignment of scalable vectors > 16 bytes is not yet supported"); + + Offset = alignTo(Offset + MFI.getObjectSize(FI), Align); + if (AssignOffsets) + Assign(FI, -Offset); + } + return Offset; } +int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets( + MachineFrameInfo &MFI) const { + int MinCSFrameIndex, MaxCSFrameIndex; + return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false); +} + +int64_t AArch64FrameLowering::assignSVEStackObjectOffsets( + MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const { + return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, + true); +} + void AArch64FrameLowering::processFunctionBeforeFrameFinalized( MachineFunction &MF, RegScavenger *RS) const { MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -2340,12 +2611,13 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized( assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown && "Upwards growing stack unsupported"); - unsigned MaxAlign = getStackAlignment(); - int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign); + int MinCSFrameIndex, MaxCSFrameIndex; + int64_t SVEStackSize = + assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex); AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>(); - AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign)); - assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes"); + AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U)); + AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex); // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index ac150e86c9eb..b5719feb6b15 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -44,7 +44,7 @@ public: unsigned &FrameReg, bool PreferFP, bool ForSimm) const; StackOffset resolveFrameOffsetReference(const MachineFunction &MF, - int ObjectOffset, bool isFixed, + int64_t ObjectOffset, bool isFixed, bool isSVE, unsigned &FrameReg, bool PreferFP, bool ForSimm) const; bool spillCalleeSavedRegisters(MachineBasicBlock &MBB, @@ -72,6 +72,7 @@ public: } bool enableStackSlotScavenging(const MachineFunction &MF) const override; + TargetStackID::Value getStackIDForScalableVectors() const override; void processFunctionBeforeFrameFinalized(MachineFunction &MF, RegScavenger *RS) const override; @@ -100,8 +101,12 @@ public: private: bool shouldCombineCSRLocalStackBump(MachineFunction &MF, - unsigned StackBumpBytes) const; - int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const; + uint64_t StackBumpBytes) const; + + int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const; + int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF, + int &MinCSFrameIndex, + int &MaxCSFrameIndex) const; }; } // End llvm namespace diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 1f08505f37e7..a51aa85a931c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -17,6 +17,7 @@ #include "llvm/IR/Function.h" // To access function attributes. #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" @@ -39,20 +40,16 @@ class AArch64DAGToDAGISel : public SelectionDAGISel { /// make the right decision when generating code for different targets. const AArch64Subtarget *Subtarget; - bool ForCodeSize; - public: explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm, CodeGenOpt::Level OptLevel) - : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), - ForCodeSize(false) {} + : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {} StringRef getPassName() const override { return "AArch64 Instruction Selection"; } bool runOnMachineFunction(MachineFunction &MF) override { - ForCodeSize = MF.getFunction().hasOptSize(); Subtarget = &MF.getSubtarget<AArch64Subtarget>(); return SelectionDAGISel::runOnMachineFunction(MF); } @@ -140,6 +137,59 @@ public: return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift); } + bool SelectDupZeroOrUndef(SDValue N) { + switch(N->getOpcode()) { + case ISD::UNDEF: + return true; + case AArch64ISD::DUP: + case ISD::SPLAT_VECTOR: { + auto Opnd0 = N->getOperand(0); + if (auto CN = dyn_cast<ConstantSDNode>(Opnd0)) + if (CN->isNullValue()) + return true; + if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0)) + if (CN->isZero()) + return true; + break; + } + default: + break; + } + + return false; + } + + template<MVT::SimpleValueType VT> + bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) { + return SelectSVEAddSubImm(N, VT, Imm, Shift); + } + + template<MVT::SimpleValueType VT> + bool SelectSVELogicalImm(SDValue N, SDValue &Imm) { + return SelectSVELogicalImm(N, VT, Imm); + } + + // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. + template<signed Min, signed Max, signed Scale, bool Shift> + bool SelectCntImm(SDValue N, SDValue &Imm) { + if (!isa<ConstantSDNode>(N)) + return false; + + int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue(); + if (Shift) + MulImm = 1LL << MulImm; + + if ((MulImm % std::abs(Scale)) != 0) + return false; + + MulImm /= Scale; + if ((MulImm >= Min) && (MulImm <= Max)) { + Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32); + return true; + } + + return false; + } /// Form sequences of consecutive 64/128-bit registers for use in NEON /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have @@ -177,6 +227,7 @@ public: bool tryBitfieldInsertOp(SDNode *N); bool tryBitfieldInsertInZeroOp(SDNode *N); bool tryShiftAmountMod(SDNode *N); + bool tryHighFPExt(SDNode *N); bool tryReadRegister(SDNode *N); bool tryWriteRegister(SDNode *N); @@ -217,6 +268,13 @@ private: bool SelectCMP_SWAP(SDNode *N); + bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift); + + bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm); + + bool SelectSVESignedArithImm(SDValue N, SDValue &Imm); + + bool SelectSVEArithImm(SDValue N, SDValue &Imm); }; } // end anonymous namespace @@ -250,7 +308,6 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand( switch(ConstraintID) { default: llvm_unreachable("Unexpected asm memory constraint"); - case InlineAsm::Constraint_i: case InlineAsm::Constraint_m: case InlineAsm::Constraint_Q: // We need to make sure that this one operand does not end up in XZR, thus @@ -378,7 +435,7 @@ static bool isWorthFoldingSHL(SDValue V) { bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const { // Trivial if we are optimizing for code size or if there is only // one use of the value. - if (ForCodeSize || V.hasOneUse()) + if (CurDAG->shouldOptForSize() || V.hasOneUse()) return true; // If a subtarget has a fastpath LSL we can fold a logical shift into // the addressing mode and save a cycle. @@ -1772,6 +1829,35 @@ bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) { return true; } +/// Try to form fcvtl2 instructions from a floating-point extend of a high-half +/// extract of a subvector. +bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) { + assert(N->getOpcode() == ISD::FP_EXTEND); + + // There are 2 forms of fcvtl2 - extend to double or extend to float. + SDValue Extract = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT NarrowVT = Extract.getValueType(); + if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) && + (VT != MVT::v4f32 || NarrowVT != MVT::v4f16)) + return false; + + // Optionally look past a bitcast. + Extract = peekThroughBitcasts(Extract); + if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + + // Match extract from start of high half index. + // Example: v8i16 -> v4i16 means the extract must begin at index 4. + unsigned ExtractIndex = Extract.getConstantOperandVal(1); + if (ExtractIndex != Extract.getValueType().getVectorNumElements()) + return false; + + auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16; + CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0)); + return true; +} + static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc, SDValue &Opd0, unsigned &Immr, unsigned &Imms, unsigned NumberOfIgnoredLowBits = 0, @@ -2793,6 +2879,102 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) { return true; } +bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) { + if (auto CNode = dyn_cast<ConstantSDNode>(N)) { + const int64_t ImmVal = CNode->getZExtValue(); + SDLoc DL(N); + + switch (VT.SimpleTy) { + case MVT::i8: + if ((ImmVal & 0xFF) == ImmVal) { + Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; + } + break; + case MVT::i16: + case MVT::i32: + case MVT::i64: + if ((ImmVal & 0xFF) == ImmVal) { + Shift = CurDAG->getTargetConstant(0, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; + } else if ((ImmVal & 0xFF00) == ImmVal) { + Shift = CurDAG->getTargetConstant(8, DL, MVT::i32); + Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32); + return true; + } + break; + default: + break; + } + } + + return false; +} + +bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) { + if (auto CNode = dyn_cast<ConstantSDNode>(N)) { + int64_t ImmVal = CNode->getSExtValue(); + SDLoc DL(N); + if (ImmVal >= -127 && ImmVal < 127) { + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; + } + } + return false; +} + +bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) { + if (auto CNode = dyn_cast<ConstantSDNode>(N)) { + uint64_t ImmVal = CNode->getSExtValue(); + SDLoc DL(N); + ImmVal = ImmVal & 0xFF; + if (ImmVal < 256) { + Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32); + return true; + } + } + return false; +} + +bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) { + if (auto CNode = dyn_cast<ConstantSDNode>(N)) { + uint64_t ImmVal = CNode->getZExtValue(); + SDLoc DL(N); + + // Shift mask depending on type size. + switch (VT.SimpleTy) { + case MVT::i8: + ImmVal &= 0xFF; + ImmVal |= ImmVal << 8; + ImmVal |= ImmVal << 16; + ImmVal |= ImmVal << 32; + break; + case MVT::i16: + ImmVal &= 0xFFFF; + ImmVal |= ImmVal << 16; + ImmVal |= ImmVal << 32; + break; + case MVT::i32: + ImmVal &= 0xFFFFFFFF; + ImmVal |= ImmVal << 32; + break; + case MVT::i64: + break; + default: + llvm_unreachable("Unexpected type"); + } + + uint64_t encoding; + if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) { + Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64); + return true; + } + } + return false; +} + bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) { // tagp(FrameIndex, IRGstack, tag_offset): // since the offset between FrameIndex and IRGstack is a compile-time @@ -2908,6 +3090,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) { return; break; + case ISD::FP_EXTEND: + if (tryHighFPExt(Node)) + return; + break; + case ISD::OR: if (tryBitfieldInsertOp(Node)) return; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2746117e8ee5..d45a80057564 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -10,9 +10,9 @@ // //===----------------------------------------------------------------------===// -#include "AArch64ExpandImm.h" #include "AArch64ISelLowering.h" #include "AArch64CallingConvention.h" +#include "AArch64ExpandImm.h" #include "AArch64MachineFunctionInfo.h" #include "AArch64PerfectShuffle.h" #include "AArch64RegisterInfo.h" @@ -58,6 +58,7 @@ #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Module.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/PatternMatch.h" @@ -178,11 +179,25 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass); - addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass); - addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass); addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass); + + for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) { + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + } + + for (auto VT : + { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8, + MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 }) + setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal); } // Compute derived properties from the register classes @@ -422,14 +437,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSUB, MVT::v4f16, Promote); setOperationAction(ISD::FMUL, MVT::v4f16, Promote); setOperationAction(ISD::FDIV, MVT::v4f16, Promote); - setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote); - setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote); AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32); AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32); - AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32); - AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32); setOperationAction(ISD::FABS, MVT::v4f16, Expand); setOperationAction(ISD::FNEG, MVT::v4f16, Expand); @@ -510,6 +521,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom); setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom); + // 128-bit loads and stores can be done without expanding + setOperationAction(ISD::LOAD, MVT::i128, Custom); + setOperationAction(ISD::STORE, MVT::i128, Custom); + // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0. // This requires the Performance Monitors extension. if (Subtarget->hasPerfMon()) @@ -525,6 +540,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::FSINCOS, MVT::f32, Expand); } + if (Subtarget->getTargetTriple().isOSMSVCRT()) { + // MSVCRT doesn't have powi; fall back to pow + setLibcallName(RTLIB::POWI_F32, nullptr); + setLibcallName(RTLIB::POWI_F64, nullptr); + } + // Make floating-point constants legal for the large code model, so they don't // become loads from the constant pool. if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) { @@ -601,7 +622,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::BITCAST); + setTargetDAGCombine(ISD::SIGN_EXTEND_INREG); setTargetDAGCombine(ISD::CONCAT_VECTORS); setTargetDAGCombine(ISD::STORE); if (Subtarget->supportsAddressTopByteIgnored()) @@ -734,14 +755,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::MUL, MVT::v4i32, Custom); setOperationAction(ISD::MUL, MVT::v2i64, Custom); - // Vector reductions for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32, MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) { + // Vector reductions setOperationAction(ISD::VECREDUCE_ADD, VT, Custom); setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom); setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom); setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom); + + // Saturates + setOperationAction(ISD::SADDSAT, VT, Legal); + setOperationAction(ISD::UADDSAT, VT, Legal); + setOperationAction(ISD::SSUBSAT, VT, Legal); + setOperationAction(ISD::USUBSAT, VT, Legal); } for (MVT VT : { MVT::v4f16, MVT::v2f32, MVT::v8f16, MVT::v4f32, MVT::v2f64 }) { @@ -802,10 +829,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, } if (Subtarget->hasSVE()) { + // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a + // splat of 0 or undef) once vector selects supported in SVE codegen. See + // D68877 for more details. for (MVT VT : MVT::integer_scalable_vector_valuetypes()) { - if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1) + if (isTypeLegal(VT)) setOperationAction(ISD::SPLAT_VECTOR, VT, Custom); } + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom); + setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -1257,6 +1289,19 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::UMINV: return "AArch64ISD::UMINV"; case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV"; case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV"; + case AArch64ISD::SMAXV_PRED: return "AArch64ISD::SMAXV_PRED"; + case AArch64ISD::UMAXV_PRED: return "AArch64ISD::UMAXV_PRED"; + case AArch64ISD::SMINV_PRED: return "AArch64ISD::SMINV_PRED"; + case AArch64ISD::UMINV_PRED: return "AArch64ISD::UMINV_PRED"; + case AArch64ISD::ORV_PRED: return "AArch64ISD::ORV_PRED"; + case AArch64ISD::EORV_PRED: return "AArch64ISD::EORV_PRED"; + case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED"; + case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N"; + case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N"; + case AArch64ISD::LASTA: return "AArch64ISD::LASTA"; + case AArch64ISD::LASTB: return "AArch64ISD::LASTB"; + case AArch64ISD::REV: return "AArch64ISD::REV"; + case AArch64ISD::TBL: return "AArch64ISD::TBL"; case AArch64ISD::NOT: return "AArch64ISD::NOT"; case AArch64ISD::BIT: return "AArch64ISD::BIT"; case AArch64ISD::CBZ: return "AArch64ISD::CBZ"; @@ -1311,6 +1356,32 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO"; case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI"; case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO"; + case AArch64ISD::INSR: return "AArch64ISD::INSR"; + case AArch64ISD::PTEST: return "AArch64ISD::PTEST"; + case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE"; + case AArch64ISD::GLD1: return "AArch64ISD::GLD1"; + case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED"; + case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW"; + case AArch64ISD::GLD1_UXTW: return "AArch64ISD::GLD1_UXTW"; + case AArch64ISD::GLD1_SXTW_SCALED: return "AArch64ISD::GLD1_SXTW_SCALED"; + case AArch64ISD::GLD1_UXTW_SCALED: return "AArch64ISD::GLD1_UXTW_SCALED"; + case AArch64ISD::GLD1_IMM: return "AArch64ISD::GLD1_IMM"; + case AArch64ISD::GLD1S: return "AArch64ISD::GLD1S"; + case AArch64ISD::GLD1S_SCALED: return "AArch64ISD::GLD1S_SCALED"; + case AArch64ISD::GLD1S_SXTW: return "AArch64ISD::GLD1S_SXTW"; + case AArch64ISD::GLD1S_UXTW: return "AArch64ISD::GLD1S_UXTW"; + case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED"; + case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED"; + case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM"; + case AArch64ISD::SST1: return "AArch64ISD::SST1"; + case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED"; + case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW"; + case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW"; + case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED"; + case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED"; + case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM"; + case AArch64ISD::LDP: return "AArch64ISD::LDP"; + case AArch64ISD::STP: return "AArch64ISD::STP"; } return nullptr; } @@ -1568,7 +1639,8 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC, // All of the compare-mask comparisons are ordered, but we can switch // between the two by a double inversion. E.g. ULE == !OGT. Invert = true; - changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2); + changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32), + CondCode, CondCode2); break; } } @@ -1815,7 +1887,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val, ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get(); bool isInteger = LHS.getValueType().isInteger(); if (Negate) - CC = getSetCCInverse(CC, isInteger); + CC = getSetCCInverse(CC, LHS.getValueType()); SDLoc DL(Val); // Determine OutCC and handle FP special case. if (isInteger) { @@ -2287,7 +2359,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) { if (CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } // If the constants line up, perform the transform! @@ -2861,6 +2933,55 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, case Intrinsic::aarch64_sve_uunpklo: return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(), Op.getOperand(1)); + case Intrinsic::aarch64_sve_clasta_n: + return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::aarch64_sve_clastb_n: + return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), Op.getOperand(3)); + case Intrinsic::aarch64_sve_lasta: + return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_lastb: + return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_rev: + return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(), + Op.getOperand(1)); + case Intrinsic::aarch64_sve_tbl: + return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_trn1: + return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_trn2: + return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_uzp1: + return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_uzp2: + return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_zip1: + return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_zip2: + return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2)); + case Intrinsic::aarch64_sve_ptrue: + return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(), + Op.getOperand(1)); + + case Intrinsic::aarch64_sve_insr: { + SDValue Scalar = Op.getOperand(2); + EVT ScalarTy = Scalar.getValueType(); + if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) + Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar); + + return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(), + Op.getOperand(1), Scalar); + } case Intrinsic::localaddress: { const auto &MF = DAG.getMachineFunction(); @@ -2886,6 +3007,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, } } +bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const { + return ExtVal.getValueType().isScalableVector(); +} + // Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, EVT VT, EVT MemVT, @@ -2920,7 +3045,7 @@ static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, // Custom lowering for any store, vector or scalar and/or default or with // a truncate operations. Currently only custom lower truncate operation -// from vector v4i16 to v4i8. +// from vector v4i16 to v4i8 or volatile stores of i128. SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const { SDLoc Dl(Op); @@ -2932,18 +3057,32 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, EVT VT = Value.getValueType(); EVT MemVT = StoreNode->getMemoryVT(); - assert (VT.isVector() && "Can only custom lower vector store types"); - - unsigned AS = StoreNode->getAddressSpace(); - unsigned Align = StoreNode->getAlignment(); - if (Align < MemVT.getStoreSize() && - !allowsMisalignedMemoryAccesses( - MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) { - return scalarizeVectorStore(StoreNode, DAG); - } - - if (StoreNode->isTruncatingStore()) { - return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + if (VT.isVector()) { + unsigned AS = StoreNode->getAddressSpace(); + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, + StoreNode->getMemOperand()->getFlags(), + nullptr)) { + return scalarizeVectorStore(StoreNode, DAG); + } + + if (StoreNode->isTruncatingStore()) { + return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + } + } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) { + assert(StoreNode->getValue()->getValueType(0) == MVT::i128); + SDValue Lo = + DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), + DAG.getConstant(0, Dl, MVT::i64)); + SDValue Hi = + DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(), + DAG.getConstant(1, Dl, MVT::i64)); + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other), + {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()}, + StoreNode->getMemoryVT(), StoreNode->getMemOperand()); + return Result; } return SDValue(); @@ -3092,6 +3231,9 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, switch (CC) { default: report_fatal_error("Unsupported calling convention."); + case CallingConv::AArch64_SVE_VectorCall: + // Calling SVE functions is currently not yet supported. + report_fatal_error("Unsupported calling convention."); case CallingConv::WebKit_JS: return CC_AArch64_WebKit_JS; case CallingConv::GHC: @@ -3111,8 +3253,10 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, : CC_AArch64_DarwinPCS_VarArg; case CallingConv::Win64: return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; - case CallingConv::AArch64_VectorCall: - return CC_AArch64_AAPCS; + case CallingConv::CFGuard_Check: + return CC_AArch64_Win64_CFGuard_Check; + case CallingConv::AArch64_VectorCall: + return CC_AArch64_AAPCS; } } @@ -3848,11 +3992,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } // Walk the register/memloc assignments, inserting copies/loads. - for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e; - ++i, ++realArgIdx) { + for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { CCValAssign &VA = ArgLocs[i]; - SDValue Arg = OutVals[realArgIdx]; - ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags; + SDValue Arg = OutVals[i]; + ISD::ArgFlagsTy Flags = Outs[i].Flags; // Promote the value if needed. switch (VA.getLocInfo()) { @@ -3867,7 +4010,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg); break; case CCValAssign::AExt: - if (Outs[realArgIdx].ArgVT == MVT::i1) { + if (Outs[i].ArgVT == MVT::i1) { // AAPCS requires i1 to be zero-extended to 8-bits by the caller. Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg); Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg); @@ -3896,7 +4039,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, } if (VA.isRegLoc()) { - if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && + if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() && Outs[0].VT == MVT::i64) { assert(VA.getLocVT() == MVT::i64 && "unexpected calling convention register assignment"); @@ -4014,14 +4157,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // node so that legalize doesn't hack it. if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) { auto GV = G->getGlobal(); - if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) == - AArch64II::MO_GOT) { - Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT); + unsigned OpFlags = + Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()); + if (OpFlags & AArch64II::MO_GOT) { + Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags); Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee); - } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) { - assert(Subtarget->isTargetWindows() && - "Windows is the only supported COFF target"); - Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT); } else { const GlobalValue *GV = G->getGlobal(); Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0); @@ -4456,6 +4596,97 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op, return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1)); } +/// Convert a thread-local variable reference into a sequence of instructions to +/// compute the variable's address for the local exec TLS model of ELF targets. +/// The sequence depends on the maximum TLS area size. +SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV, + SDValue ThreadBase, + const SDLoc &DL, + SelectionDAG &DAG) const { + EVT PtrVT = getPointerTy(DAG.getDataLayout()); + SDValue TPOff, Addr; + + switch (DAG.getTarget().Options.TLSSize) { + default: + llvm_unreachable("Unexpected TLS size"); + + case 12: { + // mrs x0, TPIDR_EL0 + // add x0, x0, :tprel_lo12:a + SDValue Var = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF); + return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, + Var, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + } + + case 24: { + // mrs x0, TPIDR_EL0 + // add x0, x0, :tprel_hi12:a + // add x0, x0, :tprel_lo12_nc:a + SDValue HiVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); + SDValue LoVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, + AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); + Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, + HiVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr, + LoVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + } + + case 32: { + // mrs x1, TPIDR_EL0 + // movz x0, #:tprel_g1:a + // movk x0, #:tprel_g0_nc:a + // add x0, x1, x0 + SDValue HiVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1); + SDValue LoVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, + AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); + TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, + DAG.getTargetConstant(16, DL, MVT::i32)), + 0); + TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); + } + + case 48: { + // mrs x1, TPIDR_EL0 + // movz x0, #:tprel_g2:a + // movk x0, #:tprel_g1_nc:a + // movk x0, #:tprel_g0_nc:a + // add x0, x1, x0 + SDValue HiVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2); + SDValue MiVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, + AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC); + SDValue LoVar = DAG.getTargetGlobalAddress( + GV, DL, PtrVT, 0, + AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC); + TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar, + DAG.getTargetConstant(32, DL, MVT::i32)), + 0); + TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar, + DAG.getTargetConstant(16, DL, MVT::i32)), + 0); + TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar, + DAG.getTargetConstant(0, DL, MVT::i32)), + 0); + return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff); + } + } +} + /// When accessing thread-local variables under either the general-dynamic or /// local-dynamic system, we make a "TLS-descriptor" call. The variable will /// have a descriptor, accessible via a PC-relative ADRP, and whose first entry @@ -4493,15 +4724,7 @@ SDValue AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { assert(Subtarget->isTargetELF() && "This function expects an ELF target"); - if (getTargetMachine().getCodeModel() == CodeModel::Large) - report_fatal_error("ELF TLS only supported in small memory model"); - // Different choices can be made for the maximum size of the TLS area for a - // module. For the small address model, the default TLS size is 16MiB and the - // maximum TLS size is 4GiB. - // FIXME: add -mtls-size command line option and make it control the 16MiB - // vs. 4GiB code sequence generation. - // FIXME: add tiny codemodel support. We currently generate the same code as - // small, which may be larger than needed. + const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op); TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal()); @@ -4511,6 +4734,17 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, Model = TLSModel::GeneralDynamic; } + if (getTargetMachine().getCodeModel() == CodeModel::Large && + Model != TLSModel::LocalExec) + report_fatal_error("ELF TLS only supported in small memory model or " + "in local exec TLS model"); + // Different choices can be made for the maximum size of the TLS area for a + // module. For the small address model, the default TLS size is 16MiB and the + // maximum TLS size is 4GiB. + // FIXME: add tiny and large code model support for TLS access models other + // than local exec. We currently generate the same code as small for tiny, + // which may be larger than needed. + SDValue TPOff; EVT PtrVT = getPointerTy(DAG.getDataLayout()); SDLoc DL(Op); @@ -4519,23 +4753,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op, SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT); if (Model == TLSModel::LocalExec) { - SDValue HiVar = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12); - SDValue LoVar = DAG.getTargetGlobalAddress( - GV, DL, PtrVT, 0, - AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC); - - SDValue TPWithOff_lo = - SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase, - HiVar, - DAG.getTargetConstant(0, DL, MVT::i32)), - 0); - SDValue TPWithOff = - SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo, - LoVar, - DAG.getTargetConstant(0, DL, MVT::i32)), - 0); - return TPWithOff; + return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG); } else if (Model == TLSModel::InitialExec) { TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS); TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff); @@ -4961,8 +5179,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { if (LHS.getValueType().isInteger()) { SDValue CCVal; - SDValue Cmp = - getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl); + SDValue Cmp = getAArch64Cmp( + LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl); // Note that we inverted the condition above, so we reverse the order of // the true and false operands here. This will allow the setcc to be @@ -4981,7 +5199,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { AArch64CC::CondCode CC1, CC2; changeFPCCToAArch64CC(CC, CC1, CC2); if (CC2 == AArch64CC::AL) { - changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2); + changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1, + CC2); SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32); // Note that we inverted the condition above, so we reverse the order of @@ -5042,18 +5261,18 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } else if (TVal.getOpcode() == ISD::XOR) { // If TVal is a NOT we want to swap TVal and FVal so that we can match // with a CSINV rather than a CSEL. if (isAllOnesConstant(TVal.getOperand(1))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } } else if (TVal.getOpcode() == ISD::SUB) { // If TVal is a negation (SUB from 0) we want to swap TVal and FVal so @@ -5061,7 +5280,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, if (isNullConstant(TVal.getOperand(0))) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } } else if (CTVal && CFVal) { const int64_t TrueVal = CTVal->getSExtValue(); @@ -5104,7 +5323,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS, if (Swap) { std::swap(TVal, FVal); std::swap(CTVal, CFVal); - CC = ISD::getSetCCInverse(CC, true); + CC = ISD::getSetCCInverse(CC, LHS.getValueType()); } if (Opcode != AArch64ISD::CSEL) { @@ -5531,7 +5750,7 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op, // FIXME? Maybe this could be a TableGen attribute on some registers and // this table could be generated automatically from RegInfo. Register AArch64TargetLowering:: -getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const { +getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const { Register Reg = MatchRegisterName(RegName); if (AArch64::X1 <= Reg && Reg <= AArch64::X28) { const MCRegisterInfo *MRI = Subtarget->getRegisterInfo(); @@ -6946,19 +7165,55 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, // Otherwise, duplicate from the lane of the input vector. unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType()); - // SelectionDAGBuilder may have "helpfully" already extracted or conatenated - // to make a vector of the same size as this SHUFFLE. We can ignore the - // extract entirely, and canonicalise the concat using WidenVector. - if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { - Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue(); + // Try to eliminate a bitcasted extract subvector before a DUPLANE. + auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) { + // Match: dup (bitcast (extract_subv X, C)), LaneC + if (BitCast.getOpcode() != ISD::BITCAST || + BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR) + return false; + + // The extract index must align in the destination type. That may not + // happen if the bitcast is from narrow to wide type. + SDValue Extract = BitCast.getOperand(0); + unsigned ExtIdx = Extract.getConstantOperandVal(1); + unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits(); + unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth; + unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits(); + if (ExtIdxInBits % CastedEltBitWidth != 0) + return false; + + // Update the lane value by offsetting with the scaled extract index. + LaneC += ExtIdxInBits / CastedEltBitWidth; + + // Determine the casted vector type of the wide vector input. + // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC' + // Examples: + // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3 + // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5 + unsigned SrcVecNumElts = + Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth; + CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(), + SrcVecNumElts); + return true; + }; + MVT CastVT; + if (getScaledOffsetDup(V1, Lane, CastVT)) { + V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0)); + } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + // The lane is incremented by the index of the extract. + // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3 + Lane += V1.getConstantOperandVal(1); V1 = V1.getOperand(0); } else if (V1.getOpcode() == ISD::CONCAT_VECTORS) { + // The lane is decremented if we are splatting from the 2nd operand. + // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1 unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2; Lane -= Idx * VT.getVectorNumElements() / 2; V1 = WidenVector(V1.getOperand(Idx), DAG); - } else if (VT.getSizeInBits() == 64) + } else if (VT.getSizeInBits() == 64) { + // Widen the operand to 128-bit register with undef. V1 = WidenVector(V1, DAG); - + } return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64)); } @@ -7077,26 +7332,31 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op, switch (ElemVT.getSimpleVT().SimpleTy) { case MVT::i8: case MVT::i16: + case MVT::i32: SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32); - break; + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); case MVT::i64: SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); - break; - case MVT::i32: - // Fine as is - break; - // TODO: we can support splats of i1s and float types, but haven't added - // patterns yet. - case MVT::i1: + return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); + case MVT::i1: { + // The general case of i1. There isn't any natural way to do this, + // so we use some trickery with whilelo. + // TODO: Add special cases for splat of constant true/false. + SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64); + SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal, + DAG.getValueType(MVT::i1)); + SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl, + MVT::i64); + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID, + DAG.getConstant(0, dl, MVT::i64), SplatVal); + } + // TODO: we can support float types, but haven't added patterns yet. case MVT::f16: case MVT::f32: case MVT::f64: default: - llvm_unreachable("Unsupported SPLAT_VECTOR input operand type"); - break; + report_fatal_error("Unsupported SPLAT_VECTOR input operand type"); } - - return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal); } static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits, @@ -8443,6 +8703,26 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = Align(16); Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile; return true; + case Intrinsic::aarch64_sve_ldnt1: { + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(1); + Info.offset = 0; + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal; + return true; + } + case Intrinsic::aarch64_sve_stnt1: { + PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType()); + Info.opc = ISD::INTRINSIC_W_CHAIN; + Info.memVT = MVT::getVT(PtrTy->getElementType()); + Info.ptrVal = I.getArgOperand(2); + Info.offset = 0; + Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType())); + Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal; + return true; + } default: break; } @@ -8515,11 +8795,12 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const { return true; const TargetOptions &Options = getTargetMachine().Options; - const DataLayout &DL = I->getModule()->getDataLayout(); - EVT VT = getValueType(DL, User->getOperand(0)->getType()); + const Function *F = I->getFunction(); + const DataLayout &DL = F->getParent()->getDataLayout(); + Type *Ty = User->getOperand(0)->getType(); - return !(isFMAFasterThanFMulAndFAdd(VT) && - isOperationLegalOrCustom(ISD::FMA, VT) && + return !(isFMAFasterThanFMulAndFAdd(*F, Ty) && + isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) && (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath)); } @@ -9176,7 +9457,8 @@ int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL, return -1; } -bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { +bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd( + const MachineFunction &MF, EVT VT) const { VT = VT.getScalarType(); if (!VT.isSimple()) @@ -9193,6 +9475,17 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { return false; } +bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F, + Type *Ty) const { + switch (Ty->getScalarType()->getTypeID()) { + case Type::FloatTyID: + case Type::DoubleTyID: + return true; + default: + return false; + } +} + const MCPhysReg * AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const { // LR is a callee-save register, but we must treat it as clobbered by any call @@ -9363,6 +9656,19 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor, return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA); } +static bool IsSVECntIntrinsic(SDValue S) { + switch(getIntrinsicID(S.getNode())) { + default: + break; + case Intrinsic::aarch64_sve_cntb: + case Intrinsic::aarch64_sve_cnth: + case Intrinsic::aarch64_sve_cntw: + case Intrinsic::aarch64_sve_cntd: + return true; + } + return false; +} + static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -9373,9 +9679,18 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, if (!isa<ConstantSDNode>(N->getOperand(1))) return SDValue(); + SDValue N0 = N->getOperand(0); ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1)); const APInt &ConstValue = C->getAPIntValue(); + // Allow the scaling to be folded into the `cnt` instruction by preventing + // the scaling to be obscured here. This makes it easier to pattern match. + if (IsSVECntIntrinsic(N0) || + (N0->getOpcode() == ISD::TRUNCATE && + (IsSVECntIntrinsic(N0->getOperand(0))))) + if (ConstValue.sge(1) && ConstValue.sle(16)) + return SDValue(); + // Multiplication of a power of two plus/minus one can be done more // cheaply as as shift+add/sub. For now, this is true unilaterally. If // future CPUs have a cheaper MADD instruction, this may need to be @@ -9386,7 +9701,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG, // e.g. 6=3*2=(2+1)*2. // TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45 // which equals to (1+2)*16-(1+2). - SDValue N0 = N->getOperand(0); // TrailingZeroes is used to test if the mul can be lowered to // shift+add+shift. unsigned TrailingZeroes = ConstValue.countTrailingZeros(); @@ -9821,6 +10135,67 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, return SDValue(); } +static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) { + if (!MemVT.getVectorElementType().isSimple()) + return false; + + uint64_t MaskForTy = 0ull; + switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) { + case MVT::i8: + MaskForTy = 0xffull; + break; + case MVT::i16: + MaskForTy = 0xffffull; + break; + case MVT::i32: + MaskForTy = 0xffffffffull; + break; + default: + return false; + break; + } + + if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR) + if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0))) + return Op0->getAPIntValue().getLimitedValue() == MaskForTy; + + return false; +} + +static SDValue performSVEAndCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue Src = N->getOperand(0); + SDValue Mask = N->getOperand(1); + + if (!Src.hasOneUse()) + return SDValue(); + + // GLD1* instructions perform an implicit zero-extend, which makes them + // perfect candidates for combining. + switch (Src->getOpcode()) { + case AArch64ISD::GLD1: + case AArch64ISD::GLD1_SCALED: + case AArch64ISD::GLD1_SXTW: + case AArch64ISD::GLD1_SXTW_SCALED: + case AArch64ISD::GLD1_UXTW: + case AArch64ISD::GLD1_UXTW_SCALED: + case AArch64ISD::GLD1_IMM: + break; + default: + return SDValue(); + } + + EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); + + if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT)) + return Src; + + return SDValue(); +} + static SDValue performANDCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; @@ -9829,6 +10204,9 @@ static SDValue performANDCombine(SDNode *N, if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); + if (VT.isScalableVector()) + return performSVEAndCombine(N, DCI); + BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode()); if (!BVN) @@ -9889,74 +10267,6 @@ static SDValue performSRLCombine(SDNode *N, return SDValue(); } -static SDValue performBitcastCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - SelectionDAG &DAG) { - // Wait 'til after everything is legalized to try this. That way we have - // legal vector types and such. - if (DCI.isBeforeLegalizeOps()) - return SDValue(); - - // Remove extraneous bitcasts around an extract_subvector. - // For example, - // (v4i16 (bitconvert - // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1))))) - // becomes - // (extract_subvector ((v8i16 ...), (i64 4))) - - // Only interested in 64-bit vectors as the ultimate result. - EVT VT = N->getValueType(0); - if (!VT.isVector()) - return SDValue(); - if (VT.getSimpleVT().getSizeInBits() != 64) - return SDValue(); - // Is the operand an extract_subvector starting at the beginning or halfway - // point of the vector? A low half may also come through as an - // EXTRACT_SUBREG, so look for that, too. - SDValue Op0 = N->getOperand(0); - if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR && - !(Op0->isMachineOpcode() && - Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG)) - return SDValue(); - uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue(); - if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) { - if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0) - return SDValue(); - } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) { - if (idx != AArch64::dsub) - return SDValue(); - // The dsub reference is equivalent to a lane zero subvector reference. - idx = 0; - } - // Look through the bitcast of the input to the extract. - if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST) - return SDValue(); - SDValue Source = Op0->getOperand(0)->getOperand(0); - // If the source type has twice the number of elements as our destination - // type, we know this is an extract of the high or low half of the vector. - EVT SVT = Source->getValueType(0); - if (!SVT.isVector() || - SVT.getVectorNumElements() != VT.getVectorNumElements() * 2) - return SDValue(); - - LLVM_DEBUG( - dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n"); - - // Create the simplified form to just extract the low or high half of the - // vector directly rather than bothering with the bitcasts. - SDLoc dl(N); - unsigned NumElements = VT.getVectorNumElements(); - if (idx) { - SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64); - return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx); - } else { - SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32); - return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT, - Source, SubReg), - 0); - } -} - static SDValue performConcatVectorsCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG) { @@ -10263,10 +10573,10 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) { MVT::i32); Cmp = *InfoAndKind.Info.AArch64.Cmp; } else - Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0, - *InfoAndKind.Info.Generic.Opnd1, - ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true), - CCVal, DAG, dl); + Cmp = getAArch64Cmp( + *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1, + ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG, + dl); EVT VT = Op->getValueType(0); LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT)); @@ -10456,6 +10766,154 @@ static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N, DAG.getConstant(0, dl, MVT::i64)); } +static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc, + SelectionDAG &DAG) { + SDLoc dl(N); + LLVMContext &Ctx = *DAG.getContext(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + EVT VT = N->getValueType(0); + SDValue Pred = N->getOperand(1); + SDValue Data = N->getOperand(2); + EVT DataVT = Data.getValueType(); + + if (DataVT.getVectorElementType().isScalarInteger() && + (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)) { + if (!TLI.isTypeLegal(DataVT)) + return SDValue(); + + EVT OutputVT = EVT::getVectorVT(Ctx, VT, + AArch64::NeonBitsPerVector / VT.getSizeInBits()); + SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data); + SDValue Zero = DAG.getConstant(0, dl, MVT::i64); + SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero); + + return Result; + } + + return SDValue(); +} + +static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) { + SDLoc dl(N); + LLVMContext &Ctx = *DAG.getContext(); + EVT VT = N->getValueType(0); + + assert(VT.isScalableVector() && "Expected a scalable vector."); + + // Current lowering only supports the SVE-ACLE types. + if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock) + return SDValue(); + + unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8; + unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8; + EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true }); + + // Convert everything to the domain of EXT (i.e bytes). + SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1)); + SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2)); + SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3), + DAG.getConstant(ElemSize, dl, MVT::i32)); + + SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2); + return DAG.getNode(ISD::BITCAST, dl, VT, EXT); +} + +static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID, + bool Invert, + TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalize()) + return SDValue(); + + SDValue Comparator = N->getOperand(3); + if (Comparator.getOpcode() == AArch64ISD::DUP || + Comparator.getOpcode() == ISD::SPLAT_VECTOR) { + unsigned IID = getIntrinsicID(N); + EVT VT = N->getValueType(0); + EVT CmpVT = N->getOperand(2).getValueType(); + SDValue Pred = N->getOperand(1); + SDValue Imm; + SDLoc DL(N); + + switch (IID) { + default: + llvm_unreachable("Called with wrong intrinsic!"); + break; + + // Signed comparisons + case Intrinsic::aarch64_sve_cmpeq_wide: + case Intrinsic::aarch64_sve_cmpne_wide: + case Intrinsic::aarch64_sve_cmpge_wide: + case Intrinsic::aarch64_sve_cmpgt_wide: + case Intrinsic::aarch64_sve_cmplt_wide: + case Intrinsic::aarch64_sve_cmple_wide: { + if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { + int64_t ImmVal = CN->getSExtValue(); + if (ImmVal >= -16 && ImmVal <= 15) + Imm = DAG.getConstant(ImmVal, DL, MVT::i32); + else + return SDValue(); + } + break; + } + // Unsigned comparisons + case Intrinsic::aarch64_sve_cmphs_wide: + case Intrinsic::aarch64_sve_cmphi_wide: + case Intrinsic::aarch64_sve_cmplo_wide: + case Intrinsic::aarch64_sve_cmpls_wide: { + if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) { + uint64_t ImmVal = CN->getZExtValue(); + if (ImmVal <= 127) + Imm = DAG.getConstant(ImmVal, DL, MVT::i32); + else + return SDValue(); + } + break; + } + } + + SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm); + SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64); + SDValue Op0, Op1; + if (Invert) { + Op0 = Splat; + Op1 = N->getOperand(2); + } else { + Op0 = N->getOperand(2); + Op1 = Splat; + } + return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT, + ID, Pred, Op0, Op1); + } + + return SDValue(); +} + +static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op, + AArch64CC::CondCode Cond) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + + SDLoc DL(Op); + assert(Op.getValueType().isScalableVector() && + TLI.isTypeLegal(Op.getValueType()) && + "Expected legal scalable vector type!"); + + // Ensure target specific opcodes are using legal type. + EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT); + SDValue TVal = DAG.getConstant(1, DL, OutVT); + SDValue FVal = DAG.getConstant(0, DL, OutVT); + + // Set condition code (CC) flags. + SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op); + + // Convert CC to integer based on requested condition. + // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare. + SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32); + SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test); + return DAG.getZExtOrTrunc(Res, DL, VT); +} + static SDValue performIntrinsicCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, const AArch64Subtarget *Subtarget) { @@ -10510,6 +10968,61 @@ static SDValue performIntrinsicCombine(SDNode *N, case Intrinsic::aarch64_crc32h: case Intrinsic::aarch64_crc32ch: return tryCombineCRC32(0xffff, N, DAG); + case Intrinsic::aarch64_sve_smaxv: + return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG); + case Intrinsic::aarch64_sve_umaxv: + return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG); + case Intrinsic::aarch64_sve_sminv: + return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG); + case Intrinsic::aarch64_sve_uminv: + return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG); + case Intrinsic::aarch64_sve_orv: + return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG); + case Intrinsic::aarch64_sve_eorv: + return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG); + case Intrinsic::aarch64_sve_andv: + return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG); + case Intrinsic::aarch64_sve_ext: + return LowerSVEIntrinsicEXT(N, DAG); + case Intrinsic::aarch64_sve_cmpeq_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq, + false, DCI, DAG); + case Intrinsic::aarch64_sve_cmpne_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne, + false, DCI, DAG); + case Intrinsic::aarch64_sve_cmpge_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, + false, DCI, DAG); + case Intrinsic::aarch64_sve_cmpgt_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, + false, DCI, DAG); + case Intrinsic::aarch64_sve_cmplt_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt, + true, DCI, DAG); + case Intrinsic::aarch64_sve_cmple_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge, + true, DCI, DAG); + case Intrinsic::aarch64_sve_cmphs_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, + false, DCI, DAG); + case Intrinsic::aarch64_sve_cmphi_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, + false, DCI, DAG); + case Intrinsic::aarch64_sve_cmplo_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true, + DCI, DAG); + case Intrinsic::aarch64_sve_cmpls_wide: + return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true, + DCI, DAG); + case Intrinsic::aarch64_sve_ptest_any: + return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), + AArch64CC::ANY_ACTIVE); + case Intrinsic::aarch64_sve_ptest_first: + return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), + AArch64CC::FIRST_ACTIVE); + case Intrinsic::aarch64_sve_ptest_last: + return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2), + AArch64CC::LAST_ACTIVE); } return SDValue(); } @@ -10652,6 +11165,48 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St, return NewST1; } +static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + EVT PtrTy = N->getOperand(3).getValueType(); + + EVT LoadVT = VT; + if (VT.isFloatingPoint()) + LoadVT = VT.changeTypeToInteger(); + + auto *MINode = cast<MemIntrinsicSDNode>(N); + SDValue PassThru = DAG.getConstant(0, DL, LoadVT); + SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(), + MINode->getOperand(3), DAG.getUNDEF(PtrTy), + MINode->getOperand(2), PassThru, + MINode->getMemoryVT(), MINode->getMemOperand(), + ISD::UNINDEXED, ISD::NON_EXTLOAD, false); + + if (VT.isFloatingPoint()) { + SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) }; + return DAG.getMergeValues(Ops, DL); + } + + return L; +} + +static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) { + SDLoc DL(N); + + SDValue Data = N->getOperand(2); + EVT DataVT = Data.getValueType(); + EVT PtrTy = N->getOperand(4).getValueType(); + + if (DataVT.isFloatingPoint()) + Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data); + + auto *MINode = cast<MemIntrinsicSDNode>(N); + return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4), + DAG.getUNDEF(PtrTy), MINode->getOperand(3), + MINode->getMemoryVT(), MINode->getMemOperand(), + ISD::UNINDEXED, false, false); +} + /// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The /// load store optimizer pass will merge them to store pair stores. This should /// be better than a movi to create the vector zero followed by a vector store @@ -11703,6 +12258,215 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG, DAG.getConstant(MinOffset, DL, MVT::i64)); } +// Returns an SVE type that ContentTy can be trivially sign or zero extended +// into. +static MVT getSVEContainerType(EVT ContentTy) { + assert(ContentTy.isSimple() && "No SVE containers for extended types"); + + switch (ContentTy.getSimpleVT().SimpleTy) { + default: + llvm_unreachable("No known SVE container for this MVT type"); + case MVT::nxv2i8: + case MVT::nxv2i16: + case MVT::nxv2i32: + case MVT::nxv2i64: + case MVT::nxv2f32: + case MVT::nxv2f64: + return MVT::nxv2i64; + case MVT::nxv4i8: + case MVT::nxv4i16: + case MVT::nxv4i32: + case MVT::nxv4f32: + return MVT::nxv4i32; + } +} + +static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { + const SDValue Src = N->getOperand(2); + const EVT SrcVT = Src->getValueType(0); + assert(SrcVT.isScalableVector() && + "Scatter stores are only possible for SVE vectors"); + + SDLoc DL(N); + MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT(); + + // Make sure that source data will fit into an SVE register + if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) + return SDValue(); + + // For FPs, ACLE only supports _packed_ single and double precision types. + if (SrcElVT.isFloatingPoint()) + if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64)) + return SDValue(); + + // Depending on the addressing mode, this is either a pointer or a vector of + // pointers (that fits into one register) + const SDValue Base = N->getOperand(4); + // Depending on the addressing mode, this is either a single offset or a + // vector of offsets (that fits into one register) + SDValue Offset = N->getOperand(5); + + auto &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(Base.getValueType())) + return SDValue(); + + // Some scatter store variants allow unpacked offsets, but only as nxv2i32 + // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to + // nxv2i64. Legalize accordingly. + if (!OnlyPackedOffsets && + Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) + Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); + + if (!TLI.isTypeLegal(Offset.getValueType())) + return SDValue(); + + // Source value type that is representable in hardware + EVT HwSrcVt = getSVEContainerType(SrcVT); + + // Keep the original type of the input data to store - this is needed to + // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the + // integer equivalent, so just use HwSrcVt. + SDValue InputVT = DAG.getValueType(SrcVT); + if (SrcVT.isFloatingPoint()) + InputVT = DAG.getValueType(HwSrcVt); + + SDVTList VTs = DAG.getVTList(MVT::Other); + SDValue SrcNew; + + if (Src.getValueType().isFloatingPoint()) + SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src); + else + SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src); + + SDValue Ops[] = {N->getOperand(0), // Chain + SrcNew, + N->getOperand(3), // Pg + Base, + Offset, + InputVT}; + + return DAG.getNode(Opcode, DL, VTs, Ops); +} + +static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG, + unsigned Opcode, + bool OnlyPackedOffsets = true) { + EVT RetVT = N->getValueType(0); + assert(RetVT.isScalableVector() && + "Gather loads are only possible for SVE vectors"); + SDLoc DL(N); + + if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock) + return SDValue(); + + // Depending on the addressing mode, this is either a pointer or a vector of + // pointers (that fits into one register) + const SDValue Base = N->getOperand(3); + // Depending on the addressing mode, this is either a single offset or a + // vector of offsets (that fits into one register) + SDValue Offset = N->getOperand(4); + + auto &TLI = DAG.getTargetLoweringInfo(); + if (!TLI.isTypeLegal(Base.getValueType())) + return SDValue(); + + // Some gather load variants allow unpacked offsets, but only as nxv2i32 + // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to + // nxv2i64. Legalize accordingly. + if (!OnlyPackedOffsets && + Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32) + Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0); + + // Return value type that is representable in hardware + EVT HwRetVt = getSVEContainerType(RetVT); + + // Keep the original output value type around - this will better inform + // optimisations (e.g. instruction folding when load is followed by + // zext/sext). This will only be used for ints, so the value for FPs + // doesn't matter. + SDValue OutVT = DAG.getValueType(RetVT); + if (RetVT.isFloatingPoint()) + OutVT = DAG.getValueType(HwRetVt); + + SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other); + SDValue Ops[] = {N->getOperand(0), // Chain + N->getOperand(2), // Pg + Base, Offset, OutVT}; + + SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops); + SDValue LoadChain = SDValue(Load.getNode(), 1); + + if (RetVT.isInteger() && (RetVT != HwRetVt)) + Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0)); + + // If the original return value was FP, bitcast accordingly. Doing it here + // means that we can avoid adding TableGen patterns for FPs. + if (RetVT.isFloatingPoint()) + Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0)); + + return DAG.getMergeValues({Load, LoadChain}, DL); +} + + +static SDValue +performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, + SelectionDAG &DAG) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue Src = N->getOperand(0); + unsigned Opc = Src->getOpcode(); + + // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates + // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes. + unsigned NewOpc; + switch (Opc) { + case AArch64ISD::GLD1: + NewOpc = AArch64ISD::GLD1S; + break; + case AArch64ISD::GLD1_SCALED: + NewOpc = AArch64ISD::GLD1S_SCALED; + break; + case AArch64ISD::GLD1_SXTW: + NewOpc = AArch64ISD::GLD1S_SXTW; + break; + case AArch64ISD::GLD1_SXTW_SCALED: + NewOpc = AArch64ISD::GLD1S_SXTW_SCALED; + break; + case AArch64ISD::GLD1_UXTW: + NewOpc = AArch64ISD::GLD1S_UXTW; + break; + case AArch64ISD::GLD1_UXTW_SCALED: + NewOpc = AArch64ISD::GLD1S_UXTW_SCALED; + break; + case AArch64ISD::GLD1_IMM: + NewOpc = AArch64ISD::GLD1S_IMM; + break; + default: + return SDValue(); + } + + EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT(); + EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT(); + + if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse()) + return SDValue(); + + EVT DstVT = N->getValueType(0); + SDVTList VTs = DAG.getVTList(DstVT, MVT::Other); + SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2), + Src->getOperand(3), Src->getOperand(4)}; + + SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops); + DCI.CombineTo(N, ExtLoad); + DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1)); + + // Return N so it doesn't get rechecked + return SDValue(N, 0); +} + SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; @@ -11737,8 +12501,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case ISD::ZERO_EXTEND: case ISD::SIGN_EXTEND: return performExtendCombine(N, DCI, DAG); - case ISD::BITCAST: - return performBitcastCombine(N, DCI, DAG); + case ISD::SIGN_EXTEND_INREG: + return performSignExtendInRegCombine(N, DCI, DAG); case ISD::CONCAT_VECTORS: return performConcatVectorsCombine(N, DCI, DAG); case ISD::SELECT: @@ -11789,6 +12553,46 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N, case Intrinsic::aarch64_neon_st3lane: case Intrinsic::aarch64_neon_st4lane: return performNEONPostLDSTCombine(N, DCI, DAG); + case Intrinsic::aarch64_sve_ldnt1: + return performLDNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_stnt1: + return performSTNT1Combine(N, DAG); + case Intrinsic::aarch64_sve_ld1_gather: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1); + case Intrinsic::aarch64_sve_ld1_gather_index: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED); + case Intrinsic::aarch64_sve_ld1_gather_sxtw: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ld1_gather_uxtw: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ld1_gather_sxtw_index: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ld1_gather_uxtw_index: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_ld1_gather_imm: + return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM); + case Intrinsic::aarch64_sve_st1_scatter: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1); + case Intrinsic::aarch64_sve_st1_scatter_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED); + case Intrinsic::aarch64_sve_st1_scatter_sxtw: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_uxtw: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_sxtw_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_uxtw_index: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED, + /*OnlyPackedOffsets=*/false); + case Intrinsic::aarch64_sve_st1_scatter_imm: + return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM); default: break; } @@ -12084,6 +12888,69 @@ void AArch64TargetLowering::ReplaceNodeResults( case ISD::ATOMIC_CMP_SWAP: ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget); return; + case ISD::LOAD: { + assert(SDValue(N, 0).getValueType() == MVT::i128 && + "unexpected load's value type"); + LoadSDNode *LoadNode = cast<LoadSDNode>(N); + if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) { + // Non-volatile loads are optimized later in AArch64's load/store + // optimizer. + return; + } + + SDValue Result = DAG.getMemIntrinsicNode( + AArch64ISD::LDP, SDLoc(N), + DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}), + {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(), + LoadNode->getMemOperand()); + + SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, + Result.getValue(0), Result.getValue(1)); + Results.append({Pair, Result.getValue(2) /* Chain */}); + return; + } + case ISD::INTRINSIC_WO_CHAIN: { + EVT VT = N->getValueType(0); + assert((VT == MVT::i8 || VT == MVT::i16) && + "custom lowering for unexpected type"); + + ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0)); + Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue()); + switch (IntID) { + default: + return; + case Intrinsic::aarch64_sve_clasta_n: { + SDLoc DL(N); + auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); + auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32, + N->getOperand(1), Op2, N->getOperand(3)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); + return; + } + case Intrinsic::aarch64_sve_clastb_n: { + SDLoc DL(N); + auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2)); + auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32, + N->getOperand(1), Op2, N->getOperand(3)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); + return; + } + case Intrinsic::aarch64_sve_lasta: { + SDLoc DL(N); + auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32, + N->getOperand(1), N->getOperand(2)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); + return; + } + case Intrinsic::aarch64_sve_lastb: { + SDLoc DL(N); + auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32, + N->getOperand(1), N->getOperand(2)); + Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V)); + return; + } + } + } } } @@ -12351,7 +13218,7 @@ bool AArch64TargetLowering:: bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const { if (DAG.getMachineFunction().getFunction().hasMinSize() && - !Subtarget->isTargetWindows()) + !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin()) return false; return true; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 00fa96bc4e6d..672dfc4fcbc0 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -155,6 +155,14 @@ enum NodeType : unsigned { SMAXV, UMAXV, + SMAXV_PRED, + UMAXV_PRED, + SMINV_PRED, + UMINV_PRED, + ORV_PRED, + EORV_PRED, + ANDV_PRED, + // Vector bitwise negation NOT, @@ -196,6 +204,43 @@ enum NodeType : unsigned { UUNPKHI, UUNPKLO, + CLASTA_N, + CLASTB_N, + LASTA, + LASTB, + REV, + TBL, + + INSR, + PTEST, + PTRUE, + + // Unsigned gather loads. + GLD1, + GLD1_SCALED, + GLD1_UXTW, + GLD1_SXTW, + GLD1_UXTW_SCALED, + GLD1_SXTW_SCALED, + GLD1_IMM, + + // Signed gather loads + GLD1S, + GLD1S_SCALED, + GLD1S_UXTW, + GLD1S_SXTW, + GLD1S_UXTW_SCALED, + GLD1S_SXTW_SCALED, + GLD1S_IMM, + // Scatter store + SST1, + SST1_SCALED, + SST1_UXTW, + SST1_SXTW, + SST1_UXTW_SCALED, + SST1_SXTW_SCALED, + SST1_IMM, + // NEON Load/Store with post-increment base updates LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE, LD3post, @@ -224,8 +269,10 @@ enum NodeType : unsigned { STG, STZG, ST2G, - STZ2G + STZ2G, + LDP, + STP }; } // end namespace AArch64ISD @@ -396,7 +443,9 @@ public: /// Return true if an FMA operation is faster than a pair of fmul and fadd /// instructions. fmuladd intrinsics will be expanded to FMAs when this method /// returns true, otherwise fmuladd is expanded to fmul + fadd. - bool isFMAFasterThanFMulAndFAdd(EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, + EVT VT) const override; + bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override; const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override; @@ -648,6 +697,8 @@ private: SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerELFTLSLocalExec(const GlobalValue *GV, SDValue ThreadBase, + const SDLoc &DL, SelectionDAG &DAG) const; SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL, SelectionDAG &DAG) const; SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const; @@ -713,7 +764,7 @@ private: unsigned combineRepeatedFPDivisors() const override; ConstraintType getConstraintType(StringRef Constraint) const override; - Register getRegisterByName(const char* RegName, EVT VT, + Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; /// Examine constraint string and operand type and determine a weight value. @@ -741,6 +792,7 @@ private: return TargetLowering::getInlineAsmMemConstraint(ConstraintCode); } + bool isVectorLoadExtDesirable(SDValue ExtVal) const override; bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override; bool mayBeEmittedAsTailCall(const CallInst *CI) const override; bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset, diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td index 459b53923625..27e1d8ee6b98 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td @@ -15,9 +15,9 @@ //===---------------------------------- let AddedComplexity = 15, Size = 0 in def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering), - [(atomic_fence imm:$ordering, 0)]>, Sched<[]>; -def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>; -def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>; + [(atomic_fence timm:$ordering, 0)]>, Sched<[]>; +def : Pat<(atomic_fence (i64 4), (timm)), (DMB (i32 0x9))>; +def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>; //===---------------------------------- // Atomic loads diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index f555e4123307..c3efe03a0987 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -305,7 +305,7 @@ def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> { } def SImm8Operand : SImmOperand<8>; -def simm8 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -128 && Imm < 127; }]> { +def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 127; }]> { let ParserMatchClass = SImm8Operand; let DecoderMethod = "DecodeSImm<8>"; } @@ -358,6 +358,16 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>; def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>; def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>; +def UImmS2XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64); +}]>; +def UImmS4XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64); +}]>; +def UImmS8XForm : SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64); +}]>; + // uimm5sN predicate - True if the immediate is a multiple of N in the range // [0 * N, 32 * N]. def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>; @@ -365,17 +375,41 @@ def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>; def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>; def uimm5s2 : Operand<i64>, ImmLeaf<i64, - [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }], + UImmS2XForm> { let ParserMatchClass = UImm5s2Operand; let PrintMethod = "printImmScale<2>"; } def uimm5s4 : Operand<i64>, ImmLeaf<i64, - [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }], + UImmS4XForm> { let ParserMatchClass = UImm5s4Operand; let PrintMethod = "printImmScale<4>"; } def uimm5s8 : Operand<i64>, ImmLeaf<i64, - [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> { + [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }], + UImmS8XForm> { + let ParserMatchClass = UImm5s8Operand; + let PrintMethod = "printImmScale<8>"; +} + +// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant) +// instead of ImmLeaf (Constant) +def tuimm5s2 : Operand<i64>, TImmLeaf<i64, + [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }], + UImmS2XForm> { + let ParserMatchClass = UImm5s2Operand; + let PrintMethod = "printImmScale<2>"; +} +def tuimm5s4 : Operand<i64>, TImmLeaf<i64, + [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }], + UImmS4XForm> { + let ParserMatchClass = UImm5s4Operand; + let PrintMethod = "printImmScale<4>"; +} +def tuimm5s8 : Operand<i64>, TImmLeaf<i64, + [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }], + UImmS8XForm> { let ParserMatchClass = UImm5s8Operand; let PrintMethod = "printImmScale<8>"; } @@ -590,6 +624,30 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{ let ParserMatchClass = Imm1_32Operand; } +// Same as vecshiftR#N, but use TargetConstant (TimmLeaf) instead of Constant +// (ImmLeaf) +def tvecshiftR8 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9); +}]> { + let EncoderMethod = "getVecShiftR8OpValue"; + let DecoderMethod = "DecodeVecShiftR8Imm"; + let ParserMatchClass = Imm1_8Operand; +} +def tvecshiftR16 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17); +}]> { + let EncoderMethod = "getVecShiftR16OpValue"; + let DecoderMethod = "DecodeVecShiftR16Imm"; + let ParserMatchClass = Imm1_16Operand; +} +def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{ + return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33); +}]> { + let EncoderMethod = "getVecShiftR32OpValue"; + let DecoderMethod = "DecodeVecShiftR32Imm"; + let ParserMatchClass = Imm1_32Operand; +} + def Imm0_1Operand : AsmImmRange<0, 1>; def Imm0_7Operand : AsmImmRange<0, 7>; def Imm0_15Operand : AsmImmRange<0, 15>; @@ -713,6 +771,13 @@ def imm0_127 : Operand<i32>, ImmLeaf<i32, [{ let PrintMethod = "printImm"; } +def imm0_127_64b : Operand<i64>, ImmLeaf<i64, [{ + return ((uint64_t)Imm) < 128; +}]> { + let ParserMatchClass = Imm0_127Operand; + let PrintMethod = "printImm"; +} + // NOTE: These imm0_N operands have to be of type i64 because i64 is the size // for all shift-amounts. @@ -730,6 +795,14 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{ let ParserMatchClass = Imm0_31Operand; } +// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf) +// instead of Contant (ImmLeaf) +def timm0_31 : Operand<i64>, TImmLeaf<i64, [{ + return ((uint64_t)Imm) < 32; +}]> { + let ParserMatchClass = Imm0_31Operand; +} + // True if the 32-bit immediate is in the range [0,31] def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{ return ((uint64_t)Imm) < 32; @@ -758,6 +831,13 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{ let ParserMatchClass = Imm0_7Operand; } +// imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7] +def imm32_0_7 : Operand<i32>, ImmLeaf<i32, [{ + return ((uint32_t)Imm) < 8; +}]> { + let ParserMatchClass = Imm0_7Operand; +} + // imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15] def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{ return ((uint32_t)Imm) < 16; @@ -1403,6 +1483,7 @@ class RCPCLoad<bits<2> sz, string asm, RegisterClass RC> class AuthBase<bits<1> M, dag oops, dag iops, string asm, string operands, list<dag> pattern> : I<oops, iops, asm, operands, "", pattern>, Sched<[]> { + let isAuthenticated = 1; let Inst{31-25} = 0b1101011; let Inst{20-11} = 0b1111100001; let Inst{10} = M; @@ -1427,6 +1508,7 @@ class AuthOneOperand<bits<3> opc, bits<1> M, string asm> let Inst{9-5} = Rn; } +let Uses = [LR,SP] in class AuthReturn<bits<3> op, bits<1> M, string asm> : AuthBase<M, (outs), (ins), asm, "", []> { let Inst{24} = 0; @@ -1441,6 +1523,7 @@ class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm, bits<10> offset; bits<5> Rn; bits<5> Rt; + let isAuthenticated = 1; let Inst{31-24} = 0b11111000; let Inst{23} = M; let Inst{22} = offset{9}; @@ -1463,6 +1546,9 @@ multiclass AuthLoad<bit M, string asm, Operand opr> { def : InstAlias<asm # "\t$Rt, [$Rn]", (!cast<Instruction>(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>; + + def : InstAlias<asm # "\t$Rt, [$wback]!", + (!cast<Instruction>(NAME # "writeback") GPR64sp:$wback, GPR64:$Rt, 0), 0>; } //--- @@ -3047,6 +3133,22 @@ def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>; def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>; def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>; +def gi_ro_Windexed8 : + GIComplexOperandMatcher<s64, "selectAddrModeWRO<8>">, + GIComplexPatternEquiv<ro_Windexed8>; +def gi_ro_Windexed16 : + GIComplexOperandMatcher<s64, "selectAddrModeWRO<16>">, + GIComplexPatternEquiv<ro_Windexed16>; +def gi_ro_Windexed32 : + GIComplexOperandMatcher<s64, "selectAddrModeWRO<32>">, + GIComplexPatternEquiv<ro_Windexed32>; +def gi_ro_Windexed64 : + GIComplexOperandMatcher<s64, "selectAddrModeWRO<64>">, + GIComplexPatternEquiv<ro_Windexed64>; +def gi_ro_Windexed128 : + GIComplexOperandMatcher<s64, "selectAddrModeWRO<128>">, + GIComplexPatternEquiv<ro_Windexed128>; + class MemExtendOperand<string Reg, int Width> : AsmOperandClass { let Name = "Mem" # Reg # "Extend" # Width; let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">"; @@ -5066,6 +5168,24 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm, [(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>; } +multiclass SIMDThreeSameVectorExtraPatterns<string inst, SDPatternOperator OpNode> { + def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)), + (!cast<Instruction>(inst#"v8i8") V64:$LHS, V64:$RHS)>; + def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)), + (!cast<Instruction>(inst#"v4i16") V64:$LHS, V64:$RHS)>; + def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)), + (!cast<Instruction>(inst#"v2i32") V64:$LHS, V64:$RHS)>; + + def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)), + (!cast<Instruction>(inst#"v16i8") V128:$LHS, V128:$RHS)>; + def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)), + (!cast<Instruction>(inst#"v8i16") V128:$LHS, V128:$RHS)>; + def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)), + (!cast<Instruction>(inst#"v4i32") V128:$LHS, V128:$RHS)>; + def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)), + (!cast<Instruction>(inst#"v2i64") V128:$LHS, V128:$RHS)>; +} + // As above, but D sized elements unsupported. multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm, SDPatternOperator OpNode> { @@ -10034,15 +10154,20 @@ class ComplexRotationOperand<int Angle, int Remainder, string Type> let DiagnosticType = "InvalidComplexRotation" # Type; let Name = "ComplexRotation" # Type; } -def complexrotateop : Operand<i32> { +def complexrotateop : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }], + SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32); +}]>> { let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">; let PrintMethod = "printComplexRotationOp<90, 0>"; } -def complexrotateopodd : Operand<i32> { +def complexrotateopodd : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }], + SDNodeXForm<imm, [{ + return CurDAG->getTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32); +}]>> { let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">; let PrintMethod = "printComplexRotationOp<180, 90>"; } - let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode, RegisterOperand regtype, Operand rottype, @@ -10373,9 +10498,9 @@ class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops> let Inst{11-10} = op1; } class CryptoRRTied_2D<bits<1>op0, bits<2>op1, string asm> - : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d}">; + : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d|.2d\t$Vd, $Vn}">; class CryptoRRTied_4S<bits<1>op0, bits<2>op1, string asm> - : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s}">; + : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s|.4s\t$Vd, $Vn}">; class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm, string asmops, string cst> @@ -10390,19 +10515,19 @@ class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm, } class CryptoRRR_2D<bits<1> op0, bits<2>op1, string asm> : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, - "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "">; + "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "">; class CryptoRRRTied_2D<bits<1> op0, bits<2>op1, string asm> : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm, - "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "$Vd = $Vdst">; + "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">; class CryptoRRR_4S<bits<1> op0, bits<2>op1, string asm> : CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm, - "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "">; + "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "">; class CryptoRRRTied_4S<bits<1> op0, bits<2>op1, string asm> : CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm, - "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "$Vd = $Vdst">; + "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">; class CryptoRRRTied<bits<1> op0, bits<2>op1, string asm> : CryptoRRR<op0, op1, (outs FPR128:$Vdst), (ins FPR128:$Vd, FPR128:$Vn, V128:$Vm), - asm, "{\t$Vd, $Vn, $Vm.2d}", "$Vd = $Vdst">; + asm, "{\t$Vd, $Vn, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">; class CryptoRRRR<bits<2>op0, string asm, string asmops> : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm, @@ -10416,15 +10541,18 @@ class CryptoRRRR<bits<2>op0, string asm, string asmops> let Inst{14-10} = Va; } class CryptoRRRR_16B<bits<2>op0, string asm> - : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b}"> { + : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b" # + "|.16b\t$Vd, $Vn, $Vm, $Va}"> { } class CryptoRRRR_4S<bits<2>op0, string asm> - : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s}"> { + : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s" # + "|.4s\t$Vd, $Vn, $Vm, $Va}"> { } class CryptoRRRi6<string asm> : BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm, - "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm}", "", []> { + "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm" # + "|.2d\t$Vd, $Vn, $Vm, $imm}", "", []> { bits<6> imm; bits<5> Vm; let Inst{24-21} = 0b0100; @@ -10437,7 +10565,8 @@ class CryptoRRRi6<string asm> class CryptoRRRi2Tied<bits<1>op0, bits<2>op1, string asm> : BaseCryptoV82<(outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm), - asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm}", "$Vd = $Vdst", []> { + asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm" # + "|.4s\t$Vd, $Vn, $Vm$imm}", "$Vd = $Vdst", []> { bits<2> imm; bits<5> Vm; let Inst{24-21} = 0b0010; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index 5c35e5bcdd30..54f3f7c10132 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -30,6 +30,7 @@ #include "llvm/CodeGen/StackMaps.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/GlobalValue.h" #include "llvm/MC/MCAsmInfo.h" @@ -1981,6 +1982,9 @@ bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset, const TargetRegisterInfo *TRI) const { + if (!LdSt.mayLoadOrStore()) + return false; + unsigned Width; return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI); } @@ -2025,9 +2029,8 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth( Offset = LdSt.getOperand(3).getImm() * Scale; } - assert((BaseOp->isReg() || BaseOp->isFI()) && - "getMemOperandWithOffset only supports base " - "operands of type register or frame index."); + if (!BaseOp->isReg() && !BaseOp->isFI()) + return false; return true; } @@ -2185,12 +2188,19 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, MaxOffset = 4095; break; case AArch64::ADDG: - case AArch64::TAGPstack: Scale = 16; Width = 0; MinOffset = 0; MaxOffset = 63; break; + case AArch64::TAGPstack: + Scale = 16; + Width = 0; + // TAGP with a negative offset turns into SUBP, which has a maximum offset + // of 63 (not 64!). + MinOffset = -63; + MaxOffset = 63; + break; case AArch64::LDG: case AArch64::STGOffset: case AArch64::STZGOffset: @@ -2227,54 +2237,82 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale, return true; } -static unsigned getOffsetStride(unsigned Opc) { +// Scaling factor for unscaled load or store. +int AArch64InstrInfo::getMemScale(unsigned Opc) { switch (Opc) { default: - return 0; - case AArch64::LDURQi: - case AArch64::STURQi: - return 16; - case AArch64::LDURXi: - case AArch64::LDURDi: - case AArch64::STURXi: - case AArch64::STURDi: - return 8; - case AArch64::LDURWi: + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::STRBBui: + case AArch64::STURBBi: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return 2; + case AArch64::LDRSui: case AArch64::LDURSi: + case AArch64::LDRSWui: case AArch64::LDURSWi: - case AArch64::STURWi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::STRSui: case AArch64::STURSi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: return 4; + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: + return 8; + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: + return 16; } } // Scale the unscaled offsets. Returns false if the unscaled offset can't be // scaled. static bool scaleOffset(unsigned Opc, int64_t &Offset) { - unsigned OffsetStride = getOffsetStride(Opc); - if (OffsetStride == 0) - return false; + int Scale = AArch64InstrInfo::getMemScale(Opc); + // If the byte-offset isn't a multiple of the stride, we can't scale this // offset. - if (Offset % OffsetStride != 0) + if (Offset % Scale != 0) return false; // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. - Offset /= OffsetStride; - return true; -} - -// Unscale the scaled offsets. Returns false if the scaled offset can't be -// unscaled. -static bool unscaleOffset(unsigned Opc, int64_t &Offset) { - unsigned OffsetStride = getOffsetStride(Opc); - if (OffsetStride == 0) - return false; - - // Convert the "element" offset used by scaled pair load/store instructions - // into the byte-offset used by unscaled. - Offset *= OffsetStride; + Offset /= Scale; return true; } @@ -2305,15 +2343,17 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1, int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); - // Get the byte-offset from the object offset. - if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2)) + // Convert to scaled object offsets. + int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); + if (ObjectOffset1 % Scale1 != 0) return false; + ObjectOffset1 /= Scale1; + int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); + if (ObjectOffset2 % Scale2 != 0) + return false; + ObjectOffset2 /= Scale2; ObjectOffset1 += Offset1; ObjectOffset2 += Offset2; - // Get the "element" index in the object. - if (!scaleOffset(Opcode1, ObjectOffset1) || - !scaleOffset(Opcode2, ObjectOffset2)) - return false; return ObjectOffset1 + 1 == ObjectOffset2; } @@ -2373,7 +2413,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, // The caller should already have ordered First/SecondLdSt by offset. // Note: except for non-equal frame index bases if (BaseOp1.isFI()) { - assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) && + assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && "Caller should have ordered offsets."); const MachineFrameInfo &MFI = @@ -2382,8 +2422,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1, BaseOp2.getIndex(), Offset2, SecondOpc); } - assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && - "Caller should have ordered offsets."); + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); return Offset1 + 1 == Offset2; } @@ -2409,8 +2448,8 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg, void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc, + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc, unsigned Opcode, ArrayRef<unsigned> Indices) const { assert(Subtarget.hasNEON() && "Unexpected register copy without NEON"); @@ -2461,8 +2500,8 @@ void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB, void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, - unsigned SrcReg, bool KillSrc) const { + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc) const { if (AArch64::GPR32spRegClass.contains(DestReg) && (AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) { const TargetRegisterInfo *TRI = &getRegisterInfo(); @@ -2471,10 +2510,10 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, // If either operand is WSP, expand to ADD #0. if (Subtarget.hasZeroCycleRegMove()) { // Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move. - unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, - &AArch64::GPR64spRegClass); - unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, - &AArch64::GPR64spRegClass); + MCRegister DestRegX = TRI->getMatchingSuperReg( + DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); + MCRegister SrcRegX = TRI->getMatchingSuperReg( + SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); // This instruction is reading and writing X registers. This may upset // the register scavenger and machine verifier, so we need to indicate // that we are reading an undefined value from SrcRegX, but a proper @@ -2497,10 +2536,10 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB, } else { if (Subtarget.hasZeroCycleRegMove()) { // Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move. - unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32, - &AArch64::GPR64spRegClass); - unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32, - &AArch64::GPR64spRegClass); + MCRegister DestRegX = TRI->getMatchingSuperReg( + DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass); + MCRegister SrcRegX = TRI->getMatchingSuperReg( + SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass); // This instruction is reading and writing X registers. This may upset // the register scavenger and machine verifier, so we need to indicate // that we are reading an undefined value from SrcRegX, but a proper @@ -2897,7 +2936,18 @@ void AArch64InstrInfo::storeRegToStackSlot( } break; } + unsigned StackID = TargetStackID::Default; + if (AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_PXI; + StackID = TargetStackID::SVEVector; + } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register store without SVE"); + Opc = AArch64::STR_ZXI; + StackID = TargetStackID::SVEVector; + } assert(Opc && "Unknown register class"); + MFI.setStackID(FI, StackID); const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) .addReg(SrcReg, getKillRegState(isKill)) @@ -3028,7 +3078,19 @@ void AArch64InstrInfo::loadRegFromStackSlot( } break; } + + unsigned StackID = TargetStackID::Default; + if (AArch64::PPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_PXI; + StackID = TargetStackID::SVEVector; + } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) { + assert(Subtarget.hasSVE() && "Unexpected register load without SVE"); + Opc = AArch64::LDR_ZXI; + StackID = TargetStackID::SVEVector; + } assert(Opc && "Unknown register class"); + MFI.setStackID(FI, StackID); const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc)) .addReg(DestReg, getDefRegState(true)) @@ -3085,7 +3147,7 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB, const unsigned MaxEncodableValue = MaxEncoding << ShiftSize; do { - unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue); + uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue); unsigned LocalShiftSize = 0; if (ThisVal > MaxEncoding) { ThisVal = ThisVal >> ShiftSize; @@ -3548,6 +3610,18 @@ static bool isCombineInstrCandidate64(unsigned Opc) { // Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi. case AArch64::SUBXri: case AArch64::SUBSXri: + case AArch64::ADDv8i8: + case AArch64::ADDv16i8: + case AArch64::ADDv4i16: + case AArch64::ADDv8i16: + case AArch64::ADDv2i32: + case AArch64::ADDv4i32: + case AArch64::SUBv8i8: + case AArch64::SUBv16i8: + case AArch64::SUBv4i16: + case AArch64::SUBv8i16: + case AArch64::SUBv2i32: + case AArch64::SUBv4i32: return true; default: break; @@ -3690,6 +3764,13 @@ static bool getMaddPatterns(MachineInstr &Root, } }; + auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) { + if (canCombine(MBB, Root.getOperand(Operand), Opcode)) { + Patterns.push_back(Pattern); + Found = true; + } + }; + typedef MachineCombinerPattern MCP; switch (Opc) { @@ -3725,6 +3806,70 @@ static bool getMaddPatterns(MachineInstr &Root, case AArch64::SUBXri: setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1); break; + case AArch64::ADDv8i8: + setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1); + setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2); + break; + case AArch64::ADDv16i8: + setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1); + setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2); + break; + case AArch64::ADDv4i16: + setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1); + setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2); + setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1); + setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2); + break; + case AArch64::ADDv8i16: + setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1); + setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2); + setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1); + setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2); + break; + case AArch64::ADDv2i32: + setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1); + setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2); + setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1); + setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2); + break; + case AArch64::ADDv4i32: + setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1); + setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2); + setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1); + setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2); + break; + case AArch64::SUBv8i8: + setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1); + setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2); + break; + case AArch64::SUBv16i8: + setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1); + setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2); + break; + case AArch64::SUBv4i16: + setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1); + setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2); + setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1); + setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2); + break; + case AArch64::SUBv8i16: + setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1); + setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2); + setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1); + setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2); + break; + case AArch64::SUBv2i32: + setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1); + setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2); + setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1); + setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2); + break; + case AArch64::SUBv4i32: + setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1); + setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2); + setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1); + setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2); + break; } return Found; } @@ -3937,6 +4082,46 @@ bool AArch64InstrInfo::isThroughputPattern( case MachineCombinerPattern::FMLSv2f64_OP2: case MachineCombinerPattern::FMLSv4i32_indexed_OP2: case MachineCombinerPattern::FMLSv4f32_OP2: + case MachineCombinerPattern::MULADDv8i8_OP1: + case MachineCombinerPattern::MULADDv8i8_OP2: + case MachineCombinerPattern::MULADDv16i8_OP1: + case MachineCombinerPattern::MULADDv16i8_OP2: + case MachineCombinerPattern::MULADDv4i16_OP1: + case MachineCombinerPattern::MULADDv4i16_OP2: + case MachineCombinerPattern::MULADDv8i16_OP1: + case MachineCombinerPattern::MULADDv8i16_OP2: + case MachineCombinerPattern::MULADDv2i32_OP1: + case MachineCombinerPattern::MULADDv2i32_OP2: + case MachineCombinerPattern::MULADDv4i32_OP1: + case MachineCombinerPattern::MULADDv4i32_OP2: + case MachineCombinerPattern::MULSUBv8i8_OP1: + case MachineCombinerPattern::MULSUBv8i8_OP2: + case MachineCombinerPattern::MULSUBv16i8_OP1: + case MachineCombinerPattern::MULSUBv16i8_OP2: + case MachineCombinerPattern::MULSUBv4i16_OP1: + case MachineCombinerPattern::MULSUBv4i16_OP2: + case MachineCombinerPattern::MULSUBv8i16_OP1: + case MachineCombinerPattern::MULSUBv8i16_OP2: + case MachineCombinerPattern::MULSUBv2i32_OP1: + case MachineCombinerPattern::MULSUBv2i32_OP2: + case MachineCombinerPattern::MULSUBv4i32_OP1: + case MachineCombinerPattern::MULSUBv4i32_OP2: + case MachineCombinerPattern::MULADDv4i16_indexed_OP1: + case MachineCombinerPattern::MULADDv4i16_indexed_OP2: + case MachineCombinerPattern::MULADDv8i16_indexed_OP1: + case MachineCombinerPattern::MULADDv8i16_indexed_OP2: + case MachineCombinerPattern::MULADDv2i32_indexed_OP1: + case MachineCombinerPattern::MULADDv2i32_indexed_OP2: + case MachineCombinerPattern::MULADDv4i32_indexed_OP1: + case MachineCombinerPattern::MULADDv4i32_indexed_OP2: + case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: + case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: + case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: + case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: + case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: + case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: + case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: + case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: return true; } // end switch (Pattern) return false; @@ -4040,6 +4225,80 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, return MUL; } +/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate +/// instructions. +/// +/// \see genFusedMultiply +static MachineInstr *genFusedMultiplyAcc( + MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, + MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { + return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, + FMAInstKind::Accumulator); +} + +/// genNeg - Helper to generate an intermediate negation of the second operand +/// of Root +static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI, + const TargetInstrInfo *TII, MachineInstr &Root, + SmallVectorImpl<MachineInstr *> &InsInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, + unsigned MnegOpc, const TargetRegisterClass *RC) { + Register NewVR = MRI.createVirtualRegister(RC); + MachineInstrBuilder MIB = + BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR) + .add(Root.getOperand(2)); + InsInstrs.push_back(MIB); + + assert(InstrIdxForVirtReg.empty()); + InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0)); + + return NewVR; +} + +/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate +/// instructions with an additional negation of the accumulator +static MachineInstr *genFusedMultiplyAccNeg( + MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, + MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, + unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { + assert(IdxMulOpd == 1); + + Register NewVR = + genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); + return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, + FMAInstKind::Accumulator, &NewVR); +} + +/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate +/// instructions. +/// +/// \see genFusedMultiply +static MachineInstr *genFusedMultiplyIdx( + MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, + MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, + unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) { + return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, + FMAInstKind::Indexed); +} + +/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate +/// instructions with an additional negation of the accumulator +static MachineInstr *genFusedMultiplyIdxNeg( + MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, + MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, + DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd, + unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) { + assert(IdxMulOpd == 1); + + Register NewVR = + genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC); + + return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC, + FMAInstKind::Indexed, &NewVR); +} + /// genMaddR - Generate madd instruction and combine mul and add using /// an extra virtual register /// Example - an ADD intermediate needs to be stored in a register: @@ -4279,6 +4538,231 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } break; } + + case MachineCombinerPattern::MULADDv8i8_OP1: + Opc = AArch64::MLAv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv8i8_OP2: + Opc = AArch64::MLAv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv16i8_OP1: + Opc = AArch64::MLAv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv16i8_OP2: + Opc = AArch64::MLAv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv4i16_OP1: + Opc = AArch64::MLAv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv4i16_OP2: + Opc = AArch64::MLAv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv8i16_OP1: + Opc = AArch64::MLAv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv8i16_OP2: + Opc = AArch64::MLAv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv2i32_OP1: + Opc = AArch64::MLAv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv2i32_OP2: + Opc = AArch64::MLAv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv4i32_OP1: + Opc = AArch64::MLAv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv4i32_OP2: + Opc = AArch64::MLAv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::MULSUBv8i8_OP1: + Opc = AArch64::MLAv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8, + RC); + break; + case MachineCombinerPattern::MULSUBv8i8_OP2: + Opc = AArch64::MLSv8i8; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv16i8_OP1: + Opc = AArch64::MLAv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8, + RC); + break; + case MachineCombinerPattern::MULSUBv16i8_OP2: + Opc = AArch64::MLSv16i8; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv4i16_OP1: + Opc = AArch64::MLAv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, + RC); + break; + case MachineCombinerPattern::MULSUBv4i16_OP2: + Opc = AArch64::MLSv4i16; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv8i16_OP1: + Opc = AArch64::MLAv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, + RC); + break; + case MachineCombinerPattern::MULSUBv8i16_OP2: + Opc = AArch64::MLSv8i16; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv2i32_OP1: + Opc = AArch64::MLAv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, + RC); + break; + case MachineCombinerPattern::MULSUBv2i32_OP2: + Opc = AArch64::MLSv2i32; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv4i32_OP1: + Opc = AArch64::MLAv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, + RC); + break; + case MachineCombinerPattern::MULSUBv4i32_OP2: + Opc = AArch64::MLSv4i32; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::MULADDv4i16_indexed_OP1: + Opc = AArch64::MLAv4i16_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv4i16_indexed_OP2: + Opc = AArch64::MLAv4i16_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv8i16_indexed_OP1: + Opc = AArch64::MLAv8i16_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv8i16_indexed_OP2: + Opc = AArch64::MLAv8i16_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv2i32_indexed_OP1: + Opc = AArch64::MLAv2i32_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv2i32_indexed_OP2: + Opc = AArch64::MLAv2i32_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULADDv4i32_indexed_OP1: + Opc = AArch64::MLAv4i32_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + case MachineCombinerPattern::MULADDv4i32_indexed_OP2: + Opc = AArch64::MLAv4i32_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + + case MachineCombinerPattern::MULSUBv4i16_indexed_OP1: + Opc = AArch64::MLAv4i16_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16, + RC); + break; + case MachineCombinerPattern::MULSUBv4i16_indexed_OP2: + Opc = AArch64::MLSv4i16_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv8i16_indexed_OP1: + Opc = AArch64::MLAv8i16_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16, + RC); + break; + case MachineCombinerPattern::MULSUBv8i16_indexed_OP2: + Opc = AArch64::MLSv8i16_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv2i32_indexed_OP1: + Opc = AArch64::MLAv2i32_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32, + RC); + break; + case MachineCombinerPattern::MULSUBv2i32_indexed_OP2: + Opc = AArch64::MLSv2i32_indexed; + RC = &AArch64::FPR64RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + case MachineCombinerPattern::MULSUBv4i32_indexed_OP1: + Opc = AArch64::MLAv4i32_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs, + InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32, + RC); + break; + case MachineCombinerPattern::MULSUBv4i32_indexed_OP2: + Opc = AArch64::MLSv4i32_indexed; + RC = &AArch64::FPR128RegClass; + MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); + break; + // Floating Point Support case MachineCombinerPattern::FMULADDH_OP1: Opc = AArch64::FMADDHrrr; @@ -5037,8 +5521,99 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const { return 0u; } -outliner::OutlinedFunction -AArch64InstrInfo::getOutliningCandidateInfo( +static bool +outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a, + const outliner::Candidate &b) { + const Function &Fa = a.getMF()->getFunction(); + const Function &Fb = b.getMF()->getFunction(); + + // If none of the functions have the "sign-return-address" attribute their + // signing behaviour is equal + if (!Fa.hasFnAttribute("sign-return-address") && + !Fb.hasFnAttribute("sign-return-address")) { + return true; + } + + // If both functions have the "sign-return-address" attribute their signing + // behaviour is equal, if the values of the attributes are equal + if (Fa.hasFnAttribute("sign-return-address") && + Fb.hasFnAttribute("sign-return-address")) { + StringRef ScopeA = + Fa.getFnAttribute("sign-return-address").getValueAsString(); + StringRef ScopeB = + Fb.getFnAttribute("sign-return-address").getValueAsString(); + return ScopeA.equals(ScopeB); + } + + // If function B doesn't have the "sign-return-address" attribute but A does, + // the functions' signing behaviour is equal if A's value for + // "sign-return-address" is "none" and vice versa. + if (Fa.hasFnAttribute("sign-return-address")) { + StringRef ScopeA = + Fa.getFnAttribute("sign-return-address").getValueAsString(); + return ScopeA.equals("none"); + } + + if (Fb.hasFnAttribute("sign-return-address")) { + StringRef ScopeB = + Fb.getFnAttribute("sign-return-address").getValueAsString(); + return ScopeB.equals("none"); + } + + llvm_unreachable("Unkown combination of sign-return-address attributes"); +} + +static bool +outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a, + const outliner::Candidate &b) { + const Function &Fa = a.getMF()->getFunction(); + const Function &Fb = b.getMF()->getFunction(); + + // If none of the functions have the "sign-return-address-key" attribute + // their keys are equal + if (!Fa.hasFnAttribute("sign-return-address-key") && + !Fb.hasFnAttribute("sign-return-address-key")) { + return true; + } + + // If both functions have the "sign-return-address-key" attribute their + // keys are equal if the values of "sign-return-address-key" are equal + if (Fa.hasFnAttribute("sign-return-address-key") && + Fb.hasFnAttribute("sign-return-address-key")) { + StringRef KeyA = + Fa.getFnAttribute("sign-return-address-key").getValueAsString(); + StringRef KeyB = + Fb.getFnAttribute("sign-return-address-key").getValueAsString(); + return KeyA.equals(KeyB); + } + + // If B doesn't have the "sign-return-address-key" attribute, both keys are + // equal, if function a has the default key (a_key) + if (Fa.hasFnAttribute("sign-return-address-key")) { + StringRef KeyA = + Fa.getFnAttribute("sign-return-address-key").getValueAsString(); + return KeyA.equals_lower("a_key"); + } + + if (Fb.hasFnAttribute("sign-return-address-key")) { + StringRef KeyB = + Fb.getFnAttribute("sign-return-address-key").getValueAsString(); + return KeyB.equals_lower("a_key"); + } + + llvm_unreachable("Unkown combination of sign-return-address-key attributes"); +} + +static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a, + const outliner::Candidate &b) { + const AArch64Subtarget &SubtargetA = + a.getMF()->getSubtarget<AArch64Subtarget>(); + const AArch64Subtarget &SubtargetB = + b.getMF()->getSubtarget<AArch64Subtarget>(); + return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps(); +} + +outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo( std::vector<outliner::Candidate> &RepeatedSequenceLocs) const { outliner::Candidate &FirstCand = RepeatedSequenceLocs[0]; unsigned SequenceSize = @@ -5046,12 +5621,115 @@ AArch64InstrInfo::getOutliningCandidateInfo( [this](unsigned Sum, const MachineInstr &MI) { return Sum + getInstSizeInBytes(MI); }); + unsigned NumBytesToCreateFrame = 0; + + // We only allow outlining for functions having exactly matching return + // address signing attributes, i.e., all share the same value for the + // attribute "sign-return-address" and all share the same type of key they + // are signed with. + // Additionally we require all functions to simultaniously either support + // v8.3a features or not. Otherwise an outlined function could get signed + // using dedicated v8.3 instructions and a call from a function that doesn't + // support v8.3 instructions would therefore be invalid. + if (std::adjacent_find( + RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), + [](const outliner::Candidate &a, const outliner::Candidate &b) { + // Return true if a and b are non-equal w.r.t. return address + // signing or support of v8.3a features + if (outliningCandidatesSigningScopeConsensus(a, b) && + outliningCandidatesSigningKeyConsensus(a, b) && + outliningCandidatesV8_3OpsConsensus(a, b)) { + return false; + } + return true; + }) != RepeatedSequenceLocs.end()) { + return outliner::OutlinedFunction(); + } + + // Since at this point all candidates agree on their return address signing + // picking just one is fine. If the candidate functions potentially sign their + // return addresses, the outlined function should do the same. Note that in + // the case of "sign-return-address"="non-leaf" this is an assumption: It is + // not certainly true that the outlined function will have to sign its return + // address but this decision is made later, when the decision to outline + // has already been made. + // The same holds for the number of additional instructions we need: On + // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is + // necessary. However, at this point we don't know if the outlined function + // will have a RET instruction so we assume the worst. + const Function &FCF = FirstCand.getMF()->getFunction(); + const TargetRegisterInfo &TRI = getRegisterInfo(); + if (FCF.hasFnAttribute("sign-return-address")) { + // One PAC and one AUT instructions + NumBytesToCreateFrame += 8; + + // We have to check if sp modifying instructions would get outlined. + // If so we only allow outlining if sp is unchanged overall, so matching + // sub and add instructions are okay to outline, all other sp modifications + // are not + auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) { + int SPValue = 0; + MachineBasicBlock::iterator MBBI = C.front(); + for (;;) { + if (MBBI->modifiesRegister(AArch64::SP, &TRI)) { + switch (MBBI->getOpcode()) { + case AArch64::ADDXri: + case AArch64::ADDWri: + assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); + assert(MBBI->getOperand(2).isImm() && + "Expected operand to be immediate"); + assert(MBBI->getOperand(1).isReg() && + "Expected operand to be a register"); + // Check if the add just increments sp. If so, we search for + // matching sub instructions that decrement sp. If not, the + // modification is illegal + if (MBBI->getOperand(1).getReg() == AArch64::SP) + SPValue += MBBI->getOperand(2).getImm(); + else + return true; + break; + case AArch64::SUBXri: + case AArch64::SUBWri: + assert(MBBI->getNumOperands() == 4 && "Wrong number of operands"); + assert(MBBI->getOperand(2).isImm() && + "Expected operand to be immediate"); + assert(MBBI->getOperand(1).isReg() && + "Expected operand to be a register"); + // Check if the sub just decrements sp. If so, we search for + // matching add instructions that increment sp. If not, the + // modification is illegal + if (MBBI->getOperand(1).getReg() == AArch64::SP) + SPValue -= MBBI->getOperand(2).getImm(); + else + return true; + break; + default: + return true; + } + } + if (MBBI == C.back()) + break; + ++MBBI; + } + if (SPValue) + return true; + return false; + }; + // Remove candidates with illegal stack modifying instructions + RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(), + RepeatedSequenceLocs.end(), + hasIllegalSPModification), + RepeatedSequenceLocs.end()); + + // If the sequence doesn't have enough candidates left, then we're done. + if (RepeatedSequenceLocs.size() < 2) + return outliner::OutlinedFunction(); + } // Properties about candidate MBBs that hold for all of them. unsigned FlagsSetInAll = 0xF; // Compute liveness information for each candidate, and set FlagsSetInAll. - const TargetRegisterInfo &TRI = getRegisterInfo(); std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(), [&FlagsSetInAll](outliner::Candidate &C) { FlagsSetInAll &= C.Flags; @@ -5107,7 +5785,7 @@ AArch64InstrInfo::getOutliningCandidateInfo( }; unsigned FrameID = MachineOutlinerDefault; - unsigned NumBytesToCreateFrame = 4; + NumBytesToCreateFrame += 4; bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) { return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement"); @@ -5190,11 +5868,21 @@ AArch64InstrInfo::getOutliningCandidateInfo( unsigned NumBytesNoStackCalls = 0; std::vector<outliner::Candidate> CandidatesWithoutStackFixups; + // Check if we have to save LR. for (outliner::Candidate &C : RepeatedSequenceLocs) { C.initLRU(TRI); + // If we have a noreturn caller, then we're going to be conservative and + // say that we have to save LR. If we don't have a ret at the end of the + // block, then we can't reason about liveness accurately. + // + // FIXME: We can probably do better than always disabling this in + // noreturn functions by fixing up the liveness info. + bool IsNoReturn = + C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn); + // Is LR available? If so, we don't need a save. - if (C.LRU.available(AArch64::LR)) { + if (C.LRU.available(AArch64::LR) && !IsNoReturn) { NumBytesNoStackCalls += 4; C.setCallInfo(MachineOutlinerNoLRSave, 4); CandidatesWithoutStackFixups.push_back(C); @@ -5376,6 +6064,19 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT, MachineFunction *MF = MBB->getParent(); AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>(); + // Don't outline anything used for return address signing. The outlined + // function will get signed later if needed + switch (MI.getOpcode()) { + case AArch64::PACIASP: + case AArch64::PACIBSP: + case AArch64::AUTIASP: + case AArch64::AUTIBSP: + case AArch64::RETAA: + case AArch64::RETAB: + case AArch64::EMITBKEY: + return outliner::InstrType::Illegal; + } + // Don't outline LOHs. if (FuncInfo->getLOHRelated().count(&MI)) return outliner::InstrType::Illegal; @@ -5528,6 +6229,59 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const { } } +static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB, + bool ShouldSignReturnAddr, + bool ShouldSignReturnAddrWithAKey) { + if (ShouldSignReturnAddr) { + MachineBasicBlock::iterator MBBPAC = MBB.begin(); + MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL; + + if (MBBAUT != MBB.end()) + DL = MBBAUT->getDebugLoc(); + + // At the very beginning of the basic block we insert the following + // depending on the key type + // + // a_key: b_key: + // PACIASP EMITBKEY + // CFI_INSTRUCTION PACIBSP + // CFI_INSTRUCTION + if (ShouldSignReturnAddrWithAKey) { + BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP)) + .setMIFlag(MachineInstr::FrameSetup); + } else { + BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY)) + .setMIFlag(MachineInstr::FrameSetup); + BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP)) + .setMIFlag(MachineInstr::FrameSetup); + } + unsigned CFIIndex = + MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr)); + BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION)) + .addCFIIndex(CFIIndex) + .setMIFlags(MachineInstr::FrameSetup); + + // If v8.3a features are available we can replace a RET instruction by + // RETAA or RETAB and omit the AUT instructions + if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() && + MBBAUT->getOpcode() == AArch64::RET) { + BuildMI(MBB, MBBAUT, DL, + TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA + : AArch64::RETAB)) + .copyImplicitOps(*MBBAUT); + MBB.erase(MBBAUT); + } else { + BuildMI(MBB, MBBAUT, DL, + TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP + : AArch64::AUTIBSP)) + .setMIFlag(MachineInstr::FrameDestroy); + } + } +} + void AArch64InstrInfo::buildOutlinedFrame( MachineBasicBlock &MBB, MachineFunction &MF, const outliner::OutlinedFunction &OF) const { @@ -5543,16 +6297,19 @@ void AArch64InstrInfo::buildOutlinedFrame( TailOpcode = AArch64::TCRETURNriALL; } MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode)) - .add(Call->getOperand(0)) - .addImm(0); + .add(Call->getOperand(0)) + .addImm(0); MBB.insert(MBB.end(), TC); Call->eraseFromParent(); } + bool IsLeafFunction = true; + // Is there a call in the outlined range? - auto IsNonTailCall = [](MachineInstr &MI) { + auto IsNonTailCall = [](const MachineInstr &MI) { return MI.isCall() && !MI.isReturn(); }; + if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) { // Fix up the instructions in the range, since we're going to modify the // stack. @@ -5560,6 +6317,8 @@ void AArch64InstrInfo::buildOutlinedFrame( "Can only fix up stack references once"); fixupPostOutline(MBB); + IsLeafFunction = false; + // LR has to be a live in so that we can save it. MBB.addLiveIn(AArch64::LR); @@ -5606,16 +6365,47 @@ void AArch64InstrInfo::buildOutlinedFrame( Et = MBB.insert(Et, LDRXpost); } + // If a bunch of candidates reach this point they must agree on their return + // address signing. It is therefore enough to just consider the signing + // behaviour of one of them + const Function &CF = OF.Candidates.front().getMF()->getFunction(); + bool ShouldSignReturnAddr = false; + if (CF.hasFnAttribute("sign-return-address")) { + StringRef Scope = + CF.getFnAttribute("sign-return-address").getValueAsString(); + if (Scope.equals("all")) + ShouldSignReturnAddr = true; + else if (Scope.equals("non-leaf") && !IsLeafFunction) + ShouldSignReturnAddr = true; + } + + // a_key is the default + bool ShouldSignReturnAddrWithAKey = true; + if (CF.hasFnAttribute("sign-return-address-key")) { + const StringRef Key = + CF.getFnAttribute("sign-return-address-key").getValueAsString(); + // Key can either be a_key or b_key + assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) && + "Return address signing key must be either a_key or b_key"); + ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key"); + } + // If this is a tail call outlined function, then there's already a return. if (OF.FrameConstructionID == MachineOutlinerTailCall || - OF.FrameConstructionID == MachineOutlinerThunk) + OF.FrameConstructionID == MachineOutlinerThunk) { + signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, + ShouldSignReturnAddrWithAKey); return; + } // It's not a tail call, so we have to insert the return ourselves. MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET)) .addReg(AArch64::LR, RegState::Undef); MBB.insert(MBB.end(), ret); + signOutlinedFunction(MF, MBB, ShouldSignReturnAddr, + ShouldSignReturnAddrWithAKey); + // Did we have to modify the stack by saving the link register? if (OF.FrameConstructionID != MachineOutlinerDefault) return; @@ -5702,29 +6492,126 @@ bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault( return MF.getFunction().hasMinSize(); } -bool AArch64InstrInfo::isCopyInstrImpl( - const MachineInstr &MI, const MachineOperand *&Source, - const MachineOperand *&Destination) const { +Optional<DestSourcePair> +AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { // AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg // and zero immediate operands used as an alias for mov instruction. if (MI.getOpcode() == AArch64::ORRWrs && MI.getOperand(1).getReg() == AArch64::WZR && MI.getOperand(3).getImm() == 0x0) { - Destination = &MI.getOperand(0); - Source = &MI.getOperand(2); - return true; + return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; } if (MI.getOpcode() == AArch64::ORRXrs && MI.getOperand(1).getReg() == AArch64::XZR && MI.getOperand(3).getImm() == 0x0) { - Destination = &MI.getOperand(0); - Source = &MI.getOperand(2); - return true; + return DestSourcePair{MI.getOperand(0), MI.getOperand(2)}; } - return false; + return None; +} + +Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI, + Register Reg) const { + int Sign = 1; + int64_t Offset = 0; + + // TODO: Handle cases where Reg is a super- or sub-register of the + // destination register. + if (Reg != MI.getOperand(0).getReg()) + return None; + + switch (MI.getOpcode()) { + default: + return None; + case AArch64::SUBWri: + case AArch64::SUBXri: + case AArch64::SUBSWri: + case AArch64::SUBSXri: + Sign *= -1; + LLVM_FALLTHROUGH; + case AArch64::ADDSWri: + case AArch64::ADDSXri: + case AArch64::ADDWri: + case AArch64::ADDXri: { + // TODO: Third operand can be global address (usually some string). + if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() || + !MI.getOperand(2).isImm()) + return None; + Offset = MI.getOperand(2).getImm() * Sign; + int Shift = MI.getOperand(3).getImm(); + assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12"); + Offset = Offset << Shift; + } + } + return RegImmPair{MI.getOperand(1).getReg(), Offset}; +} + +/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with +/// the destination register then, if possible, describe the value in terms of +/// the source register. +static Optional<ParamLoadedValue> +describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg, + const TargetInstrInfo *TII, + const TargetRegisterInfo *TRI) { + auto DestSrc = TII->isCopyInstr(MI); + if (!DestSrc) + return None; + + Register DestReg = DestSrc->Destination->getReg(); + Register SrcReg = DestSrc->Source->getReg(); + + auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {}); + + // If the described register is the destination, just return the source. + if (DestReg == DescribedReg) + return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); + + // ORRWrs zero-extends to 64-bits, so we need to consider such cases. + if (MI.getOpcode() == AArch64::ORRWrs && + TRI->isSuperRegister(DestReg, DescribedReg)) + return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr); + + // We may need to describe the lower part of a ORRXrs move. + if (MI.getOpcode() == AArch64::ORRXrs && + TRI->isSubRegister(DestReg, DescribedReg)) { + Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32); + return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr); + } + + assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) && + "Unhandled ORR[XW]rs copy case"); + + return None; +} + +Optional<ParamLoadedValue> +AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI, + Register Reg) const { + const MachineFunction *MF = MI.getMF(); + const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo(); + switch (MI.getOpcode()) { + case AArch64::MOVZWi: + case AArch64::MOVZXi: { + // MOVZWi may be used for producing zero-extended 32-bit immediates in + // 64-bit parameters, so we need to consider super-registers. + if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg)) + return None; + + if (!MI.getOperand(1).isImm()) + return None; + int64_t Immediate = MI.getOperand(1).getImm(); + int Shift = MI.getOperand(2).getImm(); + return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift), + nullptr); + } + case AArch64::ORRWrs: + case AArch64::ORRXrs: + return describeORRLoadedValue(MI, Reg, this, TRI); + } + + return TargetInstrInfo::describeLoadedValue(MI, Reg); } #define GET_INSTRINFO_HELPERS diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h index 1688045e4fb8..66e517e54903 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -89,6 +89,12 @@ public: /// if there is a corresponding unscaled variant available. static Optional<unsigned> getUnscaledLdSt(unsigned Opc); + /// Scaling factor for (scaled or unscaled) load or store. + static int getMemScale(unsigned Opc); + static int getMemScale(const MachineInstr &MI) { + return getMemScale(MI.getOpcode()); + } + /// Returns the index for the immediate for a given instruction. static unsigned getLoadStoreImmIdx(unsigned Opc); @@ -131,15 +137,15 @@ public: unsigned NumLoads) const override; void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, - bool KillSrc, unsigned Opcode, + const DebugLoc &DL, MCRegister DestReg, + MCRegister SrcReg, bool KillSrc, unsigned Opcode, llvm::ArrayRef<unsigned> Indices) const; void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, DebugLoc DL, unsigned DestReg, unsigned SrcReg, bool KillSrc, unsigned Opcode, unsigned ZeroReg, llvm::ArrayRef<unsigned> Indices) const; void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I, - const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, + const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, @@ -265,15 +271,21 @@ public: /// on Windows. static bool isSEHInstruction(const MachineInstr &MI); + Optional<RegImmPair> isAddImmediate(const MachineInstr &MI, + Register Reg) const override; + + Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI, + Register Reg) const override; + #define GET_INSTRINFO_HELPER_DECLS #include "AArch64GenInstrInfo.inc" protected: - /// If the specific machine instruction is a instruction that moves/copies - /// value from one register to another register return true along with - /// @Source machine operand and @Destination machine operand. - bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source, - const MachineOperand *&Destination) const override; + /// If the specific machine instruction is an instruction that moves/copies + /// value from one register to another register return destination and source + /// registers as machine operands. + Optional<DestSourcePair> + isCopyInstrImpl(const MachineInstr &MI) const override; private: /// Sets the offsets on outlined instructions in \p MBB which use SP diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1981bd5d3bf0..d590d4d913ff 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -214,6 +214,7 @@ def SDT_AArch64FCmp : SDTypeProfile<0, 2, SDTCisSameAs<0, 1>]>; def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>; def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>; +def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>; def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>]>; @@ -242,6 +243,9 @@ def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>; def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>; +def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; +def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>; + // Generates the general dynamic sequences, i.e. // adrp x0, :tlsdesc:var // ldr x1, [x0, #:tlsdesc_lo12:var] @@ -259,6 +263,110 @@ def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4, SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>, SDTCisSameAs<1, 4>]>; +def SDT_AArch64TBL : SDTypeProfile<1, 2, [ + SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2> +]>; + +// non-extending masked load fragment. +def nonext_masked_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast<MaskedLoadSDNode>(N)->isUnindexed() && + !cast<MaskedLoadSDNode>(N)->isNonTemporal(); +}]>; +// sign extending masked load fragments. +def asext_masked_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def),[{ + return (cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD || + cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD) && + cast<MaskedLoadSDNode>(N)->isUnindexed(); +}]>; +def asext_masked_load_i8 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def asext_masked_load_i16 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def asext_masked_load_i32 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (asext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; +// zero extending masked load fragments. +def zext_masked_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD && + cast<MaskedLoadSDNode>(N)->isUnindexed(); +}]>; +def zext_masked_load_i8 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def zext_masked_load_i16 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def zext_masked_load_i32 : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (zext_masked_load node:$ptr, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def non_temporal_load : + PatFrag<(ops node:$ptr, node:$pred, node:$def), + (masked_ld node:$ptr, undef, node:$pred, node:$def), [{ + return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD && + cast<MaskedLoadSDNode>(N)->isUnindexed() && + cast<MaskedLoadSDNode>(N)->isNonTemporal(); +}]>; + +// non-truncating masked store fragment. +def nontrunc_masked_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() && + cast<MaskedStoreSDNode>(N)->isUnindexed() && + !cast<MaskedStoreSDNode>(N)->isNonTemporal(); +}]>; +// truncating masked store fragments. +def trunc_masked_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->isTruncatingStore() && + cast<MaskedStoreSDNode>(N)->isUnindexed(); +}]>; +def trunc_masked_store_i8 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8; +}]>; +def trunc_masked_store_i16 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16; +}]>; +def trunc_masked_store_i32 : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (trunc_masked_store node:$val, node:$ptr, node:$pred), [{ + return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; +}]>; + +def non_temporal_store : + PatFrag<(ops node:$val, node:$ptr, node:$pred), + (masked_st node:$val, node:$ptr, undef, node:$pred), [{ + return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() && + cast<MaskedStoreSDNode>(N)->isUnindexed() && + cast<MaskedStoreSDNode>(N)->isNonTemporal(); +}]>; // Node definitions. def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; @@ -319,6 +427,8 @@ def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>; def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>; def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>; +def AArch64insr : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>; + def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>; def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>; def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>; @@ -432,6 +542,11 @@ def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>; def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>; def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; +def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; + +def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>; + //===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// @@ -441,10 +556,10 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>; // the Function object through the <Target>Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def ForCodeSize : Predicate<"MF->getFunction().hasOptSize()">; - def NotForCodeSize : Predicate<"!MF->getFunction().hasOptSize()">; + def ForCodeSize : Predicate<"shouldOptForSize(MF)">; + def NotForCodeSize : Predicate<"!shouldOptForSize(MF)">; // Avoid generating STRQro if it is slow, unless we're optimizing for code size. - def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().hasOptSize()">; + def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">; def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>; @@ -675,34 +790,81 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd, defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla", null_frag>; +let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in { + def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 0))>; + def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot270 (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 1))>; + def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot90 (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 0))>; + def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>; +} +let Predicates = [HasComplxNum, HasNEON] in { + def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>; + def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot270 (v2f32 V64:$Rn), (v2f32 V64:$Rm))), + (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 1))>; + foreach Ty = [v4f32, v2f64] in { + def : Pat<(Ty (int_aarch64_neon_vcadd_rot90 (Ty V128:$Rn), (Ty V128:$Rm))), + (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 0))>; + def : Pat<(Ty (int_aarch64_neon_vcadd_rot270 (Ty V128:$Rn), (Ty V128:$Rm))), + (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 1))>; + } +} + // v8.3a Pointer Authentication // These instructions inhabit part of the hint space and so can be used for -// armv8 targets +// armv8 targets. Keeping the old HINT mnemonic when compiling without PA is +// important for compatibility with other assemblers (e.g. GAS) when building +// software compatible with both CPUs that do or don't implement PA. let Uses = [LR], Defs = [LR] in { - def PACIAZ : SystemNoOperands<0b000, "paciaz">; - def PACIBZ : SystemNoOperands<0b010, "pacibz">; - def AUTIAZ : SystemNoOperands<0b100, "autiaz">; - def AUTIBZ : SystemNoOperands<0b110, "autibz">; + def PACIAZ : SystemNoOperands<0b000, "hint #24">; + def PACIBZ : SystemNoOperands<0b010, "hint #26">; + let isAuthenticated = 1 in { + def AUTIAZ : SystemNoOperands<0b100, "hint #28">; + def AUTIBZ : SystemNoOperands<0b110, "hint #30">; + } } let Uses = [LR, SP], Defs = [LR] in { - def PACIASP : SystemNoOperands<0b001, "paciasp">; - def PACIBSP : SystemNoOperands<0b011, "pacibsp">; - def AUTIASP : SystemNoOperands<0b101, "autiasp">; - def AUTIBSP : SystemNoOperands<0b111, "autibsp">; + def PACIASP : SystemNoOperands<0b001, "hint #25">; + def PACIBSP : SystemNoOperands<0b011, "hint #27">; + let isAuthenticated = 1 in { + def AUTIASP : SystemNoOperands<0b101, "hint #29">; + def AUTIBSP : SystemNoOperands<0b111, "hint #31">; + } } let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in { - def PACIA1716 : SystemNoOperands<0b000, "pacia1716">; - def PACIB1716 : SystemNoOperands<0b010, "pacib1716">; - def AUTIA1716 : SystemNoOperands<0b100, "autia1716">; - def AUTIB1716 : SystemNoOperands<0b110, "autib1716">; + def PACIA1716 : SystemNoOperands<0b000, "hint #8">; + def PACIB1716 : SystemNoOperands<0b010, "hint #10">; + let isAuthenticated = 1 in { + def AUTIA1716 : SystemNoOperands<0b100, "hint #12">; + def AUTIB1716 : SystemNoOperands<0b110, "hint #14">; + } } let Uses = [LR], Defs = [LR], CRm = 0b0000 in { - def XPACLRI : SystemNoOperands<0b111, "xpaclri">; + def XPACLRI : SystemNoOperands<0b111, "hint #7">; } -// These pointer authentication isntructions require armv8.3a +// These pointer authentication instructions require armv8.3a let Predicates = [HasPA] in { + + // When compiling with PA, there is a better mnemonic for these instructions. + def : InstAlias<"paciaz", (PACIAZ), 1>; + def : InstAlias<"pacibz", (PACIBZ), 1>; + def : InstAlias<"autiaz", (AUTIAZ), 1>; + def : InstAlias<"autibz", (AUTIBZ), 1>; + def : InstAlias<"paciasp", (PACIASP), 1>; + def : InstAlias<"pacibsp", (PACIBSP), 1>; + def : InstAlias<"autiasp", (AUTIASP), 1>; + def : InstAlias<"autibsp", (AUTIBSP), 1>; + def : InstAlias<"pacia1716", (PACIA1716), 1>; + def : InstAlias<"pacib1716", (PACIB1716), 1>; + def : InstAlias<"autia1716", (AUTIA1716), 1>; + def : InstAlias<"autib1716", (AUTIB1716), 1>; + def : InstAlias<"xpaclri", (XPACLRI), 1>; + multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> { def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>; def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>; @@ -1478,6 +1640,8 @@ def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)), def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)), (i64 1))), (CLSXr GPR64:$Rn)>; +def : Pat<(int_aarch64_cls GPR32:$Rn), (CLSWr GPR32:$Rn)>; +def : Pat<(int_aarch64_cls64 GPR64:$Rm), (EXTRACT_SUBREG (CLSXr GPR64:$Rm), sub_32)>; // Unlike the other one operand instructions, the instructions with the "rev" // mnemonic do *not* just different in the size bit, but actually use different @@ -1859,6 +2023,9 @@ defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">; defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">; defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">; +def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (LDPXi GPR64sp:$Rn, simm7s8:$offset)>; + //--- // (register offset) //--- @@ -2552,6 +2719,9 @@ defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">; defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">; defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">; +def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)), + (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>; + //--- // (Register offset) @@ -3506,14 +3676,8 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn) (i64 4)))), (FCVTLv8i16 V128:$Rn)>; def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>; -def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn), - (i64 2))))), - (FCVTLv4i32 V128:$Rn)>; def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>; -def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn), - (i64 4))))), - (FCVTLv8i16 V128:$Rn)>; defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>; defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>; @@ -3714,10 +3878,11 @@ defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>; defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>; defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>; defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>; -defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", - TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >; -defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", - TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >; + +// MLA and MLS are generated in MachineCombine +defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>; +defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>; + defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>; defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>; defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba", @@ -3760,6 +3925,12 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah", defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh", int_aarch64_neon_sqsub>; +// Extra saturate patterns, other than the intrinsics matches above +defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>; +defm : SIMDThreeSameVectorExtraPatterns<"UQADD", uaddsat>; +defm : SIMDThreeSameVectorExtraPatterns<"SQSUB", ssubsat>; +defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>; + defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>; defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic", BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >; @@ -4356,6 +4527,25 @@ defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16, defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16, UMULLv4i16_v4i32, UMULLv2i32_v2i64>; +// Patterns for smull2/umull2. +multiclass Neon_mul_high_patterns<SDPatternOperator opnode, + Instruction INST8B, Instruction INST4H, Instruction INST2S> { + def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn), + (extract_high_v16i8 V128:$Rm))), + (INST8B V128:$Rn, V128:$Rm)>; + def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn), + (extract_high_v8i16 V128:$Rm))), + (INST4H V128:$Rn, V128:$Rm)>; + def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn), + (extract_high_v4i32 V128:$Rm))), + (INST2S V128:$Rn, V128:$Rm)>; +} + +defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16, + SMULLv8i16_v4i32, SMULLv4i32_v2i64>; +defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16, + UMULLv8i16_v4i32, UMULLv4i32_v2i64>; + // Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode, Instruction INST8B, Instruction INST4H, Instruction INST2S> { @@ -5422,10 +5612,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))), defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>; defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>; -defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", - TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>; -defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", - TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>; + +// Generated by MachineCombine +defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>; +defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>; + defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>; defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal", TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>; diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp index 961f38cad1e4..b9ac2657e1c5 100644 --- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -32,6 +32,7 @@ #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/IR/Type.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -205,6 +206,14 @@ private: ComplexRendererFns selectAddrModeShiftedExtendXReg(MachineOperand &Root, unsigned SizeInBytes) const; + + /// Returns a \p ComplexRendererFns which contains a base, offset, and whether + /// or not a shift + extend should be folded into an addressing mode. Returns + /// None when this is not profitable or possible. + ComplexRendererFns + selectExtendedSHL(MachineOperand &Root, MachineOperand &Base, + MachineOperand &Offset, unsigned SizeInBytes, + bool WantsExt) const; ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const; ComplexRendererFns selectAddrModeXRO(MachineOperand &Root, unsigned SizeInBytes) const; @@ -213,6 +222,13 @@ private: return selectAddrModeXRO(Root, Width / 8); } + ComplexRendererFns selectAddrModeWRO(MachineOperand &Root, + unsigned SizeInBytes) const; + template <int Width> + ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const { + return selectAddrModeWRO(Root, Width / 8); + } + ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const; ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const { @@ -227,6 +243,15 @@ private: return selectShiftedRegister(Root); } + /// Given an extend instruction, determine the correct shift-extend type for + /// that instruction. + /// + /// If the instruction is going to be used in a load or store, pass + /// \p IsLoadStore = true. + AArch64_AM::ShiftExtendType + getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI, + bool IsLoadStore = false) const; + /// Instructions that accept extend modifiers like UXTW expect the register /// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a /// subregister copy if necessary. Return either ExtReg, or the result of the @@ -235,9 +260,12 @@ private: MachineIRBuilder &MIB) const; ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const; - void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const; - void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I) const; - void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I) const; + void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI, + int OpIdx = -1) const; + void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I, + int OpIdx = -1) const; + void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I, + int OpIdx = -1) const; // Materialize a GlobalValue or BlockAddress using a movz+movk sequence. void materializeLargeCMVal(MachineInstr &I, const Value *V, @@ -462,7 +490,7 @@ static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, } } else if (OpSize == 64) { switch (GenericOpc) { - case TargetOpcode::G_GEP: + case TargetOpcode::G_PTR_ADD: return AArch64::ADDXrr; case TargetOpcode::G_SHL: return AArch64::LSLVXr; @@ -1006,6 +1034,66 @@ bool AArch64InstructionSelector::selectCompareBranch( return true; } +/// Returns the element immediate value of a vector shift operand if found. +/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR. +static Optional<int64_t> getVectorShiftImm(Register Reg, + MachineRegisterInfo &MRI) { + assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand"); + MachineInstr *OpMI = MRI.getVRegDef(Reg); + assert(OpMI && "Expected to find a vreg def for vector shift operand"); + if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR) + return None; + + // Check all operands are identical immediates. + int64_t ImmVal = 0; + for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) { + auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI); + if (!VRegAndVal) + return None; + + if (Idx == 1) + ImmVal = VRegAndVal->Value; + if (ImmVal != VRegAndVal->Value) + return None; + } + + return ImmVal; +} + +/// Matches and returns the shift immediate value for a SHL instruction given +/// a shift operand. +static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) { + Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI); + if (!ShiftImm) + return None; + // Check the immediate is in range for a SHL. + int64_t Imm = *ShiftImm; + if (Imm < 0) + return None; + switch (SrcTy.getElementType().getSizeInBits()) { + default: + LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift"); + return None; + case 8: + if (Imm > 7) + return None; + break; + case 16: + if (Imm > 15) + return None; + break; + case 32: + if (Imm > 31) + return None; + break; + case 64: + if (Imm > 63) + return None; + break; + } + return Imm; +} + bool AArch64InstructionSelector::selectVectorSHL( MachineInstr &I, MachineRegisterInfo &MRI) const { assert(I.getOpcode() == TargetOpcode::G_SHL); @@ -1017,21 +1105,29 @@ bool AArch64InstructionSelector::selectVectorSHL( if (!Ty.isVector()) return false; + // Check if we have a vector of constants on RHS that we can select as the + // immediate form. + Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI); + unsigned Opc = 0; if (Ty == LLT::vector(2, 64)) { - Opc = AArch64::USHLv2i64; + Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64; } else if (Ty == LLT::vector(4, 32)) { - Opc = AArch64::USHLv4i32; + Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32; } else if (Ty == LLT::vector(2, 32)) { - Opc = AArch64::USHLv2i32; + Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32; } else { LLVM_DEBUG(dbgs() << "Unhandled G_SHL type"); return false; } MachineIRBuilder MIB(I); - auto UShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Src2Reg}); - constrainSelectedInstRegOperands(*UShl, TII, TRI, RBI); + auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg}); + if (ImmVal) + Shl.addImm(*ImmVal); + else + Shl.addUse(Src2Reg); + constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI); I.eraseFromParent(); return true; } @@ -1765,7 +1861,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { auto *PtrMI = MRI.getVRegDef(PtrReg); // Try to fold a GEP into our unsigned immediate addressing mode. - if (PtrMI->getOpcode() == TargetOpcode::G_GEP) { + if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) { if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) { int64_t Imm = *COff; const unsigned Size = MemSizeInBits / 8; @@ -1883,7 +1979,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } - case TargetOpcode::G_GEP: { + case TargetOpcode::G_PTR_ADD: { MachineIRBuilder MIRBuilder(I); emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2), MIRBuilder); @@ -2065,14 +2161,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I) { unsigned DstSize = DstTy.getSizeInBits(); unsigned SrcSize = SrcTy.getSizeInBits(); + if (DstTy.isVector()) + return false; // Should be handled by imported patterns. + assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() == AArch64::GPRRegBankID && "Unexpected ext regbank"); MachineIRBuilder MIB(I); MachineInstr *ExtI; - if (DstTy.isVector()) - return false; // Should be handled by imported patterns. // First check if we're extending the result of a load which has a dest type // smaller than 32 bits, then this zext is redundant. GPR32 is the smallest @@ -3602,22 +3699,51 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const { return false; // The shuffle's second operand doesn't matter if the mask is all zero. - const Constant *Mask = I.getOperand(3).getShuffleMask(); - if (!isa<ConstantAggregateZero>(Mask)) + ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); + if (!all_of(Mask, [](int Elem) { return Elem == 0; })) return false; // We're done, now find out what kind of splat we need. LLT VecTy = MRI.getType(I.getOperand(0).getReg()); LLT EltTy = VecTy.getElementType(); - if (VecTy.getSizeInBits() != 128 || EltTy.getSizeInBits() < 32) { - LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 128b yet"); + if (EltTy.getSizeInBits() < 32) { + LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet"); return false; } bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID; - static const unsigned OpcTable[2][2] = { - {AArch64::DUPv4i32gpr, AArch64::DUPv2i64gpr}, - {AArch64::DUPv4i32lane, AArch64::DUPv2i64lane}}; - unsigned Opc = OpcTable[IsFP][EltTy.getSizeInBits() == 64]; + unsigned Opc = 0; + if (IsFP) { + switch (EltTy.getSizeInBits()) { + case 32: + if (VecTy.getNumElements() == 2) { + Opc = AArch64::DUPv2i32lane; + } else { + Opc = AArch64::DUPv4i32lane; + assert(VecTy.getNumElements() == 4); + } + break; + case 64: + assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); + Opc = AArch64::DUPv2i64lane; + break; + } + } else { + switch (EltTy.getSizeInBits()) { + case 32: + if (VecTy.getNumElements() == 2) { + Opc = AArch64::DUPv2i32gpr; + } else { + Opc = AArch64::DUPv4i32gpr; + assert(VecTy.getNumElements() == 4); + } + break; + case 64: + assert(VecTy.getNumElements() == 2 && "Unexpected num elts"); + Opc = AArch64::DUPv2i64gpr; + break; + } + } + assert(Opc && "Did not compute an opcode for a dup"); // For FP splats, we need to widen the scalar reg via undef too. if (IsFP) { @@ -3652,15 +3778,12 @@ bool AArch64InstructionSelector::selectShuffleVector( const LLT Src1Ty = MRI.getType(Src1Reg); Register Src2Reg = I.getOperand(2).getReg(); const LLT Src2Ty = MRI.getType(Src2Reg); - const Constant *ShuffleMask = I.getOperand(3).getShuffleMask(); + ArrayRef<int> Mask = I.getOperand(3).getShuffleMask(); MachineBasicBlock &MBB = *I.getParent(); MachineFunction &MF = *MBB.getParent(); LLVMContext &Ctx = MF.getFunction().getContext(); - SmallVector<int, 8> Mask; - ShuffleVectorInst::getShuffleMask(ShuffleMask, Mask); - // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if // it's originated from a <1 x T> type. Those should have been lowered into // G_BUILD_VECTOR earlier. @@ -4164,45 +4287,15 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg( [](MachineInstr &Use) { return Use.mayLoadOrStore(); }); } -/// This is used for computing addresses like this: -/// -/// ldr x1, [x2, x3, lsl #3] -/// -/// Where x2 is the base register, and x3 is an offset register. The shift-left -/// is a constant value specific to this load instruction. That is, we'll never -/// see anything other than a 3 here (which corresponds to the size of the -/// element being loaded.) InstructionSelector::ComplexRendererFns -AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( - MachineOperand &Root, unsigned SizeInBytes) const { - if (!Root.isReg()) - return None; - MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); - - // Make sure that the memory op is a valid size. - int64_t LegalShiftVal = Log2_32(SizeInBytes); - if (LegalShiftVal == 0) - return None; - - // We want to find something like this: - // - // val = G_CONSTANT LegalShiftVal - // shift = G_SHL off_reg val - // ptr = G_GEP base_reg shift - // x = G_LOAD ptr - // - // And fold it into this addressing mode: - // - // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] +AArch64InstructionSelector::selectExtendedSHL( + MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset, + unsigned SizeInBytes, bool WantsExt) const { + assert(Base.isReg() && "Expected base to be a register operand"); + assert(Offset.isReg() && "Expected offset to be a register operand"); - // Check if we can find the G_GEP. - MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI); - if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI)) - return None; - - // Now, try to match an opcode which will match our specific offset. - // We want a G_SHL or a G_MUL. - MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI); + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg()); if (!OffsetInst) return None; @@ -4210,6 +4303,10 @@ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL) return None; + // Make sure that the memory op is a valid size. + int64_t LegalShiftVal = Log2_32(SizeInBytes); + if (LegalShiftVal == 0) + return None; if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) return None; @@ -4254,27 +4351,82 @@ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( if (ImmVal != LegalShiftVal) return None; + unsigned SignExtend = 0; + if (WantsExt) { + // Check if the offset is defined by an extend. + MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI); + auto Ext = getExtendTypeForInst(*ExtInst, MRI, true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + + SignExtend = Ext == AArch64_AM::SXTW; + + // Need a 32-bit wide register here. + MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg())); + OffsetReg = ExtInst->getOperand(1).getReg(); + OffsetReg = narrowExtendRegIfNeeded(OffsetReg, MIB); + } + // We can use the LHS of the GEP as the base, and the LHS of the shift as an // offset. Signify that we are shifting by setting the shift flag to 1. - return {{[=](MachineInstrBuilder &MIB) { - MIB.addUse(Gep->getOperand(1).getReg()); - }, + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); }, [=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); }, [=](MachineInstrBuilder &MIB) { // Need to add both immediates here to make sure that they are both // added to the instruction. - MIB.addImm(0); + MIB.addImm(SignExtend); MIB.addImm(1); }}}; } /// This is used for computing addresses like this: /// +/// ldr x1, [x2, x3, lsl #3] +/// +/// Where x2 is the base register, and x3 is an offset register. The shift-left +/// is a constant value specific to this load instruction. That is, we'll never +/// see anything other than a 3 here (which corresponds to the size of the +/// element being loaded.) +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeShiftedExtendXReg( + MachineOperand &Root, unsigned SizeInBytes) const { + if (!Root.isReg()) + return None; + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + // We want to find something like this: + // + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_PTR_ADD base_reg shift + // x = G_LOAD ptr + // + // And fold it into this addressing mode: + // + // ldr x, [base_reg, off_reg, lsl #LegalShiftVal] + + // Check if we can find the G_PTR_ADD. + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + return None; + + // Now, try to match an opcode which will match our specific offset. + // We want a G_SHL or a G_MUL. + MachineInstr *OffsetInst = + getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI); + return selectExtendedSHL(Root, PtrAdd->getOperand(1), + OffsetInst->getOperand(0), SizeInBytes, + /*WantsExt=*/false); +} + +/// This is used for computing addresses like this: +/// /// ldr x1, [x2, x3] /// /// Where x2 is the base register, and x3 is an offset register. /// -/// When possible (or profitable) to fold a G_GEP into the address calculation, +/// When possible (or profitable) to fold a G_PTR_ADD into the address calculation, /// this will do so. Otherwise, it will return None. InstructionSelector::ComplexRendererFns AArch64InstructionSelector::selectAddrModeRegisterOffset( @@ -4283,7 +4435,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset( // We need a GEP. MachineInstr *Gep = MRI.getVRegDef(Root.getReg()); - if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP) + if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD) return None; // If this is used more than once, let's not bother folding. @@ -4329,6 +4481,74 @@ AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root, return selectAddrModeRegisterOffset(Root); } +/// This is used for computing addresses like this: +/// +/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal] +/// +/// Where we have a 64-bit base register, a 32-bit offset register, and an +/// extend (which may or may not be signed). +InstructionSelector::ComplexRendererFns +AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root, + unsigned SizeInBytes) const { + MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo(); + + MachineInstr *PtrAdd = + getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI); + if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI)) + return None; + + MachineOperand &LHS = PtrAdd->getOperand(1); + MachineOperand &RHS = PtrAdd->getOperand(2); + MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI); + + // The first case is the same as selectAddrModeXRO, except we need an extend. + // In this case, we try to find a shift and extend, and fold them into the + // addressing mode. + // + // E.g. + // + // off_reg = G_Z/S/ANYEXT ext_reg + // val = G_CONSTANT LegalShiftVal + // shift = G_SHL off_reg val + // ptr = G_PTR_ADD base_reg shift + // x = G_LOAD ptr + // + // In this case we can get a load like this: + // + // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal] + auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0), + SizeInBytes, /*WantsExt=*/true); + if (ExtendedShl) + return ExtendedShl; + + // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though. + // + // e.g. + // ldr something, [base_reg, ext_reg, sxtw] + if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI)) + return None; + + // Check if this is an extend. We'll get an extend type if it is. + AArch64_AM::ShiftExtendType Ext = + getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true); + if (Ext == AArch64_AM::InvalidShiftExtend) + return None; + + // Need a 32-bit wide register. + MachineIRBuilder MIB(*PtrAdd); + Register ExtReg = + narrowExtendRegIfNeeded(OffsetInst->getOperand(1).getReg(), MIB); + unsigned SignExtend = Ext == AArch64_AM::SXTW; + + // Base is LHS, offset is ExtReg. + return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); }, + [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); }, + [=](MachineInstrBuilder &MIB) { + MIB.addImm(SignExtend); + MIB.addImm(0); + }}}; +} + /// Select a "register plus unscaled signed 9-bit immediate" address. This /// should only match when there is an offset that is not valid for a scaled /// immediate addressing mode. The "Size" argument is the size in bytes of the @@ -4491,9 +4711,8 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const { [=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}}; } -/// Get the correct ShiftExtendType for an extend instruction. -static AArch64_AM::ShiftExtendType -getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) { +AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst( + MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const { unsigned Opc = MI.getOpcode(); // Handle explicit extend instructions first. @@ -4540,9 +4759,9 @@ getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) { default: return AArch64_AM::InvalidShiftExtend; case 0xFF: - return AArch64_AM::UXTB; + return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend; case 0xFFFF: - return AArch64_AM::UXTH; + return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend; case 0xFFFFFFFF: return AArch64_AM::UXTW; } @@ -4632,25 +4851,29 @@ AArch64InstructionSelector::selectArithExtendedRegister( } void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB, - const MachineInstr &MI) const { + const MachineInstr &MI, + int OpIdx) const { const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); - assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI); assert(CstVal && "Expected constant value"); MIB.addImm(CstVal.getValue()); } void AArch64InstructionSelector::renderLogicalImm32( - MachineInstrBuilder &MIB, const MachineInstr &I) const { - assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32); MIB.addImm(Enc); } void AArch64InstructionSelector::renderLogicalImm64( - MachineInstrBuilder &MIB, const MachineInstr &I) const { - assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT"); + MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const { + assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 && + "Expected G_CONSTANT"); uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue(); uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64); MIB.addImm(Enc); diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 7a1901bd5b1e..95719a35c6da 100644 --- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -59,7 +59,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { } getActionDefinitionsBuilder(G_IMPLICIT_DEF) - .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64}) + .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64}) .clampScalar(0, s1, s64) .widenScalarToNextPow2(0, 8) .fewerElementsIf( @@ -104,7 +104,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { .moreElementsToNextPow2(0) .minScalarSameAs(1, 0); - getActionDefinitionsBuilder(G_GEP) + getActionDefinitionsBuilder(G_PTR_ADD) .legalFor({{p0, s64}}) .clampScalar(1, s64, s64); @@ -143,7 +143,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) { getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64}); getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO}) - .legalFor({{s32, s1}, {s64, s1}}); + .legalFor({{s32, s1}, {s64, s1}}) + .minScalar(0, s32); getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG}) .legalFor({s32, s64, v2s64, v4s32, v2s32}); @@ -743,7 +744,7 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, // Realign the list to the actual required alignment. auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1); - auto ListTmp = MIRBuilder.buildGEP(PtrTy, List, AlignMinus1.getReg(0)); + auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0)); DstPtr = MRI.createGenericVirtualRegister(PtrTy); MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align)); @@ -758,7 +759,7 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize)); - auto NewList = MIRBuilder.buildGEP(PtrTy, DstPtr, Size.getReg(0)); + auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0)); MIRBuilder.buildStore( NewList, ListPtr, diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp index a0c4a25bb5b9..3156bb446963 100644 --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -26,16 +26,19 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include <cassert> #include <cstdint> +#include <functional> #include <iterator> #include <limits> @@ -51,6 +54,9 @@ STATISTIC(NumUnscaledPairCreated, STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted"); STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted"); +DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming", + "Controls which pairs are considered for renaming"); + // The LdStLimit limits how far we search for load/store pairs. static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit", cl::init(20), cl::Hidden); @@ -76,6 +82,11 @@ using LdStPairFlags = struct LdStPairFlags { // to be extended, 0 means I, and 1 means the returned iterator. int SExtIdx = -1; + // If not none, RenameReg can be used to rename the result register of the + // first store in a pair. Currently this only works when merging stores + // forward. + Optional<MCPhysReg> RenameReg = None; + LdStPairFlags() = default; void setMergeForward(bool V = true) { MergeForward = V; } @@ -83,6 +94,10 @@ using LdStPairFlags = struct LdStPairFlags { void setSExtIdx(int V) { SExtIdx = V; } int getSExtIdx() const { return SExtIdx; } + + void setRenameReg(MCPhysReg R) { RenameReg = R; } + void clearRenameReg() { RenameReg = None; } + Optional<MCPhysReg> getRenameReg() const { return RenameReg; } }; struct AArch64LoadStoreOpt : public MachineFunctionPass { @@ -99,6 +114,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass { // Track which register units have been modified and used. LiveRegUnits ModifiedRegUnits, UsedRegUnits; + LiveRegUnits DefinedInBB; void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired<AAResultsWrapperPass>(); @@ -215,69 +231,6 @@ static bool isTagStore(const MachineInstr &MI) { } } -// Scaling factor for unscaled load or store. -static int getMemScale(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - llvm_unreachable("Opcode has unknown scale!"); - case AArch64::LDRBBui: - case AArch64::LDURBBi: - case AArch64::LDRSBWui: - case AArch64::LDURSBWi: - case AArch64::STRBBui: - case AArch64::STURBBi: - return 1; - case AArch64::LDRHHui: - case AArch64::LDURHHi: - case AArch64::LDRSHWui: - case AArch64::LDURSHWi: - case AArch64::STRHHui: - case AArch64::STURHHi: - return 2; - case AArch64::LDRSui: - case AArch64::LDURSi: - case AArch64::LDRSWui: - case AArch64::LDURSWi: - case AArch64::LDRWui: - case AArch64::LDURWi: - case AArch64::STRSui: - case AArch64::STURSi: - case AArch64::STRWui: - case AArch64::STURWi: - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPWi: - case AArch64::STPSi: - case AArch64::STPWi: - return 4; - case AArch64::LDRDui: - case AArch64::LDURDi: - case AArch64::LDRXui: - case AArch64::LDURXi: - case AArch64::STRDui: - case AArch64::STURDi: - case AArch64::STRXui: - case AArch64::STURXi: - case AArch64::LDPDi: - case AArch64::LDPXi: - case AArch64::STPDi: - case AArch64::STPXi: - return 8; - case AArch64::LDRQui: - case AArch64::LDURQi: - case AArch64::STRQui: - case AArch64::STURQi: - case AArch64::LDPQi: - case AArch64::STPQi: - case AArch64::STGOffset: - case AArch64::STZGOffset: - case AArch64::ST2GOffset: - case AArch64::STZ2GOffset: - case AArch64::STGPi: - return 16; - } -} - static unsigned getMatchingNonSExtOpcode(unsigned Opc, bool *IsValidLdStrOpc = nullptr) { if (IsValidLdStrOpc) @@ -588,7 +541,7 @@ static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. // All other pre/post indexed ldst instructions are unscaled. - Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1; + Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1; if (IsPaired) { MinOffset = -64; @@ -599,8 +552,8 @@ static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, } } -static const MachineOperand &getLdStRegOp(const MachineInstr &MI, - unsigned PairedRegOp = 0) { +static MachineOperand &getLdStRegOp(MachineInstr &MI, + unsigned PairedRegOp = 0) { assert(PairedRegOp < 2 && "Unexpected register operand idx."); unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0; return MI.getOperand(Idx); @@ -620,8 +573,8 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst, MachineInstr &StoreInst, const AArch64InstrInfo *TII) { assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); - int LoadSize = getMemScale(LoadInst); - int StoreSize = getMemScale(StoreInst); + int LoadSize = TII->getMemScale(LoadInst); + int StoreSize = TII->getMemScale(StoreInst); int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) ? getLdStOffsetOp(StoreInst).getImm() : getLdStOffsetOp(StoreInst).getImm() * StoreSize; @@ -731,7 +684,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, unsigned Opc = I->getOpcode(); bool IsScaled = !TII->isUnscaledLdSt(Opc); - int OffsetStride = IsScaled ? 1 : getMemScale(*I); + int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I); bool MergeForward = Flags.getMergeForward(); // Insert our new paired instruction after whichever of the paired @@ -783,6 +736,44 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, return NextI; } +// Apply Fn to all instructions between MI and the beginning of the block, until +// a def for DefReg is reached. Returns true, iff Fn returns true for all +// visited instructions. Stop after visiting Limit iterations. +static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg, + const TargetRegisterInfo *TRI, unsigned Limit, + std::function<bool(MachineInstr &, bool)> &Fn) { + auto MBB = MI.getParent(); + for (MachineBasicBlock::reverse_iterator I = MI.getReverseIterator(), + E = MBB->rend(); + I != E; I++) { + if (!Limit) + return false; + --Limit; + + bool isDef = any_of(I->operands(), [DefReg, TRI](MachineOperand &MOP) { + return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() && + TRI->regsOverlap(MOP.getReg(), DefReg); + }); + if (!Fn(*I, isDef)) + return false; + if (isDef) + break; + } + return true; +} + +static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units, + const TargetRegisterInfo *TRI) { + + for (const MachineOperand &MOP : phys_regs_and_masks(MI)) + if (MOP.isReg() && MOP.isKill()) + Units.removeReg(MOP.getReg()); + + for (const MachineOperand &MOP : phys_regs_and_masks(MI)) + if (MOP.isReg() && !MOP.isKill()) + Units.addReg(MOP.getReg()); +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, MachineBasicBlock::iterator Paired, @@ -800,9 +791,76 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); bool IsUnscaled = TII->isUnscaledLdSt(Opc); - int OffsetStride = IsUnscaled ? getMemScale(*I) : 1; + int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1; bool MergeForward = Flags.getMergeForward(); + + Optional<MCPhysReg> RenameReg = Flags.getRenameReg(); + if (MergeForward && RenameReg) { + MCRegister RegToRename = getLdStRegOp(*I).getReg(); + DefinedInBB.addReg(*RenameReg); + + // Return the sub/super register for RenameReg, matching the size of + // OriginalReg. + auto GetMatchingSubReg = [this, + RenameReg](MCPhysReg OriginalReg) -> MCPhysReg { + for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg)) + if (TRI->getMinimalPhysRegClass(OriginalReg) == + TRI->getMinimalPhysRegClass(SubOrSuper)) + return SubOrSuper; + llvm_unreachable("Should have found matching sub or super register!"); + }; + + std::function<bool(MachineInstr &, bool)> UpdateMIs = + [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) { + if (IsDef) { + bool SeenDef = false; + for (auto &MOP : MI.operands()) { + // Rename the first explicit definition and all implicit + // definitions matching RegToRename. + if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && + (!SeenDef || (MOP.isDef() && MOP.isImplicit())) && + TRI->regsOverlap(MOP.getReg(), RegToRename)) { + assert((MOP.isImplicit() || + (MOP.isRenamable() && !MOP.isEarlyClobber())) && + "Need renamable operands"); + MOP.setReg(GetMatchingSubReg(MOP.getReg())); + SeenDef = true; + } + } + } else { + for (auto &MOP : MI.operands()) { + if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() && + TRI->regsOverlap(MOP.getReg(), RegToRename)) { + assert((MOP.isImplicit() || + (MOP.isRenamable() && !MOP.isEarlyClobber())) && + "Need renamable operands"); + MOP.setReg(GetMatchingSubReg(MOP.getReg())); + } + } + } + LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n"); + return true; + }; + forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs); + +#if !defined(NDEBUG) + // Make sure the register used for renaming is not used between the paired + // instructions. That would trash the content before the new paired + // instruction. + for (auto &MI : + iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>( + std::next(I), std::next(Paired))) + assert(all_of(MI.operands(), + [this, &RenameReg](const MachineOperand &MOP) { + return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() || + !TRI->regsOverlap(MOP.getReg(), *RenameReg); + }) && + "Rename register used between paired instruction, trashing the " + "content"); +#endif + } + // Insert our new paired instruction after whichever of the paired // instructions MergeForward indicates. MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I; @@ -818,11 +876,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, // We're trying to pair instructions that differ in how they are scaled. If // I is scaled then scale the offset of Paired accordingly. Otherwise, do // the opposite (i.e., make Paired's offset unscaled). - int MemSize = getMemScale(*Paired); + int MemSize = TII->getMemScale(*Paired); if (PairedIsUnscaled) { // If the unscaled offset isn't a multiple of the MemSize, we can't // pair the operations together. - assert(!(PairedOffset % getMemScale(*Paired)) && + assert(!(PairedOffset % TII->getMemScale(*Paired)) && "Offset should be a multiple of the stride!"); PairedOffset /= MemSize; } else { @@ -847,9 +905,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); // Scale the immediate offset, if necessary. if (TII->isUnscaledLdSt(RtMI->getOpcode())) { - assert(!(OffsetImm % getMemScale(*RtMI)) && + assert(!(OffsetImm % TII->getMemScale(*RtMI)) && "Unscaled offset cannot be scaled."); - OffsetImm /= getMemScale(*RtMI); + OffsetImm /= TII->getMemScale(*RtMI); } // Construct the new instruction. @@ -931,6 +989,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I, } LLVM_DEBUG(dbgs() << "\n"); + if (MergeForward) + for (const MachineOperand &MOP : phys_regs_and_masks(*I)) + if (MOP.isReg() && MOP.isKill()) + DefinedInBB.addReg(MOP.getReg()); + // Erase the old instructions. I->eraseFromParent(); Paired->eraseFromParent(); @@ -944,8 +1007,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI, MachineBasicBlock::iterator NextI = LoadI; ++NextI; - int LoadSize = getMemScale(*LoadI); - int StoreSize = getMemScale(*StoreI); + int LoadSize = TII->getMemScale(*LoadI); + int StoreSize = TII->getMemScale(*StoreI); Register LdRt = getLdStRegOp(*LoadI).getReg(); const MachineOperand &StMO = getLdStRegOp(*StoreI); Register StRt = getLdStRegOp(*StoreI).getReg(); @@ -1207,6 +1270,148 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI, // FIXME: Can we also match a mixed sext/zext unscaled/scaled pair? } +static bool +canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween, + SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, + const TargetRegisterInfo *TRI) { + if (!FirstMI.mayStore()) + return false; + + // Check if we can find an unused register which we can use to rename + // the register used by the first load/store. + auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); + MachineFunction &MF = *FirstMI.getParent()->getParent(); + if (!RegClass || !MF.getRegInfo().tracksLiveness()) + return false; + + auto RegToRename = getLdStRegOp(FirstMI).getReg(); + // For now, we only rename if the store operand gets killed at the store. + if (!getLdStRegOp(FirstMI).isKill() && + !any_of(FirstMI.operands(), + [TRI, RegToRename](const MachineOperand &MOP) { + return MOP.isReg() && !MOP.isDebug() && MOP.getReg() && + MOP.isImplicit() && MOP.isKill() && + TRI->regsOverlap(RegToRename, MOP.getReg()); + })) { + LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n"); + return false; + } + auto canRenameMOP = [](const MachineOperand &MOP) { + return MOP.isImplicit() || + (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied()); + }; + + bool FoundDef = false; + + // For each instruction between FirstMI and the previous def for RegToRename, + // we + // * check if we can rename RegToRename in this instruction + // * collect the registers used and required register classes for RegToRename. + std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI, + bool IsDef) { + LLVM_DEBUG(dbgs() << "Checking " << MI << "\n"); + // Currently we do not try to rename across frame-setup instructions. + if (MI.getFlag(MachineInstr::FrameSetup)) { + LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions currently (" + << MI << ")\n"); + return false; + } + + UsedInBetween.accumulate(MI); + + // For a definition, check that we can rename the definition and exit the + // loop. + FoundDef = IsDef; + + // For defs, check if we can rename the first def of RegToRename. + if (FoundDef) { + for (auto &MOP : MI.operands()) { + if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() || + !TRI->regsOverlap(MOP.getReg(), RegToRename)) + continue; + if (!canRenameMOP(MOP)) { + LLVM_DEBUG(dbgs() + << " Cannot rename " << MOP << " in " << MI << "\n"); + return false; + } + RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); + } + return true; + } else { + for (auto &MOP : MI.operands()) { + if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() || + !TRI->regsOverlap(MOP.getReg(), RegToRename)) + continue; + + if (!canRenameMOP(MOP)) { + LLVM_DEBUG(dbgs() + << " Cannot rename " << MOP << " in " << MI << "\n"); + return false; + } + RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg())); + } + } + return true; + }; + + if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs)) + return false; + + if (!FoundDef) { + LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n"); + return false; + } + return true; +} + +// Check if we can find a physical register for renaming. This register must: +// * not be defined up to FirstMI (checking DefinedInBB) +// * not used between the MI and the defining instruction of the register to +// rename (checked using UsedInBetween). +// * is available in all used register classes (checked using RequiredClasses). +static Optional<MCPhysReg> tryToFindRegisterToRename( + MachineInstr &FirstMI, MachineInstr &MI, LiveRegUnits &DefinedInBB, + LiveRegUnits &UsedInBetween, + SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses, + const TargetRegisterInfo *TRI) { + auto &MF = *FirstMI.getParent()->getParent(); + MachineRegisterInfo &RegInfo = MF.getRegInfo(); + + // Checks if any sub- or super-register of PR is callee saved. + auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) { + return any_of(TRI->sub_and_superregs_inclusive(PR), + [&MF, TRI](MCPhysReg SubOrSuper) { + return TRI->isCalleeSavedPhysReg(SubOrSuper, MF); + }); + }; + + // Check if PR or one of its sub- or super-registers can be used for all + // required register classes. + auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) { + return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) { + return any_of(TRI->sub_and_superregs_inclusive(PR), + [C, TRI](MCPhysReg SubOrSuper) { + return C == TRI->getMinimalPhysRegClass(SubOrSuper); + }); + }); + }; + + auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg()); + for (const MCPhysReg &PR : *RegClass) { + if (DefinedInBB.available(PR) && UsedInBetween.available(PR) && + !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) && + CanBeUsedForAllClasses(PR)) { + DefinedInBB.addReg(PR); + LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI) + << "\n"); + return {PR}; + } + } + LLVM_DEBUG(dbgs() << "No rename register found from " + << TRI->getRegClassName(RegClass) << "\n"); + return None; +} + /// Scan the instructions looking for a load/store that can be combined with the /// current instruction into a wider equivalent or a load/store pair. MachineBasicBlock::iterator @@ -1215,6 +1420,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, bool FindNarrowMerge) { MachineBasicBlock::iterator E = I->getParent()->end(); MachineBasicBlock::iterator MBBI = I; + MachineBasicBlock::iterator MBBIWithRenameReg; MachineInstr &FirstMI = *I; ++MBBI; @@ -1223,9 +1429,16 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, Register Reg = getLdStRegOp(FirstMI).getReg(); Register BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); - int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); + Optional<bool> MaybeCanRename = None; + SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses; + LiveRegUnits UsedInBetween; + UsedInBetween.init(*TRI); + + Flags.clearRenameReg(); + // Track which register units have been modified and used between the first // insn (inclusive) and the second insn. ModifiedRegUnits.clear(); @@ -1237,6 +1450,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { MachineInstr &MI = *MBBI; + UsedInBetween.accumulate(MI); + // Don't count transient instructions towards the search limit since there // may be different numbers of them if e.g. debug information is present. if (!MI.isTransient()) @@ -1259,7 +1474,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // We're trying to pair instructions that differ in how they are scaled. // If FirstMI is scaled then scale the offset of MI accordingly. // Otherwise, do the opposite (i.e., make MI's offset unscaled). - int MemSize = getMemScale(MI); + int MemSize = TII->getMemScale(MI); if (MIIsUnscaled) { // If the unscaled offset isn't a multiple of the MemSize, we can't // pair the operations together: bail and keep looking. @@ -1329,7 +1544,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, !(MI.mayLoad() && !UsedRegUnits.available(getLdStRegOp(MI).getReg())) && !mayAlias(MI, MemInsns, AA)) { + Flags.setMergeForward(false); + Flags.clearRenameReg(); return MBBI; } @@ -1337,18 +1554,41 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I, // between the two instructions and none of the instructions between the // first and the second alias with the first, we can combine the first // into the second. - if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) && - !(MayLoad && + if (!(MayLoad && !UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) && !mayAlias(FirstMI, MemInsns, AA)) { - Flags.setMergeForward(true); - return MBBI; + + if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) { + Flags.setMergeForward(true); + Flags.clearRenameReg(); + return MBBI; + } + + if (DebugCounter::shouldExecute(RegRenamingCounter)) { + if (!MaybeCanRename) + MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween, + RequiredClasses, TRI)}; + + if (*MaybeCanRename) { + Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename( + FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses, + TRI); + if (MaybeRenameReg) { + Flags.setRenameReg(*MaybeRenameReg); + Flags.setMergeForward(true); + MBBIWithRenameReg = MBBI; + } + } + } } // Unable to combine these instructions due to interference in between. // Keep looking. } } + if (Flags.getRenameReg()) + return MBBIWithRenameReg; + // If the instruction wasn't a matching load or store. Stop searching if we // encounter a call instruction that might modify memory. if (MI.isCall()) @@ -1492,7 +1732,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( MachineBasicBlock::iterator MBBI = I; Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions // can't be formed if the memory instruction doesn't have the offset we're @@ -1663,7 +1903,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { // with Offset-1) bool IsUnscaled = TII->isUnscaledLdSt(MI); int Offset = getLdStOffsetOp(MI).getImm(); - int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; + int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; // Allow one more for offset. if (Offset > 0) Offset -= OffsetStride; @@ -1680,7 +1920,13 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) { ++NumUnscaledPairCreated; // Keeping the iterator straight is a pain, so we let the merge routine tell // us what the next instruction is after it's done mucking about. + auto Prev = std::prev(MBBI); MBBI = mergePairedInsns(MBBI, Paired, Flags); + // Collect liveness info for instructions between Prev and the new position + // MBBI. + for (auto I = std::next(Prev); I != MBBI; I++) + updateDefinedRegisters(*I, DefinedInBB, TRI); + return true; } return false; @@ -1723,7 +1969,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate // The immediate in the load/store is scaled by the size of the memory // operation. The immediate in the add we're looking for, // however, is not, so adjust here. - int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] @@ -1742,6 +1988,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt) { + bool Modified = false; // Four tranformations to do here: // 1) Find loads that directly read from stores and promote them by @@ -1786,8 +2033,17 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, // ldr x1, [x2, #8] // ; becomes // ldp x0, x1, [x2] + + if (MBB.getParent()->getRegInfo().tracksLiveness()) { + DefinedInBB.clear(); + DefinedInBB.addLiveIns(MBB); + } + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E;) { + // Track currently live registers up to this point, to help with + // searching for a rename register on demand. + updateDefinedRegisters(*MBBI, DefinedInBB, TRI); if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI)) Modified = true; else @@ -1825,11 +2081,14 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) { // or store. ModifiedRegUnits.init(*TRI); UsedRegUnits.init(*TRI); + DefinedInBB.init(*TRI); bool Modified = false; bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign(); - for (auto &MBB : Fn) - Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt); + for (auto &MBB : Fn) { + auto M = optimizeBlock(MBB, enableNarrowZeroStOpt); + Modified |= M; + } return Modified; } diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 0009fb7b5520..6ddb3fdb0046 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/TargetFrameLowering.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCLinkerOptimizationHint.h" #include <cassert> @@ -51,10 +52,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool HasStackFrame = false; /// Amount of stack frame size, not including callee-saved registers. - unsigned LocalStackSize; + uint64_t LocalStackSize = 0; + + /// The start and end frame indices for the SVE callee saves. + int MinSVECSFrameIndex = 0; + int MaxSVECSFrameIndex = 0; /// Amount of stack frame size used for saving callee-saved registers. - unsigned CalleeSavedStackSize; + unsigned CalleeSavedStackSize = 0; + unsigned SVECalleeSavedStackSize = 0; + bool HasCalleeSavedStackSize = false; /// Number of TLS accesses using the special (combinable) /// _TLS_MODULE_BASE_ symbol. @@ -117,7 +124,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { // Offset from SP-at-entry to the tagged base pointer. // Tagged base pointer is set up to point to the first (lowest address) tagged // stack slot. - unsigned TaggedBasePointerOffset; + unsigned TaggedBasePointerOffset = 0; public: AArch64FunctionInfo() = default; @@ -160,15 +167,79 @@ public: void setCalleeSaveStackHasFreeSpace(bool s) { CalleeSaveStackHasFreeSpace = s; } - bool isSplitCSR() const { return IsSplitCSR; } void setIsSplitCSR(bool s) { IsSplitCSR = s; } - void setLocalStackSize(unsigned Size) { LocalStackSize = Size; } - unsigned getLocalStackSize() const { return LocalStackSize; } + void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; } + uint64_t getLocalStackSize() const { return LocalStackSize; } + + void setCalleeSavedStackSize(unsigned Size) { + CalleeSavedStackSize = Size; + HasCalleeSavedStackSize = true; + } + + // When CalleeSavedStackSize has not been set (for example when + // some MachineIR pass is run in isolation), then recalculate + // the CalleeSavedStackSize directly from the CalleeSavedInfo. + // Note: This information can only be recalculated after PEI + // has assigned offsets to the callee save objects. + unsigned getCalleeSavedStackSize(const MachineFrameInfo &MFI) const { + bool ValidateCalleeSavedStackSize = false; + +#ifndef NDEBUG + // Make sure the calculated size derived from the CalleeSavedInfo + // equals the cached size that was calculated elsewhere (e.g. in + // determineCalleeSaves). + ValidateCalleeSavedStackSize = HasCalleeSavedStackSize; +#endif + + if (!HasCalleeSavedStackSize || ValidateCalleeSavedStackSize) { + assert(MFI.isCalleeSavedInfoValid() && "CalleeSavedInfo not calculated"); + if (MFI.getCalleeSavedInfo().empty()) + return 0; + + int64_t MinOffset = std::numeric_limits<int64_t>::max(); + int64_t MaxOffset = std::numeric_limits<int64_t>::min(); + for (const auto &Info : MFI.getCalleeSavedInfo()) { + int FrameIdx = Info.getFrameIdx(); + if (MFI.getStackID(FrameIdx) != TargetStackID::Default) + continue; + int64_t Offset = MFI.getObjectOffset(FrameIdx); + int64_t ObjSize = MFI.getObjectSize(FrameIdx); + MinOffset = std::min<int64_t>(Offset, MinOffset); + MaxOffset = std::max<int64_t>(Offset + ObjSize, MaxOffset); + } + + unsigned Size = alignTo(MaxOffset - MinOffset, 16); + assert((!HasCalleeSavedStackSize || getCalleeSavedStackSize() == Size) && + "Invalid size calculated for callee saves"); + return Size; + } + + return getCalleeSavedStackSize(); + } + + unsigned getCalleeSavedStackSize() const { + assert(HasCalleeSavedStackSize && + "CalleeSavedStackSize has not been calculated"); + return CalleeSavedStackSize; + } + + // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes' + void setSVECalleeSavedStackSize(unsigned Size) { + SVECalleeSavedStackSize = Size; + } + unsigned getSVECalleeSavedStackSize() const { + return SVECalleeSavedStackSize; + } + + void setMinMaxSVECSFrameIndex(int Min, int Max) { + MinSVECSFrameIndex = Min; + MaxSVECSFrameIndex = Max; + } - void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; } - unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; } + int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; } + int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; } void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; } unsigned getNumLocalDynamicTLSAccesses() const { diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp index d30ea120bae4..230fd514d022 100644 --- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -62,20 +62,6 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, CombinerHelper Helper(Observer, B, KB, MDT); switch (MI.getOpcode()) { - case TargetOpcode::G_CONCAT_VECTORS: - return Helper.tryCombineConcatVectors(MI); - case TargetOpcode::G_SHUFFLE_VECTOR: - return Helper.tryCombineShuffleVector(MI); - case TargetOpcode::G_LOAD: - case TargetOpcode::G_SEXTLOAD: - case TargetOpcode::G_ZEXTLOAD: { - bool Changed = false; - Changed |= Helper.tryCombineExtendingLoads(MI); - Changed |= Helper.tryCombineIndexedLoadStore(MI); - return Changed; - } - case TargetOpcode::G_STORE: - return Helper.tryCombineIndexedLoadStore(MI); case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: switch (MI.getIntrinsicID()) { case Intrinsic::memcpy: @@ -93,9 +79,16 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer, } } - if (Generated.tryCombineAll(Observer, MI, B)) + if (Generated.tryCombineAll(Observer, MI, B, Helper)) return true; + switch (MI.getOpcode()) { + case TargetOpcode::G_CONCAT_VECTORS: + return Helper.tryCombineConcatVectors(MI); + case TargetOpcode::G_SHUFFLE_VECTOR: + return Helper.tryCombineShuffleVector(MI); + } + return false; } diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp index a594ecb71fc9..9135f1b40122 100644 --- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp +++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp @@ -38,6 +38,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 8ec73aa3c040..40efac261fd9 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -222,8 +222,9 @@ unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A, return RegisterBankInfo::copyCost(A, B, Size); } -const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass( - const TargetRegisterClass &RC) const { +const RegisterBank & +AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const { switch (RC.getID()) { case AArch64::FPR8RegClassID: case AArch64::FPR16RegClassID: @@ -529,7 +530,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Arithmetic ops. case TargetOpcode::G_ADD: case TargetOpcode::G_SUB: - case TargetOpcode::G_GEP: + case TargetOpcode::G_PTR_ADD: case TargetOpcode::G_MUL: case TargetOpcode::G_SDIV: case TargetOpcode::G_UDIV: diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h index 016fed65eb2a..e956fca1aa10 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h +++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h @@ -132,8 +132,8 @@ public: unsigned copyCost(const RegisterBank &A, const RegisterBank &B, unsigned Size) const override; - const RegisterBank & - getRegBankFromRegClass(const TargetRegisterClass &RC) const override; + const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC, + LLT) const override; InstructionMappings getInstrAlternativeMappings(const MachineInstr &MI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp index de176088595d..14f839cd4f81 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -43,6 +43,8 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT) const MCPhysReg * AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); + if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check) + return CSR_Win_AArch64_CFGuard_Check_SaveList; if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows()) return CSR_Win_AArch64_AAPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::GHC) @@ -53,6 +55,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { return CSR_AArch64_AllRegs_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall) return CSR_AArch64_AAVPCS_SaveList; + if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall) + return CSR_AArch64_SVE_AAPCS_SaveList; if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS) return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ? CSR_AArch64_CXX_TLS_Darwin_PE_SaveList : @@ -123,7 +127,10 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (CC == CallingConv::AArch64_VectorCall) return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask; if (CC == CallingConv::AArch64_SVE_VectorCall) - return CSR_AArch64_SVE_AAPCS_RegMask; + return SCS ? CSR_AArch64_SVE_AAPCS_SCS_RegMask + : CSR_AArch64_SVE_AAPCS_RegMask; + if (CC == CallingConv::CFGuard_Check) + return CSR_Win_AArch64_CFGuard_Check_RegMask; if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering() ->supportSwiftError() && MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError)) @@ -390,7 +397,6 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI, bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg, int64_t Offset) const { - assert(Offset <= INT_MAX && "Offset too big to fit in int."); assert(MI && "Unable to get the legal offset for nil instruction."); StackOffset SaveOffset(Offset, MVT::i8); return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td index 61fc0795c242..f52feab03953 100644 --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -481,7 +481,7 @@ def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> { // Vector operand versions of the FP registers. Alternate name printing and -// assmebler matching. +// assembler matching. def VectorReg64AsmOperand : AsmOperandClass { let Name = "VectorReg64"; let PredicateMethod = "isNeonVectorReg"; @@ -858,35 +858,19 @@ def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>; //****************************************************************************** -// SVE vector register class -def ZPR : RegisterClass<"AArch64", - [nxv16i8, nxv8i16, nxv4i32, nxv2i64, - nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, - nxv1f64, nxv2f64], - 128, (sequence "Z%u", 0, 31)> { +// SVE vector register classes +class ZPRClass<int lastreg> : RegisterClass<"AArch64", + [nxv16i8, nxv8i16, nxv4i32, nxv2i64, + nxv2f16, nxv4f16, nxv8f16, + nxv2f32, nxv4f32, + nxv2f64], + 128, (sequence "Z%u", 0, lastreg)> { let Size = 128; } -// SVE restricted 4 bit scalable vector register class -def ZPR_4b : RegisterClass<"AArch64", - [nxv16i8, nxv8i16, nxv4i32, nxv2i64, - nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, - nxv1f64, nxv2f64], - 128, (sequence "Z%u", 0, 15)> { - let Size = 128; -} - -// SVE restricted 3 bit scalable vector register class -def ZPR_3b : RegisterClass<"AArch64", - [nxv16i8, nxv8i16, nxv4i32, nxv2i64, - nxv2f16, nxv4f16, nxv8f16, - nxv1f32, nxv2f32, nxv4f32, - nxv1f64, nxv2f64], - 128, (sequence "Z%u", 0, 7)> { - let Size = 128; -} +def ZPR : ZPRClass<31>; +def ZPR_4b : ZPRClass<15>; // Restricted 4 bit SVE vector register class. +def ZPR_3b : ZPRClass<7>; // Restricted 3 bit SVE vector register class. class ZPRAsmOperand<string name, int Width, string RegClassSuffix = ""> : AsmOperandClass { diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index b573eac76754..c849d7af9a40 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -10,6 +10,72 @@ // //===----------------------------------------------------------------------===// +def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [ + SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>, + SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1> +]>; + +def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; +def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>; + +def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_sxtw : SDNode<"AArch64ISD::GLD1_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + +def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_gather_sxtw : SDNode<"AArch64ISD::GLD1S_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +def AArch64ld1s_gather_imm : SDNode<"AArch64ISD::GLD1S_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; + +def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>; + +def AArch64smaxv_pred : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>; +def AArch64umaxv_pred : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>; +def AArch64sminv_pred : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>; +def AArch64uminv_pred : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>; +def AArch64orv_pred : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>; +def AArch64eorv_pred : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>; +def AArch64andv_pred : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>; +def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>; +def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>; + +def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>; +def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>; +def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>; + +def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>; + +def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>; +def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>; + let Predicates = [HasSVE] in { def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">; @@ -18,69 +84,69 @@ let Predicates = [HasSVE] in { def SETFFR : sve_int_setffr<"setffr">; def WRFFR : sve_int_wrffr<"wrffr">; - defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">; - defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">; - defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd">; - defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd">; - defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">; - defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">; - - defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and">; - defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">; - defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">; - defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">; - - defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">; - defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">; - defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr">; - - defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr">; - defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor">; - defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and">; - defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic">; - - defm ADD_ZI : sve_int_arith_imm0<0b000, "add">; - defm SUB_ZI : sve_int_arith_imm0<0b001, "sub">; - defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr">; - defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd">; - defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd">; - defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub">; - defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub">; - - defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad">; - defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb">; - defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla">; - defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls">; + defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>; + defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>; + defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>; + defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>; + defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>; + defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>; + + defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>; + defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>; + defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>; + defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>; + + defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", int_aarch64_sve_add>; + defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", int_aarch64_sve_sub>; + defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", int_aarch64_sve_subr>; + + defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>; + defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>; + defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>; + defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>; + + defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>; + defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>; + defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>; + defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>; + defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>; + defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>; + defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>; + + defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>; + defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>; + defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>; + defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>; // SVE predicated integer reductions. - defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv">; - defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv">; - defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv">; - defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv">; - defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv">; - defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv">; - defm ORV_VPZ : sve_int_reduce_2<0b000, "orv">; - defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv">; - defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv">; - - defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn">; - defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon">; - defm AND_ZI : sve_int_log_imm<0b10, "and", "bic">; - - defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", simm8>; - defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", simm8>; - defm UMAX_ZI : sve_int_arith_imm1<0b01, "umax", imm0_255>; - defm UMIN_ZI : sve_int_arith_imm1<0b11, "umin", imm0_255>; - - defm MUL_ZI : sve_int_arith_imm2<"mul">; - defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul">; - defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh">; - defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh">; - - defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv">; - defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv">; - defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">; - defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">; + defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>; + defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>; + defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_pred>; + defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_pred>; + defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_pred>; + defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_pred>; + defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_pred>; + defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_pred>; + defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_pred>; + + defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>; + defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>; + defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>; + + defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", smax>; + defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", smin>; + defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", umax>; + defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", umin>; + + defm MUL_ZI : sve_int_arith_imm2<"mul", mul>; + defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>; + defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>; + defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>; + + defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", int_aarch64_sve_sdiv>; + defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", int_aarch64_sve_udiv>; + defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", int_aarch64_sve_sdivr>; + defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", int_aarch64_sve_udivr>; defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>; defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>; @@ -88,32 +154,32 @@ let Predicates = [HasSVE] in { defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>; defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>; - defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">; - defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">; - defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth">; - defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">; - defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">; - defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">; - defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>; - defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>; - - defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", null_frag>; - defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", null_frag>; + defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>; + defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>; + defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>; + defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>; + defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>; + defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>; + defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>; + defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>; + + defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>; + defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>; defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>; - defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", null_frag>; - defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", null_frag>; - defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">; - defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">; + defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>; + defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>; + defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>; + defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>; - defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax">; - defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax">; - defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin">; - defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin">; - defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd">; - defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd">; + defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", int_aarch64_sve_smax>; + defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", int_aarch64_sve_umax>; + defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", int_aarch64_sve_smin>; + defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", int_aarch64_sve_umin>; + defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>; + defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>; - defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe">; - defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte">; + defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>; + defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>; defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>; defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>; @@ -124,57 +190,57 @@ let Predicates = [HasSVE] in { defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>; defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>; - defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd">; - defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub">; - defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul">; - defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr">; - defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">; - defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">; - defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax">; - defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin">; - defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd">; - defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">; - defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx">; - defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">; - defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">; - - defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; - defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>; - defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>; - defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>; - defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>; - defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>; - - defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">; - - defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd">; - defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla">; - - defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla">; - defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls">; - defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla">; - defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls">; - - defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad">; - defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb">; - defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad">; - defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb">; - - defm FTMAD_ZZI : sve_fp_ftmad<"ftmad">; - - defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla">; - defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls">; - - defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla">; - defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul">; + defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", int_aarch64_sve_fadd>; + defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", int_aarch64_sve_fsub>; + defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", int_aarch64_sve_fmul>; + defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", int_aarch64_sve_fsubr>; + defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>; + defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>; + defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", int_aarch64_sve_fmax>; + defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", int_aarch64_sve_fmin>; + defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", int_aarch64_sve_fabd>; + defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>; + defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", int_aarch64_sve_fmulx>; + defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", int_aarch64_sve_fdivr>; + defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", int_aarch64_sve_fdiv>; + + defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>; + defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>; + defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul>; + defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>; + defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>; + defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>; + + defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>; + + defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>; + defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>; + + defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>; + defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>; + defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>; + defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>; + + defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>; + defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>; + defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>; + defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>; + + defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>; + + defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>; + defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>; + + defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>; + defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>; // SVE floating point reductions. - defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda">; - defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv">; - defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv">; - defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv">; - defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv">; - defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv">; + defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", int_aarch64_sve_fadda>; + defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", int_aarch64_sve_faddv>; + defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", int_aarch64_sve_fmaxnmv>; + defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", int_aarch64_sve_fminnmv>; + defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", int_aarch64_sve_fmaxv>; + defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", int_aarch64_sve_fminv>; // Splat immediate (unpredicated) defm DUP_ZI : sve_int_dup_imm<"dup">; @@ -195,21 +261,21 @@ let Predicates = [HasSVE] in { defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">; // Select elements from either vector (predicated) - defm SEL_ZPZZ : sve_int_sel_vvv<"sel">; + defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>; - defm SPLICE_ZPZ : sve_int_perm_splice<"splice">; - defm COMPACT_ZPZ : sve_int_perm_compact<"compact">; - defm INSR_ZR : sve_int_perm_insrs<"insr">; - defm INSR_ZV : sve_int_perm_insrv<"insr">; - def EXT_ZZI : sve_int_perm_extract_i<"ext">; + defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>; + defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>; + defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>; + defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>; + defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>; - defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">; - defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">; - defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">; - defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">; + defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>; + defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>; + defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>; + defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>; - defm REV_PP : sve_int_perm_reverse_p<"rev">; - defm REV_ZZ : sve_int_perm_reverse_z<"rev">; + defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>; + defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>; defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>; defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>; @@ -222,9 +288,7 @@ let Predicates = [HasSVE] in { defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">; defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">; def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>; - def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>; - def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>; - def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>; + defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>; def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">; def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">; @@ -243,36 +307,36 @@ let Predicates = [HasSVE] in { def PTEST_PP : sve_int_ptest<0b010000, "ptest">; def PFALSE : sve_int_pfalse<0b000000, "pfalse">; - defm PFIRST : sve_int_pfirst<0b00000, "pfirst">; - defm PNEXT : sve_int_pnext<0b00110, "pnext">; - - def AND_PPzPP : sve_int_pred_log<0b0000, "and">; - def BIC_PPzPP : sve_int_pred_log<0b0001, "bic">; - def EOR_PPzPP : sve_int_pred_log<0b0010, "eor">; - def SEL_PPPP : sve_int_pred_log<0b0011, "sel">; - def ANDS_PPzPP : sve_int_pred_log<0b0100, "ands">; - def BICS_PPzPP : sve_int_pred_log<0b0101, "bics">; - def EORS_PPzPP : sve_int_pred_log<0b0110, "eors">; - def ORR_PPzPP : sve_int_pred_log<0b1000, "orr">; - def ORN_PPzPP : sve_int_pred_log<0b1001, "orn">; - def NOR_PPzPP : sve_int_pred_log<0b1010, "nor">; - def NAND_PPzPP : sve_int_pred_log<0b1011, "nand">; - def ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs">; - def ORNS_PPzPP : sve_int_pred_log<0b1101, "orns">; - def NORS_PPzPP : sve_int_pred_log<0b1110, "nors">; - def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">; - - defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">; - defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">; - defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta">; - defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb">; - defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta">; - defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb">; - - defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta">; - defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb">; - defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta">; - defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb">; + defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>; + defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>; + + defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z>; + defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>; + defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z>; + defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>; + defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>; + defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>; + defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>; + defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>; + defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>; + defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>; + defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>; + defm ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs", null_frag>; + defm ORNS_PPzPP : sve_int_pred_log<0b1101, "orns", null_frag>; + defm NORS_PPzPP : sve_int_pred_log<0b1110, "nors", null_frag>; + defm NANDS_PPzPP : sve_int_pred_log<0b1111, "nands", null_frag>; + + defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta", AArch64clasta_n>; + defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb", AArch64clastb_n>; + defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta", AArch64clasta_n>; + defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb", AArch64clastb_n>; + defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>; + defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>; + + defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>; + defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>; + defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>; + defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>; // continuous load with reg+immediate defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>; @@ -404,115 +468,115 @@ let Predicates = [HasSVE] in { // Gathers using unscaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw] - defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>; + defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; // Gathers using scaled 32-bit offsets, e.g. // ld1h z0.s, p0/z, [x0, z0.s, uxtw #1] - defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - - // Gathers using scaled 32-bit pointers with offset, e.g. + defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + + // Gathers using 32-bit pointers with scaled offset, e.g. // ld1h z0.s, p0/z, [z0.s, #16] - defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31>; - defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>; - defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31>; - defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31>; - defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2>; - defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>; - defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2>; - defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2>; - defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4>; - defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4>; - - // Gathers using scaled 64-bit pointers with offset, e.g. + defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>; + defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv4i8>; + defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>; + defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv4i8>; + defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>; + defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv4i16>; + defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>; + defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv4i16>; + defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>; + defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv4i32>; + + // Gathers using 64-bit pointers with scaled offset, e.g. // ld1h z0.d, p0/z, [z0.d, #16] - defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31>; - defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>; - defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31>; - defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31>; - defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2>; - defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>; - defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2>; - defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2>; - defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4>; - defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>; - defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4>; - defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4>; - defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8>; - defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8>; + defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, null_frag, nxv2i64>; // Gathers using unscaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d] - defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">; - defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">; - defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">; - defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">; - defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">; - defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">; - defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">; - defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">; - defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">; - defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">; - defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">; - defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">; - defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">; - defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">; + defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>; // Gathers using scaled 64-bit offsets, e.g. // ld1h z0.d, p0/z, [x0, z0.d, lsl #1] - defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>; - defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>; - defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>; - defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>; - defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>; - defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>; + defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>; // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw] - defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>; + defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g. // ld1h z0.d, p0/z, [x0, z0.d, uxtw #1] - defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; - defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; + defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Non-temporal contiguous loads (register + immediate) defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>; @@ -550,51 +614,55 @@ let Predicates = [HasSVE] in { defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>; defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>; - // Scatters using unscaled 32-bit offsets, e.g. - // st1h z0.s, p0, [x0, z0.s, uxtw] - // and unpacked: + // Scatters using unpacked, unscaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw] - defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>; - defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>; - defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>; - defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>; - - // Scatters using scaled 32-bit offsets, e.g. + defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>; + + // Scatters using packed, unscaled 32-bit offsets, e.g. + // st1h z0.s, p0, [x0, z0.s, uxtw] + defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>; + + // Scatters using packed, scaled 32-bit offsets, e.g. // st1h z0.s, p0, [x0, z0.s, uxtw #1] - // and unpacked: + defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>; + defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>; + + // Scatters using unpacked, scaled 32-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, uxtw #1] - defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>; - defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>; - defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>; - defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>; - defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>; + defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>; + defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>; // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.s, p0, [z0.s, #16] + defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>; + defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>; + defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>; + + // Scatters using 32/64-bit pointers with offset, e.g. // st1h z0.d, p0, [z0.d, #16] - defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>; - defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>; - defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>; - defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>; - defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>; - defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>; - defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>; + defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>; + defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>; + defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>; + defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>; // Scatters using unscaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d] - defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">; - defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">; - defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">; - defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">; + defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>; + defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>; + defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>; + defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>; // Scatters using scaled 64-bit offsets, e.g. // st1h z0.d, p0, [x0, z0.d, lsl #1] - defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>; - defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>; - defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>; + defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>; + defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>; + defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>; // ST(2|3|4) structured stores (register + immediate) defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>; @@ -693,58 +761,58 @@ let Predicates = [HasSVE] in { defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">; defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">; - defm TBL_ZZZ : sve_int_perm_tbl<"tbl">; - - defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">; - defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">; - defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1">; - defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2">; - defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1">; - defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2">; - - defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">; - defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">; - defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1">; - defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2">; - defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1">; - defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2">; - - defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs">; - defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi">; - defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge">; - defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt">; - defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq">; - defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne">; - - defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq">; - defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne">; - defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge">; - defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt">; - defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt">; - defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple">; - defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs">; - defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi">; - defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo">; - defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls">; - - defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge">; - defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt">; - defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt">; - defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple">; - defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq">; - defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne">; - defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs">; - defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi">; - defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo">; - defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls">; - - defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge">; - defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt">; - defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq">; - defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne">; - defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo">; - defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge">; - defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt">; + defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>; + + defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>; + defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>; + defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>; + defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2", AArch64uzp2>; + defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>; + defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>; + + defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>; + defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>; + defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>; + defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2", AArch64uzp2>; + defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>; + defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>; + + defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", int_aarch64_sve_cmphs, SETUGE>; + defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", int_aarch64_sve_cmphi, SETUGT>; + defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", int_aarch64_sve_cmpge, SETGE>; + defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", int_aarch64_sve_cmpgt, SETGT>; + defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", int_aarch64_sve_cmpeq, SETEQ>; + defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", int_aarch64_sve_cmpne, SETNE>; + + defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>; + defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>; + defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge", int_aarch64_sve_cmpge_wide>; + defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt", int_aarch64_sve_cmpgt_wide>; + defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt", int_aarch64_sve_cmplt_wide>; + defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple", int_aarch64_sve_cmple_wide>; + defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs", int_aarch64_sve_cmphs_wide>; + defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi", int_aarch64_sve_cmphi_wide>; + defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>; + defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>; + + defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, int_aarch64_sve_cmpge>; + defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, int_aarch64_sve_cmpgt>; + defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, null_frag, int_aarch64_sve_cmpgt>; + defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, null_frag, int_aarch64_sve_cmpge>; + defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, int_aarch64_sve_cmpeq>; + defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, int_aarch64_sve_cmpne>; + defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, int_aarch64_sve_cmphs>; + defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, int_aarch64_sve_cmphi>; + defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, null_frag, int_aarch64_sve_cmphi>; + defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, null_frag, int_aarch64_sve_cmphs>; + + defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge", int_aarch64_sve_fcmpge>; + defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt", int_aarch64_sve_fcmpgt>; + defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq", int_aarch64_sve_fcmpeq>; + defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne", int_aarch64_sve_fcmpne>; + defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo", int_aarch64_sve_fcmpuo>; + defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>; + defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>; defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">; defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">; @@ -753,15 +821,15 @@ let Predicates = [HasSVE] in { defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">; defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">; - defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt">; - defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele">; - defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo">; - defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels">; + defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>; + defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>; + defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo>; + defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels>; - defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt">; - defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele">; - defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo">; - defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels">; + defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt>; + defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele>; + defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo>; + defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels>; def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>; def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>; @@ -772,11 +840,11 @@ let Predicates = [HasSVE] in { def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">; def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">; - defm CNTB_XPiI : sve_int_count<0b000, "cntb">; - defm CNTH_XPiI : sve_int_count<0b010, "cnth">; - defm CNTW_XPiI : sve_int_count<0b100, "cntw">; - defm CNTD_XPiI : sve_int_count<0b110, "cntd">; - defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">; + defm CNTB_XPiI : sve_int_count<0b000, "cntb", int_aarch64_sve_cntb>; + defm CNTH_XPiI : sve_int_count<0b010, "cnth", int_aarch64_sve_cnth>; + defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>; + defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>; + defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>; defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">; defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">; @@ -787,76 +855,76 @@ let Predicates = [HasSVE] in { defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">; defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">; - defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb">; - defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb">; - defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb">; - defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb">; - defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb">; - defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb">; - defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb">; - defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb">; - - defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch">; - defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch">; - defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech">; - defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech">; - defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch">; - defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch">; - defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech">; - defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech">; - - defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw">; - defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw">; - defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw">; - defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw">; - defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw">; - defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw">; - defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw">; - defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw">; - - defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd">; - defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd">; - defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd">; - defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd">; - defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd">; - defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd">; - defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd">; - defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd">; - - defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16>; - defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16>; - defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16>; - defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16>; + defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>; + defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>; + defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>; + defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb", int_aarch64_sve_uqdecb_n32>; + defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb", int_aarch64_sve_sqincb_n64>; + defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb", int_aarch64_sve_uqincb_n64>; + defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb", int_aarch64_sve_sqdecb_n64>; + defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb", int_aarch64_sve_uqdecb_n64>; + + defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch", int_aarch64_sve_sqinch_n32>; + defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch", int_aarch64_sve_uqinch_n32>; + defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech", int_aarch64_sve_sqdech_n32>; + defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech", int_aarch64_sve_uqdech_n32>; + defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch", int_aarch64_sve_sqinch_n64>; + defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch", int_aarch64_sve_uqinch_n64>; + defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech", int_aarch64_sve_sqdech_n64>; + defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech", int_aarch64_sve_uqdech_n64>; + + defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw", int_aarch64_sve_sqincw_n32>; + defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw", int_aarch64_sve_uqincw_n32>; + defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw", int_aarch64_sve_sqdecw_n32>; + defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw", int_aarch64_sve_uqdecw_n32>; + defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw", int_aarch64_sve_sqincw_n64>; + defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw", int_aarch64_sve_uqincw_n64>; + defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw", int_aarch64_sve_sqdecw_n64>; + defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw", int_aarch64_sve_uqdecw_n64>; + + defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd", int_aarch64_sve_sqincd_n32>; + defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd", int_aarch64_sve_uqincd_n32>; + defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd", int_aarch64_sve_sqdecd_n32>; + defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd", int_aarch64_sve_uqdecd_n32>; + defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd", int_aarch64_sve_sqincd_n64>; + defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd", int_aarch64_sve_uqincd_n64>; + defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd", int_aarch64_sve_sqdecd_n64>; + defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd", int_aarch64_sve_uqdecd_n64>; + + defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16, int_aarch64_sve_sqinch, nxv8i16>; + defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16, int_aarch64_sve_uqinch, nxv8i16>; + defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16, int_aarch64_sve_sqdech, nxv8i16>; + defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16, int_aarch64_sve_uqdech, nxv8i16>; defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>; defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>; - defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32>; - defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32>; - defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32>; - defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32>; + defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32, int_aarch64_sve_sqincw, nxv4i32>; + defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32, int_aarch64_sve_uqincw, nxv4i32>; + defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32, int_aarch64_sve_sqdecw, nxv4i32>; + defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32, int_aarch64_sve_uqdecw, nxv4i32>; defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>; defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>; - defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64>; - defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64>; - defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64>; - defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64>; + defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64, int_aarch64_sve_sqincd, nxv2i64>; + defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64, int_aarch64_sve_uqincd, nxv2i64>; + defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64, int_aarch64_sve_sqdecd, nxv2i64>; + defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64, int_aarch64_sve_uqdecd, nxv2i64>; defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>; defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>; - defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp">; - defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp">; - defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp">; - defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp">; - defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp">; - defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp">; - defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp">; - defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp">; + defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp", int_aarch64_sve_sqincp_n32>; + defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp", int_aarch64_sve_sqincp_n64>; + defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp", int_aarch64_sve_uqincp_n32>; + defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp", int_aarch64_sve_uqincp_n64>; + defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp", int_aarch64_sve_sqdecp_n32>; + defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>; + defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>; + defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>; defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">; defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">; - defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp">; - defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp">; - defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp">; - defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp">; + defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>; + defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>; + defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp", int_aarch64_sve_sqdecp>; + defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp", int_aarch64_sve_uqdecp>; defm INCP_ZP : sve_int_count_v<0b10000, "incp">; defm DECP_ZP : sve_int_count_v<0b10100, "decp">; @@ -878,63 +946,63 @@ let Predicates = [HasSVE] in { defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">; defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">; defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">; - defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd">; - - defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">; - defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">; - defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl">; - defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">; - defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">; - defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">; - - defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">; - defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">; - defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">; - - def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, ElementSizeS>; - def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, ElementSizeS>; - def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, ElementSizeH>; - def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, ElementSizeS>; - def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, ElementSizeS>; - def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, ElementSizeH>; - def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, ElementSizeH>; - def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, ElementSizeS>; - def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, ElementSizeH>; - def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, ElementSizeS>; - def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, ElementSizeD>; - def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, ElementSizeD>; - def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, ElementSizeD>; - def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, ElementSizeD>; - def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, ElementSizeD>; - def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, ElementSizeD>; - def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, ElementSizeS>; - def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, ElementSizeD>; - def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, ElementSizeS>; - def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, ElementSizeD>; - def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, ElementSizeD>; - def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, ElementSizeD>; - def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, ElementSizeD>; - def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, ElementSizeD>; - def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, ElementSizeD>; - def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, ElementSizeD>; - def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, ElementSizeD>; - def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, ElementSizeS>; - def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, ElementSizeD>; - def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, ElementSizeS>; - def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, ElementSizeD>; - def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, ElementSizeD>; - def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, ElementSizeD>; - def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, ElementSizeD>; - - defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">; - defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">; - defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm">; - defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz">; - defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta">; - defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx">; - defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti">; - defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx">; - defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt">; + defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>; + + defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>; + defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>; + defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>; + defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>; + defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>; + defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>; + + defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>; + defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>; + defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>; + + defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv16i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv16i1, nxv8f16, ElementSizeS>; + defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>; + defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>; + defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>; + defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>; + defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv16i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv16i1, nxv8f16, ElementSizeD>; + defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv16i1, nxv2f64, ElementSizeD>; + defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv16i1, nxv4f32, ElementSizeD>; + defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>; + defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>; + defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>; + defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>; + defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>; + defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>; + defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>; + defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>; + defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>; + defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>; + defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>; + + defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>; + defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>; + defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>; + defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>; + defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>; + defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", int_aarch64_sve_frintx>; + defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", int_aarch64_sve_frinti>; + defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>; + defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>; // InstAliases def : InstAlias<"mov $Zd, $Zn", @@ -1021,6 +1089,22 @@ let Predicates = [HasSVE] in { def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn", (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>; + def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)), + (PTEST_PP PPR:$pg, PPR:$src)>; + + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>; + def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>; + def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; @@ -1070,6 +1154,83 @@ let Predicates = [HasSVE] in { def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>; def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>; + // Add more complex addressing modes here as required + multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load, + Instruction RegImmInst> { + + def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))), + (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous loads + defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D_IMM>; + defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D_IMM>; + defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D_IMM>; + defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D_IMM>; + defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D_IMM>; + defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D_IMM>; + defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D_IMM>; + defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D_IMM>; + defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D_IMM>; + defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D_IMM>; + + // 4-element contiguous loads + defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S_IMM>; + defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S_IMM>; + defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S_IMM>; + defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S_IMM>; + defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W_IMM>; + defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S_IMM>; + defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W_IMM>; + + // 8-element contiguous loads + defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H_IMM>; + defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H_IMM>; + defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H_IMM>; + defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H_IMM>; + + // 16-element contiguous loads + defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B_IMM>; + + multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store, + Instruction RegImmInst> { + def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)), + (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>; + } + + // 2-element contiguous stores + defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D_IMM>; + defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D_IMM>; + defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D_IMM>; + defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D_IMM>; + defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D_IMM>; + defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D_IMM>; + defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D_IMM>; + + // 4-element contiguous stores + defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S_IMM>; + defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S_IMM>; + defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W_IMM>; + defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S_IMM>; + defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W_IMM>; + + // 8-element contiguous stores + defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H_IMM>; + defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H_IMM>; + defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H_IMM>; + + // 16-element contiguous stores + defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B_IMM>; + + defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRI>; + defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRI>; + defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRI>; + defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRI>; + + defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRI>; + defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>; + defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>; + defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>; } let Predicates = [HasSVE2] in { @@ -1286,46 +1447,46 @@ let Predicates = [HasSVE2] in { defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">; // SVE2 bitwise shift right narrow (bottom) - defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">; - defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">; - defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">; - defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">; - defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">; - defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">; - defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">; - defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">; + defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>; + defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>; + defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb", int_aarch64_sve_shrnb>; + defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", int_aarch64_sve_rshrnb>; + defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb", int_aarch64_sve_sqshrnb>; + defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb", int_aarch64_sve_sqrshrnb>; + defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb", int_aarch64_sve_uqshrnb>; + defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb", int_aarch64_sve_uqrshrnb>; // SVE2 bitwise shift right narrow (top) - defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">; - defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">; - defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">; - defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">; - defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">; - defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">; - defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">; - defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">; + defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt", int_aarch64_sve_sqshrunt>; + defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt", int_aarch64_sve_sqrshrunt>; + defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt", int_aarch64_sve_shrnt>; + defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt", int_aarch64_sve_rshrnt>; + defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt", int_aarch64_sve_sqshrnt>; + defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt", int_aarch64_sve_sqrshrnt>; + defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt", int_aarch64_sve_uqshrnt>; + defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt", int_aarch64_sve_uqrshrnt>; // SVE2 integer add/subtract narrow high part (bottom) - defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">; - defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">; - defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">; - defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">; + defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb", int_aarch64_sve_addhnb>; + defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb", int_aarch64_sve_raddhnb>; + defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb", int_aarch64_sve_subhnb>; + defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb", int_aarch64_sve_rsubhnb>; // SVE2 integer add/subtract narrow high part (top) - defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">; - defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">; - defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">; - defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">; + defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt", int_aarch64_sve_addhnt>; + defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt", int_aarch64_sve_raddhnt>; + defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt", int_aarch64_sve_subhnt>; + defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt", int_aarch64_sve_rsubhnt>; // SVE2 saturating extract narrow (bottom) - defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">; - defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">; - defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">; + defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb", int_aarch64_sve_sqxtnb>; + defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb", int_aarch64_sve_uqxtnb>; + defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb", int_aarch64_sve_sqxtunb>; // SVE2 saturating extract narrow (top) - defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">; - defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">; - defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">; + defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>; + defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>; + defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>; // SVE2 character match defm MATCH_PPzZZ : sve2_char_match<0b0, "match">; @@ -1353,32 +1514,32 @@ let Predicates = [HasSVE2] in { defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">; // SVE2 floating-point base 2 logarithm as integer - defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">; + defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>; // SVE2 floating-point convert precision - defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">; - defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">; - defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">; - def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>; + defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">; + defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">; + defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">; + defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">; // SVE2 floating-point pairwise operations - defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">; - defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">; - defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">; - defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp">; - defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp">; + defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp", int_aarch64_sve_faddp>; + defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>; + defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>; + defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp", int_aarch64_sve_fmaxp>; + defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp", int_aarch64_sve_fminp>; // SVE2 floating-point multiply-add long (indexed) - def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">; - def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">; - def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">; - def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">; + defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>; + defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>; + defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>; + defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>; // SVE2 floating-point multiply-add long - def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">; - def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">; - def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">; - def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">; + defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>; + defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>; + defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>; + defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>; // SVE2 bitwise ternary operations defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">; @@ -1427,15 +1588,15 @@ let Predicates = [HasSVE2] in { defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">; // SVE2 integer compare scalar count and limit - defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">; - defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">; - defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">; - defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi">; - - defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege">; - defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt">; - defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">; - defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">; + defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>; + defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>; + defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", int_aarch64_sve_whilehs>; + defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi>; + + defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", int_aarch64_sve_whilege>; + defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", int_aarch64_sve_whilegt>; + defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", int_aarch64_sve_whilehs>; + defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>; // SVE2 pointer conflict compare defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">; diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td deleted file mode 100644 index f1e76e2c20d3..000000000000 --- a/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td +++ /dev/null @@ -1,850 +0,0 @@ -//=- AArch64SchedExynosM1.td - Samsung Exynos M1 Sched Defs --*- tablegen -*-=// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file defines the machine model for the Samsung Exynos M1 to support -// instruction scheduling and other instruction cost heuristics. -// -//===----------------------------------------------------------------------===// - -//===----------------------------------------------------------------------===// -// The Exynos-M1 is a traditional superscalar microprocessor with a -// 4-wide in-order stage for decode and dispatch and a wider issue stage. -// The execution units and loads and stores are out-of-order. - -def ExynosM1Model : SchedMachineModel { - let IssueWidth = 4; // Up to 4 uops per cycle. - let MicroOpBufferSize = 96; // ROB size. - let LoopMicroOpBufferSize = 24; // Based on the instruction queue size. - let LoadLatency = 4; // Optimistic load cases. - let MispredictPenalty = 14; // Minimum branch misprediction penalty. - let CompleteModel = 1; // Use the default model otherwise. - - list<Predicate> UnsupportedFeatures = SVEUnsupported.F; -} - -//===----------------------------------------------------------------------===// -// Define each kind of processor resource and number available on the Exynos-M1, -// which has 9 pipelines, each with its own queue with out-of-order dispatch. - -let SchedModel = ExynosM1Model in { - -def M1UnitA : ProcResource<2>; // Simple integer -def M1UnitC : ProcResource<1>; // Simple and complex integer -def M1UnitD : ProcResource<1>; // Integer division (inside C, serialized) -def M1UnitB : ProcResource<2>; // Branch -def M1UnitL : ProcResource<1>; // Load -def M1UnitS : ProcResource<1>; // Store -def M1PipeF0 : ProcResource<1>; // FP #0 -let Super = M1PipeF0 in { - def M1UnitFMAC : ProcResource<1>; // FP multiplication - def M1UnitNAL0 : ProcResource<1>; // Simple vector - def M1UnitNMISC : ProcResource<1>; // Miscellanea - def M1UnitFCVT : ProcResource<1>; // FP conversion - def M1UnitNCRYPT : ProcResource<1>; // Cryptographic -} -def M1PipeF1 : ProcResource<1>; // FP #1 -let Super = M1PipeF1 in { - def M1UnitFADD : ProcResource<1>; // Simple FP - def M1UnitNAL1 : ProcResource<1>; // Simple vector - def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized) - def M1UnitFST : ProcResource<1>; // FP store -} - -def M1UnitALU : ProcResGroup<[M1UnitA, - M1UnitC]>; // All integer -def M1UnitNALU : ProcResGroup<[M1UnitNAL0, - M1UnitNAL1]>; // All simple vector - -//===----------------------------------------------------------------------===// -// Coarse scheduling model. - -def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; } -def M1WriteA2 : SchedWriteRes<[M1UnitALU]> { let Latency = 2; } -def M1WriteAA : SchedWriteRes<[M1UnitALU]> { let Latency = 2; - let ResourceCycles = [2]; } -def M1WriteAB : SchedWriteRes<[M1UnitALU, - M1UnitC]> { let Latency = 1; - let NumMicroOps = 2; } -def M1WriteAC : SchedWriteRes<[M1UnitALU, - M1UnitALU, - M1UnitC]> { let Latency = 2; - let NumMicroOps = 3; } -def M1WriteAD : SchedWriteRes<[M1UnitALU, - M1UnitC]> { let Latency = 2; - let NumMicroOps = 2; } -def M1WriteAX : SchedWriteVariant<[SchedVar<ExynosArithPred, [M1WriteA1]>, - SchedVar<ExynosLogicPred, [M1WriteA1]>, - SchedVar<NoSchedPred, [M1WriteAA]>]>; -def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; } -def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } - -def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; } -def M1WriteBX : SchedWriteVariant<[SchedVar<ExynosBranchLinkLRPred, [M1WriteAC]>, - SchedVar<NoSchedPred, [M1WriteAB]>]>; - -def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; } -def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; } -def M1WriteLA : SchedWriteRes<[M1UnitL]> { let Latency = 6; - let ResourceCycles = [2]; } -def M1WriteLB : SchedWriteRes<[M1UnitL, - M1UnitA]> { let Latency = 4; - let NumMicroOps = 2; } -def M1WriteLC : SchedWriteRes<[M1UnitL, - M1UnitA]> { let Latency = 5; - let NumMicroOps = 2; } -def M1WriteLD : SchedWriteRes<[M1UnitL, - M1UnitA]> { let Latency = 6; - let NumMicroOps = 2; - let ResourceCycles = [2, 1]; } -def M1WriteLH : SchedWriteRes<[]> { let Latency = 5; - let NumMicroOps = 0; } -def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteLC]>, - SchedVar<NoSchedPred, [M1WriteL5]>]>; - -def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; } -def M1WriteS3 : SchedWriteRes<[M1UnitS]> { let Latency = 3; } -def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; } -def M1WriteSA : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitA]> { let Latency = 3; - let NumMicroOps = 2; } -def M1WriteSB : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitA]> { let Latency = 3; - let NumMicroOps = 3; } -def M1WriteSC : SchedWriteRes<[M1UnitS, - M1UnitA]> { let Latency = 2; - let NumMicroOps = 2; } -def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteSC]>, - SchedVar<NoSchedPred, [M1WriteS1]>]>; - -def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>, - SchedVar<NoSchedPred, [ReadDefault]>]>; - -// Branch instructions. -def : WriteRes<WriteBr, []> { let Latency = 0; } -def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; } - -// Arithmetic and logical integer instructions. -def : WriteRes<WriteI, [M1UnitALU]> { let Latency = 1; } -def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; } -def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; } -def : WriteRes<WriteIS, [M1UnitALU]> { let Latency = 1; } - -// Move instructions. -def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; } - -// Divide and multiply instructions. -def : WriteRes<WriteID32, [M1UnitC, - M1UnitD]> { let Latency = 13; - let ResourceCycles = [1, 13]; } -def : WriteRes<WriteID64, [M1UnitC, - M1UnitD]> { let Latency = 21; - let ResourceCycles = [1, 21]; } -// TODO: Long multiplication take 5 cycles and also the ALU. -def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; } -def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4; - let ResourceCycles = [2]; } - -// Miscellaneous instructions. -def : WriteRes<WriteExtr, [M1UnitALU, - M1UnitALU]> { let Latency = 2; - let NumMicroOps = 2; } - -// Addressing modes. -def : WriteRes<WriteAdr, []> { let Latency = 1; - let NumMicroOps = 0; } -def : SchedAlias<ReadAdrBase, M1ReadAdrBase>; - -// Load instructions. -def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; } -def : WriteRes<WriteLDHi, []> { let Latency = 4; - let NumMicroOps = 0; } -def : SchedAlias<WriteLDIdx, M1WriteLX>; - -// Store instructions. -def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; } -def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; } -def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; } -def : SchedAlias<WriteSTIdx, M1WriteSX>; - -// FP data instructions. -def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; } -def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; } -def : WriteRes<WriteFDiv, [M1UnitFVAR]> { let Latency = 15; - let ResourceCycles = [15]; } -def : WriteRes<WriteFMul, [M1UnitFMAC]> { let Latency = 4; } - -// FP miscellaneous instructions. -def : WriteRes<WriteFCvt, [M1UnitFCVT]> { let Latency = 3; } -def : WriteRes<WriteFImm, [M1UnitNALU]> { let Latency = 1; } -def : WriteRes<WriteFCopy, [M1UnitS]> { let Latency = 4; } - -// FP load instructions. -def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; } - -// FP store instructions. -def : WriteRes<WriteVST, [M1UnitS, - M1UnitFST]> { let Latency = 1; - let NumMicroOps = 1; } - -// ASIMD FP instructions. -def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; } - -// Other miscellaneous instructions. -def : WriteRes<WriteAtomic, []> { let Unsupported = 1; } -def : WriteRes<WriteBarrier, []> { let Latency = 1; } -def : WriteRes<WriteHint, []> { let Latency = 1; } -def : WriteRes<WriteSys, []> { let Latency = 1; } - -//===----------------------------------------------------------------------===// -// Fast forwarding. - -// TODO: Add FP register forwarding rules. -def : ReadAdvance<ReadI, 0>; -def : ReadAdvance<ReadISReg, 0>; -def : ReadAdvance<ReadIEReg, 0>; -def : ReadAdvance<ReadIM, 0>; -// TODO: The forwarding for WriteIM32 saves actually 2 cycles. -def : ReadAdvance<ReadIMA, 3, [WriteIM32, WriteIM64]>; -def : ReadAdvance<ReadID, 0>; -def : ReadAdvance<ReadExtrHi, 0>; -def : ReadAdvance<ReadAdrBase, 0>; -def : ReadAdvance<ReadVLD, 0>; - -//===----------------------------------------------------------------------===// -// Finer scheduling model. - -def M1WriteNEONA : SchedWriteRes<[M1UnitNALU, - M1UnitNALU, - M1UnitFADD]> { let Latency = 9; - let NumMicroOps = 3; } -def M1WriteNEONB : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 5; - let NumMicroOps = 2;} -def M1WriteNEONC : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteNEOND : SchedWriteRes<[M1UnitNALU, - M1UnitFST, - M1UnitL]> { let Latency = 10; - let NumMicroOps = 3; } -def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT, - M1UnitFST]> { let Latency = 8; - let NumMicroOps = 2; } -def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT, - M1UnitFST, - M1UnitL]> { let Latency = 13; - let NumMicroOps = 3; } -def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC, - M1UnitFST]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteNEONH : SchedWriteRes<[M1UnitNALU, - M1UnitFST]> { let Latency = 3; - let NumMicroOps = 2; } -def M1WriteNEONI : SchedWriteRes<[M1UnitFST, - M1UnitL]> { let Latency = 9; - let NumMicroOps = 2; } -def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC, - M1UnitFMAC]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC, - M1UnitFMAC]> { let Latency = 7; - let NumMicroOps = 2; } -def M1WriteNEONL : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; - let ResourceCycles = [2]; } -def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; } -def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; } -def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; } -def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; } -def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; } -// TODO -def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; - let ResourceCycles = [15]; } -def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; - let ResourceCycles = [23]; } -def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; } -def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; } -def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; } -def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; } -def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; } -def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } -def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; } -def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; } -def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; } -def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } -def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } -def M1WriteTB : SchedWriteRes<[M1UnitC, - M1UnitALU]> { let Latency = 2; - let NumMicroOps = 2; } -def M1WriteVLDA : SchedWriteRes<[M1UnitL, - M1UnitL]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteVLDB : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 7; - let NumMicroOps = 3; } -def M1WriteVLDC : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 8; - let NumMicroOps = 4; } -def M1WriteVLDD : SchedWriteRes<[M1UnitL, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 2; - let ResourceCycles = [2, 1]; } -def M1WriteVLDE : SchedWriteRes<[M1UnitL, - M1UnitNALU]> { let Latency = 6; - let NumMicroOps = 2; } -def M1WriteVLDF : SchedWriteRes<[M1UnitL, - M1UnitL]> { let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; } -def M1WriteVLDG : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [2, 1, 1]; } -def M1WriteVLDH : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU]> { let Latency = 6; - let NumMicroOps = 3; } -def M1WriteVLDI : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 12; - let NumMicroOps = 3; - let ResourceCycles = [2, 2, 2]; } -def M1WriteVLDJ : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitNALU]> { let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1, 1]; } -def M1WriteVLDK : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitNALU, - M1UnitNALU]> { let Latency = 9; - let NumMicroOps = 5; - let ResourceCycles = [2, 1, 1, 1, 1]; } -def M1WriteVLDL : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitL, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 5; - let ResourceCycles = [1, 1, 1, 1, 1]; } -def M1WriteVLDM : SchedWriteRes<[M1UnitL, - M1UnitNALU, - M1UnitNALU, - M1UnitL, - M1UnitNALU, - M1UnitNALU]> { let Latency = 7; - let NumMicroOps = 6; - let ResourceCycles = [1, 1, 1, 1, 1, 1]; } -def M1WriteVLDN : SchedWriteRes<[M1UnitL, - M1UnitL, - M1UnitL, - M1UnitL]> { let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 2, 1]; } -def M1WriteVSTA : WriteSequence<[WriteVST], 2>; -def M1WriteVSTB : WriteSequence<[WriteVST], 3>; -def M1WriteVSTC : WriteSequence<[WriteVST], 4>; -def M1WriteVSTD : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitFST]> { let Latency = 7; - let NumMicroOps = 2; - let ResourceCycles = [7, 1, 1]; } -def M1WriteVSTE : SchedWriteRes<[M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST]> { let Latency = 8; - let NumMicroOps = 3; - let ResourceCycles = [7, 1, 1, 1, 1]; } -def M1WriteVSTF : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 15; - let NumMicroOps = 5; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1]; } -def M1WriteVSTG : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 16; - let NumMicroOps = 6; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1]; } -def M1WriteVSTH : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 14; - let NumMicroOps = 4; - let ResourceCycles = [1, 7, 1, 7, 1]; } -def M1WriteVSTI : SchedWriteRes<[M1UnitNALU, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitS, - M1UnitFST, - M1UnitFST, - M1UnitFST]> { let Latency = 17; - let NumMicroOps = 7; - let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; } - -// Special cases. -def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } -def M1WriteCOPY : SchedWriteVariant<[SchedVar<ExynosFPPred, [M1WriteNALU1]>, - SchedVar<NoSchedPred, [M1WriteA1]>]>; - -// Fast forwarding. -def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>; - -// Branch instructions -def : InstRW<[M1WriteB1], (instrs Bcc)>; -def : InstRW<[M1WriteA1], (instrs BL)>; -def : InstRW<[M1WriteBX], (instrs BLR)>; -def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>; -def : InstRW<[M1WriteAD], (instregex "^TBN?Z[WX]")>; - -// Arithmetic and logical integer instructions. -def : InstRW<[M1WriteAX], (instregex ".+rx(64)?$")>; -def : InstRW<[M1WriteAX], (instregex ".+rs$")>; - -// Move instructions. -def : InstRW<[M1WriteCOPY], (instrs COPY)>; - -// Divide and multiply instructions. - -// Miscellaneous instructions. - -// Load instructions. -def : InstRW<[M1WriteLB, - WriteLDHi, - WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>; -def : InstRW<[M1WriteLC, - ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>; -def : InstRW<[M1WriteL5, - ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>; -def : InstRW<[M1WriteLC, - ReadAdrBase], (instrs PRFMroW)>; -def : InstRW<[M1WriteL5, - ReadAdrBase], (instrs PRFMroX)>; - -// Store instructions. -def : InstRW<[M1WriteSC, - ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>; -def : InstRW<[WriteST, - ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>; - -// FP data instructions. -def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>; -def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>; -def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>; -def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>; -def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>; -def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>; -def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>; -def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>; -def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; -def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>; -def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>; -def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>; -def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>; - -// FP miscellaneous instructions. -def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>; -def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; -def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>; -def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>; -def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev1")>; -def : InstRW<[M1WriteNMISC1], (instregex "^FRECPXv1")>; -def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)S(16|32|64)")>; -def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>; -def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>; - -// FP load instructions. -def : InstRW<[WriteVLD], (instregex "^LDR[DSQ]l")>; -def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>; -def : InstRW<[WriteVLD, - WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>; -def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>; -def : InstRW<[M1WriteLD, - ReadAdrBase], (instregex "^LDR[BDHS]roW")>; -def : InstRW<[WriteVLD, - ReadAdrBase], (instregex "^LDR[BDHS]roX")>; -def : InstRW<[M1WriteLD, - ReadAdrBase], (instregex "^LDRQro[WX]")>; -def : InstRW<[WriteVLD, - M1WriteLH], (instregex "^LDN?P[DS]i")>; -def : InstRW<[M1WriteLA, - M1WriteLH], (instregex "^LDN?PQi")>; -def : InstRW<[M1WriteLC, - M1WriteLH, - WriteAdr], (instregex "^LDP[DS](post|pre)")>; -def : InstRW<[M1WriteLD, - M1WriteLH, - WriteAdr], (instregex "^LDPQ(post|pre)")>; - -// FP store instructions. -def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>; -def : InstRW<[WriteVST, - WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>; -def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>; -def : InstRW<[M1WriteSA, - ReadAdrBase], (instregex "^STR[BDHS]roW")>; -def : InstRW<[WriteVST, - ReadAdrBase], (instregex "^STR[BDHS]roX")>; -def : InstRW<[M1WriteSA, - ReadAdrBase], (instregex "^STRQro[WX]")>; -def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>; -def : InstRW<[WriteVST, - WriteAdr], (instregex "^STP[DS](post|pre)")>; -def : InstRW<[M1WriteSB, - WriteAdr], (instregex "^STPQ(post|pre)")>; - -// ASIMD instructions. -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>; -def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>; -def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>; -def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>; -def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>; -def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>; -def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>; -def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>; -def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>; -def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>; -def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>; -def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>; -def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>; -def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>; -def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>; -def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>; -def : InstRW<[M1WriteNALU1], (instregex "^SHL[dv]")>; -def : InstRW<[M1WriteNALU1], (instregex "^[SU]SH[LR][dv]")>; -def : InstRW<[M1WriteNALU1], (instregex "^S[RS]I[dv]")>; -def : InstRW<[M1WriteNAL13], (instregex "^(([SU]Q)?R)?SHRU?N[bhsv]")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]RSH[LR][dv]")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]QR?SHLU?[bdhsv]")>; - -// ASIMD FP instructions. -def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>; -def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>; -def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>; -def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>; -def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>; -def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>; -def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>; -def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>; -def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>; -def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>; -def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>; -def : InstRW<[M1WriteNEONJ], (instregex "^FMULX?v.i")>; -def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v.f")>; -def : InstRW<[M1WriteNEONK], (instregex "^FML[AS]v.i")>; -def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v.f")>; -def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; - -// ASIMD miscellaneous instructions. -def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>; -def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>; -def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>; -def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>; -def : InstRW<[M1WriteNALU1], (instregex "^EXTv8")>; -def : InstRW<[M1WriteNEONL], (instregex "^EXTv16")>; -def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>; -def : InstRW<[M1WriteNALU1], (instregex "^CPY")>; -def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>; -def : InstRW<[M1WriteNALU1], (instregex "^MOVI[Dv]")>; -def : InstRW<[M1WriteNALU1], (instregex "^FMOVv")>; -def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev[248]")>; -def : InstRW<[M1Wr |