aboutsummaryrefslogtreecommitdiffstats
path: root/llvm/lib/Target
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
commit706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target
parent7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
downloadsrc-706b4fc47bbc608932d3b491ae19a3b9cde9497b.tar.gz
src-706b4fc47bbc608932d3b491ae19a3b9cde9497b.zip
Vendor import of llvm-project master e26a78e70, the last commit beforevendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
the llvmorg-11-init tag, from which release/10.x was branched.
Notes
Notes: svn path=/vendor/llvm-project/master/; revision=356843 svn path=/vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085/; revision=356844; tag=vendor/llvm-project/llvmorg-10-init-17466-ge26a78e7085
Diffstat (limited to 'llvm/lib/Target')
-rw-r--r--llvm/lib/Target/AArch64/AArch64.td123
-rw-r--r--llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp87
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallLowering.cpp10
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.h3
-rw-r--r--llvm/lib/Target/AArch64/AArch64CallingConvention.td29
-rw-r--r--llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp13
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp4
-rw-r--r--llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp14
-rw-r--r--llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64FastISel.cpp22
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.cpp420
-rw-r--r--llvm/lib/Target/AArch64/AArch64FrameLowering.h11
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp201
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.cpp1219
-rw-r--r--llvm/lib/Target/AArch64/AArch64ISelLowering.h58
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrAtomics.td6
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrFormats.td165
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.cpp1037
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.h28
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstrInfo.td255
-rw-r--r--llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp381
-rw-r--r--llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp11
-rw-r--r--llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp433
-rw-r--r--llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h87
-rw-r--r--llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp23
-rw-r--r--llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp7
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h4
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp10
-rw-r--r--llvm/lib/Target/AArch64/AArch64RegisterInfo.td38
-rw-r--r--llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td1243
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM1.td850
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM3.td20
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM4.td32
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedExynosM5.td1012
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedPredExynos.td14
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedPredicates.td80
-rw-r--r--llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td19
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackOffset.h12
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTagging.cpp1
-rw-r--r--llvm/lib/Target/AArch64/AArch64StackTaggingPreRA.cpp32
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.cpp16
-rw-r--r--llvm/lib/Target/AArch64/AArch64Subtarget.h9
-rw-r--r--llvm/lib/Target/AArch64/AArch64SystemOperands.td2
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetMachine.cpp35
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp26
-rw-r--r--llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h35
-rw-r--r--llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp16
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp3
-rw-r--r--llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.h3
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp18
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp21
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h16
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp8
-rw-r--r--llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp5
-rw-r--r--llvm/lib/Target/AArch64/SVEInstrFormats.td1261
-rw-r--r--llvm/lib/Target/AArch64/TargetInfo/AArch64TargetInfo.cpp2
-rw-r--r--llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h19
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPU.h5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp25
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGISel.td22
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def144
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp45
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h29
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp38
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp101
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInline.cpp8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td24
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp573
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h37
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUInstructions.td252
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp425
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp19
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h8
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp32
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp675
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h15
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td6
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td141
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp61
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h23
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp9
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp208
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h31
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AMDGPUUnifyMetadata.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp46
-rw-r--r--llvm/lib/Target/AMDGPU/BUFInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/DSInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h2
-rw-r--r--llvm/lib/Target/AMDGPU/FLATInstructions.td111
-rw-r--r--llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp18
-rw-r--r--llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp13
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h14
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/MIMGInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/R600ISelLowering.cpp48
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.cpp6
-rw-r--r--llvm/lib/Target/AMDGPU/R600InstrInfo.h2
-rw-r--r--llvm/lib/Target/AMDGPU/R600Instructions.td25
-rw-r--r--llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIFoldOperands.cpp235
-rw-r--r--llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIFrameLowering.cpp10
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.cpp265
-rw-r--r--llvm/lib/Target/AMDGPU/SIISelLowering.h21
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertSkips.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp70
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.cpp170
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.h25
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstrInfo.td132
-rw-r--r--llvm/lib/Target/AMDGPU/SIInstructions.td135
-rw-r--r--llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp697
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp17
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp180
-rw-r--r--llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp1
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h19
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp14
-rw-r--r--llvm/lib/Target/AMDGPU/SIMachineScheduler.h1
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMasking.cpp21
-rw-r--r--llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp3
-rw-r--r--llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp5
-rw-r--r--llvm/lib/Target/AMDGPU/SIPreAllocateWWMRegs.cpp7
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp49
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.h5
-rw-r--r--llvm/lib/Target/AMDGPU/SIRegisterInfo.td16
-rw-r--r--llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp158
-rw-r--r--llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp4
-rw-r--r--llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp30
-rw-r--r--llvm/lib/Target/AMDGPU/SMInstructions.td10
-rw-r--r--llvm/lib/Target/AMDGPU/SOPInstructions.td23
-rw-r--r--llvm/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp42
-rw-r--r--llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h61
-rw-r--r--llvm/lib/Target/AMDGPU/VOP1Instructions.td87
-rw-r--r--llvm/lib/Target/AMDGPU/VOP2Instructions.td43
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3Instructions.td42
-rw-r--r--llvm/lib/Target/AMDGPU/VOP3PInstructions.td4
-rw-r--r--llvm/lib/Target/AMDGPU/VOPCInstructions.td4
-rw-r--r--llvm/lib/Target/ARC/ARCAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/ARC/ARCBranchFinalize.cpp1
-rw-r--r--llvm/lib/Target/ARC/ARCISelLowering.cpp2
-rw-r--r--llvm/lib/Target/ARC/ARCInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/ARC/ARCInstrInfo.h2
-rw-r--r--llvm/lib/Target/ARC/ARCOptAddrMode.cpp3
-rw-r--r--llvm/lib/Target/ARC/ARCRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/ARC/ARCTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/ARC/Disassembler/ARCDisassembler.cpp4
-rw-r--r--llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h6
-rw-r--r--llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp5
-rw-r--r--llvm/lib/Target/ARC/TargetInfo/ARCTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/ARM/A15SDOptimizer.cpp3
-rw-r--r--llvm/lib/Target/ARM/ARM.h4
-rw-r--r--llvm/lib/Target/ARM/ARM.td32
-rw-r--r--llvm/lib/Target/ARM/ARMAsmPrinter.cpp12
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp89
-rw-r--r--llvm/lib/Target/ARM/ARMBaseInstrInfo.h128
-rw-r--r--llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp9
-rw-r--r--llvm/lib/Target/ARM/ARMCallLowering.cpp2
-rw-r--r--llvm/lib/Target/ARM/ARMCallingConv.cpp46
-rw-r--r--llvm/lib/Target/ARM/ARMCallingConv.h3
-rw-r--r--llvm/lib/Target/ARM/ARMCallingConv.td35
-rw-r--r--llvm/lib/Target/ARM/ARMCodeGenPrepare.cpp1069
-rw-r--r--llvm/lib/Target/ARM/ARMConstantIslandPass.cpp20
-rw-r--r--llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp21
-rw-r--r--llvm/lib/Target/ARM/ARMFastISel.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMFrameLowering.cpp12
-rw-r--r--llvm/lib/Target/ARM/ARMFrameLowering.h2
-rw-r--r--llvm/lib/Target/ARM/ARMHazardRecognizer.h7
-rw-r--r--llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp611
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.cpp727
-rw-r--r--llvm/lib/Target/ARM/ARMISelLowering.h26
-rw-r--r--llvm/lib/Target/ARM/ARMInstrInfo.td61
-rw-r--r--llvm/lib/Target/ARM/ARMInstrMVE.td2779
-rw-r--r--llvm/lib/Target/ARM/ARMInstrNEON.td169
-rw-r--r--llvm/lib/Target/ARM/ARMInstrThumb2.td228
-rw-r--r--llvm/lib/Target/ARM/ARMInstrVFP.td31
-rw-r--r--llvm/lib/Target/ARM/ARMInstructionSelector.cpp18
-rw-r--r--llvm/lib/Target/ARM/ARMLegalizerInfo.cpp10
-rw-r--r--llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp27
-rw-r--r--llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp968
-rw-r--r--llvm/lib/Target/ARM/ARMParallelDSP.cpp15
-rw-r--r--llvm/lib/Target/ARM/ARMPredicates.td8
-rw-r--r--llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp7
-rw-r--r--llvm/lib/Target/ARM/ARMRegisterBankInfo.h4
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.cpp17
-rw-r--r--llvm/lib/Target/ARM/ARMSubtarget.h20
-rw-r--r--llvm/lib/Target/ARM/ARMTargetMachine.cpp33
-rw-r--r--llvm/lib/Target/ARM/ARMTargetMachine.h2
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp333
-rw-r--r--llvm/lib/Target/ARM/ARMTargetTransformInfo.h36
-rw-r--r--llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp105
-rw-r--r--llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp95
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp7
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp29
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp11
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h8
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp8
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp2
-rw-r--r--llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h2
-rw-r--r--llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp301
-rw-r--r--llvm/lib/Target/ARM/MVETailPredication.cpp167
-rw-r--r--llvm/lib/Target/ARM/MVEVPTBlockPass.cpp161
-rw-r--r--llvm/lib/Target/ARM/TargetInfo/ARMTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/ARM/Thumb1InstrInfo.cpp4
-rw-r--r--llvm/lib/Target/ARM/Thumb1InstrInfo.h2
-rw-r--r--llvm/lib/Target/ARM/Thumb2InstrInfo.cpp121
-rw-r--r--llvm/lib/Target/ARM/Thumb2InstrInfo.h2
-rw-r--r--llvm/lib/Target/ARM/Utils/ARMBaseInfo.h53
-rw-r--r--llvm/lib/Target/AVR/AVRAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp91
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.cpp9
-rw-r--r--llvm/lib/Target/AVR/AVRISelLowering.h2
-rw-r--r--llvm/lib/Target/AVR/AVRInstrFormats.td4
-rw-r--r--llvm/lib/Target/AVR/AVRInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/AVR/AVRInstrInfo.h2
-rw-r--r--llvm/lib/Target/AVR/AVRInstrInfo.td14
-rw-r--r--llvm/lib/Target/AVR/AVRTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp2
-rw-r--r--llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp4
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h6
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.h2
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp8
-rw-r--r--llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/AVR/TargetInfo/AVRTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp2
-rw-r--r--llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp86
-rw-r--r--llvm/lib/Target/BPF/BPFAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp1
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.h2
-rw-r--r--llvm/lib/Target/BPF/BPFInstrInfo.td19
-rw-r--r--llvm/lib/Target/BPF/BPFMIPeephole.cpp89
-rw-r--r--llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp163
-rw-r--r--llvm/lib/Target/BPF/BPFSubtarget.cpp1
-rw-r--r--llvm/lib/Target/BPF/BPFTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/BPF/BTF.h10
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.cpp157
-rw-r--r--llvm/lib/Target/BPF/BTFDebug.h19
-rw-r--r--llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp4
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFInstPrinter.h6
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h2
-rw-r--r--llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/BPF/TargetInfo/BPFTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/BitTracker.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp17
-rw-r--r--llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp9
-rw-r--r--llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp11
-rw-r--r--llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp16
-rw-r--r--llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp5
-rw-r--r--llvm/lib/Target/Hexagon/HexagonExpandCondsets.cpp3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp96
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenExtract.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenInsert.cpp5
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenMux.cpp10
-rw-r--r--llvm/lib/Target/Hexagon/HexagonGenPredicate.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp21
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLowering.cpp89
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLowering.h5
-rw-r--r--llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp179
-rw-r--r--llvm/lib/Target/Hexagon/HexagonInstrInfo.h2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonIntrinsics.td3
-rw-r--r--llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp20
-rw-r--r--llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonOptimizeSZextends.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPatterns.td4
-rw-r--r--llvm/lib/Target/Hexagon/HexagonPseudo.td20
-rw-r--r--llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp1
-rw-r--r--llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp5
-rw-r--r--llvm/lib/Target/Hexagon/HexagonRegisterInfo.td2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp7
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp42
-rw-r--r--llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h19
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVExtract.cpp44
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp9
-rw-r--r--llvm/lib/Target/Hexagon/HexagonVectorLoopCarriedReuse.cpp4
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp2
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp11
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h6
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp10
-rw-r--r--llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp12
-rw-r--r--llvm/lib/Target/Hexagon/RDFLiveness.cpp10
-rw-r--r--llvm/lib/Target/Hexagon/TargetInfo/HexagonTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp2
-rw-r--r--llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.cpp9
-rw-r--r--llvm/lib/Target/Lanai/Disassembler/LanaiDisassembler.h3
-rw-r--r--llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/Lanai/LanaiISelLowering.cpp2
-rw-r--r--llvm/lib/Target/Lanai/LanaiISelLowering.h2
-rw-r--r--llvm/lib/Target/Lanai/LanaiInstrInfo.cpp10
-rw-r--r--llvm/lib/Target/Lanai/LanaiInstrInfo.h4
-rw-r--r--llvm/lib/Target/Lanai/LanaiTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h9
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h6
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp3
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.h3
-rw-r--r--llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/Lanai/TargetInfo/LanaiTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp2
-rw-r--r--llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp16
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp7
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h6
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp3
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.h2
-rw-r--r--llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp25
-rw-r--r--llvm/lib/Target/MSP430/MSP430ISelLowering.cpp20
-rw-r--r--llvm/lib/Target/MSP430/MSP430ISelLowering.h3
-rw-r--r--llvm/lib/Target/MSP430/MSP430InstrInfo.cpp4
-rw-r--r--llvm/lib/Target/MSP430/MSP430InstrInfo.h2
-rw-r--r--llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h10
-rw-r--r--llvm/lib/Target/MSP430/MSP430Subtarget.h4
-rw-r--r--llvm/lib/Target/MSP430/MSP430TargetMachine.cpp2
-rw-r--r--llvm/lib/Target/MSP430/TargetInfo/MSP430TargetInfo.cpp2
-rw-r--r--llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp443
-rw-r--r--llvm/lib/Target/Mips/Disassembler/MipsDisassembler.cpp42
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h4
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.cpp1
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp6
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsBaseInfo.h12
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h6
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp18
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.h3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp15
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp3
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp9
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp4
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp24
-rw-r--r--llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp35
-rw-r--r--llvm/lib/Target/Mips/MicroMips32r6InstrFormats.td17
-rw-r--r--llvm/lib/Target/Mips/MicroMips32r6InstrInfo.td22
-rw-r--r--llvm/lib/Target/Mips/MicroMipsInstrFPU.td3
-rw-r--r--llvm/lib/Target/Mips/MicroMipsInstrInfo.td9
-rw-r--r--llvm/lib/Target/Mips/Mips.td17
-rw-r--r--llvm/lib/Target/Mips/Mips16ISelLowering.cpp3
-rw-r--r--llvm/lib/Target/Mips/Mips16InstrInfo.cpp23
-rw-r--r--llvm/lib/Target/Mips/Mips16InstrInfo.h9
-rw-r--r--llvm/lib/Target/Mips/Mips16InstrInfo.td8
-rw-r--r--llvm/lib/Target/Mips/Mips32r6InstrInfo.td10
-rw-r--r--llvm/lib/Target/Mips/Mips64InstrInfo.td43
-rw-r--r--llvm/lib/Target/Mips/Mips64r6InstrInfo.td7
-rw-r--r--llvm/lib/Target/Mips/MipsAsmPrinter.cpp12
-rw-r--r--llvm/lib/Target/Mips/MipsCallLowering.cpp5
-rw-r--r--llvm/lib/Target/Mips/MipsCallingConv.td3
-rw-r--r--llvm/lib/Target/Mips/MipsCondMov.td16
-rw-r--r--llvm/lib/Target/Mips/MipsConstantIslandPass.cpp10
-rw-r--r--llvm/lib/Target/Mips/MipsDSPInstrInfo.td11
-rw-r--r--llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp72
-rw-r--r--llvm/lib/Target/Mips/MipsExpandPseudo.cpp230
-rw-r--r--llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp1
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.cpp220
-rw-r--r--llvm/lib/Target/Mips/MipsISelLowering.h2
-rw-r--r--llvm/lib/Target/Mips/MipsInstrFPU.td48
-rw-r--r--llvm/lib/Target/Mips/MipsInstrFormats.td19
-rw-r--r--llvm/lib/Target/Mips/MipsInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/Mips/MipsInstrInfo.td194
-rw-r--r--llvm/lib/Target/Mips/MipsInstructionSelector.cpp148
-rw-r--r--llvm/lib/Target/Mips/MipsLegalizerInfo.cpp113
-rw-r--r--llvm/lib/Target/Mips/MipsMCInstLower.cpp8
-rw-r--r--llvm/lib/Target/Mips/MipsMCInstLower.h4
-rw-r--r--llvm/lib/Target/Mips/MipsMSAInstrInfo.td1
-rw-r--r--llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp1
-rw-r--r--llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp27
-rw-r--r--llvm/lib/Target/Mips/MipsRegisterBankInfo.h4
-rw-r--r--llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp22
-rw-r--r--llvm/lib/Target/Mips/MipsSEISelLowering.cpp1
-rw-r--r--llvm/lib/Target/Mips/MipsSEInstrInfo.cpp27
-rw-r--r--llvm/lib/Target/Mips/MipsSEInstrInfo.h10
-rw-r--r--llvm/lib/Target/Mips/MipsScheduleGeneric.td12
-rw-r--r--llvm/lib/Target/Mips/MipsScheduleP5600.td6
-rw-r--r--llvm/lib/Target/Mips/MipsSubtarget.cpp25
-rw-r--r--llvm/lib/Target/Mips/MipsSubtarget.h4
-rw-r--r--llvm/lib/Target/Mips/MipsTargetMachine.cpp7
-rw-r--r--llvm/lib/Target/Mips/MipsTargetStreamer.h4
-rw-r--r--llvm/lib/Target/Mips/TargetInfo/MipsTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h6
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h3
-rw-r--r--llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/ManagedStringPool.h2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp9
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXISelLowering.h5
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXImageOptimizer.cpp1
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXInstrInfo.h2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXIntrinsics.td42
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXLowerAlloca.cpp3
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp4
-rw-r--r--llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/NVPTX/NVVMIntrRange.cpp2
-rw-r--r--llvm/lib/Target/NVPTX/NVVMReflect.cpp1
-rw-r--r--llvm/lib/Target/NVPTX/TargetInfo/NVPTXTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp90
-rw-r--r--llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp6
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp8
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h6
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp28
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h7
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp23
-rw-r--r--llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h24
-rw-r--r--llvm/lib/Target/PowerPC/P9InstrResources.td229
-rw-r--r--llvm/lib/Target/PowerPC/PPC.h11
-rw-r--r--llvm/lib/Target/PowerPC/PPC.td175
-rw-r--r--llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp356
-rw-r--r--llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp1
-rw-r--r--llvm/lib/Target/PowerPC/PPCCTRLoops.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCFrameLowering.cpp7
-rw-r--r--llvm/lib/Target/PowerPC/PPCHazardRecognizers.cpp4
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp401
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.cpp1357
-rw-r--r--llvm/lib/Target/PowerPC/PPCISelLowering.h936
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstr64Bit.td78
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrAltivec.td121
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrFormats.td61
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrHTM.td16
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.cpp549
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.h14
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrInfo.td383
-rw-r--r--llvm/lib/Target/PowerPC/PPCInstrVSX.td190
-rw-r--r--llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp (renamed from llvm/lib/Target/PowerPC/PPCLoopPreIncPrep.cpp)441
-rw-r--r--llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp164
-rw-r--r--llvm/lib/Target/PowerPC/PPCMIPeephole.cpp390
-rw-r--r--llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h2
-rw-r--r--llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp25
-rw-r--r--llvm/lib/Target/PowerPC/PPCReduceCRLogicals.cpp13
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp35
-rw-r--r--llvm/lib/Target/PowerPC/PPCRegisterInfo.h9
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.cpp7
-rw-r--r--llvm/lib/Target/PowerPC/PPCSubtarget.h45
-rw-r--r--llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp1
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetMachine.cpp20
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp96
-rw-r--r--llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h17
-rw-r--r--llvm/lib/Target/PowerPC/PPCVSXFMAMutate.cpp1
-rw-r--r--llvm/lib/Target/PowerPC/TargetInfo/PowerPCTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp61
-rw-r--r--llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp4
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp10
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp13
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h8
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp6
-rw-r--r--llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp5
-rw-r--r--llvm/lib/Target/RISCV/RISCV.td29
-rw-r--r--llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp16
-rw-r--r--llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.cpp132
-rw-r--r--llvm/lib/Target/RISCV/RISCVFrameLowering.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp3
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.cpp91
-rw-r--r--llvm/lib/Target/RISCV/RISCVISelLowering.h13
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.cpp259
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.h46
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfo.td23
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoA.td2
-rw-r--r--llvm/lib/Target/RISCV/RISCVInstrInfoD.td3
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp18
-rw-r--r--llvm/lib/Target/RISCV/RISCVRegisterInfo.h2
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.cpp1
-rw-r--r--llvm/lib/Target/RISCV/RISCVSubtarget.h5
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.cpp33
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetMachine.h10
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp6
-rw-r--r--llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h6
-rw-r--r--llvm/lib/Target/RISCV/TargetInfo/RISCVTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp17
-rw-r--r--llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h36
-rw-r--r--llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp2
-rw-r--r--llvm/lib/Target/Sparc/DelaySlotFiller.cpp2
-rw-r--r--llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp4
-rwxr-xr-xllvm/lib/Target/Sparc/LeonPasses.h2
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h8
-rw-r--r--llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp8
-rw-r--r--llvm/lib/Target/Sparc/SparcAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/Sparc/SparcISelDAGToDAG.cpp3
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.cpp2
-rw-r--r--llvm/lib/Target/Sparc/SparcISelLowering.h2
-rw-r--r--llvm/lib/Target/Sparc/SparcInstr64Bit.td2
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.h2
-rw-r--r--llvm/lib/Target/Sparc/SparcInstrInfo.td5
-rw-r--r--llvm/lib/Target/Sparc/SparcTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/Sparc/TargetInfo/SparcTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp24
-rw-r--r--llvm/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp4
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp16
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h6
-rw-r--r--llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp5
-rw-r--r--llvm/lib/Target/SystemZ/SystemZ.h2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp31
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.h7
-rw-r--r--llvm/lib/Target/SystemZ/SystemZCallingConv.td27
-rw-r--r--llvm/lib/Target/SystemZ/SystemZElimCompare.cpp169
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp264
-rw-r--r--llvm/lib/Target/SystemZ/SystemZFrameLowering.h11
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp14
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.cpp379
-rw-r--r--llvm/lib/Target/SystemZ/SystemZISelLowering.h31
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrFP.td60
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrFormats.td30
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp58
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.h9
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrInfo.td19
-rw-r--r--llvm/lib/Target/SystemZ/SystemZInstrVector.td52
-rw-r--r--llvm/lib/Target/SystemZ/SystemZLongBranch.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h52
-rw-r--r--llvm/lib/Target/SystemZ/SystemZOperands.td22
-rw-r--r--llvm/lib/Target/SystemZ/SystemZOperators.td40
-rw-r--r--llvm/lib/Target/SystemZ/SystemZPatterns.td4
-rw-r--r--llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp96
-rw-r--r--llvm/lib/Target/SystemZ/SystemZShortenInst.cpp38
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTDC.cpp1
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp21
-rw-r--r--llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h11
-rw-r--r--llvm/lib/Target/SystemZ/TargetInfo/SystemZTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/TargetMachine.cpp19
-rw-r--r--llvm/lib/Target/TargetMachineC.cpp6
-rw-r--r--llvm/lib/Target/VE/InstPrinter/VEInstPrinter.cpp118
-rw-r--r--llvm/lib/Target/VE/InstPrinter/VEInstPrinter.h49
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp40
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.h31
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp106
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h53
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp44
-rw-r--r--llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.h47
-rw-r--r--llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp23
-rw-r--r--llvm/lib/Target/VE/VE.h109
-rw-r--r--llvm/lib/Target/VE/VE.td56
-rw-r--r--llvm/lib/Target/VE/VEAsmPrinter.cpp78
-rw-r--r--llvm/lib/Target/VE/VECallingConv.td19
-rw-r--r--llvm/lib/Target/VE/VEFrameLowering.cpp325
-rw-r--r--llvm/lib/Target/VE/VEFrameLowering.h81
-rw-r--r--llvm/lib/Target/VE/VEISelDAGToDAG.cpp70
-rw-r--r--llvm/lib/Target/VE/VEISelLowering.cpp137
-rw-r--r--llvm/lib/Target/VE/VEISelLowering.h62
-rw-r--r--llvm/lib/Target/VE/VEInstrFormats.td75
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.cpp133
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.h48
-rw-r--r--llvm/lib/Target/VE/VEInstrInfo.td288
-rw-r--r--llvm/lib/Target/VE/VEMCInstLower.cpp69
-rw-r--r--llvm/lib/Target/VE/VERegisterInfo.cpp133
-rw-r--r--llvm/lib/Target/VE/VERegisterInfo.h49
-rw-r--r--llvm/lib/Target/VE/VERegisterInfo.td37
-rw-r--r--llvm/lib/Target/VE/VESubtarget.cpp99
-rw-r--r--llvm/lib/Target/VE/VESubtarget.h73
-rw-r--r--llvm/lib/Target/VE/VETargetMachine.cpp108
-rw-r--r--llvm/lib/Target/VE/VETargetMachine.h57
-rw-r--r--llvm/lib/Target/VE/VETargetTransformInfo.h50
-rw-r--r--llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp38
-rw-r--r--llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp9
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h6
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp3
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h3
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp7
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h7
-rw-r--r--llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssembly.h4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp14
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp8
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.h1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp1
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISD.def4
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp7
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp47
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h9
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrFloat.td7
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp14
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.h6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td59
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp5
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp6
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp2
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp3
-rw-r--r--llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h3
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp224
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86Operand.h27
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp1670
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp1938
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h69
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h6
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp463
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h294
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp8
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h6
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp1057
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp14
-rw-r--r--llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp2
-rw-r--r--llvm/lib/Target/X86/X86.h12
-rw-r--r--llvm/lib/Target/X86/X86.td133
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.cpp50
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.h6
-rw-r--r--llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp9
-rw-r--r--llvm/lib/Target/X86/X86CallFrameOptimization.cpp12
-rw-r--r--llvm/lib/Target/X86/X86CallLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.td35
-rw-r--r--llvm/lib/Target/X86/X86CmovConversion.cpp7
-rw-r--r--llvm/lib/Target/X86/X86CondBrFolding.cpp2
-rw-r--r--llvm/lib/Target/X86/X86DomainReassignment.cpp6
-rwxr-xr-xllvm/lib/Target/X86/X86EvexToVex.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp10
-rw-r--r--llvm/lib/Target/X86/X86FastISel.cpp2
-rw-r--r--llvm/lib/Target/X86/X86FixupBWInsts.cpp24
-rw-r--r--llvm/lib/Target/X86/X86FixupLEAs.cpp7
-rw-r--r--llvm/lib/Target/X86/X86FixupSetCC.cpp50
-rw-r--r--llvm/lib/Target/X86/X86FlagsCopyLowering.cpp92
-rw-r--r--llvm/lib/Target/X86/X86FloatingPoint.cpp48
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp32
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp179
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp4018
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h65
-rw-r--r--llvm/lib/Target/X86/X86IndirectBranchTracking.cpp6
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td705
-rw-r--r--llvm/lib/Target/X86/X86InstrControl.td23
-rw-r--r--llvm/lib/Target/X86/X86InstrFMA.td35
-rw-r--r--llvm/lib/Target/X86/X86InstrFPStack.td141
-rw-r--r--llvm/lib/Target/X86/X86InstrFormats.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td96
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp175
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h14
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.td61
-rw-r--r--llvm/lib/Target/X86/X86InstrMMX.td10
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td560
-rw-r--r--llvm/lib/Target/X86/X86InstrTSX.td2
-rw-r--r--llvm/lib/Target/X86/X86InstructionSelector.cpp87
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h1
-rw-r--r--llvm/lib/Target/X86/X86LegalizerInfo.cpp8
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp166
-rw-r--r--llvm/lib/Target/X86/X86MacroFusion.cpp183
-rw-r--r--llvm/lib/Target/X86/X86OptimizeLEAs.cpp24
-rw-r--r--llvm/lib/Target/X86/X86PadShortFunction.cpp20
-rw-r--r--llvm/lib/Target/X86/X86PfmCounters.td16
-rw-r--r--llvm/lib/Target/X86/X86RegisterBankInfo.cpp5
-rw-r--r--llvm/lib/Target/X86/X86RegisterBankInfo.h4
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp34
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.td5
-rw-r--r--llvm/lib/Target/X86/X86RetpolineThunks.cpp12
-rw-r--r--llvm/lib/Target/X86/X86ScheduleAtom.td3
-rw-r--r--llvm/lib/Target/X86/X86ScheduleSLM.td30
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver2.td1548
-rw-r--r--llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp12
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp14
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h36
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp20
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp146
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h22
-rw-r--r--llvm/lib/Target/X86/X86VZeroUpper.cpp2
-rw-r--r--llvm/lib/Target/X86/X86WinAllocaExpander.cpp16
-rw-r--r--llvm/lib/Target/X86/X86WinEHState.cpp10
-rw-r--r--llvm/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp10
-rw-r--r--llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/XCore/MCTargetDesc/XCoreInstPrinter.h6
-rw-r--r--llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp5
-rw-r--r--llvm/lib/Target/XCore/TargetInfo/XCoreTargetInfo.cpp2
-rw-r--r--llvm/lib/Target/XCore/XCoreAsmPrinter.cpp2
-rw-r--r--llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp1
-rw-r--r--llvm/lib/Target/XCore/XCoreISelLowering.cpp1
-rw-r--r--llvm/lib/Target/XCore/XCoreInstrInfo.cpp4
-rw-r--r--llvm/lib/Target/XCore/XCoreInstrInfo.h2
-rw-r--r--llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp1
-rw-r--r--llvm/lib/Target/XCore/XCoreRegisterInfo.cpp2
-rw-r--r--llvm/lib/Target/XCore/XCoreTargetMachine.cpp2
716 files changed, 37437 insertions, 17375 deletions
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 5b4c9e2149da..0106355b1a44 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -448,9 +448,9 @@ include "AArch64SchedA57.td"
include "AArch64SchedCyclone.td"
include "AArch64SchedFalkor.td"
include "AArch64SchedKryo.td"
-include "AArch64SchedExynosM1.td"
include "AArch64SchedExynosM3.td"
include "AArch64SchedExynosM4.td"
+include "AArch64SchedExynosM5.td"
include "AArch64SchedThunderX.td"
include "AArch64SchedThunderX2T99.td"
@@ -565,8 +565,8 @@ def ProcA76 : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
// Note that cyclone does not fuse AES instructions, but newer apple chips do
// perform the fusion and cyclone is used by default when targetting apple OSes.
-def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
- "Cyclone", [
+def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
+ "Apple A7 (the CPU formerly known as Cyclone)", [
FeatureAlternateSExtLoadCVTF32Pattern,
FeatureArithmeticBccFusion,
FeatureArithmeticCbzFusion,
@@ -582,32 +582,82 @@ def ProcCyclone : SubtargetFeature<"cyclone", "ARMProcFamily", "Cyclone",
FeatureZCZeroingFPWorkaround
]>;
-def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
- "Samsung Exynos-M1 processors",
- [FeatureSlowPaired128,
- FeatureCRC,
- FeatureCrypto,
- FeatureExynosCheapAsMoveHandling,
- FeatureForce32BitJumpTables,
- FeatureFuseAES,
- FeaturePerfMon,
- FeaturePostRAScheduler,
- FeatureSlowMisaligned128Store,
- FeatureUseRSqrt,
- FeatureZCZeroingFP]>;
+def ProcAppleA10 : SubtargetFeature<"apple-a10", "ARMProcFamily", "AppleA10",
+ "Apple A10", [
+ FeatureAlternateSExtLoadCVTF32Pattern,
+ FeatureArithmeticBccFusion,
+ FeatureArithmeticCbzFusion,
+ FeatureCrypto,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFPARMv8,
+ FeatureFuseAES,
+ FeatureFuseCryptoEOR,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeatureZCRegMove,
+ FeatureZCZeroing,
+ FeatureCRC,
+ FeatureRDM,
+ FeaturePAN,
+ FeatureLOR,
+ FeatureVH,
+ ]>;
-def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1",
- "Samsung Exynos-M2 processors",
- [FeatureSlowPaired128,
- FeatureCRC,
+def ProcAppleA11 : SubtargetFeature<"apple-a11", "ARMProcFamily", "AppleA11",
+ "Apple A11", [
+ FeatureAlternateSExtLoadCVTF32Pattern,
+ FeatureArithmeticBccFusion,
+ FeatureArithmeticCbzFusion,
+ FeatureCrypto,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFPARMv8,
+ FeatureFuseAES,
+ FeatureFuseCryptoEOR,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeatureZCRegMove,
+ FeatureZCZeroing,
+ FeatureFullFP16,
+ HasV8_2aOps
+ ]>;
+
+def ProcAppleA12 : SubtargetFeature<"apple-a12", "ARMProcFamily", "AppleA12",
+ "Apple A12", [
+ FeatureAlternateSExtLoadCVTF32Pattern,
+ FeatureArithmeticBccFusion,
+ FeatureArithmeticCbzFusion,
+ FeatureCrypto,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFPARMv8,
+ FeatureFuseAES,
+ FeatureFuseCryptoEOR,
+ FeatureNEON,
+ FeaturePerfMon,
+ FeatureZCRegMove,
+ FeatureZCZeroing,
+ FeatureFullFP16,
+ HasV8_3aOps
+ ]>;
+
+def ProcAppleA13 : SubtargetFeature<"apple-a13", "ARMProcFamily", "AppleA13",
+ "Apple A13", [
+ FeatureAlternateSExtLoadCVTF32Pattern,
+ FeatureArithmeticBccFusion,
+ FeatureArithmeticCbzFusion,
FeatureCrypto,
- FeatureExynosCheapAsMoveHandling,
- FeatureForce32BitJumpTables,
+ FeatureDisableLatencySchedHeuristic,
+ FeatureFPARMv8,
FeatureFuseAES,
+ FeatureFuseCryptoEOR,
+ FeatureNEON,
FeaturePerfMon,
- FeaturePostRAScheduler,
- FeatureSlowMisaligned128Store,
- FeatureZCZeroingFP]>;
+ FeatureZCRegMove,
+ FeatureZCZeroing,
+ FeatureFullFP16,
+ FeatureFP16FML,
+ FeatureSHA3,
+ HasV8_4aOps
+ ]>;
def ProcExynosM3 : SubtargetFeature<"exynosm3", "ARMProcFamily", "ExynosM3",
"Samsung Exynos-M3 processors",
@@ -815,12 +865,9 @@ def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
-def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>;
-def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>;
-def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
def : ProcessorModel<"exynos-m4", ExynosM4Model, [ProcExynosM4]>;
-def : ProcessorModel<"exynos-m5", ExynosM4Model, [ProcExynosM4]>;
+def : ProcessorModel<"exynos-m5", ExynosM5Model, [ProcExynosM4]>;
def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
def : ProcessorModel<"saphira", FalkorModel, [ProcSaphira]>;
def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
@@ -834,8 +881,24 @@ def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
// FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
+// Support cyclone as an alias for apple-a7 so we can still LTO old bitcode.
+def : ProcessorModel<"cyclone", CycloneModel, [ProcAppleA7]>;
+
+// iPhone and iPad CPUs
+def : ProcessorModel<"apple-a7", CycloneModel, [ProcAppleA7]>;
+def : ProcessorModel<"apple-a8", CycloneModel, [ProcAppleA7]>;
+def : ProcessorModel<"apple-a9", CycloneModel, [ProcAppleA7]>;
+def : ProcessorModel<"apple-a10", CycloneModel, [ProcAppleA10]>;
+def : ProcessorModel<"apple-a11", CycloneModel, [ProcAppleA11]>;
+def : ProcessorModel<"apple-a12", CycloneModel, [ProcAppleA12]>;
+def : ProcessorModel<"apple-a13", CycloneModel, [ProcAppleA13]>;
+
+// watch CPUs.
+def : ProcessorModel<"apple-s4", CycloneModel, [ProcAppleA12]>;
+def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>;
+
// Alias for the latest Apple processor model supported by LLVM.
-def : ProcessorModel<"apple-latest", CycloneModel, [ProcCyclone]>;
+def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>;
//===----------------------------------------------------------------------===//
// Assembly parser
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 7ea7915c2ca6..00e321f9b850 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -84,6 +84,7 @@ public:
return MCInstLowering.lowerOperand(MO, MCOp);
}
+ void EmitStartOfAsmFile(Module &M) override;
void EmitJumpTableInfo() override;
void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
const MachineBasicBlock *MBB, unsigned JTI);
@@ -181,8 +182,79 @@ private:
} // end anonymous namespace
+void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) {
+ if (!TM.getTargetTriple().isOSBinFormatELF())
+ return;
+
+ // Assemble feature flags that may require creation of a note section.
+ unsigned Flags = ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI |
+ ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
+
+ if (any_of(M, [](const Function &F) {
+ return !F.isDeclaration() &&
+ !F.hasFnAttribute("branch-target-enforcement");
+ })) {
+ Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI;
+ }
+
+ if ((Flags & ELF::GNU_PROPERTY_AARCH64_FEATURE_1_BTI) == 0 &&
+ any_of(M, [](const Function &F) {
+ return F.hasFnAttribute("branch-target-enforcement");
+ })) {
+ errs() << "warning: some functions compiled with BTI and some compiled "
+ "without BTI\n"
+ << "warning: not setting BTI in feature flags\n";
+ }
+
+ if (any_of(M, [](const Function &F) {
+ if (F.isDeclaration())
+ return false;
+ Attribute A = F.getFnAttribute("sign-return-address");
+ return !A.isStringAttribute() || A.getValueAsString() == "none";
+ })) {
+ Flags &= ~ELF::GNU_PROPERTY_AARCH64_FEATURE_1_PAC;
+ }
+
+ if (Flags == 0)
+ return;
+
+ // Emit a .note.gnu.property section with the flags.
+ MCSection *Cur = OutStreamer->getCurrentSectionOnly();
+ MCSection *Nt = MMI->getContext().getELFSection(
+ ".note.gnu.property", ELF::SHT_NOTE, ELF::SHF_ALLOC);
+ OutStreamer->SwitchSection(Nt);
+
+ // Emit the note header.
+ EmitAlignment(Align(8));
+ OutStreamer->EmitIntValue(4, 4); // data size for "GNU\0"
+ OutStreamer->EmitIntValue(4 * 4, 4); // Elf_Prop size
+ OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
+ OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name
+
+ // Emit the PAC/BTI properties.
+ OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
+ OutStreamer->EmitIntValue(4, 4); // data size
+ OutStreamer->EmitIntValue(Flags, 4); // data
+ OutStreamer->EmitIntValue(0, 4); // pad
+
+ OutStreamer->endSection(Nt);
+ OutStreamer->SwitchSection(Cur);
+}
+
void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
{
+ const Function &F = MF->getFunction();
+ if (F.hasFnAttribute("patchable-function-entry")) {
+ unsigned Num;
+ if (F.getFnAttribute("patchable-function-entry")
+ .getValueAsString()
+ .getAsInteger(10, Num))
+ return;
+ for (; Num; --Num)
+ EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+ return;
+ }
+
EmitSled(MI, SledKind::FUNCTION_ENTER);
}
@@ -458,8 +530,8 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
// linker can safely perform dead code stripping. Since LLVM never
// generates code that does this, it is always safe to set.
OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
- emitStackMaps(SM);
}
+ emitStackMaps(SM);
}
void AArch64AsmPrinter::EmitLOHs() {
@@ -794,7 +866,11 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI) {
unsigned NumNOPBytes = StackMapOpers(&MI).getNumPatchBytes();
- SM.recordStackMap(MI);
+ auto &Ctx = OutStreamer.getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer.EmitLabel(MILabel);
+
+ SM.recordStackMap(*MILabel, MI);
assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
// Scan ahead to trim the shadow.
@@ -820,7 +896,10 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
// [<def>], <id>, <numBytes>, <target>, <numArgs>
void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
const MachineInstr &MI) {
- SM.recordPatchPoint(MI);
+ auto &Ctx = OutStreamer.getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer.EmitLabel(MILabel);
+ SM.recordPatchPoint(*MILabel, MI);
PatchPointOpers Opers(&MI);
@@ -1219,7 +1298,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
}
// Force static initialization.
-extern "C" void LLVMInitializeAArch64AsmPrinter() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() {
RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
RegisterAsmPrinter<AArch64AsmPrinter> Y(getTheAArch64beTarget());
RegisterAsmPrinter<AArch64AsmPrinter> Z(getTheARM64Target());
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
index ed93d02aa615..76ff238234d9 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -160,7 +160,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
MIRBuilder.buildConstant(OffsetReg, Offset);
Register AddrReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+ MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MF, Offset);
return AddrReg;
@@ -815,7 +815,7 @@ bool AArch64CallLowering::lowerTailCall(
// Tell the call which registers are clobbered.
auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv());
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, CalleeCC);
if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(MF, &Mask);
MIB.addRegMask(Mask);
@@ -972,7 +972,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
// Tell the call which registers are clobbered.
auto TRI = MF.getSubtarget<AArch64Subtarget>().getRegisterInfo();
- const uint32_t *Mask = TRI->getCallPreservedMask(MF, F.getCallingConv());
+ const uint32_t *Mask = TRI->getCallPreservedMask(MF, Info.CallConv);
if (MF.getSubtarget<AArch64Subtarget>().hasCustomCallingConv())
TRI->UpdateCustomCallPreservedMask(MF, &Mask);
MIB.addRegMask(Mask);
@@ -1000,10 +1000,10 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
0));
// Finally we can copy the returned value back into its virtual-register. In
- // symmetry with the arugments, the physical register must be an
+ // symmetry with the arguments, the physical register must be an
// implicit-define of the call instruction.
if (!Info.OrigRet.Ty->isVoidTy()) {
- CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(F.getCallingConv());
+ CCAssignFn *RetAssignFn = TLI.CCAssignFnForReturn(Info.CallConv);
CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
if (!handleAssignments(MIRBuilder, InArgs, Handler))
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
index 5a55d090d7c8..59939e0684ed 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -31,6 +31,9 @@ bool CC_AArch64_DarwinPCS_ILP32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ ISD::ArgFlagsTy ArgFlags, CCState &State);
bool CC_AArch64_WebKit_JS(unsigned ValNo, MVT ValVT, MVT LocVT,
CCValAssign::LocInfo LocInfo,
ISD::ArgFlagsTy ArgFlags, CCState &State);
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index bccbbd4591ed..a0b2d7712b66 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -75,10 +75,10 @@ def CC_AArch64_AAPCS : CallingConv<[
CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
- nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ nxv2f32, nxv4f32, nxv2f64],
CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
- nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ nxv2f32, nxv4f32, nxv2f64],
CCPassIndirect<i64>>,
CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -155,7 +155,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
- nxv1f32, nxv2f32, nxv4f32, nxv1f64, nxv2f64],
+ nxv2f32, nxv4f32, nxv2f64],
CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -170,6 +170,13 @@ def CC_AArch64_Win64_VarArg : CallingConv<[
CCDelegateTo<CC_AArch64_AAPCS>
]>;
+// Windows Control Flow Guard checks take a single argument (the target function
+// address) and have no return value.
+let Entry = 1 in
+def CC_AArch64_Win64_CFGuard_Check : CallingConv<[
+ CCIfType<[i64], CCAssignToReg<[X15]>>
+]>;
+
// Darwin uses a calling convention which differs in only two ways
// from the standard one at this level:
@@ -384,6 +391,12 @@ def CSR_Win_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
D8, D9, D10, D11,
D12, D13, D14, D15)>;
+// The Control Flow Guard check call uses a custom calling convention that also
+// preserves X0-X8 and Q0-Q7.
+def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS,
+ (sequence "X%u", 0, 8),
+ (sequence "Q%u", 0, 7))>;
+
// AArch64 PCS for vector functions (VPCS)
// must (additionally) preserve full Q8-Q23 registers
def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
@@ -392,10 +405,10 @@ def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
// Functions taking SVE arguments or returning an SVE type
// must (additionally) preserve full Z8-Z23 and predicate registers P4-P15
-def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
- X25, X26, X27, X28, LR, FP,
- (sequence "Z%u", 8, 23),
- (sequence "P%u", 4, 15))>;
+def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23),
+ (sequence "P%u", 4, 15),
+ X19, X20, X21, X22, X23, X24,
+ X25, X26, X27, X28, LR, FP)>;
// Constructors and destructors return 'this' in the iOS 64-bit C++ ABI; since
// 'this' and the pointer return value are both passed in X0 in these cases,
@@ -473,5 +486,7 @@ def CSR_AArch64_RT_MostRegs_SCS
: CalleeSavedRegs<(add CSR_AArch64_RT_MostRegs, X18)>;
def CSR_AArch64_AAVPCS_SCS
: CalleeSavedRegs<(add CSR_AArch64_AAVPCS, X18)>;
+def CSR_AArch64_SVE_AAPCS_SCS
+ : CalleeSavedRegs<(add CSR_AArch64_SVE_AAPCS, X18)>;
def CSR_AArch64_AAPCS_SCS
: CalleeSavedRegs<(add CSR_AArch64_AAPCS, X18)>;
diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
index 48dab79b32d3..259238705965 100644
--- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -20,6 +20,7 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Alignment.h"
#include "llvm/Support/Debug.h"
using namespace llvm;
@@ -74,10 +75,16 @@ void AArch64CompressJumpTables::scanFunction() {
BlockInfo.clear();
BlockInfo.resize(MF->getNumBlockIDs());
- int Offset = 0;
+ unsigned Offset = 0;
for (MachineBasicBlock &MBB : *MF) {
- BlockInfo[MBB.getNumber()] = Offset;
- Offset += computeBlockSize(MBB);
+ const Align Alignment = MBB.getAlignment();
+ unsigned AlignedOffset;
+ if (Alignment == Align::None())
+ AlignedOffset = Offset;
+ else
+ AlignedOffset = alignTo(Offset, Alignment);
+ BlockInfo[MBB.getNumber()] = AlignedOffset;
+ Offset = AlignedOffset + computeBlockSize(MBB);
}
}
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index a6efb115ed44..51b2ce029701 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -74,6 +74,7 @@
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 43ae9f8ec47f..054ef8f482ca 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -33,6 +33,7 @@
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -351,8 +352,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
}
// Check for flag reads and clobbers.
- MIOperands::PhysRegInfo PRI =
- MIOperands(*I).analyzePhysReg(AArch64::NZCV, TRI);
+ PhysRegInfo PRI = AnalyzePhysRegInBundle(*I, AArch64::NZCV, TRI);
if (PRI.Read) {
// The ccmp doesn't produce exactly the same flags as the original
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 082e17e44d04..3b8f8a19fe49 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -110,6 +110,8 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
unsigned BitSize) {
MachineInstr &MI = *MBBI;
Register DstReg = MI.getOperand(0).getReg();
+ uint64_t RenamableState =
+ MI.getOperand(0).isRenamable() ? RegState::Renamable : 0;
uint64_t Imm = MI.getOperand(1).getImm();
if (DstReg == AArch64::XZR || DstReg == AArch64::WZR) {
@@ -144,7 +146,8 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
bool DstIsDead = MI.getOperand(0).isDead();
MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
.addReg(DstReg, RegState::Define |
- getDeadRegState(DstIsDead && LastItem))
+ getDeadRegState(DstIsDead && LastItem) |
+ RenamableState)
.addImm(I->Op1)
.addImm(I->Op2));
} break;
@@ -155,7 +158,8 @@ bool AArch64ExpandPseudo::expandMOVImm(MachineBasicBlock &MBB,
MIBS.push_back(BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(I->Opcode))
.addReg(DstReg,
RegState::Define |
- getDeadRegState(DstIsDead && LastItem))
+ getDeadRegState(DstIsDead && LastItem) |
+ RenamableState)
.addReg(DstReg)
.addImm(I->Op1)
.addImm(I->Op2));
@@ -692,10 +696,12 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
return true;
}
case AArch64::TAGPstack: {
- BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDG))
+ int64_t Offset = MI.getOperand(2).getImm();
+ BuildMI(MBB, MBBI, MI.getDebugLoc(),
+ TII->get(Offset >= 0 ? AArch64::ADDG : AArch64::SUBG))
.add(MI.getOperand(0))
.add(MI.getOperand(1))
- .add(MI.getOperand(2))
+ .addImm(std::abs(Offset))
.add(MI.getOperand(4));
MI.eraseFromParent();
return true;
diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index b54fc2e51bac..c1fc183b04f6 100644
--- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -42,6 +42,7 @@
#include "llvm/IR/Instruction.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Debug.h"
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 277a3052f1e5..7e9c68f2bb30 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -348,6 +348,8 @@ CCAssignFn *AArch64FastISel::CCAssignFnForCall(CallingConv::ID CC) const {
return CC_AArch64_WebKit_JS;
if (CC == CallingConv::GHC)
return CC_AArch64_GHC;
+ if (CC == CallingConv::CFGuard_Check)
+ return CC_AArch64_Win64_CFGuard_Check;
return Subtarget->isTargetDarwin() ? CC_AArch64_DarwinPCS : CC_AArch64_AAPCS;
}
@@ -3251,6 +3253,13 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
if (Callee && !computeCallAddress(Callee, Addr))
return false;
+ // The weak function target may be zero; in that case we must use indirect
+ // addressing via a stub on windows as it may be out of range for a
+ // PC-relative jump.
+ if (Subtarget->isTargetWindows() && Addr.getGlobalValue() &&
+ Addr.getGlobalValue()->hasExternalWeakLinkage())
+ return false;
+
// Handle the arguments now that we've gotten them.
unsigned NumBytes;
if (!processCallArgs(CLI, OutVTs, NumBytes))
@@ -3836,11 +3845,6 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
if (!FuncInfo.CanLowerReturn)
return false;
- // FIXME: in principle it could. Mostly just a case of zero extending outgoing
- // pointers.
- if (Subtarget->isTargetILP32())
- return false;
-
if (F.isVarArg())
return false;
@@ -3920,6 +3924,11 @@ bool AArch64FastISel::selectRet(const Instruction *I) {
return false;
}
+ // "Callee" (i.e. value producer) zero extends pointers at function
+ // boundary.
+ if (Subtarget->isTargetILP32() && RV->getType()->isPointerTy())
+ SrcReg = emitAnd_ri(MVT::i64, SrcReg, false, 0xffffffff);
+
// Make the copy.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(TargetOpcode::COPY), DestReg).addReg(SrcReg);
@@ -5009,6 +5018,9 @@ std::pair<unsigned, bool> AArch64FastISel::getRegForGEPIndex(const Value *Idx) {
/// simple cases. This is because the standard fastEmit functions don't cover
/// MUL at all and ADD is lowered very inefficientily.
bool AArch64FastISel::selectGetElementPtr(const Instruction *I) {
+ if (Subtarget->isTargetILP32())
+ return false;
+
unsigned N = getRegForValue(I->getOperand(0));
if (!N)
return false;
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 68e1e6a30224..ea3e800a1ad2 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -206,6 +206,11 @@ static unsigned estimateRSStackSizeLimit(MachineFunction &MF) {
return DefaultSafeSPDisplacement;
}
+TargetStackID::Value
+AArch64FrameLowering::getStackIDForScalableVectors() const {
+ return TargetStackID::SVEVector;
+}
+
/// Returns the size of the entire SVE stackframe (calleesaves + spills).
static StackOffset getSVEStackSize(const MachineFunction &MF) {
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -222,7 +227,7 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- unsigned NumBytes = AFI->getLocalStackSize();
+ uint64_t NumBytes = AFI->getLocalStackSize();
return !(MFI.hasCalls() || hasFP(MF) || NumBytes > 128 ||
getSVEStackSize(MF));
@@ -239,7 +244,7 @@ bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
if (MF.hasEHFunclets())
return true;
// Retain behavior of always omitting the FP for leaf functions when possible.
- if (MFI.hasCalls() && MF.getTarget().Options.DisableFramePointerElim(MF))
+ if (MF.getTarget().Options.DisableFramePointerElim(MF))
return true;
if (MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
MFI.hasStackMap() || MFI.hasPatchPoint() ||
@@ -424,7 +429,7 @@ bool AArch64FrameLowering::canUseAsPrologue(
}
static bool windowsRequiresStackProbe(MachineFunction &MF,
- unsigned StackSizeInBytes) {
+ uint64_t StackSizeInBytes) {
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
if (!Subtarget.isTargetWindows())
return false;
@@ -441,15 +446,12 @@ static bool windowsRequiresStackProbe(MachineFunction &MF,
}
bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
- MachineFunction &MF, unsigned StackBumpBytes) const {
+ MachineFunction &MF, uint64_t StackBumpBytes) const {
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
const MachineFrameInfo &MFI = MF.getFrameInfo();
const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- if (MF.getFunction().hasOptSize())
- return false;
-
if (AFI->getLocalStackSize() == 0)
return false;
@@ -723,7 +725,7 @@ static MachineBasicBlock::iterator convertCalleeSaveRestoreToSPPrePostIncDec(
// Fixup callee-save register save/restore instructions to take into account
// combined SP bump by adding the local stack size to the stack offsets.
static void fixupCalleeSaveRestoreStackOffset(MachineInstr &MI,
- unsigned LocalStackSize,
+ uint64_t LocalStackSize,
bool NeedsWinCFI,
bool *HasWinCFI) {
if (AArch64InstrInfo::isSEHInstruction(MI))
@@ -834,6 +836,24 @@ static bool isTargetDarwin(const MachineFunction &MF) {
return MF.getSubtarget<AArch64Subtarget>().isTargetDarwin();
}
+static bool isTargetWindows(const MachineFunction &MF) {
+ return MF.getSubtarget<AArch64Subtarget>().isTargetWindows();
+}
+
+// Convenience function to determine whether I is an SVE callee save.
+static bool IsSVECalleeSave(MachineBasicBlock::iterator I) {
+ switch (I->getOpcode()) {
+ default:
+ return false;
+ case AArch64::STR_ZXI:
+ case AArch64::STR_PXI:
+ case AArch64::LDR_ZXI:
+ case AArch64::LDR_PXI:
+ return I->getFlag(MachineInstr::FrameSetup) ||
+ I->getFlag(MachineInstr::FrameDestroy);
+ }
+}
+
void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
MachineBasicBlock::iterator MBBI = MBB.begin();
@@ -844,8 +864,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
const TargetInstrInfo *TII = Subtarget.getInstrInfo();
MachineModuleInfo &MMI = MF.getMMI();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- bool needsFrameMoves = (MMI.hasDebugInfo() || F.needsUnwindTableEntry()) &&
- !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
+ bool needsFrameMoves =
+ MF.needsFrameMoves() && !MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool HasFP = hasFP(MF);
bool NeedsWinCFI = needsWinCFI(MF);
bool HasWinCFI = false;
@@ -897,8 +917,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// pointer from the funclet. We only save the callee saved registers in the
// funclet, which are really the callee saved registers of the parent
// function, including the funclet.
- int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF)
- : (int)MFI.getStackSize();
+ int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
+ : MFI.getStackSize();
if (!AFI->hasStackFrame() && !windowsRequiresStackProbe(MF, NumBytes)) {
assert(!HasFP && "unexpected function without stack frame but with FP");
assert(!SVEStackSize &&
@@ -916,15 +936,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP,
{-NumBytes, MVT::i8}, TII, MachineInstr::FrameSetup,
false, NeedsWinCFI, &HasWinCFI);
- if (!NeedsWinCFI) {
+ if (!NeedsWinCFI && needsFrameMoves) {
// Label used to tie together the PROLOG_LABEL and the MachineMoves.
MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
- // Encode the stack size of the leaf function.
- unsigned CFIIndex = MF.addFrameInst(
- MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
- BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
- .addCFIIndex(CFIIndex)
- .setMIFlags(MachineInstr::FrameSetup);
+ // Encode the stack size of the leaf function.
+ unsigned CFIIndex = MF.addFrameInst(
+ MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
+ BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
}
}
@@ -965,7 +985,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
// and pre-inc if we decided to combine the callee-save and local stack
// pointer bump above.
MachineBasicBlock::iterator End = MBB.end();
- while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup)) {
+ while (MBBI != End && MBBI->getFlag(MachineInstr::FrameSetup) &&
+ !IsSVECalleeSave(MBBI)) {
if (CombineSPBump)
fixupCalleeSaveRestoreStackOffset(*MBBI, AFI->getLocalStackSize(),
NeedsWinCFI, &HasWinCFI);
@@ -999,7 +1020,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
if (HasFP) {
// Only set up FP if we actually need to.
- int FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
+ int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
if (CombineSPBump)
FPOffset += AFI->getLocalStackSize();
@@ -1014,7 +1035,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
}
if (windowsRequiresStackProbe(MF, NumBytes)) {
- uint32_t NumWords = NumBytes >> 4;
+ uint64_t NumWords = NumBytes >> 4;
if (NeedsWinCFI) {
HasWinCFI = true;
// alloc_l can hold at most 256MB, so assume that NumBytes doesn't
@@ -1107,7 +1128,35 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
NumBytes = 0;
}
- emitFrameOffset(MBB, MBBI, DL, AArch64::SP, AArch64::SP, -SVEStackSize, TII,
+ StackOffset AllocateBefore = SVEStackSize, AllocateAfter = {};
+ MachineBasicBlock::iterator CalleeSavesBegin = MBBI, CalleeSavesEnd = MBBI;
+
+ // Process the SVE callee-saves to determine what space needs to be
+ // allocated.
+ if (AFI->getSVECalleeSavedStackSize()) {
+ // Find callee save instructions in frame.
+ CalleeSavesBegin = MBBI;
+ assert(IsSVECalleeSave(CalleeSavesBegin) && "Unexpected instruction");
+ while (IsSVECalleeSave(MBBI) && MBBI != MBB.getFirstTerminator())
+ ++MBBI;
+ CalleeSavesEnd = MBBI;
+
+ int64_t OffsetToFirstCalleeSaveFromSP =
+ MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
+ StackOffset OffsetToCalleeSavesFromSP =
+ StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
+ AllocateBefore -= OffsetToCalleeSavesFromSP;
+ AllocateAfter = SVEStackSize - AllocateBefore;
+ }
+
+ // Allocate space for the callee saves (if any).
+ emitFrameOffset(MBB, CalleeSavesBegin, DL, AArch64::SP, AArch64::SP,
+ -AllocateBefore, TII,
+ MachineInstr::FrameSetup);
+
+ // Finally allocate remaining SVE stack space.
+ emitFrameOffset(MBB, CalleeSavesEnd, DL, AArch64::SP, AArch64::SP,
+ -AllocateAfter, TII,
MachineInstr::FrameSetup);
// Allocate space for the rest of the frame.
@@ -1343,8 +1392,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
IsFunclet = isFuncletReturnInstr(*MBBI);
}
- int NumBytes = IsFunclet ? (int)getWinEHFuncletFrameSize(MF)
- : MFI.getStackSize();
+ int64_t NumBytes = IsFunclet ? getWinEHFuncletFrameSize(MF)
+ : MFI.getStackSize();
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
// All calls are tail calls in GHC calling conv, and functions have no
@@ -1444,7 +1493,8 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock::iterator Begin = MBB.begin();
while (LastPopI != Begin) {
--LastPopI;
- if (!LastPopI->getFlag(MachineInstr::FrameDestroy)) {
+ if (!LastPopI->getFlag(MachineInstr::FrameDestroy) ||
+ IsSVECalleeSave(LastPopI)) {
++LastPopI;
break;
} else if (CombineSPBump)
@@ -1476,11 +1526,53 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
NumBytes -= PrologueSaveSize;
assert(NumBytes >= 0 && "Negative stack allocation size!?");
+ // Process the SVE callee-saves to determine what space needs to be
+ // deallocated.
+ StackOffset DeallocateBefore = {}, DeallocateAfter = SVEStackSize;
+ MachineBasicBlock::iterator RestoreBegin = LastPopI, RestoreEnd = LastPopI;
+ if (AFI->getSVECalleeSavedStackSize()) {
+ RestoreBegin = std::prev(RestoreEnd);;
+ while (IsSVECalleeSave(RestoreBegin) &&
+ RestoreBegin != MBB.begin())
+ --RestoreBegin;
+ ++RestoreBegin;
+
+ assert(IsSVECalleeSave(RestoreBegin) &&
+ IsSVECalleeSave(std::prev(RestoreEnd)) && "Unexpected instruction");
+
+ int64_t OffsetToFirstCalleeSaveFromSP =
+ MFI.getObjectOffset(AFI->getMaxSVECSFrameIndex());
+ StackOffset OffsetToCalleeSavesFromSP =
+ StackOffset(OffsetToFirstCalleeSaveFromSP, MVT::nxv1i8) + SVEStackSize;
+ DeallocateBefore = OffsetToCalleeSavesFromSP;
+ DeallocateAfter = SVEStackSize - DeallocateBefore;
+ }
+
// Deallocate the SVE area.
- if (SVEStackSize)
- if (!AFI->isStackRealigned())
- emitFrameOffset(MBB, LastPopI, DL, AArch64::SP, AArch64::SP, SVEStackSize,
- TII, MachineInstr::FrameDestroy);
+ if (SVEStackSize) {
+ if (AFI->isStackRealigned()) {
+ if (AFI->getSVECalleeSavedStackSize())
+ // Set SP to start of SVE area, from which the callee-save reloads
+ // can be done. The code below will deallocate the stack space
+ // space by moving FP -> SP.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::FP,
+ -SVEStackSize, TII, MachineInstr::FrameDestroy);
+ } else {
+ if (AFI->getSVECalleeSavedStackSize()) {
+ // Deallocate the non-SVE locals first before we can deallocate (and
+ // restore callee saves) from the SVE area.
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+ {NumBytes, MVT::i8}, TII, MachineInstr::FrameDestroy);
+ NumBytes = 0;
+ }
+
+ emitFrameOffset(MBB, RestoreBegin, DL, AArch64::SP, AArch64::SP,
+ DeallocateBefore, TII, MachineInstr::FrameDestroy);
+
+ emitFrameOffset(MBB, RestoreEnd, DL, AArch64::SP, AArch64::SP,
+ DeallocateAfter, TII, MachineInstr::FrameDestroy);
+ }
+ }
if (!hasFP(MF)) {
bool RedZone = canUseRedZone(MF);
@@ -1490,7 +1582,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
return;
bool NoCalleeSaveRestore = PrologueSaveSize == 0;
- int StackRestoreBytes = RedZone ? 0 : NumBytes;
+ int64_t StackRestoreBytes = RedZone ? 0 : NumBytes;
if (NoCalleeSaveRestore)
StackRestoreBytes += AfterCSRPopSize;
@@ -1582,19 +1674,20 @@ int AArch64FrameLowering::getNonLocalFrameIndexReference(
return getSEHFrameIndexOffset(MF, FI);
}
-static StackOffset getFPOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset) {
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
bool IsWin64 =
Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
- unsigned FPAdjust = isTargetDarwin(MF) ? 16 : AFI->getCalleeSavedStackSize();
+ unsigned FPAdjust = isTargetDarwin(MF)
+ ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
}
-static StackOffset getStackOffset(const MachineFunction &MF, int ObjectOffset) {
+static StackOffset getStackOffset(const MachineFunction &MF, int64_t ObjectOffset) {
const auto &MFI = MF.getFrameInfo();
- return {ObjectOffset + (int)MFI.getStackSize(), MVT::i8};
+ return {ObjectOffset + (int64_t)MFI.getStackSize(), MVT::i8};
}
int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
@@ -1611,7 +1704,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP,
bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
- int ObjectOffset = MFI.getObjectOffset(FI);
+ int64_t ObjectOffset = MFI.getObjectOffset(FI);
bool isFixed = MFI.isFixedObjectIndex(FI);
bool isSVE = MFI.getStackID(FI) == TargetStackID::SVEVector;
return resolveFrameOffsetReference(MF, ObjectOffset, isFixed, isSVE, FrameReg,
@@ -1619,7 +1712,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
}
StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
- const MachineFunction &MF, int ObjectOffset, bool isFixed, bool isSVE,
+ const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
unsigned &FrameReg, bool PreferFP, bool ForSimm) const {
const auto &MFI = MF.getFrameInfo();
const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
@@ -1627,10 +1720,10 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
const auto *AFI = MF.getInfo<AArch64FunctionInfo>();
const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
- int FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
- int Offset = getStackOffset(MF, ObjectOffset).getBytes();
+ int64_t FPOffset = getFPOffset(MF, ObjectOffset).getBytes();
+ int64_t Offset = getStackOffset(MF, ObjectOffset).getBytes();
bool isCSR =
- !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize());
+ !isFixed && ObjectOffset >= -((int)AFI->getCalleeSavedStackSize(MFI));
const StackOffset &SVEStackSize = getSVEStackSize(MF);
@@ -1781,6 +1874,8 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
// TODO: LR can be paired with any register. We don't support this yet in
// the MCLayer. We need to add support for the save_lrpair unwind code.
+ if (Reg2 == AArch64::FP)
+ return true;
if (!NeedsWinCFI)
return false;
if (Reg2 == Reg1 + 1)
@@ -1793,9 +1888,9 @@ static bool invalidateWindowsRegisterPairing(unsigned Reg1, unsigned Reg2,
/// LR and FP need to be allocated together when the frame needs to save
/// the frame-record. This means any other register pairing with LR is invalid.
static bool invalidateRegisterPairing(unsigned Reg1, unsigned Reg2,
- bool NeedsWinCFI, bool NeedsFrameRecord) {
- if (NeedsWinCFI)
- return invalidateWindowsRegisterPairing(Reg1, Reg2, true);
+ bool UsesWinAAPCS, bool NeedsWinCFI, bool NeedsFrameRecord) {
+ if (UsesWinAAPCS)
+ return invalidateWindowsRegisterPairing(Reg1, Reg2, NeedsWinCFI);
// If we need to store the frame record, don't pair any register
// with LR other than FP.
@@ -1812,11 +1907,27 @@ struct RegPairInfo {
unsigned Reg2 = AArch64::NoRegister;
int FrameIdx;
int Offset;
- enum RegType { GPR, FPR64, FPR128 } Type;
+ enum RegType { GPR, FPR64, FPR128, PPR, ZPR } Type;
RegPairInfo() = default;
bool isPaired() const { return Reg2 != AArch64::NoRegister; }
+
+ unsigned getScale() const {
+ switch (Type) {
+ case PPR:
+ return 2;
+ case GPR:
+ case FPR64:
+ return 8;
+ case ZPR:
+ case FPR128:
+ return 16;
+ }
+ llvm_unreachable("Unsupported type");
+ }
+
+ bool isScalable() const { return Type == PPR || Type == ZPR; }
};
} // end anonymous namespace
@@ -1829,6 +1940,7 @@ static void computeCalleeSaveRegisterPairs(
if (CSI.empty())
return;
+ bool IsWindows = isTargetWindows(MF);
bool NeedsWinCFI = needsWinCFI(MF);
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1841,7 +1953,8 @@ static void computeCalleeSaveRegisterPairs(
CC == CallingConv::PreserveMost ||
(Count & 1) == 0) &&
"Odd number of callee-saved regs to spill!");
- int Offset = AFI->getCalleeSavedStackSize();
+ int ByteOffset = AFI->getCalleeSavedStackSize();
+ int ScalableByteOffset = AFI->getSVECalleeSavedStackSize();
// On Linux, we will have either one or zero non-paired register. On Windows
// with CFI, we can have multiple unpaired registers in order to utilize the
// available unwind codes. This flag assures that the alignment fixup is done
@@ -1857,6 +1970,10 @@ static void computeCalleeSaveRegisterPairs(
RPI.Type = RegPairInfo::FPR64;
else if (AArch64::FPR128RegClass.contains(RPI.Reg1))
RPI.Type = RegPairInfo::FPR128;
+ else if (AArch64::ZPRRegClass.contains(RPI.Reg1))
+ RPI.Type = RegPairInfo::ZPR;
+ else if (AArch64::PPRRegClass.contains(RPI.Reg1))
+ RPI.Type = RegPairInfo::PPR;
else
llvm_unreachable("Unsupported register class.");
@@ -1866,7 +1983,7 @@ static void computeCalleeSaveRegisterPairs(
switch (RPI.Type) {
case RegPairInfo::GPR:
if (AArch64::GPR64RegClass.contains(NextReg) &&
- !invalidateRegisterPairing(RPI.Reg1, NextReg, NeedsWinCFI,
+ !invalidateRegisterPairing(RPI.Reg1, NextReg, IsWindows, NeedsWinCFI,
NeedsFrameRecord))
RPI.Reg2 = NextReg;
break;
@@ -1879,6 +1996,9 @@ static void computeCalleeSaveRegisterPairs(
if (AArch64::FPR128RegClass.contains(NextReg))
RPI.Reg2 = NextReg;
break;
+ case RegPairInfo::PPR:
+ case RegPairInfo::ZPR:
+ break;
}
}
@@ -1905,6 +2025,11 @@ static void computeCalleeSaveRegisterPairs(
RPI.Reg1 == AArch64::LR) &&
"FrameRecord must be allocated together with LR");
+ // Windows AAPCS has FP and LR reversed.
+ assert((!RPI.isPaired() || !NeedsFrameRecord || RPI.Reg1 != AArch64::FP ||
+ RPI.Reg2 == AArch64::LR) &&
+ "FrameRecord must be allocated together with LR");
+
// MachO's compact unwind format relies on all registers being stored in
// adjacent register pairs.
assert((!produceCompactUnwindFrame(MF) ||
@@ -1916,23 +2041,33 @@ static void computeCalleeSaveRegisterPairs(
RPI.FrameIdx = CSI[i].getFrameIdx();
- int Scale = RPI.Type == RegPairInfo::FPR128 ? 16 : 8;
- Offset -= RPI.isPaired() ? 2 * Scale : Scale;
+ int Scale = RPI.getScale();
+ if (RPI.isScalable())
+ ScalableByteOffset -= Scale;
+ else
+ ByteOffset -= RPI.isPaired() ? 2 * Scale : Scale;
+
+ assert(!(RPI.isScalable() && RPI.isPaired()) &&
+ "Paired spill/fill instructions don't exist for SVE vectors");
// Round up size of non-pair to pair size if we need to pad the
// callee-save area to ensure 16-byte alignment.
if (AFI->hasCalleeSaveStackFreeSpace() && !FixupDone &&
- RPI.Type != RegPairInfo::FPR128 && !RPI.isPaired()) {
+ !RPI.isScalable() && RPI.Type != RegPairInfo::FPR128 &&
+ !RPI.isPaired()) {
FixupDone = true;
- Offset -= 8;
- assert(Offset % 16 == 0);
+ ByteOffset -= 8;
+ assert(ByteOffset % 16 == 0);
assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
MFI.setObjectAlignment(RPI.FrameIdx, 16);
}
+ int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
assert(Offset % Scale == 0);
RPI.Offset = Offset / Scale;
- assert((RPI.Offset >= -64 && RPI.Offset <= 63) &&
+
+ assert(((!RPI.isScalable() && RPI.Offset >= -64 && RPI.Offset <= 63) ||
+ (RPI.isScalable() && RPI.Offset >= -256 && RPI.Offset <= 255)) &&
"Offset out of bounds for LDP/STP immediate");
RegPairs.push_back(RPI);
@@ -2024,6 +2159,16 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
Size = 16;
Align = 16;
break;
+ case RegPairInfo::ZPR:
+ StrOpc = AArch64::STR_ZXI;
+ Size = 16;
+ Align = 16;
+ break;
+ case RegPairInfo::PPR:
+ StrOpc = AArch64::STR_PXI;
+ Size = 2;
+ Align = 2;
+ break;
}
LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -2064,6 +2209,11 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameSetup);
+ // Update the StackIDs of the SVE stack slots.
+ MachineFrameInfo &MFI = MF.getFrameInfo();
+ if (RPI.Type == RegPairInfo::ZPR || RPI.Type == RegPairInfo::PPR)
+ MFI.setStackID(RPI.FrameIdx, TargetStackID::SVEVector);
+
}
return true;
}
@@ -2115,6 +2265,16 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
Size = 16;
Align = 16;
break;
+ case RegPairInfo::ZPR:
+ LdrOpc = AArch64::LDR_ZXI;
+ Size = 16;
+ Align = 16;
+ break;
+ case RegPairInfo::PPR:
+ LdrOpc = AArch64::LDR_PXI;
+ Size = 2;
+ Align = 2;
+ break;
}
LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
if (RPI.isPaired()) dbgs() << ", " << printReg(Reg2, TRI);
@@ -2149,12 +2309,20 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
if (NeedsWinCFI)
InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
};
- if (ReverseCSRRestoreSeq)
- for (const RegPairInfo &RPI : reverse(RegPairs))
+
+ // SVE objects are always restored in reverse order.
+ for (const RegPairInfo &RPI : reverse(RegPairs))
+ if (RPI.isScalable())
EmitMI(RPI);
- else
+
+ if (ReverseCSRRestoreSeq) {
+ for (const RegPairInfo &RPI : reverse(RegPairs))
+ if (!RPI.isScalable())
+ EmitMI(RPI);
+ } else
for (const RegPairInfo &RPI : RegPairs)
- EmitMI(RPI);
+ if (!RPI.isScalable())
+ EmitMI(RPI);
if (NeedShadowCallStackProlog) {
// Shadow call stack epilog: ldr x30, [x18, #-8]!
@@ -2201,7 +2369,12 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
SavedRegs.set(Reg);
bool RegUsed = SavedRegs.test(Reg);
- unsigned PairedReg = CSRegs[i ^ 1];
+ unsigned PairedReg = AArch64::NoRegister;
+ if (AArch64::GPR64RegClass.contains(Reg) ||
+ AArch64::FPR64RegClass.contains(Reg) ||
+ AArch64::FPR128RegClass.contains(Reg))
+ PairedReg = CSRegs[i ^ 1];
+
if (!RegUsed) {
if (AArch64::GPR64RegClass.contains(Reg) &&
!RegInfo->isReservedReg(MF, Reg)) {
@@ -2225,16 +2398,23 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Calculates the callee saved stack size.
unsigned CSStackSize = 0;
+ unsigned SVECSStackSize = 0;
const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
const MachineRegisterInfo &MRI = MF.getRegInfo();
- for (unsigned Reg : SavedRegs.set_bits())
- CSStackSize += TRI->getRegSizeInBits(Reg, MRI) / 8;
+ for (unsigned Reg : SavedRegs.set_bits()) {
+ auto RegSize = TRI->getRegSizeInBits(Reg, MRI) / 8;
+ if (AArch64::PPRRegClass.contains(Reg) ||
+ AArch64::ZPRRegClass.contains(Reg))
+ SVECSStackSize += RegSize;
+ else
+ CSStackSize += RegSize;
+ }
// Save number of saved regs, so we can easily update CSStackSize later.
unsigned NumSavedRegs = SavedRegs.count();
// The frame record needs to be created by saving the appropriate registers
- unsigned EstimatedStackSize = MFI.estimateStackSize(MF);
+ uint64_t EstimatedStackSize = MFI.estimateStackSize(MF);
if (hasFP(MF) ||
windowsRequiresStackProbe(MF, EstimatedStackSize + CSStackSize + 16)) {
SavedRegs.set(AArch64::FP);
@@ -2248,10 +2428,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
dbgs() << "\n";);
// If any callee-saved registers are used, the frame cannot be eliminated.
- unsigned MaxAlign = getStackAlignment();
int64_t SVEStackSize =
- alignTo(determineSVEStackSize(MFI, MaxAlign), MaxAlign);
- assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+ alignTo(SVECSStackSize + estimateSVEStackObjectOffsets(MFI), 16);
bool CanEliminateFrame = (SavedRegs.count() == 0) && !SVEStackSize;
// The CSR spill slots have not been allocated yet, so estimateStackSize
@@ -2299,15 +2477,20 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
// Adding the size of additional 64bit GPR saves.
CSStackSize += 8 * (SavedRegs.count() - NumSavedRegs);
- unsigned AlignedCSStackSize = alignTo(CSStackSize, 16);
+ uint64_t AlignedCSStackSize = alignTo(CSStackSize, 16);
LLVM_DEBUG(dbgs() << "Estimated stack frame size: "
<< EstimatedStackSize + AlignedCSStackSize
<< " bytes.\n");
+ assert((!MFI.isCalleeSavedInfoValid() ||
+ AFI->getCalleeSavedStackSize() == AlignedCSStackSize) &&
+ "Should not invalidate callee saved info");
+
// Round up to register pair alignment to avoid additional SP adjustment
// instructions.
AFI->setCalleeSavedStackSize(AlignedCSStackSize);
AFI->setCalleeSaveStackHasFreeSpace(AlignedCSStackSize != CSStackSize);
+ AFI->setSVECalleeSavedStackSize(alignTo(SVECSStackSize, 16));
}
bool AArch64FrameLowering::enableStackSlotScavenging(
@@ -2316,9 +2499,40 @@ bool AArch64FrameLowering::enableStackSlotScavenging(
return AFI->hasCalleeSaveStackFreeSpace();
}
-int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI,
- unsigned &MaxAlign) const {
- // Process all fixed stack objects.
+/// returns true if there are any SVE callee saves.
+static bool getSVECalleeSaveSlotRange(const MachineFrameInfo &MFI,
+ int &Min, int &Max) {
+ Min = std::numeric_limits<int>::max();
+ Max = std::numeric_limits<int>::min();
+
+ if (!MFI.isCalleeSavedInfoValid())
+ return false;
+
+ const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+ for (auto &CS : CSI) {
+ if (AArch64::ZPRRegClass.contains(CS.getReg()) ||
+ AArch64::PPRRegClass.contains(CS.getReg())) {
+ assert((Max == std::numeric_limits<int>::min() ||
+ Max + 1 == CS.getFrameIdx()) &&
+ "SVE CalleeSaves are not consecutive");
+
+ Min = std::min(Min, CS.getFrameIdx());
+ Max = std::max(Max, CS.getFrameIdx());
+ }
+ }
+ return Min != std::numeric_limits<int>::max();
+}
+
+// Process all the SVE stack objects and determine offsets for each
+// object. If AssignOffsets is true, the offsets get assigned.
+// Fills in the first and last callee-saved frame indices into
+// Min/MaxCSFrameIndex, respectively.
+// Returns the size of the stack.
+static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
+ int &MinCSFrameIndex,
+ int &MaxCSFrameIndex,
+ bool AssignOffsets) {
+ // First process all fixed stack objects.
int64_t Offset = 0;
for (int I = MFI.getObjectIndexBegin(); I != 0; ++I)
if (MFI.getStackID(I) == TargetStackID::SVEVector) {
@@ -2327,12 +2541,69 @@ int64_t AArch64FrameLowering::determineSVEStackSize(MachineFrameInfo &MFI,
Offset = FixedOffset;
}
- // Note: We don't take allocatable stack objects into
- // account yet, because allocation for those is not yet
- // implemented.
+ auto Assign = [&MFI](int FI, int64_t Offset) {
+ LLVM_DEBUG(dbgs() << "alloc FI(" << FI << ") at SP[" << Offset << "]\n");
+ MFI.setObjectOffset(FI, Offset);
+ };
+
+ // Then process all callee saved slots.
+ if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
+ // Make sure to align the last callee save slot.
+ MFI.setObjectAlignment(MaxCSFrameIndex, 16U);
+
+ // Assign offsets to the callee save slots.
+ for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
+ Offset += MFI.getObjectSize(I);
+ Offset = alignTo(Offset, MFI.getObjectAlignment(I));
+ if (AssignOffsets)
+ Assign(I, -Offset);
+ }
+ }
+
+ // Create a buffer of SVE objects to allocate and sort it.
+ SmallVector<int, 8> ObjectsToAllocate;
+ for (int I = 0, E = MFI.getObjectIndexEnd(); I != E; ++I) {
+ unsigned StackID = MFI.getStackID(I);
+ if (StackID != TargetStackID::SVEVector)
+ continue;
+ if (MaxCSFrameIndex >= I && I >= MinCSFrameIndex)
+ continue;
+ if (MFI.isDeadObjectIndex(I))
+ continue;
+
+ ObjectsToAllocate.push_back(I);
+ }
+
+ // Allocate all SVE locals and spills
+ for (unsigned FI : ObjectsToAllocate) {
+ unsigned Align = MFI.getObjectAlignment(FI);
+ // FIXME: Given that the length of SVE vectors is not necessarily a power of
+ // two, we'd need to align every object dynamically at runtime if the
+ // alignment is larger than 16. This is not yet supported.
+ if (Align > 16)
+ report_fatal_error(
+ "Alignment of scalable vectors > 16 bytes is not yet supported");
+
+ Offset = alignTo(Offset + MFI.getObjectSize(FI), Align);
+ if (AssignOffsets)
+ Assign(FI, -Offset);
+ }
+
return Offset;
}
+int64_t AArch64FrameLowering::estimateSVEStackObjectOffsets(
+ MachineFrameInfo &MFI) const {
+ int MinCSFrameIndex, MaxCSFrameIndex;
+ return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex, false);
+}
+
+int64_t AArch64FrameLowering::assignSVEStackObjectOffsets(
+ MachineFrameInfo &MFI, int &MinCSFrameIndex, int &MaxCSFrameIndex) const {
+ return determineSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex,
+ true);
+}
+
void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
MachineFunction &MF, RegScavenger *RS) const {
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -2340,12 +2611,13 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
assert(getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown &&
"Upwards growing stack unsupported");
- unsigned MaxAlign = getStackAlignment();
- int64_t SVEStackSize = determineSVEStackSize(MFI, MaxAlign);
+ int MinCSFrameIndex, MaxCSFrameIndex;
+ int64_t SVEStackSize =
+ assignSVEStackObjectOffsets(MFI, MinCSFrameIndex, MaxCSFrameIndex);
AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
- AFI->setStackSizeSVE(alignTo(SVEStackSize, MaxAlign));
- assert(MaxAlign <= 16 && "Cannot align scalable vectors more than 16 bytes");
+ AFI->setStackSizeSVE(alignTo(SVEStackSize, 16U));
+ AFI->setMinMaxSVECSFrameIndex(MinCSFrameIndex, MaxCSFrameIndex);
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index ac150e86c9eb..b5719feb6b15 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -44,7 +44,7 @@ public:
unsigned &FrameReg, bool PreferFP,
bool ForSimm) const;
StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
- int ObjectOffset, bool isFixed,
+ int64_t ObjectOffset, bool isFixed,
bool isSVE, unsigned &FrameReg,
bool PreferFP, bool ForSimm) const;
bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
@@ -72,6 +72,7 @@ public:
}
bool enableStackSlotScavenging(const MachineFunction &MF) const override;
+ TargetStackID::Value getStackIDForScalableVectors() const override;
void processFunctionBeforeFrameFinalized(MachineFunction &MF,
RegScavenger *RS) const override;
@@ -100,8 +101,12 @@ public:
private:
bool shouldCombineCSRLocalStackBump(MachineFunction &MF,
- unsigned StackBumpBytes) const;
- int64_t determineSVEStackSize(MachineFrameInfo &MF, unsigned &MaxAlign) const;
+ uint64_t StackBumpBytes) const;
+
+ int64_t estimateSVEStackObjectOffsets(MachineFrameInfo &MF) const;
+ int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
+ int &MinCSFrameIndex,
+ int &MaxCSFrameIndex) const;
};
} // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1f08505f37e7..a51aa85a931c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -17,6 +17,7 @@
#include "llvm/IR/Function.h" // To access function attributes.
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
@@ -39,20 +40,16 @@ class AArch64DAGToDAGISel : public SelectionDAGISel {
/// make the right decision when generating code for different targets.
const AArch64Subtarget *Subtarget;
- bool ForCodeSize;
-
public:
explicit AArch64DAGToDAGISel(AArch64TargetMachine &tm,
CodeGenOpt::Level OptLevel)
- : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
- ForCodeSize(false) {}
+ : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr) {}
StringRef getPassName() const override {
return "AArch64 Instruction Selection";
}
bool runOnMachineFunction(MachineFunction &MF) override {
- ForCodeSize = MF.getFunction().hasOptSize();
Subtarget = &MF.getSubtarget<AArch64Subtarget>();
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -140,6 +137,59 @@ public:
return SelectAddrModeXRO(N, Width / 8, Base, Offset, SignExtend, DoShift);
}
+ bool SelectDupZeroOrUndef(SDValue N) {
+ switch(N->getOpcode()) {
+ case ISD::UNDEF:
+ return true;
+ case AArch64ISD::DUP:
+ case ISD::SPLAT_VECTOR: {
+ auto Opnd0 = N->getOperand(0);
+ if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
+ if (CN->isNullValue())
+ return true;
+ if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
+ if (CN->isZero())
+ return true;
+ break;
+ }
+ default:
+ break;
+ }
+
+ return false;
+ }
+
+ template<MVT::SimpleValueType VT>
+ bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
+ return SelectSVEAddSubImm(N, VT, Imm, Shift);
+ }
+
+ template<MVT::SimpleValueType VT>
+ bool SelectSVELogicalImm(SDValue N, SDValue &Imm) {
+ return SelectSVELogicalImm(N, VT, Imm);
+ }
+
+ // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
+ template<signed Min, signed Max, signed Scale, bool Shift>
+ bool SelectCntImm(SDValue N, SDValue &Imm) {
+ if (!isa<ConstantSDNode>(N))
+ return false;
+
+ int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
+ if (Shift)
+ MulImm = 1LL << MulImm;
+
+ if ((MulImm % std::abs(Scale)) != 0)
+ return false;
+
+ MulImm /= Scale;
+ if ((MulImm >= Min) && (MulImm <= Max)) {
+ Imm = CurDAG->getTargetConstant(MulImm, SDLoc(N), MVT::i32);
+ return true;
+ }
+
+ return false;
+ }
/// Form sequences of consecutive 64/128-bit registers for use in NEON
/// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have
@@ -177,6 +227,7 @@ public:
bool tryBitfieldInsertOp(SDNode *N);
bool tryBitfieldInsertInZeroOp(SDNode *N);
bool tryShiftAmountMod(SDNode *N);
+ bool tryHighFPExt(SDNode *N);
bool tryReadRegister(SDNode *N);
bool tryWriteRegister(SDNode *N);
@@ -217,6 +268,13 @@ private:
bool SelectCMP_SWAP(SDNode *N);
+ bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
+
+ bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
+
+ bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
+
+ bool SelectSVEArithImm(SDValue N, SDValue &Imm);
};
} // end anonymous namespace
@@ -250,7 +308,6 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand(
switch(ConstraintID) {
default:
llvm_unreachable("Unexpected asm memory constraint");
- case InlineAsm::Constraint_i:
case InlineAsm::Constraint_m:
case InlineAsm::Constraint_Q:
// We need to make sure that this one operand does not end up in XZR, thus
@@ -378,7 +435,7 @@ static bool isWorthFoldingSHL(SDValue V) {
bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
// Trivial if we are optimizing for code size or if there is only
// one use of the value.
- if (ForCodeSize || V.hasOneUse())
+ if (CurDAG->shouldOptForSize() || V.hasOneUse())
return true;
// If a subtarget has a fastpath LSL we can fold a logical shift into
// the addressing mode and save a cycle.
@@ -1772,6 +1829,35 @@ bool AArch64DAGToDAGISel::tryBitfieldExtractOpFromSExt(SDNode *N) {
return true;
}
+/// Try to form fcvtl2 instructions from a floating-point extend of a high-half
+/// extract of a subvector.
+bool AArch64DAGToDAGISel::tryHighFPExt(SDNode *N) {
+ assert(N->getOpcode() == ISD::FP_EXTEND);
+
+ // There are 2 forms of fcvtl2 - extend to double or extend to float.
+ SDValue Extract = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT NarrowVT = Extract.getValueType();
+ if ((VT != MVT::v2f64 || NarrowVT != MVT::v2f32) &&
+ (VT != MVT::v4f32 || NarrowVT != MVT::v4f16))
+ return false;
+
+ // Optionally look past a bitcast.
+ Extract = peekThroughBitcasts(Extract);
+ if (Extract.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+
+ // Match extract from start of high half index.
+ // Example: v8i16 -> v4i16 means the extract must begin at index 4.
+ unsigned ExtractIndex = Extract.getConstantOperandVal(1);
+ if (ExtractIndex != Extract.getValueType().getVectorNumElements())
+ return false;
+
+ auto Opcode = VT == MVT::v2f64 ? AArch64::FCVTLv4i32 : AArch64::FCVTLv8i16;
+ CurDAG->SelectNodeTo(N, Opcode, VT, Extract.getOperand(0));
+ return true;
+}
+
static bool isBitfieldExtractOp(SelectionDAG *CurDAG, SDNode *N, unsigned &Opc,
SDValue &Opd0, unsigned &Immr, unsigned &Imms,
unsigned NumberOfIgnoredLowBits = 0,
@@ -2793,6 +2879,102 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
return true;
}
+bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
+ if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
+ const int64_t ImmVal = CNode->getZExtValue();
+ SDLoc DL(N);
+
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ if ((ImmVal & 0xFF) == ImmVal) {
+ Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+ return true;
+ }
+ break;
+ case MVT::i16:
+ case MVT::i32:
+ case MVT::i64:
+ if ((ImmVal & 0xFF) == ImmVal) {
+ Shift = CurDAG->getTargetConstant(0, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+ return true;
+ } else if ((ImmVal & 0xFF00) == ImmVal) {
+ Shift = CurDAG->getTargetConstant(8, DL, MVT::i32);
+ Imm = CurDAG->getTargetConstant(ImmVal >> 8, DL, MVT::i32);
+ return true;
+ }
+ break;
+ default:
+ break;
+ }
+ }
+
+ return false;
+}
+
+bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
+ if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
+ int64_t ImmVal = CNode->getSExtValue();
+ SDLoc DL(N);
+ if (ImmVal >= -127 && ImmVal < 127) {
+ Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AArch64DAGToDAGISel::SelectSVEArithImm(SDValue N, SDValue &Imm) {
+ if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t ImmVal = CNode->getSExtValue();
+ SDLoc DL(N);
+ ImmVal = ImmVal & 0xFF;
+ if (ImmVal < 256) {
+ Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+ return true;
+ }
+ }
+ return false;
+}
+
+bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
+ if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
+ uint64_t ImmVal = CNode->getZExtValue();
+ SDLoc DL(N);
+
+ // Shift mask depending on type size.
+ switch (VT.SimpleTy) {
+ case MVT::i8:
+ ImmVal &= 0xFF;
+ ImmVal |= ImmVal << 8;
+ ImmVal |= ImmVal << 16;
+ ImmVal |= ImmVal << 32;
+ break;
+ case MVT::i16:
+ ImmVal &= 0xFFFF;
+ ImmVal |= ImmVal << 16;
+ ImmVal |= ImmVal << 32;
+ break;
+ case MVT::i32:
+ ImmVal &= 0xFFFFFFFF;
+ ImmVal |= ImmVal << 32;
+ break;
+ case MVT::i64:
+ break;
+ default:
+ llvm_unreachable("Unexpected type");
+ }
+
+ uint64_t encoding;
+ if (AArch64_AM::processLogicalImmediate(ImmVal, 64, encoding)) {
+ Imm = CurDAG->getTargetConstant(encoding, DL, MVT::i64);
+ return true;
+ }
+ }
+ return false;
+}
+
bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
// tagp(FrameIndex, IRGstack, tag_offset):
// since the offset between FrameIndex and IRGstack is a compile-time
@@ -2908,6 +3090,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
return;
break;
+ case ISD::FP_EXTEND:
+ if (tryHighFPExt(Node))
+ return;
+ break;
+
case ISD::OR:
if (tryBitfieldInsertOp(Node))
return;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 2746117e8ee5..d45a80057564 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -10,9 +10,9 @@
//
//===----------------------------------------------------------------------===//
-#include "AArch64ExpandImm.h"
#include "AArch64ISelLowering.h"
#include "AArch64CallingConvention.h"
+#include "AArch64ExpandImm.h"
#include "AArch64MachineFunctionInfo.h"
#include "AArch64PerfectShuffle.h"
#include "AArch64RegisterInfo.h"
@@ -58,6 +58,7 @@
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/OperandTraits.h"
#include "llvm/IR/PatternMatch.h"
@@ -178,11 +179,25 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
addRegisterClass(MVT::nxv2f16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv4f16, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv8f16, &AArch64::ZPRRegClass);
- addRegisterClass(MVT::nxv1f32, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2f32, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
- addRegisterClass(MVT::nxv1f64, &AArch64::ZPRRegClass);
addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
+
+ for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
+ setOperationAction(ISD::SADDSAT, VT, Legal);
+ setOperationAction(ISD::UADDSAT, VT, Legal);
+ setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::USUBSAT, VT, Legal);
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ }
+
+ for (auto VT :
+ { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
+ MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
+ setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
}
// Compute derived properties from the register classes
@@ -422,14 +437,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSUB, MVT::v4f16, Promote);
setOperationAction(ISD::FMUL, MVT::v4f16, Promote);
setOperationAction(ISD::FDIV, MVT::v4f16, Promote);
- setOperationAction(ISD::FP_EXTEND, MVT::v4f16, Promote);
- setOperationAction(ISD::FP_ROUND, MVT::v4f16, Promote);
AddPromotedToType(ISD::FADD, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FSUB, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FMUL, MVT::v4f16, MVT::v4f32);
AddPromotedToType(ISD::FDIV, MVT::v4f16, MVT::v4f32);
- AddPromotedToType(ISD::FP_EXTEND, MVT::v4f16, MVT::v4f32);
- AddPromotedToType(ISD::FP_ROUND, MVT::v4f16, MVT::v4f32);
setOperationAction(ISD::FABS, MVT::v4f16, Expand);
setOperationAction(ISD::FNEG, MVT::v4f16, Expand);
@@ -510,6 +521,10 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i32, Custom);
setOperationAction(ISD::ATOMIC_LOAD_AND, MVT::i64, Custom);
+ // 128-bit loads and stores can be done without expanding
+ setOperationAction(ISD::LOAD, MVT::i128, Custom);
+ setOperationAction(ISD::STORE, MVT::i128, Custom);
+
// Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
// This requires the Performance Monitors extension.
if (Subtarget->hasPerfMon())
@@ -525,6 +540,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FSINCOS, MVT::f32, Expand);
}
+ if (Subtarget->getTargetTriple().isOSMSVCRT()) {
+ // MSVCRT doesn't have powi; fall back to pow
+ setLibcallName(RTLIB::POWI_F32, nullptr);
+ setLibcallName(RTLIB::POWI_F64, nullptr);
+ }
+
// Make floating-point constants legal for the large code model, so they don't
// become loads from the constant pool.
if (Subtarget->isTargetMachO() && TM.getCodeModel() == CodeModel::Large) {
@@ -601,7 +622,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setTargetDAGCombine(ISD::ANY_EXTEND);
setTargetDAGCombine(ISD::ZERO_EXTEND);
setTargetDAGCombine(ISD::SIGN_EXTEND);
- setTargetDAGCombine(ISD::BITCAST);
+ setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
setTargetDAGCombine(ISD::CONCAT_VECTORS);
setTargetDAGCombine(ISD::STORE);
if (Subtarget->supportsAddressTopByteIgnored())
@@ -734,14 +755,20 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::MUL, MVT::v4i32, Custom);
setOperationAction(ISD::MUL, MVT::v2i64, Custom);
- // Vector reductions
for (MVT VT : { MVT::v8i8, MVT::v4i16, MVT::v2i32,
MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
+ // Vector reductions
setOperationAction(ISD::VECREDUCE_ADD, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_SMIN, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMAX, VT, Custom);
setOperationAction(ISD::VECREDUCE_UMIN, VT, Custom);
+
+ // Saturates
+ setOperationAction(ISD::SADDSAT, VT, Legal);
+ setOperationAction(ISD::UADDSAT, VT, Legal);
+ setOperationAction(ISD::SSUBSAT, VT, Legal);
+ setOperationAction(ISD::USUBSAT, VT, Legal);
}
for (MVT VT : { MVT::v4f16, MVT::v2f32,
MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
@@ -802,10 +829,15 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
}
if (Subtarget->hasSVE()) {
+ // FIXME: Add custom lowering of MLOAD to handle different passthrus (not a
+ // splat of 0 or undef) once vector selects supported in SVE codegen. See
+ // D68877 for more details.
for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
- if (isTypeLegal(VT) && VT.getVectorElementType() != MVT::i1)
+ if (isTypeLegal(VT))
setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
}
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
+ setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
}
PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -1257,6 +1289,19 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::UMINV: return "AArch64ISD::UMINV";
case AArch64ISD::SMAXV: return "AArch64ISD::SMAXV";
case AArch64ISD::UMAXV: return "AArch64ISD::UMAXV";
+ case AArch64ISD::SMAXV_PRED: return "AArch64ISD::SMAXV_PRED";
+ case AArch64ISD::UMAXV_PRED: return "AArch64ISD::UMAXV_PRED";
+ case AArch64ISD::SMINV_PRED: return "AArch64ISD::SMINV_PRED";
+ case AArch64ISD::UMINV_PRED: return "AArch64ISD::UMINV_PRED";
+ case AArch64ISD::ORV_PRED: return "AArch64ISD::ORV_PRED";
+ case AArch64ISD::EORV_PRED: return "AArch64ISD::EORV_PRED";
+ case AArch64ISD::ANDV_PRED: return "AArch64ISD::ANDV_PRED";
+ case AArch64ISD::CLASTA_N: return "AArch64ISD::CLASTA_N";
+ case AArch64ISD::CLASTB_N: return "AArch64ISD::CLASTB_N";
+ case AArch64ISD::LASTA: return "AArch64ISD::LASTA";
+ case AArch64ISD::LASTB: return "AArch64ISD::LASTB";
+ case AArch64ISD::REV: return "AArch64ISD::REV";
+ case AArch64ISD::TBL: return "AArch64ISD::TBL";
case AArch64ISD::NOT: return "AArch64ISD::NOT";
case AArch64ISD::BIT: return "AArch64ISD::BIT";
case AArch64ISD::CBZ: return "AArch64ISD::CBZ";
@@ -1311,6 +1356,32 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
case AArch64ISD::SUNPKLO: return "AArch64ISD::SUNPKLO";
case AArch64ISD::UUNPKHI: return "AArch64ISD::UUNPKHI";
case AArch64ISD::UUNPKLO: return "AArch64ISD::UUNPKLO";
+ case AArch64ISD::INSR: return "AArch64ISD::INSR";
+ case AArch64ISD::PTEST: return "AArch64ISD::PTEST";
+ case AArch64ISD::PTRUE: return "AArch64ISD::PTRUE";
+ case AArch64ISD::GLD1: return "AArch64ISD::GLD1";
+ case AArch64ISD::GLD1_SCALED: return "AArch64ISD::GLD1_SCALED";
+ case AArch64ISD::GLD1_SXTW: return "AArch64ISD::GLD1_SXTW";
+ case AArch64ISD::GLD1_UXTW: return "AArch64ISD::GLD1_UXTW";
+ case AArch64ISD::GLD1_SXTW_SCALED: return "AArch64ISD::GLD1_SXTW_SCALED";
+ case AArch64ISD::GLD1_UXTW_SCALED: return "AArch64ISD::GLD1_UXTW_SCALED";
+ case AArch64ISD::GLD1_IMM: return "AArch64ISD::GLD1_IMM";
+ case AArch64ISD::GLD1S: return "AArch64ISD::GLD1S";
+ case AArch64ISD::GLD1S_SCALED: return "AArch64ISD::GLD1S_SCALED";
+ case AArch64ISD::GLD1S_SXTW: return "AArch64ISD::GLD1S_SXTW";
+ case AArch64ISD::GLD1S_UXTW: return "AArch64ISD::GLD1S_UXTW";
+ case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
+ case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
+ case AArch64ISD::GLD1S_IMM: return "AArch64ISD::GLD1S_IMM";
+ case AArch64ISD::SST1: return "AArch64ISD::SST1";
+ case AArch64ISD::SST1_SCALED: return "AArch64ISD::SST1_SCALED";
+ case AArch64ISD::SST1_SXTW: return "AArch64ISD::SST1_SXTW";
+ case AArch64ISD::SST1_UXTW: return "AArch64ISD::SST1_UXTW";
+ case AArch64ISD::SST1_SXTW_SCALED: return "AArch64ISD::SST1_SXTW_SCALED";
+ case AArch64ISD::SST1_UXTW_SCALED: return "AArch64ISD::SST1_UXTW_SCALED";
+ case AArch64ISD::SST1_IMM: return "AArch64ISD::SST1_IMM";
+ case AArch64ISD::LDP: return "AArch64ISD::LDP";
+ case AArch64ISD::STP: return "AArch64ISD::STP";
}
return nullptr;
}
@@ -1568,7 +1639,8 @@ static void changeVectorFPCCToAArch64CC(ISD::CondCode CC,
// All of the compare-mask comparisons are ordered, but we can switch
// between the two by a double inversion. E.g. ULE == !OGT.
Invert = true;
- changeFPCCToAArch64CC(getSetCCInverse(CC, false), CondCode, CondCode2);
+ changeFPCCToAArch64CC(getSetCCInverse(CC, /* FP inverse */ MVT::f32),
+ CondCode, CondCode2);
break;
}
}
@@ -1815,7 +1887,7 @@ static SDValue emitConjunctionRec(SelectionDAG &DAG, SDValue Val,
ISD::CondCode CC = cast<CondCodeSDNode>(Val->getOperand(2))->get();
bool isInteger = LHS.getValueType().isInteger();
if (Negate)
- CC = getSetCCInverse(CC, isInteger);
+ CC = getSetCCInverse(CC, LHS.getValueType());
SDLoc DL(Val);
// Determine OutCC and handle FP special case.
if (isInteger) {
@@ -2287,7 +2359,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
if (CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
// If the constants line up, perform the transform!
@@ -2861,6 +2933,55 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
case Intrinsic::aarch64_sve_uunpklo:
return DAG.getNode(AArch64ISD::UUNPKLO, dl, Op.getValueType(),
Op.getOperand(1));
+ case Intrinsic::aarch64_sve_clasta_n:
+ return DAG.getNode(AArch64ISD::CLASTA_N, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::aarch64_sve_clastb_n:
+ return DAG.getNode(AArch64ISD::CLASTB_N, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+ case Intrinsic::aarch64_sve_lasta:
+ return DAG.getNode(AArch64ISD::LASTA, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_lastb:
+ return DAG.getNode(AArch64ISD::LASTB, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_rev:
+ return DAG.getNode(AArch64ISD::REV, dl, Op.getValueType(),
+ Op.getOperand(1));
+ case Intrinsic::aarch64_sve_tbl:
+ return DAG.getNode(AArch64ISD::TBL, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_trn1:
+ return DAG.getNode(AArch64ISD::TRN1, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_trn2:
+ return DAG.getNode(AArch64ISD::TRN2, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_uzp1:
+ return DAG.getNode(AArch64ISD::UZP1, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_uzp2:
+ return DAG.getNode(AArch64ISD::UZP2, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_zip1:
+ return DAG.getNode(AArch64ISD::ZIP1, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_zip2:
+ return DAG.getNode(AArch64ISD::ZIP2, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2));
+ case Intrinsic::aarch64_sve_ptrue:
+ return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
+ Op.getOperand(1));
+
+ case Intrinsic::aarch64_sve_insr: {
+ SDValue Scalar = Op.getOperand(2);
+ EVT ScalarTy = Scalar.getValueType();
+ if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
+ Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
+
+ return DAG.getNode(AArch64ISD::INSR, dl, Op.getValueType(),
+ Op.getOperand(1), Scalar);
+ }
case Intrinsic::localaddress: {
const auto &MF = DAG.getMachineFunction();
@@ -2886,6 +3007,10 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
}
+bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
+ return ExtVal.getValueType().isScalableVector();
+}
+
// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
EVT VT, EVT MemVT,
@@ -2920,7 +3045,7 @@ static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
// Custom lowering for any store, vector or scalar and/or default or with
// a truncate operations. Currently only custom lower truncate operation
-// from vector v4i16 to v4i8.
+// from vector v4i16 to v4i8 or volatile stores of i128.
SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
SelectionDAG &DAG) const {
SDLoc Dl(Op);
@@ -2932,18 +3057,32 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
EVT VT = Value.getValueType();
EVT MemVT = StoreNode->getMemoryVT();
- assert (VT.isVector() && "Can only custom lower vector store types");
-
- unsigned AS = StoreNode->getAddressSpace();
- unsigned Align = StoreNode->getAlignment();
- if (Align < MemVT.getStoreSize() &&
- !allowsMisalignedMemoryAccesses(
- MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
- return scalarizeVectorStore(StoreNode, DAG);
- }
-
- if (StoreNode->isTruncatingStore()) {
- return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+ if (VT.isVector()) {
+ unsigned AS = StoreNode->getAddressSpace();
+ unsigned Align = StoreNode->getAlignment();
+ if (Align < MemVT.getStoreSize() &&
+ !allowsMisalignedMemoryAccesses(MemVT, AS, Align,
+ StoreNode->getMemOperand()->getFlags(),
+ nullptr)) {
+ return scalarizeVectorStore(StoreNode, DAG);
+ }
+
+ if (StoreNode->isTruncatingStore()) {
+ return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+ }
+ } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
+ assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
+ SDValue Lo =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
+ DAG.getConstant(0, Dl, MVT::i64));
+ SDValue Hi =
+ DAG.getNode(ISD::EXTRACT_ELEMENT, Dl, MVT::i64, StoreNode->getValue(),
+ DAG.getConstant(1, Dl, MVT::i64));
+ SDValue Result = DAG.getMemIntrinsicNode(
+ AArch64ISD::STP, Dl, DAG.getVTList(MVT::Other),
+ {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+ StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+ return Result;
}
return SDValue();
@@ -3092,6 +3231,9 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
switch (CC) {
default:
report_fatal_error("Unsupported calling convention.");
+ case CallingConv::AArch64_SVE_VectorCall:
+ // Calling SVE functions is currently not yet supported.
+ report_fatal_error("Unsupported calling convention.");
case CallingConv::WebKit_JS:
return CC_AArch64_WebKit_JS;
case CallingConv::GHC:
@@ -3111,8 +3253,10 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
: CC_AArch64_DarwinPCS_VarArg;
case CallingConv::Win64:
return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS;
- case CallingConv::AArch64_VectorCall:
- return CC_AArch64_AAPCS;
+ case CallingConv::CFGuard_Check:
+ return CC_AArch64_Win64_CFGuard_Check;
+ case CallingConv::AArch64_VectorCall:
+ return CC_AArch64_AAPCS;
}
}
@@ -3848,11 +3992,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
// Walk the register/memloc assignments, inserting copies/loads.
- for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size(); i != e;
- ++i, ++realArgIdx) {
+ for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
CCValAssign &VA = ArgLocs[i];
- SDValue Arg = OutVals[realArgIdx];
- ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
+ SDValue Arg = OutVals[i];
+ ISD::ArgFlagsTy Flags = Outs[i].Flags;
// Promote the value if needed.
switch (VA.getLocInfo()) {
@@ -3867,7 +4010,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
break;
case CCValAssign::AExt:
- if (Outs[realArgIdx].ArgVT == MVT::i1) {
+ if (Outs[i].ArgVT == MVT::i1) {
// AAPCS requires i1 to be zero-extended to 8-bits by the caller.
Arg = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, Arg);
Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i8, Arg);
@@ -3896,7 +4039,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
}
if (VA.isRegLoc()) {
- if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
+ if (i == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
Outs[0].VT == MVT::i64) {
assert(VA.getLocVT() == MVT::i64 &&
"unexpected calling convention register assignment");
@@ -4014,14 +4157,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// node so that legalize doesn't hack it.
if (auto *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
auto GV = G->getGlobal();
- if (Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine()) ==
- AArch64II::MO_GOT) {
- Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_GOT);
+ unsigned OpFlags =
+ Subtarget->classifyGlobalFunctionReference(GV, getTargetMachine());
+ if (OpFlags & AArch64II::MO_GOT) {
+ Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
- } else if (Subtarget->isTargetCOFF() && GV->hasDLLImportStorageClass()) {
- assert(Subtarget->isTargetWindows() &&
- "Windows is the only supported COFF target");
- Callee = getGOT(G, DAG, AArch64II::MO_DLLIMPORT);
} else {
const GlobalValue *GV = G->getGlobal();
Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
@@ -4456,6 +4596,97 @@ AArch64TargetLowering::LowerDarwinGlobalTLSAddress(SDValue Op,
return DAG.getCopyFromReg(Chain, DL, AArch64::X0, PtrVT, Chain.getValue(1));
}
+/// Convert a thread-local variable reference into a sequence of instructions to
+/// compute the variable's address for the local exec TLS model of ELF targets.
+/// The sequence depends on the maximum TLS area size.
+SDValue AArch64TargetLowering::LowerELFTLSLocalExec(const GlobalValue *GV,
+ SDValue ThreadBase,
+ const SDLoc &DL,
+ SelectionDAG &DAG) const {
+ EVT PtrVT = getPointerTy(DAG.getDataLayout());
+ SDValue TPOff, Addr;
+
+ switch (DAG.getTarget().Options.TLSSize) {
+ default:
+ llvm_unreachable("Unexpected TLS size");
+
+ case 12: {
+ // mrs x0, TPIDR_EL0
+ // add x0, x0, :tprel_lo12:a
+ SDValue Var = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_PAGEOFF);
+ return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
+ Var,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ }
+
+ case 24: {
+ // mrs x0, TPIDR_EL0
+ // add x0, x0, :tprel_hi12:a
+ // add x0, x0, :tprel_lo12_nc:a
+ SDValue HiVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
+ SDValue LoVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+ Addr = SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
+ HiVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ return SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, Addr,
+ LoVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ }
+
+ case 32: {
+ // mrs x1, TPIDR_EL0
+ // movz x0, #:tprel_g1:a
+ // movk x0, #:tprel_g0_nc:a
+ // add x0, x1, x0
+ SDValue HiVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G1);
+ SDValue LoVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+ DAG.getTargetConstant(16, DL, MVT::i32)),
+ 0);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+ }
+
+ case 48: {
+ // mrs x1, TPIDR_EL0
+ // movz x0, #:tprel_g2:a
+ // movk x0, #:tprel_g1_nc:a
+ // movk x0, #:tprel_g0_nc:a
+ // add x0, x1, x0
+ SDValue HiVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_G2);
+ SDValue MiVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_G1 | AArch64II::MO_NC);
+ SDValue LoVar = DAG.getTargetGlobalAddress(
+ GV, DL, PtrVT, 0,
+ AArch64II::MO_TLS | AArch64II::MO_G0 | AArch64II::MO_NC);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::MOVZXi, DL, PtrVT, HiVar,
+ DAG.getTargetConstant(32, DL, MVT::i32)),
+ 0);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, MiVar,
+ DAG.getTargetConstant(16, DL, MVT::i32)),
+ 0);
+ TPOff = SDValue(DAG.getMachineNode(AArch64::MOVKXi, DL, PtrVT, TPOff, LoVar,
+ DAG.getTargetConstant(0, DL, MVT::i32)),
+ 0);
+ return DAG.getNode(ISD::ADD, DL, PtrVT, ThreadBase, TPOff);
+ }
+ }
+}
+
/// When accessing thread-local variables under either the general-dynamic or
/// local-dynamic system, we make a "TLS-descriptor" call. The variable will
/// have a descriptor, accessible via a PC-relative ADRP, and whose first entry
@@ -4493,15 +4724,7 @@ SDValue
AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
SelectionDAG &DAG) const {
assert(Subtarget->isTargetELF() && "This function expects an ELF target");
- if (getTargetMachine().getCodeModel() == CodeModel::Large)
- report_fatal_error("ELF TLS only supported in small memory model");
- // Different choices can be made for the maximum size of the TLS area for a
- // module. For the small address model, the default TLS size is 16MiB and the
- // maximum TLS size is 4GiB.
- // FIXME: add -mtls-size command line option and make it control the 16MiB
- // vs. 4GiB code sequence generation.
- // FIXME: add tiny codemodel support. We currently generate the same code as
- // small, which may be larger than needed.
+
const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(Op);
TLSModel::Model Model = getTargetMachine().getTLSModel(GA->getGlobal());
@@ -4511,6 +4734,17 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
Model = TLSModel::GeneralDynamic;
}
+ if (getTargetMachine().getCodeModel() == CodeModel::Large &&
+ Model != TLSModel::LocalExec)
+ report_fatal_error("ELF TLS only supported in small memory model or "
+ "in local exec TLS model");
+ // Different choices can be made for the maximum size of the TLS area for a
+ // module. For the small address model, the default TLS size is 16MiB and the
+ // maximum TLS size is 4GiB.
+ // FIXME: add tiny and large code model support for TLS access models other
+ // than local exec. We currently generate the same code as small for tiny,
+ // which may be larger than needed.
+
SDValue TPOff;
EVT PtrVT = getPointerTy(DAG.getDataLayout());
SDLoc DL(Op);
@@ -4519,23 +4753,7 @@ AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
SDValue ThreadBase = DAG.getNode(AArch64ISD::THREAD_POINTER, DL, PtrVT);
if (Model == TLSModel::LocalExec) {
- SDValue HiVar = DAG.getTargetGlobalAddress(
- GV, DL, PtrVT, 0, AArch64II::MO_TLS | AArch64II::MO_HI12);
- SDValue LoVar = DAG.getTargetGlobalAddress(
- GV, DL, PtrVT, 0,
- AArch64II::MO_TLS | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
-
- SDValue TPWithOff_lo =
- SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, ThreadBase,
- HiVar,
- DAG.getTargetConstant(0, DL, MVT::i32)),
- 0);
- SDValue TPWithOff =
- SDValue(DAG.getMachineNode(AArch64::ADDXri, DL, PtrVT, TPWithOff_lo,
- LoVar,
- DAG.getTargetConstant(0, DL, MVT::i32)),
- 0);
- return TPWithOff;
+ return LowerELFTLSLocalExec(GV, ThreadBase, DL, DAG);
} else if (Model == TLSModel::InitialExec) {
TPOff = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, AArch64II::MO_TLS);
TPOff = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, TPOff);
@@ -4961,8 +5179,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (LHS.getValueType().isInteger()) {
SDValue CCVal;
- SDValue Cmp =
- getAArch64Cmp(LHS, RHS, ISD::getSetCCInverse(CC, true), CCVal, DAG, dl);
+ SDValue Cmp = getAArch64Cmp(
+ LHS, RHS, ISD::getSetCCInverse(CC, LHS.getValueType()), CCVal, DAG, dl);
// Note that we inverted the condition above, so we reverse the order of
// the true and false operands here. This will allow the setcc to be
@@ -4981,7 +5199,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
AArch64CC::CondCode CC1, CC2;
changeFPCCToAArch64CC(CC, CC1, CC2);
if (CC2 == AArch64CC::AL) {
- changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, false), CC1, CC2);
+ changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
+ CC2);
SDValue CC1Val = DAG.getConstant(CC1, dl, MVT::i32);
// Note that we inverted the condition above, so we reverse the order of
@@ -5042,18 +5261,18 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
if (CTVal && CFVal && CTVal->isAllOnesValue() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
} else if (CTVal && CFVal && CTVal->isOne() && CFVal->isNullValue()) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
} else if (TVal.getOpcode() == ISD::XOR) {
// If TVal is a NOT we want to swap TVal and FVal so that we can match
// with a CSINV rather than a CSEL.
if (isAllOnesConstant(TVal.getOperand(1))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
} else if (TVal.getOpcode() == ISD::SUB) {
// If TVal is a negation (SUB from 0) we want to swap TVal and FVal so
@@ -5061,7 +5280,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
if (isNullConstant(TVal.getOperand(0))) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
} else if (CTVal && CFVal) {
const int64_t TrueVal = CTVal->getSExtValue();
@@ -5104,7 +5323,7 @@ SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
if (Swap) {
std::swap(TVal, FVal);
std::swap(CTVal, CFVal);
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, LHS.getValueType());
}
if (Opcode != AArch64ISD::CSEL) {
@@ -5531,7 +5750,7 @@ SDValue AArch64TargetLowering::LowerSPONENTRY(SDValue Op,
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
Register AArch64TargetLowering::
-getRegisterByName(const char* RegName, EVT VT, const MachineFunction &MF) const {
+getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const {
Register Reg = MatchRegisterName(RegName);
if (AArch64::X1 <= Reg && Reg <= AArch64::X28) {
const MCRegisterInfo *MRI = Subtarget->getRegisterInfo();
@@ -6946,19 +7165,55 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
// Otherwise, duplicate from the lane of the input vector.
unsigned Opcode = getDUPLANEOp(V1.getValueType().getVectorElementType());
- // SelectionDAGBuilder may have "helpfully" already extracted or conatenated
- // to make a vector of the same size as this SHUFFLE. We can ignore the
- // extract entirely, and canonicalise the concat using WidenVector.
- if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
- Lane += cast<ConstantSDNode>(V1.getOperand(1))->getZExtValue();
+ // Try to eliminate a bitcasted extract subvector before a DUPLANE.
+ auto getScaledOffsetDup = [](SDValue BitCast, int &LaneC, MVT &CastVT) {
+ // Match: dup (bitcast (extract_subv X, C)), LaneC
+ if (BitCast.getOpcode() != ISD::BITCAST ||
+ BitCast.getOperand(0).getOpcode() != ISD::EXTRACT_SUBVECTOR)
+ return false;
+
+ // The extract index must align in the destination type. That may not
+ // happen if the bitcast is from narrow to wide type.
+ SDValue Extract = BitCast.getOperand(0);
+ unsigned ExtIdx = Extract.getConstantOperandVal(1);
+ unsigned SrcEltBitWidth = Extract.getScalarValueSizeInBits();
+ unsigned ExtIdxInBits = ExtIdx * SrcEltBitWidth;
+ unsigned CastedEltBitWidth = BitCast.getScalarValueSizeInBits();
+ if (ExtIdxInBits % CastedEltBitWidth != 0)
+ return false;
+
+ // Update the lane value by offsetting with the scaled extract index.
+ LaneC += ExtIdxInBits / CastedEltBitWidth;
+
+ // Determine the casted vector type of the wide vector input.
+ // dup (bitcast (extract_subv X, C)), LaneC --> dup (bitcast X), LaneC'
+ // Examples:
+ // dup (bitcast (extract_subv v2f64 X, 1) to v2f32), 1 --> dup v4f32 X, 3
+ // dup (bitcast (extract_subv v16i8 X, 8) to v4i16), 1 --> dup v8i16 X, 5
+ unsigned SrcVecNumElts =
+ Extract.getOperand(0).getValueSizeInBits() / CastedEltBitWidth;
+ CastVT = MVT::getVectorVT(BitCast.getSimpleValueType().getScalarType(),
+ SrcVecNumElts);
+ return true;
+ };
+ MVT CastVT;
+ if (getScaledOffsetDup(V1, Lane, CastVT)) {
+ V1 = DAG.getBitcast(CastVT, V1.getOperand(0).getOperand(0));
+ } else if (V1.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
+ // The lane is incremented by the index of the extract.
+ // Example: dup v2f32 (extract v4f32 X, 2), 1 --> dup v4f32 X, 3
+ Lane += V1.getConstantOperandVal(1);
V1 = V1.getOperand(0);
} else if (V1.getOpcode() == ISD::CONCAT_VECTORS) {
+ // The lane is decremented if we are splatting from the 2nd operand.
+ // Example: dup v4i32 (concat v2i32 X, v2i32 Y), 3 --> dup v4i32 Y, 1
unsigned Idx = Lane >= (int)VT.getVectorNumElements() / 2;
Lane -= Idx * VT.getVectorNumElements() / 2;
V1 = WidenVector(V1.getOperand(Idx), DAG);
- } else if (VT.getSizeInBits() == 64)
+ } else if (VT.getSizeInBits() == 64) {
+ // Widen the operand to 128-bit register with undef.
V1 = WidenVector(V1, DAG);
-
+ }
return DAG.getNode(Opcode, dl, VT, V1, DAG.getConstant(Lane, dl, MVT::i64));
}
@@ -7077,26 +7332,31 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
switch (ElemVT.getSimpleVT().SimpleTy) {
case MVT::i8:
case MVT::i16:
+ case MVT::i32:
SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
- break;
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
case MVT::i64:
SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
- break;
- case MVT::i32:
- // Fine as is
- break;
- // TODO: we can support splats of i1s and float types, but haven't added
- // patterns yet.
- case MVT::i1:
+ return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+ case MVT::i1: {
+ // The general case of i1. There isn't any natural way to do this,
+ // so we use some trickery with whilelo.
+ // TODO: Add special cases for splat of constant true/false.
+ SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+ SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
+ DAG.getValueType(MVT::i1));
+ SDValue ID = DAG.getTargetConstant(Intrinsic::aarch64_sve_whilelo, dl,
+ MVT::i64);
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
+ DAG.getConstant(0, dl, MVT::i64), SplatVal);
+ }
+ // TODO: we can support float types, but haven't added patterns yet.
case MVT::f16:
case MVT::f32:
case MVT::f64:
default:
- llvm_unreachable("Unsupported SPLAT_VECTOR input operand type");
- break;
+ report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
}
-
- return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
}
static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
@@ -8443,6 +8703,26 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.align = Align(16);
Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
return true;
+ case Intrinsic::aarch64_sve_ldnt1: {
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(1);
+ Info.offset = 0;
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
+ return true;
+ }
+ case Intrinsic::aarch64_sve_stnt1: {
+ PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
+ Info.opc = ISD::INTRINSIC_W_CHAIN;
+ Info.memVT = MVT::getVT(PtrTy->getElementType());
+ Info.ptrVal = I.getArgOperand(2);
+ Info.offset = 0;
+ Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+ Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
+ return true;
+ }
default:
break;
}
@@ -8515,11 +8795,12 @@ bool AArch64TargetLowering::isProfitableToHoist(Instruction *I) const {
return true;
const TargetOptions &Options = getTargetMachine().Options;
- const DataLayout &DL = I->getModule()->getDataLayout();
- EVT VT = getValueType(DL, User->getOperand(0)->getType());
+ const Function *F = I->getFunction();
+ const DataLayout &DL = F->getParent()->getDataLayout();
+ Type *Ty = User->getOperand(0)->getType();
- return !(isFMAFasterThanFMulAndFAdd(VT) &&
- isOperationLegalOrCustom(ISD::FMA, VT) &&
+ return !(isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+ isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
(Options.AllowFPOpFusion == FPOpFusion::Fast ||
Options.UnsafeFPMath));
}
@@ -9176,7 +9457,8 @@ int AArch64TargetLowering::getScalingFactorCost(const DataLayout &DL,
return -1;
}
-bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(
+ const MachineFunction &MF, EVT VT) const {
VT = VT.getScalarType();
if (!VT.isSimple())
@@ -9193,6 +9475,17 @@ bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
return false;
}
+bool AArch64TargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
+ Type *Ty) const {
+ switch (Ty->getScalarType()->getTypeID()) {
+ case Type::FloatTyID:
+ case Type::DoubleTyID:
+ return true;
+ default:
+ return false;
+ }
+}
+
const MCPhysReg *
AArch64TargetLowering::getScratchRegisters(CallingConv::ID) const {
// LR is a callee-save register, but we must treat it as clobbered by any call
@@ -9363,6 +9656,19 @@ AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
}
+static bool IsSVECntIntrinsic(SDValue S) {
+ switch(getIntrinsicID(S.getNode())) {
+ default:
+ break;
+ case Intrinsic::aarch64_sve_cntb:
+ case Intrinsic::aarch64_sve_cnth:
+ case Intrinsic::aarch64_sve_cntw:
+ case Intrinsic::aarch64_sve_cntd:
+ return true;
+ }
+ return false;
+}
+
static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -9373,9 +9679,18 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
if (!isa<ConstantSDNode>(N->getOperand(1)))
return SDValue();
+ SDValue N0 = N->getOperand(0);
ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
const APInt &ConstValue = C->getAPIntValue();
+ // Allow the scaling to be folded into the `cnt` instruction by preventing
+ // the scaling to be obscured here. This makes it easier to pattern match.
+ if (IsSVECntIntrinsic(N0) ||
+ (N0->getOpcode() == ISD::TRUNCATE &&
+ (IsSVECntIntrinsic(N0->getOperand(0)))))
+ if (ConstValue.sge(1) && ConstValue.sle(16))
+ return SDValue();
+
// Multiplication of a power of two plus/minus one can be done more
// cheaply as as shift+add/sub. For now, this is true unilaterally. If
// future CPUs have a cheaper MADD instruction, this may need to be
@@ -9386,7 +9701,6 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
// e.g. 6=3*2=(2+1)*2.
// TODO: consider lowering more cases, e.g. C = 14, -6, -14 or even 45
// which equals to (1+2)*16-(1+2).
- SDValue N0 = N->getOperand(0);
// TrailingZeroes is used to test if the mul can be lowered to
// shift+add+shift.
unsigned TrailingZeroes = ConstValue.countTrailingZeros();
@@ -9821,6 +10135,67 @@ static SDValue performORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
return SDValue();
}
+static bool isConstantSplatVectorMaskForType(SDNode *N, EVT MemVT) {
+ if (!MemVT.getVectorElementType().isSimple())
+ return false;
+
+ uint64_t MaskForTy = 0ull;
+ switch (MemVT.getVectorElementType().getSimpleVT().SimpleTy) {
+ case MVT::i8:
+ MaskForTy = 0xffull;
+ break;
+ case MVT::i16:
+ MaskForTy = 0xffffull;
+ break;
+ case MVT::i32:
+ MaskForTy = 0xffffffffull;
+ break;
+ default:
+ return false;
+ break;
+ }
+
+ if (N->getOpcode() == AArch64ISD::DUP || N->getOpcode() == ISD::SPLAT_VECTOR)
+ if (auto *Op0 = dyn_cast<ConstantSDNode>(N->getOperand(0)))
+ return Op0->getAPIntValue().getLimitedValue() == MaskForTy;
+
+ return false;
+}
+
+static SDValue performSVEAndCombine(SDNode *N,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue Src = N->getOperand(0);
+ SDValue Mask = N->getOperand(1);
+
+ if (!Src.hasOneUse())
+ return SDValue();
+
+ // GLD1* instructions perform an implicit zero-extend, which makes them
+ // perfect candidates for combining.
+ switch (Src->getOpcode()) {
+ case AArch64ISD::GLD1:
+ case AArch64ISD::GLD1_SCALED:
+ case AArch64ISD::GLD1_SXTW:
+ case AArch64ISD::GLD1_SXTW_SCALED:
+ case AArch64ISD::GLD1_UXTW:
+ case AArch64ISD::GLD1_UXTW_SCALED:
+ case AArch64ISD::GLD1_IMM:
+ break;
+ default:
+ return SDValue();
+ }
+
+ EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
+
+ if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
+ return Src;
+
+ return SDValue();
+}
+
static SDValue performANDCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI) {
SelectionDAG &DAG = DCI.DAG;
@@ -9829,6 +10204,9 @@ static SDValue performANDCombine(SDNode *N,
if (!VT.isVector() || !DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
+ if (VT.isScalableVector())
+ return performSVEAndCombine(N, DCI);
+
BuildVectorSDNode *BVN =
dyn_cast<BuildVectorSDNode>(N->getOperand(1).getNode());
if (!BVN)
@@ -9889,74 +10267,6 @@ static SDValue performSRLCombine(SDNode *N,
return SDValue();
}
-static SDValue performBitcastCombine(SDNode *N,
- TargetLowering::DAGCombinerInfo &DCI,
- SelectionDAG &DAG) {
- // Wait 'til after everything is legalized to try this. That way we have
- // legal vector types and such.
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- // Remove extraneous bitcasts around an extract_subvector.
- // For example,
- // (v4i16 (bitconvert
- // (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
- // becomes
- // (extract_subvector ((v8i16 ...), (i64 4)))
-
- // Only interested in 64-bit vectors as the ultimate result.
- EVT VT = N->getValueType(0);
- if (!VT.isVector())
- return SDValue();
- if (VT.getSimpleVT().getSizeInBits() != 64)
- return SDValue();
- // Is the operand an extract_subvector starting at the beginning or halfway
- // point of the vector? A low half may also come through as an
- // EXTRACT_SUBREG, so look for that, too.
- SDValue Op0 = N->getOperand(0);
- if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
- !(Op0->isMachineOpcode() &&
- Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
- return SDValue();
- uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
- if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
- if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
- return SDValue();
- } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
- if (idx != AArch64::dsub)
- return SDValue();
- // The dsub reference is equivalent to a lane zero subvector reference.
- idx = 0;
- }
- // Look through the bitcast of the input to the extract.
- if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
- return SDValue();
- SDValue Source = Op0->getOperand(0)->getOperand(0);
- // If the source type has twice the number of elements as our destination
- // type, we know this is an extract of the high or low half of the vector.
- EVT SVT = Source->getValueType(0);
- if (!SVT.isVector() ||
- SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
- return SDValue();
-
- LLVM_DEBUG(
- dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
-
- // Create the simplified form to just extract the low or high half of the
- // vector directly rather than bothering with the bitcasts.
- SDLoc dl(N);
- unsigned NumElements = VT.getVectorNumElements();
- if (idx) {
- SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
- } else {
- SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
- return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
- Source, SubReg),
- 0);
- }
-}
-
static SDValue performConcatVectorsCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG) {
@@ -10263,10 +10573,10 @@ static SDValue performSetccAddFolding(SDNode *Op, SelectionDAG &DAG) {
MVT::i32);
Cmp = *InfoAndKind.Info.AArch64.Cmp;
} else
- Cmp = getAArch64Cmp(*InfoAndKind.Info.Generic.Opnd0,
- *InfoAndKind.Info.Generic.Opnd1,
- ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, true),
- CCVal, DAG, dl);
+ Cmp = getAArch64Cmp(
+ *InfoAndKind.Info.Generic.Opnd0, *InfoAndKind.Info.Generic.Opnd1,
+ ISD::getSetCCInverse(InfoAndKind.Info.Generic.CC, CmpVT), CCVal, DAG,
+ dl);
EVT VT = Op->getValueType(0);
LHS = DAG.getNode(ISD::ADD, dl, VT, RHS, DAG.getConstant(1, dl, VT));
@@ -10456,6 +10766,154 @@ static SDValue combineAcrossLanesIntrinsic(unsigned Opc, SDNode *N,
DAG.getConstant(0, dl, MVT::i64));
}
+static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
+ SelectionDAG &DAG) {
+ SDLoc dl(N);
+ LLVMContext &Ctx = *DAG.getContext();
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ EVT VT = N->getValueType(0);
+ SDValue Pred = N->getOperand(1);
+ SDValue Data = N->getOperand(2);
+ EVT DataVT = Data.getValueType();
+
+ if (DataVT.getVectorElementType().isScalarInteger() &&
+ (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64)) {
+ if (!TLI.isTypeLegal(DataVT))
+ return SDValue();
+
+ EVT OutputVT = EVT::getVectorVT(Ctx, VT,
+ AArch64::NeonBitsPerVector / VT.getSizeInBits());
+ SDValue Reduce = DAG.getNode(Opc, dl, OutputVT, Pred, Data);
+ SDValue Zero = DAG.getConstant(0, dl, MVT::i64);
+ SDValue Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Reduce, Zero);
+
+ return Result;
+ }
+
+ return SDValue();
+}
+
+static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
+ SDLoc dl(N);
+ LLVMContext &Ctx = *DAG.getContext();
+ EVT VT = N->getValueType(0);
+
+ assert(VT.isScalableVector() && "Expected a scalable vector.");
+
+ // Current lowering only supports the SVE-ACLE types.
+ if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ unsigned ElemSize = VT.getVectorElementType().getSizeInBits() / 8;
+ unsigned ByteSize = VT.getSizeInBits().getKnownMinSize() / 8;
+ EVT ByteVT = EVT::getVectorVT(Ctx, MVT::i8, { ByteSize, true });
+
+ // Convert everything to the domain of EXT (i.e bytes).
+ SDValue Op0 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(1));
+ SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, ByteVT, N->getOperand(2));
+ SDValue Op2 = DAG.getNode(ISD::MUL, dl, MVT::i32, N->getOperand(3),
+ DAG.getConstant(ElemSize, dl, MVT::i32));
+
+ SDValue EXT = DAG.getNode(AArch64ISD::EXT, dl, ByteVT, Op0, Op1, Op2);
+ return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
+}
+
+static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID,
+ bool Invert,
+ TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalize())
+ return SDValue();
+
+ SDValue Comparator = N->getOperand(3);
+ if (Comparator.getOpcode() == AArch64ISD::DUP ||
+ Comparator.getOpcode() == ISD::SPLAT_VECTOR) {
+ unsigned IID = getIntrinsicID(N);
+ EVT VT = N->getValueType(0);
+ EVT CmpVT = N->getOperand(2).getValueType();
+ SDValue Pred = N->getOperand(1);
+ SDValue Imm;
+ SDLoc DL(N);
+
+ switch (IID) {
+ default:
+ llvm_unreachable("Called with wrong intrinsic!");
+ break;
+
+ // Signed comparisons
+ case Intrinsic::aarch64_sve_cmpeq_wide:
+ case Intrinsic::aarch64_sve_cmpne_wide:
+ case Intrinsic::aarch64_sve_cmpge_wide:
+ case Intrinsic::aarch64_sve_cmpgt_wide:
+ case Intrinsic::aarch64_sve_cmplt_wide:
+ case Intrinsic::aarch64_sve_cmple_wide: {
+ if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
+ int64_t ImmVal = CN->getSExtValue();
+ if (ImmVal >= -16 && ImmVal <= 15)
+ Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
+ else
+ return SDValue();
+ }
+ break;
+ }
+ // Unsigned comparisons
+ case Intrinsic::aarch64_sve_cmphs_wide:
+ case Intrinsic::aarch64_sve_cmphi_wide:
+ case Intrinsic::aarch64_sve_cmplo_wide:
+ case Intrinsic::aarch64_sve_cmpls_wide: {
+ if (auto *CN = dyn_cast<ConstantSDNode>(Comparator.getOperand(0))) {
+ uint64_t ImmVal = CN->getZExtValue();
+ if (ImmVal <= 127)
+ Imm = DAG.getConstant(ImmVal, DL, MVT::i32);
+ else
+ return SDValue();
+ }
+ break;
+ }
+ }
+
+ SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
+ SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64);
+ SDValue Op0, Op1;
+ if (Invert) {
+ Op0 = Splat;
+ Op1 = N->getOperand(2);
+ } else {
+ Op0 = N->getOperand(2);
+ Op1 = Splat;
+ }
+ return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+ ID, Pred, Op0, Op1);
+ }
+
+ return SDValue();
+}
+
+static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
+ AArch64CC::CondCode Cond) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+ SDLoc DL(Op);
+ assert(Op.getValueType().isScalableVector() &&
+ TLI.isTypeLegal(Op.getValueType()) &&
+ "Expected legal scalable vector type!");
+
+ // Ensure target specific opcodes are using legal type.
+ EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+ SDValue TVal = DAG.getConstant(1, DL, OutVT);
+ SDValue FVal = DAG.getConstant(0, DL, OutVT);
+
+ // Set condition code (CC) flags.
+ SDValue Test = DAG.getNode(AArch64ISD::PTEST, DL, MVT::Other, Pg, Op);
+
+ // Convert CC to integer based on requested condition.
+ // NOTE: Cond is inverted to promote CSEL's removal when it feeds a compare.
+ SDValue CC = DAG.getConstant(getInvertedCondCode(Cond), DL, MVT::i32);
+ SDValue Res = DAG.getNode(AArch64ISD::CSEL, DL, OutVT, FVal, TVal, CC, Test);
+ return DAG.getZExtOrTrunc(Res, DL, VT);
+}
+
static SDValue performIntrinsicCombine(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const AArch64Subtarget *Subtarget) {
@@ -10510,6 +10968,61 @@ static SDValue performIntrinsicCombine(SDNode *N,
case Intrinsic::aarch64_crc32h:
case Intrinsic::aarch64_crc32ch:
return tryCombineCRC32(0xffff, N, DAG);
+ case Intrinsic::aarch64_sve_smaxv:
+ return LowerSVEIntReduction(N, AArch64ISD::SMAXV_PRED, DAG);
+ case Intrinsic::aarch64_sve_umaxv:
+ return LowerSVEIntReduction(N, AArch64ISD::UMAXV_PRED, DAG);
+ case Intrinsic::aarch64_sve_sminv:
+ return LowerSVEIntReduction(N, AArch64ISD::SMINV_PRED, DAG);
+ case Intrinsic::aarch64_sve_uminv:
+ return LowerSVEIntReduction(N, AArch64ISD::UMINV_PRED, DAG);
+ case Intrinsic::aarch64_sve_orv:
+ return LowerSVEIntReduction(N, AArch64ISD::ORV_PRED, DAG);
+ case Intrinsic::aarch64_sve_eorv:
+ return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
+ case Intrinsic::aarch64_sve_andv:
+ return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
+ case Intrinsic::aarch64_sve_ext:
+ return LowerSVEIntrinsicEXT(N, DAG);
+ case Intrinsic::aarch64_sve_cmpeq_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq,
+ false, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmpne_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne,
+ false, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmpge_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
+ false, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmpgt_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
+ false, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmplt_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
+ true, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmple_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
+ true, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmphs_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs,
+ false, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmphi_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi,
+ false, DCI, DAG);
+ case Intrinsic::aarch64_sve_cmplo_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true,
+ DCI, DAG);
+ case Intrinsic::aarch64_sve_cmpls_wide:
+ return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true,
+ DCI, DAG);
+ case Intrinsic::aarch64_sve_ptest_any:
+ return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ AArch64CC::ANY_ACTIVE);
+ case Intrinsic::aarch64_sve_ptest_first:
+ return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ AArch64CC::FIRST_ACTIVE);
+ case Intrinsic::aarch64_sve_ptest_last:
+ return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
+ AArch64CC::LAST_ACTIVE);
}
return SDValue();
}
@@ -10652,6 +11165,48 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
return NewST1;
}
+static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+ EVT VT = N->getValueType(0);
+ EVT PtrTy = N->getOperand(3).getValueType();
+
+ EVT LoadVT = VT;
+ if (VT.isFloatingPoint())
+ LoadVT = VT.changeTypeToInteger();
+
+ auto *MINode = cast<MemIntrinsicSDNode>(N);
+ SDValue PassThru = DAG.getConstant(0, DL, LoadVT);
+ SDValue L = DAG.getMaskedLoad(LoadVT, DL, MINode->getChain(),
+ MINode->getOperand(3), DAG.getUNDEF(PtrTy),
+ MINode->getOperand(2), PassThru,
+ MINode->getMemoryVT(), MINode->getMemOperand(),
+ ISD::UNINDEXED, ISD::NON_EXTLOAD, false);
+
+ if (VT.isFloatingPoint()) {
+ SDValue Ops[] = { DAG.getNode(ISD::BITCAST, DL, VT, L), L.getValue(1) };
+ return DAG.getMergeValues(Ops, DL);
+ }
+
+ return L;
+}
+
+static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
+ SDLoc DL(N);
+
+ SDValue Data = N->getOperand(2);
+ EVT DataVT = Data.getValueType();
+ EVT PtrTy = N->getOperand(4).getValueType();
+
+ if (DataVT.isFloatingPoint())
+ Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
+
+ auto *MINode = cast<MemIntrinsicSDNode>(N);
+ return DAG.getMaskedStore(MINode->getChain(), DL, Data, MINode->getOperand(4),
+ DAG.getUNDEF(PtrTy), MINode->getOperand(3),
+ MINode->getMemoryVT(), MINode->getMemOperand(),
+ ISD::UNINDEXED, false, false);
+}
+
/// Replace a splat of zeros to a vector store by scalar stores of WZR/XZR. The
/// load store optimizer pass will merge them to store pair stores. This should
/// be better than a movi to create the vector zero followed by a vector store
@@ -11703,6 +12258,215 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
DAG.getConstant(MinOffset, DL, MVT::i64));
}
+// Returns an SVE type that ContentTy can be trivially sign or zero extended
+// into.
+static MVT getSVEContainerType(EVT ContentTy) {
+ assert(ContentTy.isSimple() && "No SVE containers for extended types");
+
+ switch (ContentTy.getSimpleVT().SimpleTy) {
+ default:
+ llvm_unreachable("No known SVE container for this MVT type");
+ case MVT::nxv2i8:
+ case MVT::nxv2i16:
+ case MVT::nxv2i32:
+ case MVT::nxv2i64:
+ case MVT::nxv2f32:
+ case MVT::nxv2f64:
+ return MVT::nxv2i64;
+ case MVT::nxv4i8:
+ case MVT::nxv4i16:
+ case MVT::nxv4i32:
+ case MVT::nxv4f32:
+ return MVT::nxv4i32;
+ }
+}
+
+static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode,
+ bool OnlyPackedOffsets = true) {
+ const SDValue Src = N->getOperand(2);
+ const EVT SrcVT = Src->getValueType(0);
+ assert(SrcVT.isScalableVector() &&
+ "Scatter stores are only possible for SVE vectors");
+
+ SDLoc DL(N);
+ MVT SrcElVT = SrcVT.getVectorElementType().getSimpleVT();
+
+ // Make sure that source data will fit into an SVE register
+ if (SrcVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // For FPs, ACLE only supports _packed_ single and double precision types.
+ if (SrcElVT.isFloatingPoint())
+ if ((SrcVT != MVT::nxv4f32) && (SrcVT != MVT::nxv2f64))
+ return SDValue();
+
+ // Depending on the addressing mode, this is either a pointer or a vector of
+ // pointers (that fits into one register)
+ const SDValue Base = N->getOperand(4);
+ // Depending on the addressing mode, this is either a single offset or a
+ // vector of offsets (that fits into one register)
+ SDValue Offset = N->getOperand(5);
+
+ auto &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(Base.getValueType()))
+ return SDValue();
+
+ // Some scatter store variants allow unpacked offsets, but only as nxv2i32
+ // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
+ // nxv2i64. Legalize accordingly.
+ if (!OnlyPackedOffsets &&
+ Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
+ Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
+
+ if (!TLI.isTypeLegal(Offset.getValueType()))
+ return SDValue();
+
+ // Source value type that is representable in hardware
+ EVT HwSrcVt = getSVEContainerType(SrcVT);
+
+ // Keep the original type of the input data to store - this is needed to
+ // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the
+ // integer equivalent, so just use HwSrcVt.
+ SDValue InputVT = DAG.getValueType(SrcVT);
+ if (SrcVT.isFloatingPoint())
+ InputVT = DAG.getValueType(HwSrcVt);
+
+ SDVTList VTs = DAG.getVTList(MVT::Other);
+ SDValue SrcNew;
+
+ if (Src.getValueType().isFloatingPoint())
+ SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Src);
+ else
+ SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Src);
+
+ SDValue Ops[] = {N->getOperand(0), // Chain
+ SrcNew,
+ N->getOperand(3), // Pg
+ Base,
+ Offset,
+ InputVT};
+
+ return DAG.getNode(Opcode, DL, VTs, Ops);
+}
+
+static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
+ unsigned Opcode,
+ bool OnlyPackedOffsets = true) {
+ EVT RetVT = N->getValueType(0);
+ assert(RetVT.isScalableVector() &&
+ "Gather loads are only possible for SVE vectors");
+ SDLoc DL(N);
+
+ if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+ return SDValue();
+
+ // Depending on the addressing mode, this is either a pointer or a vector of
+ // pointers (that fits into one register)
+ const SDValue Base = N->getOperand(3);
+ // Depending on the addressing mode, this is either a single offset or a
+ // vector of offsets (that fits into one register)
+ SDValue Offset = N->getOperand(4);
+
+ auto &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isTypeLegal(Base.getValueType()))
+ return SDValue();
+
+ // Some gather load variants allow unpacked offsets, but only as nxv2i32
+ // vectors. These are implicitly sign (sxtw) or zero (zxtw) extend to
+ // nxv2i64. Legalize accordingly.
+ if (!OnlyPackedOffsets &&
+ Offset.getValueType().getSimpleVT().SimpleTy == MVT::nxv2i32)
+ Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset).getValue(0);
+
+ // Return value type that is representable in hardware
+ EVT HwRetVt = getSVEContainerType(RetVT);
+
+ // Keep the original output value type around - this will better inform
+ // optimisations (e.g. instruction folding when load is followed by
+ // zext/sext). This will only be used for ints, so the value for FPs
+ // doesn't matter.
+ SDValue OutVT = DAG.getValueType(RetVT);
+ if (RetVT.isFloatingPoint())
+ OutVT = DAG.getValueType(HwRetVt);
+
+ SDVTList VTs = DAG.getVTList(HwRetVt, MVT::Other);
+ SDValue Ops[] = {N->getOperand(0), // Chain
+ N->getOperand(2), // Pg
+ Base, Offset, OutVT};
+
+ SDValue Load = DAG.getNode(Opcode, DL, VTs, Ops);
+ SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+ if (RetVT.isInteger() && (RetVT != HwRetVt))
+ Load = DAG.getNode(ISD::TRUNCATE, DL, RetVT, Load.getValue(0));
+
+ // If the original return value was FP, bitcast accordingly. Doing it here
+ // means that we can avoid adding TableGen patterns for FPs.
+ if (RetVT.isFloatingPoint())
+ Load = DAG.getNode(ISD::BITCAST, DL, RetVT, Load.getValue(0));
+
+ return DAG.getMergeValues({Load, LoadChain}, DL);
+}
+
+
+static SDValue
+performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+ SelectionDAG &DAG) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ SDValue Src = N->getOperand(0);
+ unsigned Opc = Src->getOpcode();
+
+ // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
+ // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
+ unsigned NewOpc;
+ switch (Opc) {
+ case AArch64ISD::GLD1:
+ NewOpc = AArch64ISD::GLD1S;
+ break;
+ case AArch64ISD::GLD1_SCALED:
+ NewOpc = AArch64ISD::GLD1S_SCALED;
+ break;
+ case AArch64ISD::GLD1_SXTW:
+ NewOpc = AArch64ISD::GLD1S_SXTW;
+ break;
+ case AArch64ISD::GLD1_SXTW_SCALED:
+ NewOpc = AArch64ISD::GLD1S_SXTW_SCALED;
+ break;
+ case AArch64ISD::GLD1_UXTW:
+ NewOpc = AArch64ISD::GLD1S_UXTW;
+ break;
+ case AArch64ISD::GLD1_UXTW_SCALED:
+ NewOpc = AArch64ISD::GLD1S_UXTW_SCALED;
+ break;
+ case AArch64ISD::GLD1_IMM:
+ NewOpc = AArch64ISD::GLD1S_IMM;
+ break;
+ default:
+ return SDValue();
+ }
+
+ EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+ EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
+
+ if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse())
+ return SDValue();
+
+ EVT DstVT = N->getValueType(0);
+ SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
+ SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2),
+ Src->getOperand(3), Src->getOperand(4)};
+
+ SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
+ DCI.CombineTo(N, ExtLoad);
+ DCI.CombineTo(Src.getNode(), ExtLoad, ExtLoad.getValue(1));
+
+ // Return N so it doesn't get rechecked
+ return SDValue(N, 0);
+}
+
SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -11737,8 +12501,8 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND:
return performExtendCombine(N, DCI, DAG);
- case ISD::BITCAST:
- return performBitcastCombine(N, DCI, DAG);
+ case ISD::SIGN_EXTEND_INREG:
+ return performSignExtendInRegCombine(N, DCI, DAG);
case ISD::CONCAT_VECTORS:
return performConcatVectorsCombine(N, DCI, DAG);
case ISD::SELECT:
@@ -11789,6 +12553,46 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
case Intrinsic::aarch64_neon_st3lane:
case Intrinsic::aarch64_neon_st4lane:
return performNEONPostLDSTCombine(N, DCI, DAG);
+ case Intrinsic::aarch64_sve_ldnt1:
+ return performLDNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_stnt1:
+ return performSTNT1Combine(N, DAG);
+ case Intrinsic::aarch64_sve_ld1_gather:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
+ case Intrinsic::aarch64_sve_ld1_gather_index:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_ld1_gather_imm:
+ return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
+ case Intrinsic::aarch64_sve_st1_scatter:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1);
+ case Intrinsic::aarch64_sve_st1_scatter_index:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED);
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
+ /*OnlyPackedOffsets=*/false);
+ case Intrinsic::aarch64_sve_st1_scatter_imm:
+ return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM);
default:
break;
}
@@ -12084,6 +12888,69 @@ void AArch64TargetLowering::ReplaceNodeResults(
case ISD::ATOMIC_CMP_SWAP:
ReplaceCMP_SWAP_128Results(N, Results, DAG, Subtarget);
return;
+ case ISD::LOAD: {
+ assert(SDValue(N, 0).getValueType() == MVT::i128 &&
+ "unexpected load's value type");
+ LoadSDNode *LoadNode = cast<LoadSDNode>(N);
+ if (!LoadNode->isVolatile() || LoadNode->getMemoryVT() != MVT::i128) {
+ // Non-volatile loads are optimized later in AArch64's load/store
+ // optimizer.
+ return;
+ }
+
+ SDValue Result = DAG.getMemIntrinsicNode(
+ AArch64ISD::LDP, SDLoc(N),
+ DAG.getVTList({MVT::i64, MVT::i64, MVT::Other}),
+ {LoadNode->getChain(), LoadNode->getBasePtr()}, LoadNode->getMemoryVT(),
+ LoadNode->getMemOperand());
+
+ SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
+ Result.getValue(0), Result.getValue(1));
+ Results.append({Pair, Result.getValue(2) /* Chain */});
+ return;
+ }
+ case ISD::INTRINSIC_WO_CHAIN: {
+ EVT VT = N->getValueType(0);
+ assert((VT == MVT::i8 || VT == MVT::i16) &&
+ "custom lowering for unexpected type");
+
+ ConstantSDNode *CN = cast<ConstantSDNode>(N->getOperand(0));
+ Intrinsic::ID IntID = static_cast<Intrinsic::ID>(CN->getZExtValue());
+ switch (IntID) {
+ default:
+ return;
+ case Intrinsic::aarch64_sve_clasta_n: {
+ SDLoc DL(N);
+ auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
+ auto V = DAG.getNode(AArch64ISD::CLASTA_N, DL, MVT::i32,
+ N->getOperand(1), Op2, N->getOperand(3));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
+ return;
+ }
+ case Intrinsic::aarch64_sve_clastb_n: {
+ SDLoc DL(N);
+ auto Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, N->getOperand(2));
+ auto V = DAG.getNode(AArch64ISD::CLASTB_N, DL, MVT::i32,
+ N->getOperand(1), Op2, N->getOperand(3));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
+ return;
+ }
+ case Intrinsic::aarch64_sve_lasta: {
+ SDLoc DL(N);
+ auto V = DAG.getNode(AArch64ISD::LASTA, DL, MVT::i32,
+ N->getOperand(1), N->getOperand(2));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
+ return;
+ }
+ case Intrinsic::aarch64_sve_lastb: {
+ SDLoc DL(N);
+ auto V = DAG.getNode(AArch64ISD::LASTB, DL, MVT::i32,
+ N->getOperand(1), N->getOperand(2));
+ Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, V));
+ return;
+ }
+ }
+ }
}
}
@@ -12351,7 +13218,7 @@ bool AArch64TargetLowering::
bool AArch64TargetLowering::shouldExpandShift(SelectionDAG &DAG,
SDNode *N) const {
if (DAG.getMachineFunction().getFunction().hasMinSize() &&
- !Subtarget->isTargetWindows())
+ !Subtarget->isTargetWindows() && !Subtarget->isTargetDarwin())
return false;
return true;
}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 00fa96bc4e6d..672dfc4fcbc0 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -155,6 +155,14 @@ enum NodeType : unsigned {
SMAXV,
UMAXV,
+ SMAXV_PRED,
+ UMAXV_PRED,
+ SMINV_PRED,
+ UMINV_PRED,
+ ORV_PRED,
+ EORV_PRED,
+ ANDV_PRED,
+
// Vector bitwise negation
NOT,
@@ -196,6 +204,43 @@ enum NodeType : unsigned {
UUNPKHI,
UUNPKLO,
+ CLASTA_N,
+ CLASTB_N,
+ LASTA,
+ LASTB,
+ REV,
+ TBL,
+
+ INSR,
+ PTEST,
+ PTRUE,
+
+ // Unsigned gather loads.
+ GLD1,
+ GLD1_SCALED,
+ GLD1_UXTW,
+ GLD1_SXTW,
+ GLD1_UXTW_SCALED,
+ GLD1_SXTW_SCALED,
+ GLD1_IMM,
+
+ // Signed gather loads
+ GLD1S,
+ GLD1S_SCALED,
+ GLD1S_UXTW,
+ GLD1S_SXTW,
+ GLD1S_UXTW_SCALED,
+ GLD1S_SXTW_SCALED,
+ GLD1S_IMM,
+ // Scatter store
+ SST1,
+ SST1_SCALED,
+ SST1_UXTW,
+ SST1_SXTW,
+ SST1_UXTW_SCALED,
+ SST1_SXTW_SCALED,
+ SST1_IMM,
+
// NEON Load/Store with post-increment base updates
LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
LD3post,
@@ -224,8 +269,10 @@ enum NodeType : unsigned {
STG,
STZG,
ST2G,
- STZ2G
+ STZ2G,
+ LDP,
+ STP
};
} // end namespace AArch64ISD
@@ -396,7 +443,9 @@ public:
/// Return true if an FMA operation is faster than a pair of fmul and fadd
/// instructions. fmuladd intrinsics will be expanded to FMAs when this method
/// returns true, otherwise fmuladd is expanded to fmul + fadd.
- bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
+ bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override;
const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
@@ -648,6 +697,8 @@ private:
SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerDarwinGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerELFGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerELFTLSLocalExec(const GlobalValue *GV, SDValue ThreadBase,
+ const SDLoc &DL, SelectionDAG &DAG) const;
SDValue LowerELFTLSDescCallSeq(SDValue SymAddr, const SDLoc &DL,
SelectionDAG &DAG) const;
SDValue LowerWindowsGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -713,7 +764,7 @@ private:
unsigned combineRepeatedFPDivisors() const override;
ConstraintType getConstraintType(StringRef Constraint) const override;
- Register getRegisterByName(const char* RegName, EVT VT,
+ Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
/// Examine constraint string and operand type and determine a weight value.
@@ -741,6 +792,7 @@ private:
return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
}
+ bool isVectorLoadExtDesirable(SDValue ExtVal) const override;
bool isUsedByReturnOnly(SDNode *N, SDValue &Chain) const override;
bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
bool getIndexedAddressParts(SDNode *Op, SDValue &Base, SDValue &Offset,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 459b53923625..27e1d8ee6b98 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -15,9 +15,9 @@
//===----------------------------------
let AddedComplexity = 15, Size = 0 in
def CompilerBarrier : Pseudo<(outs), (ins i32imm:$ordering),
- [(atomic_fence imm:$ordering, 0)]>, Sched<[]>;
-def : Pat<(atomic_fence (i64 4), (imm)), (DMB (i32 0x9))>;
-def : Pat<(atomic_fence (imm), (imm)), (DMB (i32 0xb))>;
+ [(atomic_fence timm:$ordering, 0)]>, Sched<[]>;
+def : Pat<(atomic_fence (i64 4), (timm)), (DMB (i32 0x9))>;
+def : Pat<(atomic_fence (timm), (timm)), (DMB (i32 0xb))>;
//===----------------------------------
// Atomic loads
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index f555e4123307..c3efe03a0987 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -305,7 +305,7 @@ def simm9 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -256 && Imm < 256; }]> {
}
def SImm8Operand : SImmOperand<8>;
-def simm8 : Operand<i64>, ImmLeaf<i64, [{ return Imm >= -128 && Imm < 127; }]> {
+def simm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -128 && Imm < 127; }]> {
let ParserMatchClass = SImm8Operand;
let DecoderMethod = "DecodeSImm<8>";
}
@@ -358,6 +358,16 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
+def UImmS2XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
+}]>;
+def UImmS4XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() / 4, SDLoc(N), MVT::i64);
+}]>;
+def UImmS8XForm : SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(N->getZExtValue() / 8, SDLoc(N), MVT::i64);
+}]>;
+
// uimm5sN predicate - True if the immediate is a multiple of N in the range
// [0 * N, 32 * N].
def UImm5s2Operand : UImmScaledMemoryIndexed<5, 2>;
@@ -365,17 +375,41 @@ def UImm5s4Operand : UImmScaledMemoryIndexed<5, 4>;
def UImm5s8Operand : UImmScaledMemoryIndexed<5, 8>;
def uimm5s2 : Operand<i64>, ImmLeaf<i64,
- [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }]> {
+ [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
+ UImmS2XForm> {
let ParserMatchClass = UImm5s2Operand;
let PrintMethod = "printImmScale<2>";
}
def uimm5s4 : Operand<i64>, ImmLeaf<i64,
- [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }]> {
+ [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
+ UImmS4XForm> {
let ParserMatchClass = UImm5s4Operand;
let PrintMethod = "printImmScale<4>";
}
def uimm5s8 : Operand<i64>, ImmLeaf<i64,
- [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }]> {
+ [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
+ UImmS8XForm> {
+ let ParserMatchClass = UImm5s8Operand;
+ let PrintMethod = "printImmScale<8>";
+}
+
+// tuimm5sN predicate - similiar to uimm5sN, but use TImmLeaf (TargetConstant)
+// instead of ImmLeaf (Constant)
+def tuimm5s2 : Operand<i64>, TImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*2) && ((Imm % 2) == 0); }],
+ UImmS2XForm> {
+ let ParserMatchClass = UImm5s2Operand;
+ let PrintMethod = "printImmScale<2>";
+}
+def tuimm5s4 : Operand<i64>, TImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*4) && ((Imm % 4) == 0); }],
+ UImmS4XForm> {
+ let ParserMatchClass = UImm5s4Operand;
+ let PrintMethod = "printImmScale<4>";
+}
+def tuimm5s8 : Operand<i64>, TImmLeaf<i64,
+ [{ return Imm >= 0 && Imm < (32*8) && ((Imm % 8) == 0); }],
+ UImmS8XForm> {
let ParserMatchClass = UImm5s8Operand;
let PrintMethod = "printImmScale<8>";
}
@@ -590,6 +624,30 @@ def vecshiftR64Narrow : Operand<i32>, ImmLeaf<i32, [{
let ParserMatchClass = Imm1_32Operand;
}
+// Same as vecshiftR#N, but use TargetConstant (TimmLeaf) instead of Constant
+// (ImmLeaf)
+def tvecshiftR8 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 9);
+}]> {
+ let EncoderMethod = "getVecShiftR8OpValue";
+ let DecoderMethod = "DecodeVecShiftR8Imm";
+ let ParserMatchClass = Imm1_8Operand;
+}
+def tvecshiftR16 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 17);
+}]> {
+ let EncoderMethod = "getVecShiftR16OpValue";
+ let DecoderMethod = "DecodeVecShiftR16Imm";
+ let ParserMatchClass = Imm1_16Operand;
+}
+def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
+ return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 33);
+}]> {
+ let EncoderMethod = "getVecShiftR32OpValue";
+ let DecoderMethod = "DecodeVecShiftR32Imm";
+ let ParserMatchClass = Imm1_32Operand;
+}
+
def Imm0_1Operand : AsmImmRange<0, 1>;
def Imm0_7Operand : AsmImmRange<0, 7>;
def Imm0_15Operand : AsmImmRange<0, 15>;
@@ -713,6 +771,13 @@ def imm0_127 : Operand<i32>, ImmLeaf<i32, [{
let PrintMethod = "printImm";
}
+def imm0_127_64b : Operand<i64>, ImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 128;
+}]> {
+ let ParserMatchClass = Imm0_127Operand;
+ let PrintMethod = "printImm";
+}
+
// NOTE: These imm0_N operands have to be of type i64 because i64 is the size
// for all shift-amounts.
@@ -730,6 +795,14 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
let ParserMatchClass = Imm0_31Operand;
}
+// timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf)
+// instead of Contant (ImmLeaf)
+def timm0_31 : Operand<i64>, TImmLeaf<i64, [{
+ return ((uint64_t)Imm) < 32;
+}]> {
+ let ParserMatchClass = Imm0_31Operand;
+}
+
// True if the 32-bit immediate is in the range [0,31]
def imm32_0_31 : Operand<i32>, ImmLeaf<i32, [{
return ((uint64_t)Imm) < 32;
@@ -758,6 +831,13 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
let ParserMatchClass = Imm0_7Operand;
}
+// imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
+def imm32_0_7 : Operand<i32>, ImmLeaf<i32, [{
+ return ((uint32_t)Imm) < 8;
+}]> {
+ let ParserMatchClass = Imm0_7Operand;
+}
+
// imm32_0_15 predicate - True if the 32-bit immediate is in the range [0,15]
def imm32_0_15 : Operand<i32>, ImmLeaf<i32, [{
return ((uint32_t)Imm) < 16;
@@ -1403,6 +1483,7 @@ class RCPCLoad<bits<2> sz, string asm, RegisterClass RC>
class AuthBase<bits<1> M, dag oops, dag iops, string asm, string operands,
list<dag> pattern>
: I<oops, iops, asm, operands, "", pattern>, Sched<[]> {
+ let isAuthenticated = 1;
let Inst{31-25} = 0b1101011;
let Inst{20-11} = 0b1111100001;
let Inst{10} = M;
@@ -1427,6 +1508,7 @@ class AuthOneOperand<bits<3> opc, bits<1> M, string asm>
let Inst{9-5} = Rn;
}
+let Uses = [LR,SP] in
class AuthReturn<bits<3> op, bits<1> M, string asm>
: AuthBase<M, (outs), (ins), asm, "", []> {
let Inst{24} = 0;
@@ -1441,6 +1523,7 @@ class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm,
bits<10> offset;
bits<5> Rn;
bits<5> Rt;
+ let isAuthenticated = 1;
let Inst{31-24} = 0b11111000;
let Inst{23} = M;
let Inst{22} = offset{9};
@@ -1463,6 +1546,9 @@ multiclass AuthLoad<bit M, string asm, Operand opr> {
def : InstAlias<asm # "\t$Rt, [$Rn]",
(!cast<Instruction>(NAME # "indexed") GPR64:$Rt, GPR64sp:$Rn, 0)>;
+
+ def : InstAlias<asm # "\t$Rt, [$wback]!",
+ (!cast<Instruction>(NAME # "writeback") GPR64sp:$wback, GPR64:$Rt, 0), 0>;
}
//---
@@ -3047,6 +3133,22 @@ def ro_Windexed32 : ComplexPattern<i64, 4, "SelectAddrModeWRO<32>", []>;
def ro_Windexed64 : ComplexPattern<i64, 4, "SelectAddrModeWRO<64>", []>;
def ro_Windexed128 : ComplexPattern<i64, 4, "SelectAddrModeWRO<128>", []>;
+def gi_ro_Windexed8 :
+ GIComplexOperandMatcher<s64, "selectAddrModeWRO<8>">,
+ GIComplexPatternEquiv<ro_Windexed8>;
+def gi_ro_Windexed16 :
+ GIComplexOperandMatcher<s64, "selectAddrModeWRO<16>">,
+ GIComplexPatternEquiv<ro_Windexed16>;
+def gi_ro_Windexed32 :
+ GIComplexOperandMatcher<s64, "selectAddrModeWRO<32>">,
+ GIComplexPatternEquiv<ro_Windexed32>;
+def gi_ro_Windexed64 :
+ GIComplexOperandMatcher<s64, "selectAddrModeWRO<64>">,
+ GIComplexPatternEquiv<ro_Windexed64>;
+def gi_ro_Windexed128 :
+ GIComplexOperandMatcher<s64, "selectAddrModeWRO<128>">,
+ GIComplexPatternEquiv<ro_Windexed128>;
+
class MemExtendOperand<string Reg, int Width> : AsmOperandClass {
let Name = "Mem" # Reg # "Extend" # Width;
let PredicateMethod = "isMem" # Reg # "Extend<" # Width # ">";
@@ -5066,6 +5168,24 @@ multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
[(set (v2i64 V128:$Rd), (OpNode (v2i64 V128:$Rn), (v2i64 V128:$Rm)))]>;
}
+multiclass SIMDThreeSameVectorExtraPatterns<string inst, SDPatternOperator OpNode> {
+ def : Pat<(v8i8 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(inst#"v8i8") V64:$LHS, V64:$RHS)>;
+ def : Pat<(v4i16 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(inst#"v4i16") V64:$LHS, V64:$RHS)>;
+ def : Pat<(v2i32 (OpNode V64:$LHS, V64:$RHS)),
+ (!cast<Instruction>(inst#"v2i32") V64:$LHS, V64:$RHS)>;
+
+ def : Pat<(v16i8 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(inst#"v16i8") V128:$LHS, V128:$RHS)>;
+ def : Pat<(v8i16 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(inst#"v8i16") V128:$LHS, V128:$RHS)>;
+ def : Pat<(v4i32 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(inst#"v4i32") V128:$LHS, V128:$RHS)>;
+ def : Pat<(v2i64 (OpNode V128:$LHS, V128:$RHS)),
+ (!cast<Instruction>(inst#"v2i64") V128:$LHS, V128:$RHS)>;
+}
+
// As above, but D sized elements unsupported.
multiclass SIMDThreeSameVectorBHS<bit U, bits<5> opc, string asm,
SDPatternOperator OpNode> {
@@ -10034,15 +10154,20 @@ class ComplexRotationOperand<int Angle, int Remainder, string Type>
let DiagnosticType = "InvalidComplexRotation" # Type;
let Name = "ComplexRotation" # Type;
}
-def complexrotateop : Operand<i32> {
+def complexrotateop : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
+ SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32);
+}]>> {
let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">;
let PrintMethod = "printComplexRotationOp<90, 0>";
}
-def complexrotateopodd : Operand<i32> {
+def complexrotateopodd : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270; }],
+ SDNodeXForm<imm, [{
+ return CurDAG->getTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32);
+}]>> {
let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">;
let PrintMethod = "printComplexRotationOp<180, 90>";
}
-
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
class BaseSIMDThreeSameVectorComplex<bit Q, bit U, bits<2> size, bits<3> opcode,
RegisterOperand regtype, Operand rottype,
@@ -10373,9 +10498,9 @@ class CryptoRRTied<bits<1>op0, bits<2>op1, string asm, string asmops>
let Inst{11-10} = op1;
}
class CryptoRRTied_2D<bits<1>op0, bits<2>op1, string asm>
- : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d}">;
+ : CryptoRRTied<op0, op1, asm, "{\t$Vd.2d, $Vn.2d|.2d\t$Vd, $Vn}">;
class CryptoRRTied_4S<bits<1>op0, bits<2>op1, string asm>
- : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s}">;
+ : CryptoRRTied<op0, op1, asm, "{\t$Vd.4s, $Vn.4s|.4s\t$Vd, $Vn}">;
class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
string asmops, string cst>
@@ -10390,19 +10515,19 @@ class CryptoRRR<bits<1> op0, bits<2>op1, dag oops, dag iops, string asm,
}
class CryptoRRR_2D<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
- "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "">;
+ "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "">;
class CryptoRRRTied_2D<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
- "{\t$Vd.2d, $Vn.2d, $Vm.2d}", "$Vd = $Vdst">;
+ "{\t$Vd.2d, $Vn.2d, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
class CryptoRRR_4S<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vd), (ins V128:$Vn, V128:$Vm), asm,
- "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "">;
+ "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "">;
class CryptoRRRTied_4S<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs V128:$Vdst), (ins V128:$Vd, V128:$Vn, V128:$Vm), asm,
- "{\t$Vd.4s, $Vn.4s, $Vm.4s}", "$Vd = $Vdst">;
+ "{\t$Vd.4s, $Vn.4s, $Vm.4s|.4s\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
class CryptoRRRTied<bits<1> op0, bits<2>op1, string asm>
: CryptoRRR<op0, op1, (outs FPR128:$Vdst), (ins FPR128:$Vd, FPR128:$Vn, V128:$Vm),
- asm, "{\t$Vd, $Vn, $Vm.2d}", "$Vd = $Vdst">;
+ asm, "{\t$Vd, $Vn, $Vm.2d|.2d\t$Vd, $Vn, $Vm}", "$Vd = $Vdst">;
class CryptoRRRR<bits<2>op0, string asm, string asmops>
: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, V128:$Va), asm,
@@ -10416,15 +10541,18 @@ class CryptoRRRR<bits<2>op0, string asm, string asmops>
let Inst{14-10} = Va;
}
class CryptoRRRR_16B<bits<2>op0, string asm>
- : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b}"> {
+ : CryptoRRRR<op0, asm, "{\t$Vd.16b, $Vn.16b, $Vm.16b, $Va.16b" #
+ "|.16b\t$Vd, $Vn, $Vm, $Va}"> {
}
class CryptoRRRR_4S<bits<2>op0, string asm>
- : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s}"> {
+ : CryptoRRRR<op0, asm, "{\t$Vd.4s, $Vn.4s, $Vm.4s, $Va.4s" #
+ "|.4s\t$Vd, $Vn, $Vm, $Va}"> {
}
class CryptoRRRi6<string asm>
: BaseCryptoV82<(outs V128:$Vd), (ins V128:$Vn, V128:$Vm, uimm6:$imm), asm,
- "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm}", "", []> {
+ "{\t$Vd.2d, $Vn.2d, $Vm.2d, $imm" #
+ "|.2d\t$Vd, $Vn, $Vm, $imm}", "", []> {
bits<6> imm;
bits<5> Vm;
let Inst{24-21} = 0b0100;
@@ -10437,7 +10565,8 @@ class CryptoRRRi6<string asm>
class CryptoRRRi2Tied<bits<1>op0, bits<2>op1, string asm>
: BaseCryptoV82<(outs V128:$Vdst),
(ins V128:$Vd, V128:$Vn, V128:$Vm, VectorIndexS:$imm),
- asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm}", "$Vd = $Vdst", []> {
+ asm, "{\t$Vd.4s, $Vn.4s, $Vm.s$imm" #
+ "|.4s\t$Vd, $Vn, $Vm$imm}", "$Vd = $Vdst", []> {
bits<2> imm;
bits<5> Vm;
let Inst{24-21} = 0b0010;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 5c35e5bcdd30..54f3f7c10132 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -30,6 +30,7 @@
#include "llvm/CodeGen/StackMaps.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/MC/MCAsmInfo.h"
@@ -1981,6 +1982,9 @@ bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
const MachineOperand *&BaseOp,
int64_t &Offset,
const TargetRegisterInfo *TRI) const {
+ if (!LdSt.mayLoadOrStore())
+ return false;
+
unsigned Width;
return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
}
@@ -2025,9 +2029,8 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
Offset = LdSt.getOperand(3).getImm() * Scale;
}
- assert((BaseOp->isReg() || BaseOp->isFI()) &&
- "getMemOperandWithOffset only supports base "
- "operands of type register or frame index.");
+ if (!BaseOp->isReg() && !BaseOp->isFI())
+ return false;
return true;
}
@@ -2185,12 +2188,19 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
MaxOffset = 4095;
break;
case AArch64::ADDG:
- case AArch64::TAGPstack:
Scale = 16;
Width = 0;
MinOffset = 0;
MaxOffset = 63;
break;
+ case AArch64::TAGPstack:
+ Scale = 16;
+ Width = 0;
+ // TAGP with a negative offset turns into SUBP, which has a maximum offset
+ // of 63 (not 64!).
+ MinOffset = -63;
+ MaxOffset = 63;
+ break;
case AArch64::LDG:
case AArch64::STGOffset:
case AArch64::STZGOffset:
@@ -2227,54 +2237,82 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
return true;
}
-static unsigned getOffsetStride(unsigned Opc) {
+// Scaling factor for unscaled load or store.
+int AArch64InstrInfo::getMemScale(unsigned Opc) {
switch (Opc) {
default:
- return 0;
- case AArch64::LDURQi:
- case AArch64::STURQi:
- return 16;
- case AArch64::LDURXi:
- case AArch64::LDURDi:
- case AArch64::STURXi:
- case AArch64::STURDi:
- return 8;
- case AArch64::LDURWi:
+ llvm_unreachable("Opcode has unknown scale!");
+ case AArch64::LDRBBui:
+ case AArch64::LDURBBi:
+ case AArch64::LDRSBWui:
+ case AArch64::LDURSBWi:
+ case AArch64::STRBBui:
+ case AArch64::STURBBi:
+ return 1;
+ case AArch64::LDRHHui:
+ case AArch64::LDURHHi:
+ case AArch64::LDRSHWui:
+ case AArch64::LDURSHWi:
+ case AArch64::STRHHui:
+ case AArch64::STURHHi:
+ return 2;
+ case AArch64::LDRSui:
case AArch64::LDURSi:
+ case AArch64::LDRSWui:
case AArch64::LDURSWi:
- case AArch64::STURWi:
+ case AArch64::LDRWui:
+ case AArch64::LDURWi:
+ case AArch64::STRSui:
case AArch64::STURSi:
+ case AArch64::STRWui:
+ case AArch64::STURWi:
+ case AArch64::LDPSi:
+ case AArch64::LDPSWi:
+ case AArch64::LDPWi:
+ case AArch64::STPSi:
+ case AArch64::STPWi:
return 4;
+ case AArch64::LDRDui:
+ case AArch64::LDURDi:
+ case AArch64::LDRXui:
+ case AArch64::LDURXi:
+ case AArch64::STRDui:
+ case AArch64::STURDi:
+ case AArch64::STRXui:
+ case AArch64::STURXi:
+ case AArch64::LDPDi:
+ case AArch64::LDPXi:
+ case AArch64::STPDi:
+ case AArch64::STPXi:
+ return 8;
+ case AArch64::LDRQui:
+ case AArch64::LDURQi:
+ case AArch64::STRQui:
+ case AArch64::STURQi:
+ case AArch64::LDPQi:
+ case AArch64::STPQi:
+ case AArch64::STGOffset:
+ case AArch64::STZGOffset:
+ case AArch64::ST2GOffset:
+ case AArch64::STZ2GOffset:
+ case AArch64::STGPi:
+ return 16;
}
}
// Scale the unscaled offsets. Returns false if the unscaled offset can't be
// scaled.
static bool scaleOffset(unsigned Opc, int64_t &Offset) {
- unsigned OffsetStride = getOffsetStride(Opc);
- if (OffsetStride == 0)
- return false;
+ int Scale = AArch64InstrInfo::getMemScale(Opc);
+
// If the byte-offset isn't a multiple of the stride, we can't scale this
// offset.
- if (Offset % OffsetStride != 0)
+ if (Offset % Scale != 0)
return false;
// Convert the byte-offset used by unscaled into an "element" offset used
// by the scaled pair load/store instructions.
- Offset /= OffsetStride;
- return true;
-}
-
-// Unscale the scaled offsets. Returns false if the scaled offset can't be
-// unscaled.
-static bool unscaleOffset(unsigned Opc, int64_t &Offset) {
- unsigned OffsetStride = getOffsetStride(Opc);
- if (OffsetStride == 0)
- return false;
-
- // Convert the "element" offset used by scaled pair load/store instructions
- // into the byte-offset used by unscaled.
- Offset *= OffsetStride;
+ Offset /= Scale;
return true;
}
@@ -2305,15 +2343,17 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
int64_t ObjectOffset1 = MFI.getObjectOffset(FI1);
int64_t ObjectOffset2 = MFI.getObjectOffset(FI2);
assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered.");
- // Get the byte-offset from the object offset.
- if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2))
+ // Convert to scaled object offsets.
+ int Scale1 = AArch64InstrInfo::getMemScale(Opcode1);
+ if (ObjectOffset1 % Scale1 != 0)
return false;
+ ObjectOffset1 /= Scale1;
+ int Scale2 = AArch64InstrInfo::getMemScale(Opcode2);
+ if (ObjectOffset2 % Scale2 != 0)
+ return false;
+ ObjectOffset2 /= Scale2;
ObjectOffset1 += Offset1;
ObjectOffset2 += Offset2;
- // Get the "element" index in the object.
- if (!scaleOffset(Opcode1, ObjectOffset1) ||
- !scaleOffset(Opcode2, ObjectOffset2))
- return false;
return ObjectOffset1 + 1 == ObjectOffset2;
}
@@ -2373,7 +2413,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
// The caller should already have ordered First/SecondLdSt by offset.
// Note: except for non-equal frame index bases
if (BaseOp1.isFI()) {
- assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) &&
+ assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
"Caller should have ordered offsets.");
const MachineFrameInfo &MFI =
@@ -2382,8 +2422,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
BaseOp2.getIndex(), Offset2, SecondOpc);
}
- assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) &&
- "Caller should have ordered offsets.");
+ assert(Offset1 <= Offset2 && "Caller should have ordered offsets.");
return Offset1 + 1 == Offset2;
}
@@ -2409,8 +2448,8 @@ static bool forwardCopyWillClobberTuple(unsigned DestReg, unsigned SrcReg,
void AArch64InstrInfo::copyPhysRegTuple(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg,
- unsigned SrcReg, bool KillSrc,
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc,
unsigned Opcode,
ArrayRef<unsigned> Indices) const {
assert(Subtarget.hasNEON() && "Unexpected register copy without NEON");
@@ -2461,8 +2500,8 @@ void AArch64InstrInfo::copyGPRRegTuple(MachineBasicBlock &MBB,
void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg,
- unsigned SrcReg, bool KillSrc) const {
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc) const {
if (AArch64::GPR32spRegClass.contains(DestReg) &&
(AArch64::GPR32spRegClass.contains(SrcReg) || SrcReg == AArch64::WZR)) {
const TargetRegisterInfo *TRI = &getRegisterInfo();
@@ -2471,10 +2510,10 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
// If either operand is WSP, expand to ADD #0.
if (Subtarget.hasZeroCycleRegMove()) {
// Cyclone recognizes "ADD Xd, Xn, #0" as a zero-cycle register move.
- unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
- &AArch64::GPR64spRegClass);
- unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
- &AArch64::GPR64spRegClass);
+ MCRegister DestRegX = TRI->getMatchingSuperReg(
+ DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
+ MCRegister SrcRegX = TRI->getMatchingSuperReg(
+ SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
// This instruction is reading and writing X registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegX, but a proper
@@ -2497,10 +2536,10 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
} else {
if (Subtarget.hasZeroCycleRegMove()) {
// Cyclone recognizes "ORR Xd, XZR, Xm" as a zero-cycle register move.
- unsigned DestRegX = TRI->getMatchingSuperReg(DestReg, AArch64::sub_32,
- &AArch64::GPR64spRegClass);
- unsigned SrcRegX = TRI->getMatchingSuperReg(SrcReg, AArch64::sub_32,
- &AArch64::GPR64spRegClass);
+ MCRegister DestRegX = TRI->getMatchingSuperReg(
+ DestReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
+ MCRegister SrcRegX = TRI->getMatchingSuperReg(
+ SrcReg, AArch64::sub_32, &AArch64::GPR64spRegClass);
// This instruction is reading and writing X registers. This may upset
// the register scavenger and machine verifier, so we need to indicate
// that we are reading an undefined value from SrcRegX, but a proper
@@ -2897,7 +2936,18 @@ void AArch64InstrInfo::storeRegToStackSlot(
}
break;
}
+ unsigned StackID = TargetStackID::Default;
+ if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+ Opc = AArch64::STR_PXI;
+ StackID = TargetStackID::SVEVector;
+ } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+ Opc = AArch64::STR_ZXI;
+ StackID = TargetStackID::SVEVector;
+ }
assert(Opc && "Unknown register class");
+ MFI.setStackID(FI, StackID);
const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
.addReg(SrcReg, getKillRegState(isKill))
@@ -3028,7 +3078,19 @@ void AArch64InstrInfo::loadRegFromStackSlot(
}
break;
}
+
+ unsigned StackID = TargetStackID::Default;
+ if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+ Opc = AArch64::LDR_PXI;
+ StackID = TargetStackID::SVEVector;
+ } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+ assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+ Opc = AArch64::LDR_ZXI;
+ StackID = TargetStackID::SVEVector;
+ }
assert(Opc && "Unknown register class");
+ MFI.setStackID(FI, StackID);
const MachineInstrBuilder MI = BuildMI(MBB, MBBI, DebugLoc(), get(Opc))
.addReg(DestReg, getDefRegState(true))
@@ -3085,7 +3147,7 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
do {
- unsigned ThisVal = std::min<unsigned>(Offset, MaxEncodableValue);
+ uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
unsigned LocalShiftSize = 0;
if (ThisVal > MaxEncoding) {
ThisVal = ThisVal >> ShiftSize;
@@ -3548,6 +3610,18 @@ static bool isCombineInstrCandidate64(unsigned Opc) {
// Note: MSUB Wd,Wn,Wm,Wi -> Wd = Wi - WnxWm, not Wd=WnxWm - Wi.
case AArch64::SUBXri:
case AArch64::SUBSXri:
+ case AArch64::ADDv8i8:
+ case AArch64::ADDv16i8:
+ case AArch64::ADDv4i16:
+ case AArch64::ADDv8i16:
+ case AArch64::ADDv2i32:
+ case AArch64::ADDv4i32:
+ case AArch64::SUBv8i8:
+ case AArch64::SUBv16i8:
+ case AArch64::SUBv4i16:
+ case AArch64::SUBv8i16:
+ case AArch64::SUBv2i32:
+ case AArch64::SUBv4i32:
return true;
default:
break;
@@ -3690,6 +3764,13 @@ static bool getMaddPatterns(MachineInstr &Root,
}
};
+ auto setVFound = [&](int Opcode, int Operand, MachineCombinerPattern Pattern) {
+ if (canCombine(MBB, Root.getOperand(Operand), Opcode)) {
+ Patterns.push_back(Pattern);
+ Found = true;
+ }
+ };
+
typedef MachineCombinerPattern MCP;
switch (Opc) {
@@ -3725,6 +3806,70 @@ static bool getMaddPatterns(MachineInstr &Root,
case AArch64::SUBXri:
setFound(AArch64::MADDXrrr, 1, AArch64::XZR, MCP::MULSUBXI_OP1);
break;
+ case AArch64::ADDv8i8:
+ setVFound(AArch64::MULv8i8, 1, MCP::MULADDv8i8_OP1);
+ setVFound(AArch64::MULv8i8, 2, MCP::MULADDv8i8_OP2);
+ break;
+ case AArch64::ADDv16i8:
+ setVFound(AArch64::MULv16i8, 1, MCP::MULADDv16i8_OP1);
+ setVFound(AArch64::MULv16i8, 2, MCP::MULADDv16i8_OP2);
+ break;
+ case AArch64::ADDv4i16:
+ setVFound(AArch64::MULv4i16, 1, MCP::MULADDv4i16_OP1);
+ setVFound(AArch64::MULv4i16, 2, MCP::MULADDv4i16_OP2);
+ setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULADDv4i16_indexed_OP1);
+ setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULADDv4i16_indexed_OP2);
+ break;
+ case AArch64::ADDv8i16:
+ setVFound(AArch64::MULv8i16, 1, MCP::MULADDv8i16_OP1);
+ setVFound(AArch64::MULv8i16, 2, MCP::MULADDv8i16_OP2);
+ setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULADDv8i16_indexed_OP1);
+ setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULADDv8i16_indexed_OP2);
+ break;
+ case AArch64::ADDv2i32:
+ setVFound(AArch64::MULv2i32, 1, MCP::MULADDv2i32_OP1);
+ setVFound(AArch64::MULv2i32, 2, MCP::MULADDv2i32_OP2);
+ setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULADDv2i32_indexed_OP1);
+ setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULADDv2i32_indexed_OP2);
+ break;
+ case AArch64::ADDv4i32:
+ setVFound(AArch64::MULv4i32, 1, MCP::MULADDv4i32_OP1);
+ setVFound(AArch64::MULv4i32, 2, MCP::MULADDv4i32_OP2);
+ setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULADDv4i32_indexed_OP1);
+ setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULADDv4i32_indexed_OP2);
+ break;
+ case AArch64::SUBv8i8:
+ setVFound(AArch64::MULv8i8, 1, MCP::MULSUBv8i8_OP1);
+ setVFound(AArch64::MULv8i8, 2, MCP::MULSUBv8i8_OP2);
+ break;
+ case AArch64::SUBv16i8:
+ setVFound(AArch64::MULv16i8, 1, MCP::MULSUBv16i8_OP1);
+ setVFound(AArch64::MULv16i8, 2, MCP::MULSUBv16i8_OP2);
+ break;
+ case AArch64::SUBv4i16:
+ setVFound(AArch64::MULv4i16, 1, MCP::MULSUBv4i16_OP1);
+ setVFound(AArch64::MULv4i16, 2, MCP::MULSUBv4i16_OP2);
+ setVFound(AArch64::MULv4i16_indexed, 1, MCP::MULSUBv4i16_indexed_OP1);
+ setVFound(AArch64::MULv4i16_indexed, 2, MCP::MULSUBv4i16_indexed_OP2);
+ break;
+ case AArch64::SUBv8i16:
+ setVFound(AArch64::MULv8i16, 1, MCP::MULSUBv8i16_OP1);
+ setVFound(AArch64::MULv8i16, 2, MCP::MULSUBv8i16_OP2);
+ setVFound(AArch64::MULv8i16_indexed, 1, MCP::MULSUBv8i16_indexed_OP1);
+ setVFound(AArch64::MULv8i16_indexed, 2, MCP::MULSUBv8i16_indexed_OP2);
+ break;
+ case AArch64::SUBv2i32:
+ setVFound(AArch64::MULv2i32, 1, MCP::MULSUBv2i32_OP1);
+ setVFound(AArch64::MULv2i32, 2, MCP::MULSUBv2i32_OP2);
+ setVFound(AArch64::MULv2i32_indexed, 1, MCP::MULSUBv2i32_indexed_OP1);
+ setVFound(AArch64::MULv2i32_indexed, 2, MCP::MULSUBv2i32_indexed_OP2);
+ break;
+ case AArch64::SUBv4i32:
+ setVFound(AArch64::MULv4i32, 1, MCP::MULSUBv4i32_OP1);
+ setVFound(AArch64::MULv4i32, 2, MCP::MULSUBv4i32_OP2);
+ setVFound(AArch64::MULv4i32_indexed, 1, MCP::MULSUBv4i32_indexed_OP1);
+ setVFound(AArch64::MULv4i32_indexed, 2, MCP::MULSUBv4i32_indexed_OP2);
+ break;
}
return Found;
}
@@ -3937,6 +4082,46 @@ bool AArch64InstrInfo::isThroughputPattern(
case MachineCombinerPattern::FMLSv2f64_OP2:
case MachineCombinerPattern::FMLSv4i32_indexed_OP2:
case MachineCombinerPattern::FMLSv4f32_OP2:
+ case MachineCombinerPattern::MULADDv8i8_OP1:
+ case MachineCombinerPattern::MULADDv8i8_OP2:
+ case MachineCombinerPattern::MULADDv16i8_OP1:
+ case MachineCombinerPattern::MULADDv16i8_OP2:
+ case MachineCombinerPattern::MULADDv4i16_OP1:
+ case MachineCombinerPattern::MULADDv4i16_OP2:
+ case MachineCombinerPattern::MULADDv8i16_OP1:
+ case MachineCombinerPattern::MULADDv8i16_OP2:
+ case MachineCombinerPattern::MULADDv2i32_OP1:
+ case MachineCombinerPattern::MULADDv2i32_OP2:
+ case MachineCombinerPattern::MULADDv4i32_OP1:
+ case MachineCombinerPattern::MULADDv4i32_OP2:
+ case MachineCombinerPattern::MULSUBv8i8_OP1:
+ case MachineCombinerPattern::MULSUBv8i8_OP2:
+ case MachineCombinerPattern::MULSUBv16i8_OP1:
+ case MachineCombinerPattern::MULSUBv16i8_OP2:
+ case MachineCombinerPattern::MULSUBv4i16_OP1:
+ case MachineCombinerPattern::MULSUBv4i16_OP2:
+ case MachineCombinerPattern::MULSUBv8i16_OP1:
+ case MachineCombinerPattern::MULSUBv8i16_OP2:
+ case MachineCombinerPattern::MULSUBv2i32_OP1:
+ case MachineCombinerPattern::MULSUBv2i32_OP2:
+ case MachineCombinerPattern::MULSUBv4i32_OP1:
+ case MachineCombinerPattern::MULSUBv4i32_OP2:
+ case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
+ case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
+ case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
+ case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
+ case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
+ case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
+ case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
+ case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
+ case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
+ case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
+ case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
+ case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
+ case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
+ case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
+ case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
+ case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
return true;
} // end switch (Pattern)
return false;
@@ -4040,6 +4225,80 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI,
return MUL;
}
+/// genFusedMultiplyAcc - Helper to generate fused multiply accumulate
+/// instructions.
+///
+/// \see genFusedMultiply
+static MachineInstr *genFusedMultiplyAcc(
+ MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
+ MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+ unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
+ return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
+ FMAInstKind::Accumulator);
+}
+
+/// genNeg - Helper to generate an intermediate negation of the second operand
+/// of Root
+static Register genNeg(MachineFunction &MF, MachineRegisterInfo &MRI,
+ const TargetInstrInfo *TII, MachineInstr &Root,
+ SmallVectorImpl<MachineInstr *> &InsInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+ unsigned MnegOpc, const TargetRegisterClass *RC) {
+ Register NewVR = MRI.createVirtualRegister(RC);
+ MachineInstrBuilder MIB =
+ BuildMI(MF, Root.getDebugLoc(), TII->get(MnegOpc), NewVR)
+ .add(Root.getOperand(2));
+ InsInstrs.push_back(MIB);
+
+ assert(InstrIdxForVirtReg.empty());
+ InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+
+ return NewVR;
+}
+
+/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
+/// instructions with an additional negation of the accumulator
+static MachineInstr *genFusedMultiplyAccNeg(
+ MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
+ MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
+ unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
+ assert(IdxMulOpd == 1);
+
+ Register NewVR =
+ genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
+ return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
+ FMAInstKind::Accumulator, &NewVR);
+}
+
+/// genFusedMultiplyIdx - Helper to generate fused multiply accumulate
+/// instructions.
+///
+/// \see genFusedMultiply
+static MachineInstr *genFusedMultiplyIdx(
+ MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
+ MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+ unsigned IdxMulOpd, unsigned MaddOpc, const TargetRegisterClass *RC) {
+ return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
+ FMAInstKind::Indexed);
+}
+
+/// genFusedMultiplyAccNeg - Helper to generate fused multiply accumulate
+/// instructions with an additional negation of the accumulator
+static MachineInstr *genFusedMultiplyIdxNeg(
+ MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII,
+ MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs,
+ DenseMap<unsigned, unsigned> &InstrIdxForVirtReg, unsigned IdxMulOpd,
+ unsigned MaddOpc, unsigned MnegOpc, const TargetRegisterClass *RC) {
+ assert(IdxMulOpd == 1);
+
+ Register NewVR =
+ genNeg(MF, MRI, TII, Root, InsInstrs, InstrIdxForVirtReg, MnegOpc, RC);
+
+ return genFusedMultiply(MF, MRI, TII, Root, InsInstrs, IdxMulOpd, MaddOpc, RC,
+ FMAInstKind::Indexed, &NewVR);
+}
+
/// genMaddR - Generate madd instruction and combine mul and add using
/// an extra virtual register
/// Example - an ADD intermediate needs to be stored in a register:
@@ -4279,6 +4538,231 @@ void AArch64InstrInfo::genAlternativeCodeSequence(
}
break;
}
+
+ case MachineCombinerPattern::MULADDv8i8_OP1:
+ Opc = AArch64::MLAv8i8;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv8i8_OP2:
+ Opc = AArch64::MLAv8i8;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv16i8_OP1:
+ Opc = AArch64::MLAv16i8;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv16i8_OP2:
+ Opc = AArch64::MLAv16i8;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv4i16_OP1:
+ Opc = AArch64::MLAv4i16;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv4i16_OP2:
+ Opc = AArch64::MLAv4i16;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv8i16_OP1:
+ Opc = AArch64::MLAv8i16;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv8i16_OP2:
+ Opc = AArch64::MLAv8i16;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv2i32_OP1:
+ Opc = AArch64::MLAv2i32;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv2i32_OP2:
+ Opc = AArch64::MLAv2i32;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv4i32_OP1:
+ Opc = AArch64::MLAv4i32;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv4i32_OP2:
+ Opc = AArch64::MLAv4i32;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::MULSUBv8i8_OP1:
+ Opc = AArch64::MLAv8i8;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i8,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv8i8_OP2:
+ Opc = AArch64::MLSv8i8;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv16i8_OP1:
+ Opc = AArch64::MLAv16i8;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv16i8,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv16i8_OP2:
+ Opc = AArch64::MLSv16i8;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv4i16_OP1:
+ Opc = AArch64::MLAv4i16;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv4i16_OP2:
+ Opc = AArch64::MLSv4i16;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv8i16_OP1:
+ Opc = AArch64::MLAv8i16;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv8i16_OP2:
+ Opc = AArch64::MLSv8i16;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv2i32_OP1:
+ Opc = AArch64::MLAv2i32;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv2i32_OP2:
+ Opc = AArch64::MLSv2i32;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv4i32_OP1:
+ Opc = AArch64::MLAv4i32;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAccNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv4i32_OP2:
+ Opc = AArch64::MLSv4i32;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyAcc(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::MULADDv4i16_indexed_OP1:
+ Opc = AArch64::MLAv4i16_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv4i16_indexed_OP2:
+ Opc = AArch64::MLAv4i16_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv8i16_indexed_OP1:
+ Opc = AArch64::MLAv8i16_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv8i16_indexed_OP2:
+ Opc = AArch64::MLAv8i16_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv2i32_indexed_OP1:
+ Opc = AArch64::MLAv2i32_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv2i32_indexed_OP2:
+ Opc = AArch64::MLAv2i32_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv4i32_indexed_OP1:
+ Opc = AArch64::MLAv4i32_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULADDv4i32_indexed_OP2:
+ Opc = AArch64::MLAv4i32_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
+ case MachineCombinerPattern::MULSUBv4i16_indexed_OP1:
+ Opc = AArch64::MLAv4i16_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i16,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv4i16_indexed_OP2:
+ Opc = AArch64::MLSv4i16_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv8i16_indexed_OP1:
+ Opc = AArch64::MLAv8i16_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv8i16,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv8i16_indexed_OP2:
+ Opc = AArch64::MLSv8i16_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv2i32_indexed_OP1:
+ Opc = AArch64::MLAv2i32_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv2i32,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv2i32_indexed_OP2:
+ Opc = AArch64::MLSv2i32_indexed;
+ RC = &AArch64::FPR64RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+ case MachineCombinerPattern::MULSUBv4i32_indexed_OP1:
+ Opc = AArch64::MLAv4i32_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdxNeg(MF, MRI, TII, Root, InsInstrs,
+ InstrIdxForVirtReg, 1, Opc, AArch64::NEGv4i32,
+ RC);
+ break;
+ case MachineCombinerPattern::MULSUBv4i32_indexed_OP2:
+ Opc = AArch64::MLSv4i32_indexed;
+ RC = &AArch64::FPR128RegClass;
+ MUL = genFusedMultiplyIdx(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC);
+ break;
+
// Floating Point Support
case MachineCombinerPattern::FMULADDH_OP1:
Opc = AArch64::FMADDHrrr;
@@ -5037,8 +5521,99 @@ AArch64InstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
return 0u;
}
-outliner::OutlinedFunction
-AArch64InstrInfo::getOutliningCandidateInfo(
+static bool
+outliningCandidatesSigningScopeConsensus(const outliner::Candidate &a,
+ const outliner::Candidate &b) {
+ const Function &Fa = a.getMF()->getFunction();
+ const Function &Fb = b.getMF()->getFunction();
+
+ // If none of the functions have the "sign-return-address" attribute their
+ // signing behaviour is equal
+ if (!Fa.hasFnAttribute("sign-return-address") &&
+ !Fb.hasFnAttribute("sign-return-address")) {
+ return true;
+ }
+
+ // If both functions have the "sign-return-address" attribute their signing
+ // behaviour is equal, if the values of the attributes are equal
+ if (Fa.hasFnAttribute("sign-return-address") &&
+ Fb.hasFnAttribute("sign-return-address")) {
+ StringRef ScopeA =
+ Fa.getFnAttribute("sign-return-address").getValueAsString();
+ StringRef ScopeB =
+ Fb.getFnAttribute("sign-return-address").getValueAsString();
+ return ScopeA.equals(ScopeB);
+ }
+
+ // If function B doesn't have the "sign-return-address" attribute but A does,
+ // the functions' signing behaviour is equal if A's value for
+ // "sign-return-address" is "none" and vice versa.
+ if (Fa.hasFnAttribute("sign-return-address")) {
+ StringRef ScopeA =
+ Fa.getFnAttribute("sign-return-address").getValueAsString();
+ return ScopeA.equals("none");
+ }
+
+ if (Fb.hasFnAttribute("sign-return-address")) {
+ StringRef ScopeB =
+ Fb.getFnAttribute("sign-return-address").getValueAsString();
+ return ScopeB.equals("none");
+ }
+
+ llvm_unreachable("Unkown combination of sign-return-address attributes");
+}
+
+static bool
+outliningCandidatesSigningKeyConsensus(const outliner::Candidate &a,
+ const outliner::Candidate &b) {
+ const Function &Fa = a.getMF()->getFunction();
+ const Function &Fb = b.getMF()->getFunction();
+
+ // If none of the functions have the "sign-return-address-key" attribute
+ // their keys are equal
+ if (!Fa.hasFnAttribute("sign-return-address-key") &&
+ !Fb.hasFnAttribute("sign-return-address-key")) {
+ return true;
+ }
+
+ // If both functions have the "sign-return-address-key" attribute their
+ // keys are equal if the values of "sign-return-address-key" are equal
+ if (Fa.hasFnAttribute("sign-return-address-key") &&
+ Fb.hasFnAttribute("sign-return-address-key")) {
+ StringRef KeyA =
+ Fa.getFnAttribute("sign-return-address-key").getValueAsString();
+ StringRef KeyB =
+ Fb.getFnAttribute("sign-return-address-key").getValueAsString();
+ return KeyA.equals(KeyB);
+ }
+
+ // If B doesn't have the "sign-return-address-key" attribute, both keys are
+ // equal, if function a has the default key (a_key)
+ if (Fa.hasFnAttribute("sign-return-address-key")) {
+ StringRef KeyA =
+ Fa.getFnAttribute("sign-return-address-key").getValueAsString();
+ return KeyA.equals_lower("a_key");
+ }
+
+ if (Fb.hasFnAttribute("sign-return-address-key")) {
+ StringRef KeyB =
+ Fb.getFnAttribute("sign-return-address-key").getValueAsString();
+ return KeyB.equals_lower("a_key");
+ }
+
+ llvm_unreachable("Unkown combination of sign-return-address-key attributes");
+}
+
+static bool outliningCandidatesV8_3OpsConsensus(const outliner::Candidate &a,
+ const outliner::Candidate &b) {
+ const AArch64Subtarget &SubtargetA =
+ a.getMF()->getSubtarget<AArch64Subtarget>();
+ const AArch64Subtarget &SubtargetB =
+ b.getMF()->getSubtarget<AArch64Subtarget>();
+ return SubtargetA.hasV8_3aOps() == SubtargetB.hasV8_3aOps();
+}
+
+outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
unsigned SequenceSize =
@@ -5046,12 +5621,115 @@ AArch64InstrInfo::getOutliningCandidateInfo(
[this](unsigned Sum, const MachineInstr &MI) {
return Sum + getInstSizeInBytes(MI);
});
+ unsigned NumBytesToCreateFrame = 0;
+
+ // We only allow outlining for functions having exactly matching return
+ // address signing attributes, i.e., all share the same value for the
+ // attribute "sign-return-address" and all share the same type of key they
+ // are signed with.
+ // Additionally we require all functions to simultaniously either support
+ // v8.3a features or not. Otherwise an outlined function could get signed
+ // using dedicated v8.3 instructions and a call from a function that doesn't
+ // support v8.3 instructions would therefore be invalid.
+ if (std::adjacent_find(
+ RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
+ [](const outliner::Candidate &a, const outliner::Candidate &b) {
+ // Return true if a and b are non-equal w.r.t. return address
+ // signing or support of v8.3a features
+ if (outliningCandidatesSigningScopeConsensus(a, b) &&
+ outliningCandidatesSigningKeyConsensus(a, b) &&
+ outliningCandidatesV8_3OpsConsensus(a, b)) {
+ return false;
+ }
+ return true;
+ }) != RepeatedSequenceLocs.end()) {
+ return outliner::OutlinedFunction();
+ }
+
+ // Since at this point all candidates agree on their return address signing
+ // picking just one is fine. If the candidate functions potentially sign their
+ // return addresses, the outlined function should do the same. Note that in
+ // the case of "sign-return-address"="non-leaf" this is an assumption: It is
+ // not certainly true that the outlined function will have to sign its return
+ // address but this decision is made later, when the decision to outline
+ // has already been made.
+ // The same holds for the number of additional instructions we need: On
+ // v8.3a RET can be replaced by RETAA/RETAB and no AUT instruction is
+ // necessary. However, at this point we don't know if the outlined function
+ // will have a RET instruction so we assume the worst.
+ const Function &FCF = FirstCand.getMF()->getFunction();
+ const TargetRegisterInfo &TRI = getRegisterInfo();
+ if (FCF.hasFnAttribute("sign-return-address")) {
+ // One PAC and one AUT instructions
+ NumBytesToCreateFrame += 8;
+
+ // We have to check if sp modifying instructions would get outlined.
+ // If so we only allow outlining if sp is unchanged overall, so matching
+ // sub and add instructions are okay to outline, all other sp modifications
+ // are not
+ auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
+ int SPValue = 0;
+ MachineBasicBlock::iterator MBBI = C.front();
+ for (;;) {
+ if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
+ switch (MBBI->getOpcode()) {
+ case AArch64::ADDXri:
+ case AArch64::ADDWri:
+ assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
+ assert(MBBI->getOperand(2).isImm() &&
+ "Expected operand to be immediate");
+ assert(MBBI->getOperand(1).isReg() &&
+ "Expected operand to be a register");
+ // Check if the add just increments sp. If so, we search for
+ // matching sub instructions that decrement sp. If not, the
+ // modification is illegal
+ if (MBBI->getOperand(1).getReg() == AArch64::SP)
+ SPValue += MBBI->getOperand(2).getImm();
+ else
+ return true;
+ break;
+ case AArch64::SUBXri:
+ case AArch64::SUBWri:
+ assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
+ assert(MBBI->getOperand(2).isImm() &&
+ "Expected operand to be immediate");
+ assert(MBBI->getOperand(1).isReg() &&
+ "Expected operand to be a register");
+ // Check if the sub just decrements sp. If so, we search for
+ // matching add instructions that increment sp. If not, the
+ // modification is illegal
+ if (MBBI->getOperand(1).getReg() == AArch64::SP)
+ SPValue -= MBBI->getOperand(2).getImm();
+ else
+ return true;
+ break;
+ default:
+ return true;
+ }
+ }
+ if (MBBI == C.back())
+ break;
+ ++MBBI;
+ }
+ if (SPValue)
+ return true;
+ return false;
+ };
+ // Remove candidates with illegal stack modifying instructions
+ RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
+ RepeatedSequenceLocs.end(),
+ hasIllegalSPModification),
+ RepeatedSequenceLocs.end());
+
+ // If the sequence doesn't have enough candidates left, then we're done.
+ if (RepeatedSequenceLocs.size() < 2)
+ return outliner::OutlinedFunction();
+ }
// Properties about candidate MBBs that hold for all of them.
unsigned FlagsSetInAll = 0xF;
// Compute liveness information for each candidate, and set FlagsSetInAll.
- const TargetRegisterInfo &TRI = getRegisterInfo();
std::for_each(RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
[&FlagsSetInAll](outliner::Candidate &C) {
FlagsSetInAll &= C.Flags;
@@ -5107,7 +5785,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
};
unsigned FrameID = MachineOutlinerDefault;
- unsigned NumBytesToCreateFrame = 4;
+ NumBytesToCreateFrame += 4;
bool HasBTI = any_of(RepeatedSequenceLocs, [](outliner::Candidate &C) {
return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
@@ -5190,11 +5868,21 @@ AArch64InstrInfo::getOutliningCandidateInfo(
unsigned NumBytesNoStackCalls = 0;
std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
+ // Check if we have to save LR.
for (outliner::Candidate &C : RepeatedSequenceLocs) {
C.initLRU(TRI);
+ // If we have a noreturn caller, then we're going to be conservative and
+ // say that we have to save LR. If we don't have a ret at the end of the
+ // block, then we can't reason about liveness accurately.
+ //
+ // FIXME: We can probably do better than always disabling this in
+ // noreturn functions by fixing up the liveness info.
+ bool IsNoReturn =
+ C.getMF()->getFunction().hasFnAttribute(Attribute::NoReturn);
+
// Is LR available? If so, we don't need a save.
- if (C.LRU.available(AArch64::LR)) {
+ if (C.LRU.available(AArch64::LR) && !IsNoReturn) {
NumBytesNoStackCalls += 4;
C.setCallInfo(MachineOutlinerNoLRSave, 4);
CandidatesWithoutStackFixups.push_back(C);
@@ -5376,6 +6064,19 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
MachineFunction *MF = MBB->getParent();
AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+ // Don't outline anything used for return address signing. The outlined
+ // function will get signed later if needed
+ switch (MI.getOpcode()) {
+ case AArch64::PACIASP:
+ case AArch64::PACIBSP:
+ case AArch64::AUTIASP:
+ case AArch64::AUTIBSP:
+ case AArch64::RETAA:
+ case AArch64::RETAB:
+ case AArch64::EMITBKEY:
+ return outliner::InstrType::Illegal;
+ }
+
// Don't outline LOHs.
if (FuncInfo->getLOHRelated().count(&MI))
return outliner::InstrType::Illegal;
@@ -5528,6 +6229,59 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
}
}
+static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
+ bool ShouldSignReturnAddr,
+ bool ShouldSignReturnAddrWithAKey) {
+ if (ShouldSignReturnAddr) {
+ MachineBasicBlock::iterator MBBPAC = MBB.begin();
+ MachineBasicBlock::iterator MBBAUT = MBB.getFirstTerminator();
+ const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL;
+
+ if (MBBAUT != MBB.end())
+ DL = MBBAUT->getDebugLoc();
+
+ // At the very beginning of the basic block we insert the following
+ // depending on the key type
+ //
+ // a_key: b_key:
+ // PACIASP EMITBKEY
+ // CFI_INSTRUCTION PACIBSP
+ // CFI_INSTRUCTION
+ if (ShouldSignReturnAddrWithAKey) {
+ BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIASP))
+ .setMIFlag(MachineInstr::FrameSetup);
+ } else {
+ BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::EMITBKEY))
+ .setMIFlag(MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::PACIBSP))
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
+ unsigned CFIIndex =
+ MF.addFrameInst(MCCFIInstruction::createNegateRAState(nullptr));
+ BuildMI(MBB, MBBPAC, DebugLoc(), TII->get(AArch64::CFI_INSTRUCTION))
+ .addCFIIndex(CFIIndex)
+ .setMIFlags(MachineInstr::FrameSetup);
+
+ // If v8.3a features are available we can replace a RET instruction by
+ // RETAA or RETAB and omit the AUT instructions
+ if (Subtarget.hasV8_3aOps() && MBBAUT != MBB.end() &&
+ MBBAUT->getOpcode() == AArch64::RET) {
+ BuildMI(MBB, MBBAUT, DL,
+ TII->get(ShouldSignReturnAddrWithAKey ? AArch64::RETAA
+ : AArch64::RETAB))
+ .copyImplicitOps(*MBBAUT);
+ MBB.erase(MBBAUT);
+ } else {
+ BuildMI(MBB, MBBAUT, DL,
+ TII->get(ShouldSignReturnAddrWithAKey ? AArch64::AUTIASP
+ : AArch64::AUTIBSP))
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+ }
+}
+
void AArch64InstrInfo::buildOutlinedFrame(
MachineBasicBlock &MBB, MachineFunction &MF,
const outliner::OutlinedFunction &OF) const {
@@ -5543,16 +6297,19 @@ void AArch64InstrInfo::buildOutlinedFrame(
TailOpcode = AArch64::TCRETURNriALL;
}
MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
- .add(Call->getOperand(0))
- .addImm(0);
+ .add(Call->getOperand(0))
+ .addImm(0);
MBB.insert(MBB.end(), TC);
Call->eraseFromParent();
}
+ bool IsLeafFunction = true;
+
// Is there a call in the outlined range?
- auto IsNonTailCall = [](MachineInstr &MI) {
+ auto IsNonTailCall = [](const MachineInstr &MI) {
return MI.isCall() && !MI.isReturn();
};
+
if (std::any_of(MBB.instr_begin(), MBB.instr_end(), IsNonTailCall)) {
// Fix up the instructions in the range, since we're going to modify the
// stack.
@@ -5560,6 +6317,8 @@ void AArch64InstrInfo::buildOutlinedFrame(
"Can only fix up stack references once");
fixupPostOutline(MBB);
+ IsLeafFunction = false;
+
// LR has to be a live in so that we can save it.
MBB.addLiveIn(AArch64::LR);
@@ -5606,16 +6365,47 @@ void AArch64InstrInfo::buildOutlinedFrame(
Et = MBB.insert(Et, LDRXpost);
}
+ // If a bunch of candidates reach this point they must agree on their return
+ // address signing. It is therefore enough to just consider the signing
+ // behaviour of one of them
+ const Function &CF = OF.Candidates.front().getMF()->getFunction();
+ bool ShouldSignReturnAddr = false;
+ if (CF.hasFnAttribute("sign-return-address")) {
+ StringRef Scope =
+ CF.getFnAttribute("sign-return-address").getValueAsString();
+ if (Scope.equals("all"))
+ ShouldSignReturnAddr = true;
+ else if (Scope.equals("non-leaf") && !IsLeafFunction)
+ ShouldSignReturnAddr = true;
+ }
+
+ // a_key is the default
+ bool ShouldSignReturnAddrWithAKey = true;
+ if (CF.hasFnAttribute("sign-return-address-key")) {
+ const StringRef Key =
+ CF.getFnAttribute("sign-return-address-key").getValueAsString();
+ // Key can either be a_key or b_key
+ assert((Key.equals_lower("a_key") || Key.equals_lower("b_key")) &&
+ "Return address signing key must be either a_key or b_key");
+ ShouldSignReturnAddrWithAKey = Key.equals_lower("a_key");
+ }
+
// If this is a tail call outlined function, then there's already a return.
if (OF.FrameConstructionID == MachineOutlinerTailCall ||
- OF.FrameConstructionID == MachineOutlinerThunk)
+ OF.FrameConstructionID == MachineOutlinerThunk) {
+ signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
+ ShouldSignReturnAddrWithAKey);
return;
+ }
// It's not a tail call, so we have to insert the return ourselves.
MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
.addReg(AArch64::LR, RegState::Undef);
MBB.insert(MBB.end(), ret);
+ signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
+ ShouldSignReturnAddrWithAKey);
+
// Did we have to modify the stack by saving the link register?
if (OF.FrameConstructionID != MachineOutlinerDefault)
return;
@@ -5702,29 +6492,126 @@ bool AArch64InstrInfo::shouldOutlineFromFunctionByDefault(
return MF.getFunction().hasMinSize();
}
-bool AArch64InstrInfo::isCopyInstrImpl(
- const MachineInstr &MI, const MachineOperand *&Source,
- const MachineOperand *&Destination) const {
+Optional<DestSourcePair>
+AArch64InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
// AArch64::ORRWrs and AArch64::ORRXrs with WZR/XZR reg
// and zero immediate operands used as an alias for mov instruction.
if (MI.getOpcode() == AArch64::ORRWrs &&
MI.getOperand(1).getReg() == AArch64::WZR &&
MI.getOperand(3).getImm() == 0x0) {
- Destination = &MI.getOperand(0);
- Source = &MI.getOperand(2);
- return true;
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
}
if (MI.getOpcode() == AArch64::ORRXrs &&
MI.getOperand(1).getReg() == AArch64::XZR &&
MI.getOperand(3).getImm() == 0x0) {
- Destination = &MI.getOperand(0);
- Source = &MI.getOperand(2);
- return true;
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(2)};
}
- return false;
+ return None;
+}
+
+Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
+ Register Reg) const {
+ int Sign = 1;
+ int64_t Offset = 0;
+
+ // TODO: Handle cases where Reg is a super- or sub-register of the
+ // destination register.
+ if (Reg != MI.getOperand(0).getReg())
+ return None;
+
+ switch (MI.getOpcode()) {
+ default:
+ return None;
+ case AArch64::SUBWri:
+ case AArch64::SUBXri:
+ case AArch64::SUBSWri:
+ case AArch64::SUBSXri:
+ Sign *= -1;
+ LLVM_FALLTHROUGH;
+ case AArch64::ADDSWri:
+ case AArch64::ADDSXri:
+ case AArch64::ADDWri:
+ case AArch64::ADDXri: {
+ // TODO: Third operand can be global address (usually some string).
+ if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
+ !MI.getOperand(2).isImm())
+ return None;
+ Offset = MI.getOperand(2).getImm() * Sign;
+ int Shift = MI.getOperand(3).getImm();
+ assert((Shift == 0 || Shift == 12) && "Shift can be either 0 or 12");
+ Offset = Offset << Shift;
+ }
+ }
+ return RegImmPair{MI.getOperand(1).getReg(), Offset};
+}
+
+/// If the given ORR instruction is a copy, and \p DescribedReg overlaps with
+/// the destination register then, if possible, describe the value in terms of
+/// the source register.
+static Optional<ParamLoadedValue>
+describeORRLoadedValue(const MachineInstr &MI, Register DescribedReg,
+ const TargetInstrInfo *TII,
+ const TargetRegisterInfo *TRI) {
+ auto DestSrc = TII->isCopyInstr(MI);
+ if (!DestSrc)
+ return None;
+
+ Register DestReg = DestSrc->Destination->getReg();
+ Register SrcReg = DestSrc->Source->getReg();
+
+ auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
+
+ // If the described register is the destination, just return the source.
+ if (DestReg == DescribedReg)
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+
+ // ORRWrs zero-extends to 64-bits, so we need to consider such cases.
+ if (MI.getOpcode() == AArch64::ORRWrs &&
+ TRI->isSuperRegister(DestReg, DescribedReg))
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+
+ // We may need to describe the lower part of a ORRXrs move.
+ if (MI.getOpcode() == AArch64::ORRXrs &&
+ TRI->isSubRegister(DestReg, DescribedReg)) {
+ Register SrcSubReg = TRI->getSubReg(SrcReg, AArch64::sub_32);
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
+ }
+
+ assert(!TRI->isSuperOrSubRegisterEq(DestReg, DescribedReg) &&
+ "Unhandled ORR[XW]rs copy case");
+
+ return None;
+}
+
+Optional<ParamLoadedValue>
+AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
+ Register Reg) const {
+ const MachineFunction *MF = MI.getMF();
+ const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+ switch (MI.getOpcode()) {
+ case AArch64::MOVZWi:
+ case AArch64::MOVZXi: {
+ // MOVZWi may be used for producing zero-extended 32-bit immediates in
+ // 64-bit parameters, so we need to consider super-registers.
+ if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+
+ if (!MI.getOperand(1).isImm())
+ return None;
+ int64_t Immediate = MI.getOperand(1).getImm();
+ int Shift = MI.getOperand(2).getImm();
+ return ParamLoadedValue(MachineOperand::CreateImm(Immediate << Shift),
+ nullptr);
+ }
+ case AArch64::ORRWrs:
+ case AArch64::ORRXrs:
+ return describeORRLoadedValue(MI, Reg, this, TRI);
+ }
+
+ return TargetInstrInfo::describeLoadedValue(MI, Reg);
}
#define GET_INSTRINFO_HELPERS
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 1688045e4fb8..66e517e54903 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -89,6 +89,12 @@ public:
/// if there is a corresponding unscaled variant available.
static Optional<unsigned> getUnscaledLdSt(unsigned Opc);
+ /// Scaling factor for (scaled or unscaled) load or store.
+ static int getMemScale(unsigned Opc);
+ static int getMemScale(const MachineInstr &MI) {
+ return getMemScale(MI.getOpcode());
+ }
+
/// Returns the index for the immediate for a given instruction.
static unsigned getLoadStoreImmIdx(unsigned Opc);
@@ -131,15 +137,15 @@ public:
unsigned NumLoads) const override;
void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
- bool KillSrc, unsigned Opcode,
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc, unsigned Opcode,
llvm::ArrayRef<unsigned> Indices) const;
void copyGPRRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
DebugLoc DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc, unsigned Opcode, unsigned ZeroReg,
llvm::ArrayRef<unsigned> Indices) const;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
- const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
@@ -265,15 +271,21 @@ public:
/// on Windows.
static bool isSEHInstruction(const MachineInstr &MI);
+ Optional<RegImmPair> isAddImmediate(const MachineInstr &MI,
+ Register Reg) const override;
+
+ Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
+ Register Reg) const override;
+
#define GET_INSTRINFO_HELPER_DECLS
#include "AArch64GenInstrInfo.inc"
protected:
- /// If the specific machine instruction is a instruction that moves/copies
- /// value from one register to another register return true along with
- /// @Source machine operand and @Destination machine operand.
- bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source,
- const MachineOperand *&Destination) const override;
+ /// If the specific machine instruction is an instruction that moves/copies
+ /// value from one register to another register return destination and source
+ /// registers as machine operands.
+ Optional<DestSourcePair>
+ isCopyInstrImpl(const MachineInstr &MI) const override;
private:
/// Sets the offsets on outlined instructions in \p MBB which use SP
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 1981bd5d3bf0..d590d4d913ff 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -214,6 +214,7 @@ def SDT_AArch64FCmp : SDTypeProfile<0, 2,
SDTCisSameAs<0, 1>]>;
def SDT_AArch64Dup : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
def SDT_AArch64DupLane : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<2>]>;
+def SDT_AArch64Insr : SDTypeProfile<1, 2, [SDTCisVec<0>]>;
def SDT_AArch64Zip : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0, 1>,
SDTCisSameAs<0, 2>]>;
@@ -242,6 +243,9 @@ def SDT_AArch64ITOF : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>;
def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
SDTCisPtrTy<1>]>;
+def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+
// Generates the general dynamic sequences, i.e.
// adrp x0, :tlsdesc:var
// ldr x1, [x0, #:tlsdesc_lo12:var]
@@ -259,6 +263,110 @@ def SDT_AArch64WrapperLarge : SDTypeProfile<1, 4,
SDTCisSameAs<1, 2>, SDTCisSameAs<1, 3>,
SDTCisSameAs<1, 4>]>;
+def SDT_AArch64TBL : SDTypeProfile<1, 2, [
+ SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisInt<2>
+]>;
+
+// non-extending masked load fragment.
+def nonext_masked_load :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed() &&
+ !cast<MaskedLoadSDNode>(N)->isNonTemporal();
+}]>;
+// sign extending masked load fragments.
+def asext_masked_load :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (masked_ld node:$ptr, undef, node:$pred, node:$def),[{
+ return (cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::EXTLOAD ||
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::SEXTLOAD) &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
+}]>;
+def asext_masked_load_i8 :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def asext_masked_load_i16 :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def asext_masked_load_i32 :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (asext_masked_load node:$ptr, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+// zero extending masked load fragments.
+def zext_masked_load :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::ZEXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
+}]>;
+def zext_masked_load_i8 :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def zext_masked_load_i16 :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def zext_masked_load_i32 :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (zext_masked_load node:$ptr, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def non_temporal_load :
+ PatFrag<(ops node:$ptr, node:$pred, node:$def),
+ (masked_ld node:$ptr, undef, node:$pred, node:$def), [{
+ return cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed() &&
+ cast<MaskedLoadSDNode>(N)->isNonTemporal();
+}]>;
+
+// non-truncating masked store fragment.
+def nontrunc_masked_store :
+ PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed() &&
+ !cast<MaskedStoreSDNode>(N)->isNonTemporal();
+}]>;
+// truncating masked store fragments.
+def trunc_masked_store :
+ PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
+}]>;
+def trunc_masked_store_i8 :
+ PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (trunc_masked_store node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+}]>;
+def trunc_masked_store_i16 :
+ PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (trunc_masked_store node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+}]>;
+def trunc_masked_store_i32 :
+ PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (trunc_masked_store node:$val, node:$ptr, node:$pred), [{
+ return cast<MaskedStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+}]>;
+
+def non_temporal_store :
+ PatFrag<(ops node:$val, node:$ptr, node:$pred),
+ (masked_st node:$val, node:$ptr, undef, node:$pred), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed() &&
+ cast<MaskedStoreSDNode>(N)->isNonTemporal();
+}]>;
// Node definitions.
def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>;
@@ -319,6 +427,8 @@ def AArch64duplane16 : SDNode<"AArch64ISD::DUPLANE16", SDT_AArch64DupLane>;
def AArch64duplane32 : SDNode<"AArch64ISD::DUPLANE32", SDT_AArch64DupLane>;
def AArch64duplane64 : SDNode<"AArch64ISD::DUPLANE64", SDT_AArch64DupLane>;
+def AArch64insr : SDNode<"AArch64ISD::INSR", SDT_AArch64Insr>;
+
def AArch64zip1 : SDNode<"AArch64ISD::ZIP1", SDT_AArch64Zip>;
def AArch64zip2 : SDNode<"AArch64ISD::ZIP2", SDT_AArch64Zip>;
def AArch64uzp1 : SDNode<"AArch64ISD::UZP1", SDT_AArch64Zip>;
@@ -432,6 +542,11 @@ def AArch64sunpklo : SDNode<"AArch64ISD::SUNPKLO", SDT_AArch64unpk>;
def AArch64uunpkhi : SDNode<"AArch64ISD::UUNPKHI", SDT_AArch64unpk>;
def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
+def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
+def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
+
//===----------------------------------------------------------------------===//
//===----------------------------------------------------------------------===//
@@ -441,10 +556,10 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
// the Function object through the <Target>Subtarget and objections were raised
// to that (see post-commit review comments for r301750).
let RecomputePerFunction = 1 in {
- def ForCodeSize : Predicate<"MF->getFunction().hasOptSize()">;
- def NotForCodeSize : Predicate<"!MF->getFunction().hasOptSize()">;
+ def ForCodeSize : Predicate<"shouldOptForSize(MF)">;
+ def NotForCodeSize : Predicate<"!shouldOptForSize(MF)">;
// Avoid generating STRQro if it is slow, unless we're optimizing for code size.
- def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || MF->getFunction().hasOptSize()">;
+ def UseSTRQro : Predicate<"!Subtarget->isSTRQroSlow() || shouldOptForSize(MF)">;
def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
@@ -675,34 +790,81 @@ defm FCADD : SIMDThreeSameVectorComplexHSD<1, 0b111, complexrotateopodd,
defm FCMLA : SIMDIndexedTiedComplexHSD<1, 0, 1, complexrotateop, "fcmla",
null_frag>;
+let Predicates = [HasComplxNum, HasNEON, HasFullFP16] in {
+ def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot90 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+ (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 0))>;
+ def : Pat<(v4f16 (int_aarch64_neon_vcadd_rot270 (v4f16 V64:$Rn), (v4f16 V64:$Rm))),
+ (FCADDv4f16 (v4f16 V64:$Rn), (v4f16 V64:$Rm), (i32 1))>;
+ def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot90 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+ (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 0))>;
+ def : Pat<(v8f16 (int_aarch64_neon_vcadd_rot270 (v8f16 V128:$Rn), (v8f16 V128:$Rm))),
+ (FCADDv8f16 (v8f16 V128:$Rn), (v8f16 V128:$Rm), (i32 1))>;
+}
+let Predicates = [HasComplxNum, HasNEON] in {
+ def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot90 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 0))>;
+ def : Pat<(v2f32 (int_aarch64_neon_vcadd_rot270 (v2f32 V64:$Rn), (v2f32 V64:$Rm))),
+ (FCADDv2f32 (v2f32 V64:$Rn), (v2f32 V64:$Rm), (i32 1))>;
+ foreach Ty = [v4f32, v2f64] in {
+ def : Pat<(Ty (int_aarch64_neon_vcadd_rot90 (Ty V128:$Rn), (Ty V128:$Rm))),
+ (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 0))>;
+ def : Pat<(Ty (int_aarch64_neon_vcadd_rot270 (Ty V128:$Rn), (Ty V128:$Rm))),
+ (!cast<Instruction>("FCADD"#Ty) (Ty V128:$Rn), (Ty V128:$Rm), (i32 1))>;
+ }
+}
+
// v8.3a Pointer Authentication
// These instructions inhabit part of the hint space and so can be used for
-// armv8 targets
+// armv8 targets. Keeping the old HINT mnemonic when compiling without PA is
+// important for compatibility with other assemblers (e.g. GAS) when building
+// software compatible with both CPUs that do or don't implement PA.
let Uses = [LR], Defs = [LR] in {
- def PACIAZ : SystemNoOperands<0b000, "paciaz">;
- def PACIBZ : SystemNoOperands<0b010, "pacibz">;
- def AUTIAZ : SystemNoOperands<0b100, "autiaz">;
- def AUTIBZ : SystemNoOperands<0b110, "autibz">;
+ def PACIAZ : SystemNoOperands<0b000, "hint #24">;
+ def PACIBZ : SystemNoOperands<0b010, "hint #26">;
+ let isAuthenticated = 1 in {
+ def AUTIAZ : SystemNoOperands<0b100, "hint #28">;
+ def AUTIBZ : SystemNoOperands<0b110, "hint #30">;
+ }
}
let Uses = [LR, SP], Defs = [LR] in {
- def PACIASP : SystemNoOperands<0b001, "paciasp">;
- def PACIBSP : SystemNoOperands<0b011, "pacibsp">;
- def AUTIASP : SystemNoOperands<0b101, "autiasp">;
- def AUTIBSP : SystemNoOperands<0b111, "autibsp">;
+ def PACIASP : SystemNoOperands<0b001, "hint #25">;
+ def PACIBSP : SystemNoOperands<0b011, "hint #27">;
+ let isAuthenticated = 1 in {
+ def AUTIASP : SystemNoOperands<0b101, "hint #29">;
+ def AUTIBSP : SystemNoOperands<0b111, "hint #31">;
+ }
}
let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
- def PACIA1716 : SystemNoOperands<0b000, "pacia1716">;
- def PACIB1716 : SystemNoOperands<0b010, "pacib1716">;
- def AUTIA1716 : SystemNoOperands<0b100, "autia1716">;
- def AUTIB1716 : SystemNoOperands<0b110, "autib1716">;
+ def PACIA1716 : SystemNoOperands<0b000, "hint #8">;
+ def PACIB1716 : SystemNoOperands<0b010, "hint #10">;
+ let isAuthenticated = 1 in {
+ def AUTIA1716 : SystemNoOperands<0b100, "hint #12">;
+ def AUTIB1716 : SystemNoOperands<0b110, "hint #14">;
+ }
}
let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
- def XPACLRI : SystemNoOperands<0b111, "xpaclri">;
+ def XPACLRI : SystemNoOperands<0b111, "hint #7">;
}
-// These pointer authentication isntructions require armv8.3a
+// These pointer authentication instructions require armv8.3a
let Predicates = [HasPA] in {
+
+ // When compiling with PA, there is a better mnemonic for these instructions.
+ def : InstAlias<"paciaz", (PACIAZ), 1>;
+ def : InstAlias<"pacibz", (PACIBZ), 1>;
+ def : InstAlias<"autiaz", (AUTIAZ), 1>;
+ def : InstAlias<"autibz", (AUTIBZ), 1>;
+ def : InstAlias<"paciasp", (PACIASP), 1>;
+ def : InstAlias<"pacibsp", (PACIBSP), 1>;
+ def : InstAlias<"autiasp", (AUTIASP), 1>;
+ def : InstAlias<"autibsp", (AUTIBSP), 1>;
+ def : InstAlias<"pacia1716", (PACIA1716), 1>;
+ def : InstAlias<"pacib1716", (PACIB1716), 1>;
+ def : InstAlias<"autia1716", (AUTIA1716), 1>;
+ def : InstAlias<"autib1716", (AUTIB1716), 1>;
+ def : InstAlias<"xpaclri", (XPACLRI), 1>;
+
multiclass SignAuth<bits<3> prefix, bits<3> prefix_z, string asm> {
def IA : SignAuthOneData<prefix, 0b00, !strconcat(asm, "ia")>;
def IB : SignAuthOneData<prefix, 0b01, !strconcat(asm, "ib")>;
@@ -1478,6 +1640,8 @@ def : Pat<(ctlz (or (shl (xor (sra GPR32:$Rn, (i64 31)), GPR32:$Rn), (i64 1)),
def : Pat<(ctlz (or (shl (xor (sra GPR64:$Rn, (i64 63)), GPR64:$Rn), (i64 1)),
(i64 1))),
(CLSXr GPR64:$Rn)>;
+def : Pat<(int_aarch64_cls GPR32:$Rn), (CLSWr GPR32:$Rn)>;
+def : Pat<(int_aarch64_cls64 GPR64:$Rm), (EXTRACT_SUBREG (CLSXr GPR64:$Rm), sub_32)>;
// Unlike the other one operand instructions, the instructions with the "rev"
// mnemonic do *not* just different in the size bit, but actually use different
@@ -1859,6 +2023,9 @@ defm LDNPS : LoadPairNoAlloc<0b00, 1, FPR32Op, simm7s4, "ldnp">;
defm LDNPD : LoadPairNoAlloc<0b01, 1, FPR64Op, simm7s8, "ldnp">;
defm LDNPQ : LoadPairNoAlloc<0b10, 1, FPR128Op, simm7s16, "ldnp">;
+def : Pat<(AArch64ldp (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+ (LDPXi GPR64sp:$Rn, simm7s8:$offset)>;
+
//---
// (register offset)
//---
@@ -2552,6 +2719,9 @@ defm STNPS : StorePairNoAlloc<0b00, 1, FPR32Op, simm7s4, "stnp">;
defm STNPD : StorePairNoAlloc<0b01, 1, FPR64Op, simm7s8, "stnp">;
defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
+def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
+ (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
+
//---
// (Register offset)
@@ -3506,14 +3676,8 @@ def : Pat<(v4f32 (int_aarch64_neon_vcvthf2fp (extract_subvector (v8i16 V128:$Rn)
(i64 4)))),
(FCVTLv8i16 V128:$Rn)>;
def : Pat<(v2f64 (fpextend (v2f32 V64:$Rn))), (FCVTLv2i32 V64:$Rn)>;
-def : Pat<(v2f64 (fpextend (v2f32 (extract_subvector (v4f32 V128:$Rn),
- (i64 2))))),
- (FCVTLv4i32 V128:$Rn)>;
def : Pat<(v4f32 (fpextend (v4f16 V64:$Rn))), (FCVTLv4i16 V64:$Rn)>;
-def : Pat<(v4f32 (fpextend (v4f16 (extract_subvector (v8f16 V128:$Rn),
- (i64 4))))),
- (FCVTLv8i16 V128:$Rn)>;
defm FCVTMS : SIMDTwoVectorFPToInt<0,0,0b11011, "fcvtms",int_aarch64_neon_fcvtms>;
defm FCVTMU : SIMDTwoVectorFPToInt<1,0,0b11011, "fcvtmu",int_aarch64_neon_fcvtmu>;
@@ -3714,10 +3878,11 @@ defm FMUL : SIMDThreeSameVectorFP<1,0,0b011,"fmul", fmul>;
defm FRECPS : SIMDThreeSameVectorFP<0,0,0b111,"frecps", int_aarch64_neon_frecps>;
defm FRSQRTS : SIMDThreeSameVectorFP<0,1,0b111,"frsqrts", int_aarch64_neon_frsqrts>;
defm FSUB : SIMDThreeSameVectorFP<0,1,0b010,"fsub", fsub>;
-defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla",
- TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))> >;
-defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls",
- TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))> >;
+
+// MLA and MLS are generated in MachineCombine
+defm MLA : SIMDThreeSameVectorBHSTied<0, 0b10010, "mla", null_frag>;
+defm MLS : SIMDThreeSameVectorBHSTied<1, 0b10010, "mls", null_frag>;
+
defm MUL : SIMDThreeSameVectorBHS<0, 0b10011, "mul", mul>;
defm PMUL : SIMDThreeSameVectorB<1, 0b10011, "pmul", int_aarch64_neon_pmul>;
defm SABA : SIMDThreeSameVectorBHSTied<0, 0b01111, "saba",
@@ -3760,6 +3925,12 @@ defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
int_aarch64_neon_sqsub>;
+// Extra saturate patterns, other than the intrinsics matches above
+defm : SIMDThreeSameVectorExtraPatterns<"SQADD", saddsat>;
+defm : SIMDThreeSameVectorExtraPatterns<"UQADD", uaddsat>;
+defm : SIMDThreeSameVectorExtraPatterns<"SQSUB", ssubsat>;
+defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>;
+
defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
@@ -4356,6 +4527,25 @@ defm : Neon_mul_widen_patterns<AArch64smull, SMULLv8i8_v8i16,
defm : Neon_mul_widen_patterns<AArch64umull, UMULLv8i8_v8i16,
UMULLv4i16_v4i32, UMULLv2i32_v2i64>;
+// Patterns for smull2/umull2.
+multiclass Neon_mul_high_patterns<SDPatternOperator opnode,
+ Instruction INST8B, Instruction INST4H, Instruction INST2S> {
+ def : Pat<(v8i16 (opnode (extract_high_v16i8 V128:$Rn),
+ (extract_high_v16i8 V128:$Rm))),
+ (INST8B V128:$Rn, V128:$Rm)>;
+ def : Pat<(v4i32 (opnode (extract_high_v8i16 V128:$Rn),
+ (extract_high_v8i16 V128:$Rm))),
+ (INST4H V128:$Rn, V128:$Rm)>;
+ def : Pat<(v2i64 (opnode (extract_high_v4i32 V128:$Rn),
+ (extract_high_v4i32 V128:$Rm))),
+ (INST2S V128:$Rn, V128:$Rm)>;
+}
+
+defm : Neon_mul_high_patterns<AArch64smull, SMULLv16i8_v8i16,
+ SMULLv8i16_v4i32, SMULLv4i32_v2i64>;
+defm : Neon_mul_high_patterns<AArch64umull, UMULLv16i8_v8i16,
+ UMULLv8i16_v4i32, UMULLv4i32_v2i64>;
+
// Additional patterns for SMLAL/SMLSL and UMLAL/UMLSL
multiclass Neon_mulacc_widen_patterns<SDPatternOperator opnode,
Instruction INST8B, Instruction INST4H, Instruction INST2S> {
@@ -5422,10 +5612,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
-defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla",
- TriOpFrag<(add node:$LHS, (mul node:$MHS, node:$RHS))>>;
-defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls",
- TriOpFrag<(sub node:$LHS, (mul node:$MHS, node:$RHS))>>;
+
+// Generated by MachineCombine
+defm MLA : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
+defm MLS : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
+
defm MUL : SIMDVectorIndexedHS<0, 0b1000, "mul", mul>;
defm SMLAL : SIMDVectorIndexedLongSDTied<0, 0b0010, "smlal",
TriOpFrag<(add node:$LHS, (int_aarch64_neon_smull node:$MHS, node:$RHS))>>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 961f38cad1e4..b9ac2657e1c5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -32,6 +32,7 @@
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/IR/Type.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -205,6 +206,14 @@ private:
ComplexRendererFns
selectAddrModeShiftedExtendXReg(MachineOperand &Root,
unsigned SizeInBytes) const;
+
+ /// Returns a \p ComplexRendererFns which contains a base, offset, and whether
+ /// or not a shift + extend should be folded into an addressing mode. Returns
+ /// None when this is not profitable or possible.
+ ComplexRendererFns
+ selectExtendedSHL(MachineOperand &Root, MachineOperand &Base,
+ MachineOperand &Offset, unsigned SizeInBytes,
+ bool WantsExt) const;
ComplexRendererFns selectAddrModeRegisterOffset(MachineOperand &Root) const;
ComplexRendererFns selectAddrModeXRO(MachineOperand &Root,
unsigned SizeInBytes) const;
@@ -213,6 +222,13 @@ private:
return selectAddrModeXRO(Root, Width / 8);
}
+ ComplexRendererFns selectAddrModeWRO(MachineOperand &Root,
+ unsigned SizeInBytes) const;
+ template <int Width>
+ ComplexRendererFns selectAddrModeWRO(MachineOperand &Root) const {
+ return selectAddrModeWRO(Root, Width / 8);
+ }
+
ComplexRendererFns selectShiftedRegister(MachineOperand &Root) const;
ComplexRendererFns selectArithShiftedRegister(MachineOperand &Root) const {
@@ -227,6 +243,15 @@ private:
return selectShiftedRegister(Root);
}
+ /// Given an extend instruction, determine the correct shift-extend type for
+ /// that instruction.
+ ///
+ /// If the instruction is going to be used in a load or store, pass
+ /// \p IsLoadStore = true.
+ AArch64_AM::ShiftExtendType
+ getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI,
+ bool IsLoadStore = false) const;
+
/// Instructions that accept extend modifiers like UXTW expect the register
/// being extended to be a GPR32. Narrow ExtReg to a 32-bit register using a
/// subregister copy if necessary. Return either ExtReg, or the result of the
@@ -235,9 +260,12 @@ private:
MachineIRBuilder &MIB) const;
ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
- void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI) const;
- void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I) const;
- void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I) const;
+ void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
+ int OpIdx = -1) const;
+ void renderLogicalImm32(MachineInstrBuilder &MIB, const MachineInstr &I,
+ int OpIdx = -1) const;
+ void renderLogicalImm64(MachineInstrBuilder &MIB, const MachineInstr &I,
+ int OpIdx = -1) const;
// Materialize a GlobalValue or BlockAddress using a movz+movk sequence.
void materializeLargeCMVal(MachineInstr &I, const Value *V,
@@ -462,7 +490,7 @@ static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID,
}
} else if (OpSize == 64) {
switch (GenericOpc) {
- case TargetOpcode::G_GEP:
+ case TargetOpcode::G_PTR_ADD:
return AArch64::ADDXrr;
case TargetOpcode::G_SHL:
return AArch64::LSLVXr;
@@ -1006,6 +1034,66 @@ bool AArch64InstructionSelector::selectCompareBranch(
return true;
}
+/// Returns the element immediate value of a vector shift operand if found.
+/// This needs to detect a splat-like operation, e.g. a G_BUILD_VECTOR.
+static Optional<int64_t> getVectorShiftImm(Register Reg,
+ MachineRegisterInfo &MRI) {
+ assert(MRI.getType(Reg).isVector() && "Expected a *vector* shift operand");
+ MachineInstr *OpMI = MRI.getVRegDef(Reg);
+ assert(OpMI && "Expected to find a vreg def for vector shift operand");
+ if (OpMI->getOpcode() != TargetOpcode::G_BUILD_VECTOR)
+ return None;
+
+ // Check all operands are identical immediates.
+ int64_t ImmVal = 0;
+ for (unsigned Idx = 1; Idx < OpMI->getNumOperands(); ++Idx) {
+ auto VRegAndVal = getConstantVRegValWithLookThrough(OpMI->getOperand(Idx).getReg(), MRI);
+ if (!VRegAndVal)
+ return None;
+
+ if (Idx == 1)
+ ImmVal = VRegAndVal->Value;
+ if (ImmVal != VRegAndVal->Value)
+ return None;
+ }
+
+ return ImmVal;
+}
+
+/// Matches and returns the shift immediate value for a SHL instruction given
+/// a shift operand.
+static Optional<int64_t> getVectorSHLImm(LLT SrcTy, Register Reg, MachineRegisterInfo &MRI) {
+ Optional<int64_t> ShiftImm = getVectorShiftImm(Reg, MRI);
+ if (!ShiftImm)
+ return None;
+ // Check the immediate is in range for a SHL.
+ int64_t Imm = *ShiftImm;
+ if (Imm < 0)
+ return None;
+ switch (SrcTy.getElementType().getSizeInBits()) {
+ default:
+ LLVM_DEBUG(dbgs() << "Unhandled element type for vector shift");
+ return None;
+ case 8:
+ if (Imm > 7)
+ return None;
+ break;
+ case 16:
+ if (Imm > 15)
+ return None;
+ break;
+ case 32:
+ if (Imm > 31)
+ return None;
+ break;
+ case 64:
+ if (Imm > 63)
+ return None;
+ break;
+ }
+ return Imm;
+}
+
bool AArch64InstructionSelector::selectVectorSHL(
MachineInstr &I, MachineRegisterInfo &MRI) const {
assert(I.getOpcode() == TargetOpcode::G_SHL);
@@ -1017,21 +1105,29 @@ bool AArch64InstructionSelector::selectVectorSHL(
if (!Ty.isVector())
return false;
+ // Check if we have a vector of constants on RHS that we can select as the
+ // immediate form.
+ Optional<int64_t> ImmVal = getVectorSHLImm(Ty, Src2Reg, MRI);
+
unsigned Opc = 0;
if (Ty == LLT::vector(2, 64)) {
- Opc = AArch64::USHLv2i64;
+ Opc = ImmVal ? AArch64::SHLv2i64_shift : AArch64::USHLv2i64;
} else if (Ty == LLT::vector(4, 32)) {
- Opc = AArch64::USHLv4i32;
+ Opc = ImmVal ? AArch64::SHLv4i32_shift : AArch64::USHLv4i32;
} else if (Ty == LLT::vector(2, 32)) {
- Opc = AArch64::USHLv2i32;
+ Opc = ImmVal ? AArch64::SHLv2i32_shift : AArch64::USHLv2i32;
} else {
LLVM_DEBUG(dbgs() << "Unhandled G_SHL type");
return false;
}
MachineIRBuilder MIB(I);
- auto UShl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg, Src2Reg});
- constrainSelectedInstRegOperands(*UShl, TII, TRI, RBI);
+ auto Shl = MIB.buildInstr(Opc, {DstReg}, {Src1Reg});
+ if (ImmVal)
+ Shl.addImm(*ImmVal);
+ else
+ Shl.addUse(Src2Reg);
+ constrainSelectedInstRegOperands(*Shl, TII, TRI, RBI);
I.eraseFromParent();
return true;
}
@@ -1765,7 +1861,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
auto *PtrMI = MRI.getVRegDef(PtrReg);
// Try to fold a GEP into our unsigned immediate addressing mode.
- if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
+ if (PtrMI->getOpcode() == TargetOpcode::G_PTR_ADD) {
if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
int64_t Imm = *COff;
const unsigned Size = MemSizeInBits / 8;
@@ -1883,7 +1979,7 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
- case TargetOpcode::G_GEP: {
+ case TargetOpcode::G_PTR_ADD: {
MachineIRBuilder MIRBuilder(I);
emitADD(I.getOperand(0).getReg(), I.getOperand(1), I.getOperand(2),
MIRBuilder);
@@ -2065,14 +2161,15 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
unsigned DstSize = DstTy.getSizeInBits();
unsigned SrcSize = SrcTy.getSizeInBits();
+ if (DstTy.isVector())
+ return false; // Should be handled by imported patterns.
+
assert((*RBI.getRegBank(DefReg, MRI, TRI)).getID() ==
AArch64::GPRRegBankID &&
"Unexpected ext regbank");
MachineIRBuilder MIB(I);
MachineInstr *ExtI;
- if (DstTy.isVector())
- return false; // Should be handled by imported patterns.
// First check if we're extending the result of a load which has a dest type
// smaller than 32 bits, then this zext is redundant. GPR32 is the smallest
@@ -3602,22 +3699,51 @@ bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
return false;
// The shuffle's second operand doesn't matter if the mask is all zero.
- const Constant *Mask = I.getOperand(3).getShuffleMask();
- if (!isa<ConstantAggregateZero>(Mask))
+ ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
+ if (!all_of(Mask, [](int Elem) { return Elem == 0; }))
return false;
// We're done, now find out what kind of splat we need.
LLT VecTy = MRI.getType(I.getOperand(0).getReg());
LLT EltTy = VecTy.getElementType();
- if (VecTy.getSizeInBits() != 128 || EltTy.getSizeInBits() < 32) {
- LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 128b yet");
+ if (EltTy.getSizeInBits() < 32) {
+ LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
return false;
}
bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
- static const unsigned OpcTable[2][2] = {
- {AArch64::DUPv4i32gpr, AArch64::DUPv2i64gpr},
- {AArch64::DUPv4i32lane, AArch64::DUPv2i64lane}};
- unsigned Opc = OpcTable[IsFP][EltTy.getSizeInBits() == 64];
+ unsigned Opc = 0;
+ if (IsFP) {
+ switch (EltTy.getSizeInBits()) {
+ case 32:
+ if (VecTy.getNumElements() == 2) {
+ Opc = AArch64::DUPv2i32lane;
+ } else {
+ Opc = AArch64::DUPv4i32lane;
+ assert(VecTy.getNumElements() == 4);
+ }
+ break;
+ case 64:
+ assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
+ Opc = AArch64::DUPv2i64lane;
+ break;
+ }
+ } else {
+ switch (EltTy.getSizeInBits()) {
+ case 32:
+ if (VecTy.getNumElements() == 2) {
+ Opc = AArch64::DUPv2i32gpr;
+ } else {
+ Opc = AArch64::DUPv4i32gpr;
+ assert(VecTy.getNumElements() == 4);
+ }
+ break;
+ case 64:
+ assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
+ Opc = AArch64::DUPv2i64gpr;
+ break;
+ }
+ }
+ assert(Opc && "Did not compute an opcode for a dup");
// For FP splats, we need to widen the scalar reg via undef too.
if (IsFP) {
@@ -3652,15 +3778,12 @@ bool AArch64InstructionSelector::selectShuffleVector(
const LLT Src1Ty = MRI.getType(Src1Reg);
Register Src2Reg = I.getOperand(2).getReg();
const LLT Src2Ty = MRI.getType(Src2Reg);
- const Constant *ShuffleMask = I.getOperand(3).getShuffleMask();
+ ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
MachineBasicBlock &MBB = *I.getParent();
MachineFunction &MF = *MBB.getParent();
LLVMContext &Ctx = MF.getFunction().getContext();
- SmallVector<int, 8> Mask;
- ShuffleVectorInst::getShuffleMask(ShuffleMask, Mask);
-
// G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
// it's originated from a <1 x T> type. Those should have been lowered into
// G_BUILD_VECTOR earlier.
@@ -4164,45 +4287,15 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
[](MachineInstr &Use) { return Use.mayLoadOrStore(); });
}
-/// This is used for computing addresses like this:
-///
-/// ldr x1, [x2, x3, lsl #3]
-///
-/// Where x2 is the base register, and x3 is an offset register. The shift-left
-/// is a constant value specific to this load instruction. That is, we'll never
-/// see anything other than a 3 here (which corresponds to the size of the
-/// element being loaded.)
InstructionSelector::ComplexRendererFns
-AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
- MachineOperand &Root, unsigned SizeInBytes) const {
- if (!Root.isReg())
- return None;
- MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
-
- // Make sure that the memory op is a valid size.
- int64_t LegalShiftVal = Log2_32(SizeInBytes);
- if (LegalShiftVal == 0)
- return None;
-
- // We want to find something like this:
- //
- // val = G_CONSTANT LegalShiftVal
- // shift = G_SHL off_reg val
- // ptr = G_GEP base_reg shift
- // x = G_LOAD ptr
- //
- // And fold it into this addressing mode:
- //
- // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+AArch64InstructionSelector::selectExtendedSHL(
+ MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
+ unsigned SizeInBytes, bool WantsExt) const {
+ assert(Base.isReg() && "Expected base to be a register operand");
+ assert(Offset.isReg() && "Expected offset to be a register operand");
- // Check if we can find the G_GEP.
- MachineInstr *Gep = getOpcodeDef(TargetOpcode::G_GEP, Root.getReg(), MRI);
- if (!Gep || !isWorthFoldingIntoExtendedReg(*Gep, MRI))
- return None;
-
- // Now, try to match an opcode which will match our specific offset.
- // We want a G_SHL or a G_MUL.
- MachineInstr *OffsetInst = getDefIgnoringCopies(Gep->getOperand(2).getReg(), MRI);
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+ MachineInstr *OffsetInst = MRI.getVRegDef(Offset.getReg());
if (!OffsetInst)
return None;
@@ -4210,6 +4303,10 @@ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
if (OffsetOpc != TargetOpcode::G_SHL && OffsetOpc != TargetOpcode::G_MUL)
return None;
+ // Make sure that the memory op is a valid size.
+ int64_t LegalShiftVal = Log2_32(SizeInBytes);
+ if (LegalShiftVal == 0)
+ return None;
if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
return None;
@@ -4254,27 +4351,82 @@ AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
if (ImmVal != LegalShiftVal)
return None;
+ unsigned SignExtend = 0;
+ if (WantsExt) {
+ // Check if the offset is defined by an extend.
+ MachineInstr *ExtInst = getDefIgnoringCopies(OffsetReg, MRI);
+ auto Ext = getExtendTypeForInst(*ExtInst, MRI, true);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return None;
+
+ SignExtend = Ext == AArch64_AM::SXTW;
+
+ // Need a 32-bit wide register here.
+ MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
+ OffsetReg = ExtInst->getOperand(1).getReg();
+ OffsetReg = narrowExtendRegIfNeeded(OffsetReg, MIB);
+ }
+
// We can use the LHS of the GEP as the base, and the LHS of the shift as an
// offset. Signify that we are shifting by setting the shift flag to 1.
- return {{[=](MachineInstrBuilder &MIB) {
- MIB.addUse(Gep->getOperand(1).getReg());
- },
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(Base.getReg()); },
[=](MachineInstrBuilder &MIB) { MIB.addUse(OffsetReg); },
[=](MachineInstrBuilder &MIB) {
// Need to add both immediates here to make sure that they are both
// added to the instruction.
- MIB.addImm(0);
+ MIB.addImm(SignExtend);
MIB.addImm(1);
}}};
}
/// This is used for computing addresses like this:
///
+/// ldr x1, [x2, x3, lsl #3]
+///
+/// Where x2 is the base register, and x3 is an offset register. The shift-left
+/// is a constant value specific to this load instruction. That is, we'll never
+/// see anything other than a 3 here (which corresponds to the size of the
+/// element being loaded.)
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeShiftedExtendXReg(
+ MachineOperand &Root, unsigned SizeInBytes) const {
+ if (!Root.isReg())
+ return None;
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ // We want to find something like this:
+ //
+ // val = G_CONSTANT LegalShiftVal
+ // shift = G_SHL off_reg val
+ // ptr = G_PTR_ADD base_reg shift
+ // x = G_LOAD ptr
+ //
+ // And fold it into this addressing mode:
+ //
+ // ldr x, [base_reg, off_reg, lsl #LegalShiftVal]
+
+ // Check if we can find the G_PTR_ADD.
+ MachineInstr *PtrAdd =
+ getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
+ if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
+ return None;
+
+ // Now, try to match an opcode which will match our specific offset.
+ // We want a G_SHL or a G_MUL.
+ MachineInstr *OffsetInst =
+ getDefIgnoringCopies(PtrAdd->getOperand(2).getReg(), MRI);
+ return selectExtendedSHL(Root, PtrAdd->getOperand(1),
+ OffsetInst->getOperand(0), SizeInBytes,
+ /*WantsExt=*/false);
+}
+
+/// This is used for computing addresses like this:
+///
/// ldr x1, [x2, x3]
///
/// Where x2 is the base register, and x3 is an offset register.
///
-/// When possible (or profitable) to fold a G_GEP into the address calculation,
+/// When possible (or profitable) to fold a G_PTR_ADD into the address calculation,
/// this will do so. Otherwise, it will return None.
InstructionSelector::ComplexRendererFns
AArch64InstructionSelector::selectAddrModeRegisterOffset(
@@ -4283,7 +4435,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
// We need a GEP.
MachineInstr *Gep = MRI.getVRegDef(Root.getReg());
- if (!Gep || Gep->getOpcode() != TargetOpcode::G_GEP)
+ if (!Gep || Gep->getOpcode() != TargetOpcode::G_PTR_ADD)
return None;
// If this is used more than once, let's not bother folding.
@@ -4329,6 +4481,74 @@ AArch64InstructionSelector::selectAddrModeXRO(MachineOperand &Root,
return selectAddrModeRegisterOffset(Root);
}
+/// This is used for computing addresses like this:
+///
+/// ldr x0, [xBase, wOffset, sxtw #LegalShiftVal]
+///
+/// Where we have a 64-bit base register, a 32-bit offset register, and an
+/// extend (which may or may not be signed).
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::selectAddrModeWRO(MachineOperand &Root,
+ unsigned SizeInBytes) const {
+ MachineRegisterInfo &MRI = Root.getParent()->getMF()->getRegInfo();
+
+ MachineInstr *PtrAdd =
+ getOpcodeDef(TargetOpcode::G_PTR_ADD, Root.getReg(), MRI);
+ if (!PtrAdd || !isWorthFoldingIntoExtendedReg(*PtrAdd, MRI))
+ return None;
+
+ MachineOperand &LHS = PtrAdd->getOperand(1);
+ MachineOperand &RHS = PtrAdd->getOperand(2);
+ MachineInstr *OffsetInst = getDefIgnoringCopies(RHS.getReg(), MRI);
+
+ // The first case is the same as selectAddrModeXRO, except we need an extend.
+ // In this case, we try to find a shift and extend, and fold them into the
+ // addressing mode.
+ //
+ // E.g.
+ //
+ // off_reg = G_Z/S/ANYEXT ext_reg
+ // val = G_CONSTANT LegalShiftVal
+ // shift = G_SHL off_reg val
+ // ptr = G_PTR_ADD base_reg shift
+ // x = G_LOAD ptr
+ //
+ // In this case we can get a load like this:
+ //
+ // ldr x0, [base_reg, ext_reg, sxtw #LegalShiftVal]
+ auto ExtendedShl = selectExtendedSHL(Root, LHS, OffsetInst->getOperand(0),
+ SizeInBytes, /*WantsExt=*/true);
+ if (ExtendedShl)
+ return ExtendedShl;
+
+ // There was no shift. We can try and fold a G_Z/S/ANYEXT in alone though.
+ //
+ // e.g.
+ // ldr something, [base_reg, ext_reg, sxtw]
+ if (!isWorthFoldingIntoExtendedReg(*OffsetInst, MRI))
+ return None;
+
+ // Check if this is an extend. We'll get an extend type if it is.
+ AArch64_AM::ShiftExtendType Ext =
+ getExtendTypeForInst(*OffsetInst, MRI, /*IsLoadStore=*/true);
+ if (Ext == AArch64_AM::InvalidShiftExtend)
+ return None;
+
+ // Need a 32-bit wide register.
+ MachineIRBuilder MIB(*PtrAdd);
+ Register ExtReg =
+ narrowExtendRegIfNeeded(OffsetInst->getOperand(1).getReg(), MIB);
+ unsigned SignExtend = Ext == AArch64_AM::SXTW;
+
+ // Base is LHS, offset is ExtReg.
+ return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(LHS.getReg()); },
+ [=](MachineInstrBuilder &MIB) { MIB.addUse(ExtReg); },
+ [=](MachineInstrBuilder &MIB) {
+ MIB.addImm(SignExtend);
+ MIB.addImm(0);
+ }}};
+}
+
/// Select a "register plus unscaled signed 9-bit immediate" address. This
/// should only match when there is an offset that is not valid for a scaled
/// immediate addressing mode. The "Size" argument is the size in bytes of the
@@ -4491,9 +4711,8 @@ AArch64InstructionSelector::selectShiftedRegister(MachineOperand &Root) const {
[=](MachineInstrBuilder &MIB) { MIB.addImm(ShiftVal); }}};
}
-/// Get the correct ShiftExtendType for an extend instruction.
-static AArch64_AM::ShiftExtendType
-getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) {
+AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
+ MachineInstr &MI, MachineRegisterInfo &MRI, bool IsLoadStore) const {
unsigned Opc = MI.getOpcode();
// Handle explicit extend instructions first.
@@ -4540,9 +4759,9 @@ getExtendTypeForInst(MachineInstr &MI, MachineRegisterInfo &MRI) {
default:
return AArch64_AM::InvalidShiftExtend;
case 0xFF:
- return AArch64_AM::UXTB;
+ return !IsLoadStore ? AArch64_AM::UXTB : AArch64_AM::InvalidShiftExtend;
case 0xFFFF:
- return AArch64_AM::UXTH;
+ return !IsLoadStore ? AArch64_AM::UXTH : AArch64_AM::InvalidShiftExtend;
case 0xFFFFFFFF:
return AArch64_AM::UXTW;
}
@@ -4632,25 +4851,29 @@ AArch64InstructionSelector::selectArithExtendedRegister(
}
void AArch64InstructionSelector::renderTruncImm(MachineInstrBuilder &MIB,
- const MachineInstr &MI) const {
+ const MachineInstr &MI,
+ int OpIdx) const {
const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
- assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
+ "Expected G_CONSTANT");
Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), MRI);
assert(CstVal && "Expected constant value");
MIB.addImm(CstVal.getValue());
}
void AArch64InstructionSelector::renderLogicalImm32(
- MachineInstrBuilder &MIB, const MachineInstr &I) const {
- assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
+ assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
+ "Expected G_CONSTANT");
uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 32);
MIB.addImm(Enc);
}
void AArch64InstructionSelector::renderLogicalImm64(
- MachineInstrBuilder &MIB, const MachineInstr &I) const {
- assert(I.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+ MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const {
+ assert(I.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
+ "Expected G_CONSTANT");
uint64_t CstVal = I.getOperand(1).getCImm()->getZExtValue();
uint64_t Enc = AArch64_AM::encodeLogicalImmediate(CstVal, 64);
MIB.addImm(Enc);
diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 7a1901bd5b1e..95719a35c6da 100644
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -59,7 +59,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
}
getActionDefinitionsBuilder(G_IMPLICIT_DEF)
- .legalFor({p0, s1, s8, s16, s32, s64, v4s32, v2s64})
+ .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
.clampScalar(0, s1, s64)
.widenScalarToNextPow2(0, 8)
.fewerElementsIf(
@@ -104,7 +104,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
.moreElementsToNextPow2(0)
.minScalarSameAs(1, 0);
- getActionDefinitionsBuilder(G_GEP)
+ getActionDefinitionsBuilder(G_PTR_ADD)
.legalFor({{p0, s64}})
.clampScalar(1, s64, s64);
@@ -143,7 +143,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
getActionDefinitionsBuilder({G_SMULH, G_UMULH}).legalFor({s32, s64});
getActionDefinitionsBuilder({G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_UADDO})
- .legalFor({{s32, s1}, {s64, s1}});
+ .legalFor({{s32, s1}, {s64, s1}})
+ .minScalar(0, s32);
getActionDefinitionsBuilder({G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FNEG})
.legalFor({s32, s64, v2s64, v4s32, v2s32});
@@ -743,7 +744,7 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
// Realign the list to the actual required alignment.
auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1);
- auto ListTmp = MIRBuilder.buildGEP(PtrTy, List, AlignMinus1.getReg(0));
+ auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
DstPtr = MRI.createGenericVirtualRegister(PtrTy);
MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
@@ -758,7 +759,7 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize));
- auto NewList = MIRBuilder.buildGEP(PtrTy, DstPtr, Size.getReg(0));
+ auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
MIRBuilder.buildStore(
NewList, ListPtr,
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index a0c4a25bb5b9..3156bb446963 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -26,16 +26,19 @@
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include <cassert>
#include <cstdint>
+#include <functional>
#include <iterator>
#include <limits>
@@ -51,6 +54,9 @@ STATISTIC(NumUnscaledPairCreated,
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
+DEBUG_COUNTER(RegRenamingCounter, DEBUG_TYPE "-reg-renaming",
+ "Controls which pairs are considered for renaming");
+
// The LdStLimit limits how far we search for load/store pairs.
static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
cl::init(20), cl::Hidden);
@@ -76,6 +82,11 @@ using LdStPairFlags = struct LdStPairFlags {
// to be extended, 0 means I, and 1 means the returned iterator.
int SExtIdx = -1;
+ // If not none, RenameReg can be used to rename the result register of the
+ // first store in a pair. Currently this only works when merging stores
+ // forward.
+ Optional<MCPhysReg> RenameReg = None;
+
LdStPairFlags() = default;
void setMergeForward(bool V = true) { MergeForward = V; }
@@ -83,6 +94,10 @@ using LdStPairFlags = struct LdStPairFlags {
void setSExtIdx(int V) { SExtIdx = V; }
int getSExtIdx() const { return SExtIdx; }
+
+ void setRenameReg(MCPhysReg R) { RenameReg = R; }
+ void clearRenameReg() { RenameReg = None; }
+ Optional<MCPhysReg> getRenameReg() const { return RenameReg; }
};
struct AArch64LoadStoreOpt : public MachineFunctionPass {
@@ -99,6 +114,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Track which register units have been modified and used.
LiveRegUnits ModifiedRegUnits, UsedRegUnits;
+ LiveRegUnits DefinedInBB;
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<AAResultsWrapperPass>();
@@ -215,69 +231,6 @@ static bool isTagStore(const MachineInstr &MI) {
}
}
-// Scaling factor for unscaled load or store.
-static int getMemScale(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- llvm_unreachable("Opcode has unknown scale!");
- case AArch64::LDRBBui:
- case AArch64::LDURBBi:
- case AArch64::LDRSBWui:
- case AArch64::LDURSBWi:
- case AArch64::STRBBui:
- case AArch64::STURBBi:
- return 1;
- case AArch64::LDRHHui:
- case AArch64::LDURHHi:
- case AArch64::LDRSHWui:
- case AArch64::LDURSHWi:
- case AArch64::STRHHui:
- case AArch64::STURHHi:
- return 2;
- case AArch64::LDRSui:
- case AArch64::LDURSi:
- case AArch64::LDRSWui:
- case AArch64::LDURSWi:
- case AArch64::LDRWui:
- case AArch64::LDURWi:
- case AArch64::STRSui:
- case AArch64::STURSi:
- case AArch64::STRWui:
- case AArch64::STURWi:
- case AArch64::LDPSi:
- case AArch64::LDPSWi:
- case AArch64::LDPWi:
- case AArch64::STPSi:
- case AArch64::STPWi:
- return 4;
- case AArch64::LDRDui:
- case AArch64::LDURDi:
- case AArch64::LDRXui:
- case AArch64::LDURXi:
- case AArch64::STRDui:
- case AArch64::STURDi:
- case AArch64::STRXui:
- case AArch64::STURXi:
- case AArch64::LDPDi:
- case AArch64::LDPXi:
- case AArch64::STPDi:
- case AArch64::STPXi:
- return 8;
- case AArch64::LDRQui:
- case AArch64::LDURQi:
- case AArch64::STRQui:
- case AArch64::STURQi:
- case AArch64::LDPQi:
- case AArch64::STPQi:
- case AArch64::STGOffset:
- case AArch64::STZGOffset:
- case AArch64::ST2GOffset:
- case AArch64::STZ2GOffset:
- case AArch64::STGPi:
- return 16;
- }
-}
-
static unsigned getMatchingNonSExtOpcode(unsigned Opc,
bool *IsValidLdStrOpc = nullptr) {
if (IsValidLdStrOpc)
@@ -588,7 +541,7 @@ static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
// ST*G and all paired ldst have the same scale in pre/post-indexed variants
// as in the "unsigned offset" variant.
// All other pre/post indexed ldst instructions are unscaled.
- Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1;
+ Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1;
if (IsPaired) {
MinOffset = -64;
@@ -599,8 +552,8 @@ static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale,
}
}
-static const MachineOperand &getLdStRegOp(const MachineInstr &MI,
- unsigned PairedRegOp = 0) {
+static MachineOperand &getLdStRegOp(MachineInstr &MI,
+ unsigned PairedRegOp = 0) {
assert(PairedRegOp < 2 && "Unexpected register operand idx.");
unsigned Idx = isPairedLdSt(MI) ? PairedRegOp : 0;
return MI.getOperand(Idx);
@@ -620,8 +573,8 @@ static bool isLdOffsetInRangeOfSt(MachineInstr &LoadInst,
MachineInstr &StoreInst,
const AArch64InstrInfo *TII) {
assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
- int LoadSize = getMemScale(LoadInst);
- int StoreSize = getMemScale(StoreInst);
+ int LoadSize = TII->getMemScale(LoadInst);
+ int StoreSize = TII->getMemScale(StoreInst);
int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst)
? getLdStOffsetOp(StoreInst).getImm()
: getLdStOffsetOp(StoreInst).getImm() * StoreSize;
@@ -731,7 +684,7 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
unsigned Opc = I->getOpcode();
bool IsScaled = !TII->isUnscaledLdSt(Opc);
- int OffsetStride = IsScaled ? 1 : getMemScale(*I);
+ int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I);
bool MergeForward = Flags.getMergeForward();
// Insert our new paired instruction after whichever of the paired
@@ -783,6 +736,44 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
return NextI;
}
+// Apply Fn to all instructions between MI and the beginning of the block, until
+// a def for DefReg is reached. Returns true, iff Fn returns true for all
+// visited instructions. Stop after visiting Limit iterations.
+static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
+ const TargetRegisterInfo *TRI, unsigned Limit,
+ std::function<bool(MachineInstr &, bool)> &Fn) {
+ auto MBB = MI.getParent();
+ for (MachineBasicBlock::reverse_iterator I = MI.getReverseIterator(),
+ E = MBB->rend();
+ I != E; I++) {
+ if (!Limit)
+ return false;
+ --Limit;
+
+ bool isDef = any_of(I->operands(), [DefReg, TRI](MachineOperand &MOP) {
+ return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
+ TRI->regsOverlap(MOP.getReg(), DefReg);
+ });
+ if (!Fn(*I, isDef))
+ return false;
+ if (isDef)
+ break;
+ }
+ return true;
+}
+
+static void updateDefinedRegisters(MachineInstr &MI, LiveRegUnits &Units,
+ const TargetRegisterInfo *TRI) {
+
+ for (const MachineOperand &MOP : phys_regs_and_masks(MI))
+ if (MOP.isReg() && MOP.isKill())
+ Units.removeReg(MOP.getReg());
+
+ for (const MachineOperand &MOP : phys_regs_and_masks(MI))
+ if (MOP.isReg() && !MOP.isKill())
+ Units.addReg(MOP.getReg());
+}
+
MachineBasicBlock::iterator
AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
MachineBasicBlock::iterator Paired,
@@ -800,9 +791,76 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
unsigned Opc =
SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode());
bool IsUnscaled = TII->isUnscaledLdSt(Opc);
- int OffsetStride = IsUnscaled ? getMemScale(*I) : 1;
+ int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1;
bool MergeForward = Flags.getMergeForward();
+
+ Optional<MCPhysReg> RenameReg = Flags.getRenameReg();
+ if (MergeForward && RenameReg) {
+ MCRegister RegToRename = getLdStRegOp(*I).getReg();
+ DefinedInBB.addReg(*RenameReg);
+
+ // Return the sub/super register for RenameReg, matching the size of
+ // OriginalReg.
+ auto GetMatchingSubReg = [this,
+ RenameReg](MCPhysReg OriginalReg) -> MCPhysReg {
+ for (MCPhysReg SubOrSuper : TRI->sub_and_superregs_inclusive(*RenameReg))
+ if (TRI->getMinimalPhysRegClass(OriginalReg) ==
+ TRI->getMinimalPhysRegClass(SubOrSuper))
+ return SubOrSuper;
+ llvm_unreachable("Should have found matching sub or super register!");
+ };
+
+ std::function<bool(MachineInstr &, bool)> UpdateMIs =
+ [this, RegToRename, GetMatchingSubReg](MachineInstr &MI, bool IsDef) {
+ if (IsDef) {
+ bool SeenDef = false;
+ for (auto &MOP : MI.operands()) {
+ // Rename the first explicit definition and all implicit
+ // definitions matching RegToRename.
+ if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
+ (!SeenDef || (MOP.isDef() && MOP.isImplicit())) &&
+ TRI->regsOverlap(MOP.getReg(), RegToRename)) {
+ assert((MOP.isImplicit() ||
+ (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
+ "Need renamable operands");
+ MOP.setReg(GetMatchingSubReg(MOP.getReg()));
+ SeenDef = true;
+ }
+ }
+ } else {
+ for (auto &MOP : MI.operands()) {
+ if (MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
+ TRI->regsOverlap(MOP.getReg(), RegToRename)) {
+ assert((MOP.isImplicit() ||
+ (MOP.isRenamable() && !MOP.isEarlyClobber())) &&
+ "Need renamable operands");
+ MOP.setReg(GetMatchingSubReg(MOP.getReg()));
+ }
+ }
+ }
+ LLVM_DEBUG(dbgs() << "Renamed " << MI << "\n");
+ return true;
+ };
+ forAllMIsUntilDef(*I, RegToRename, TRI, LdStLimit, UpdateMIs);
+
+#if !defined(NDEBUG)
+ // Make sure the register used for renaming is not used between the paired
+ // instructions. That would trash the content before the new paired
+ // instruction.
+ for (auto &MI :
+ iterator_range<MachineInstrBundleIterator<llvm::MachineInstr>>(
+ std::next(I), std::next(Paired)))
+ assert(all_of(MI.operands(),
+ [this, &RenameReg](const MachineOperand &MOP) {
+ return !MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
+ !TRI->regsOverlap(MOP.getReg(), *RenameReg);
+ }) &&
+ "Rename register used between paired instruction, trashing the "
+ "content");
+#endif
+ }
+
// Insert our new paired instruction after whichever of the paired
// instructions MergeForward indicates.
MachineBasicBlock::iterator InsertionPoint = MergeForward ? Paired : I;
@@ -818,11 +876,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
// We're trying to pair instructions that differ in how they are scaled. If
// I is scaled then scale the offset of Paired accordingly. Otherwise, do
// the opposite (i.e., make Paired's offset unscaled).
- int MemSize = getMemScale(*Paired);
+ int MemSize = TII->getMemScale(*Paired);
if (PairedIsUnscaled) {
// If the unscaled offset isn't a multiple of the MemSize, we can't
// pair the operations together.
- assert(!(PairedOffset % getMemScale(*Paired)) &&
+ assert(!(PairedOffset % TII->getMemScale(*Paired)) &&
"Offset should be a multiple of the stride!");
PairedOffset /= MemSize;
} else {
@@ -847,9 +905,9 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
int OffsetImm = getLdStOffsetOp(*RtMI).getImm();
// Scale the immediate offset, if necessary.
if (TII->isUnscaledLdSt(RtMI->getOpcode())) {
- assert(!(OffsetImm % getMemScale(*RtMI)) &&
+ assert(!(OffsetImm % TII->getMemScale(*RtMI)) &&
"Unscaled offset cannot be scaled.");
- OffsetImm /= getMemScale(*RtMI);
+ OffsetImm /= TII->getMemScale(*RtMI);
}
// Construct the new instruction.
@@ -931,6 +989,11 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
}
LLVM_DEBUG(dbgs() << "\n");
+ if (MergeForward)
+ for (const MachineOperand &MOP : phys_regs_and_masks(*I))
+ if (MOP.isReg() && MOP.isKill())
+ DefinedInBB.addReg(MOP.getReg());
+
// Erase the old instructions.
I->eraseFromParent();
Paired->eraseFromParent();
@@ -944,8 +1007,8 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
MachineBasicBlock::iterator NextI = LoadI;
++NextI;
- int LoadSize = getMemScale(*LoadI);
- int StoreSize = getMemScale(*StoreI);
+ int LoadSize = TII->getMemScale(*LoadI);
+ int StoreSize = TII->getMemScale(*StoreI);
Register LdRt = getLdStRegOp(*LoadI).getReg();
const MachineOperand &StMO = getLdStRegOp(*StoreI);
Register StRt = getLdStRegOp(*StoreI).getReg();
@@ -1207,6 +1270,148 @@ static bool areCandidatesToMergeOrPair(MachineInstr &FirstMI, MachineInstr &MI,
// FIXME: Can we also match a mixed sext/zext unscaled/scaled pair?
}
+static bool
+canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
+ SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
+ const TargetRegisterInfo *TRI) {
+ if (!FirstMI.mayStore())
+ return false;
+
+ // Check if we can find an unused register which we can use to rename
+ // the register used by the first load/store.
+ auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
+ MachineFunction &MF = *FirstMI.getParent()->getParent();
+ if (!RegClass || !MF.getRegInfo().tracksLiveness())
+ return false;
+
+ auto RegToRename = getLdStRegOp(FirstMI).getReg();
+ // For now, we only rename if the store operand gets killed at the store.
+ if (!getLdStRegOp(FirstMI).isKill() &&
+ !any_of(FirstMI.operands(),
+ [TRI, RegToRename](const MachineOperand &MOP) {
+ return MOP.isReg() && !MOP.isDebug() && MOP.getReg() &&
+ MOP.isImplicit() && MOP.isKill() &&
+ TRI->regsOverlap(RegToRename, MOP.getReg());
+ })) {
+ LLVM_DEBUG(dbgs() << " Operand not killed at " << FirstMI << "\n");
+ return false;
+ }
+ auto canRenameMOP = [](const MachineOperand &MOP) {
+ return MOP.isImplicit() ||
+ (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
+ };
+
+ bool FoundDef = false;
+
+ // For each instruction between FirstMI and the previous def for RegToRename,
+ // we
+ // * check if we can rename RegToRename in this instruction
+ // * collect the registers used and required register classes for RegToRename.
+ std::function<bool(MachineInstr &, bool)> CheckMIs = [&](MachineInstr &MI,
+ bool IsDef) {
+ LLVM_DEBUG(dbgs() << "Checking " << MI << "\n");
+ // Currently we do not try to rename across frame-setup instructions.
+ if (MI.getFlag(MachineInstr::FrameSetup)) {
+ LLVM_DEBUG(dbgs() << " Cannot rename framesetup instructions currently ("
+ << MI << ")\n");
+ return false;
+ }
+
+ UsedInBetween.accumulate(MI);
+
+ // For a definition, check that we can rename the definition and exit the
+ // loop.
+ FoundDef = IsDef;
+
+ // For defs, check if we can rename the first def of RegToRename.
+ if (FoundDef) {
+ for (auto &MOP : MI.operands()) {
+ if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
+ !TRI->regsOverlap(MOP.getReg(), RegToRename))
+ continue;
+ if (!canRenameMOP(MOP)) {
+ LLVM_DEBUG(dbgs()
+ << " Cannot rename " << MOP << " in " << MI << "\n");
+ return false;
+ }
+ RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
+ }
+ return true;
+ } else {
+ for (auto &MOP : MI.operands()) {
+ if (!MOP.isReg() || MOP.isDebug() || !MOP.getReg() ||
+ !TRI->regsOverlap(MOP.getReg(), RegToRename))
+ continue;
+
+ if (!canRenameMOP(MOP)) {
+ LLVM_DEBUG(dbgs()
+ << " Cannot rename " << MOP << " in " << MI << "\n");
+ return false;
+ }
+ RequiredClasses.insert(TRI->getMinimalPhysRegClass(MOP.getReg()));
+ }
+ }
+ return true;
+ };
+
+ if (!forAllMIsUntilDef(FirstMI, RegToRename, TRI, LdStLimit, CheckMIs))
+ return false;
+
+ if (!FoundDef) {
+ LLVM_DEBUG(dbgs() << " Did not find definition for register in BB\n");
+ return false;
+ }
+ return true;
+}
+
+// Check if we can find a physical register for renaming. This register must:
+// * not be defined up to FirstMI (checking DefinedInBB)
+// * not used between the MI and the defining instruction of the register to
+// rename (checked using UsedInBetween).
+// * is available in all used register classes (checked using RequiredClasses).
+static Optional<MCPhysReg> tryToFindRegisterToRename(
+ MachineInstr &FirstMI, MachineInstr &MI, LiveRegUnits &DefinedInBB,
+ LiveRegUnits &UsedInBetween,
+ SmallPtrSetImpl<const TargetRegisterClass *> &RequiredClasses,
+ const TargetRegisterInfo *TRI) {
+ auto &MF = *FirstMI.getParent()->getParent();
+ MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+ // Checks if any sub- or super-register of PR is callee saved.
+ auto AnySubOrSuperRegCalleePreserved = [&MF, TRI](MCPhysReg PR) {
+ return any_of(TRI->sub_and_superregs_inclusive(PR),
+ [&MF, TRI](MCPhysReg SubOrSuper) {
+ return TRI->isCalleeSavedPhysReg(SubOrSuper, MF);
+ });
+ };
+
+ // Check if PR or one of its sub- or super-registers can be used for all
+ // required register classes.
+ auto CanBeUsedForAllClasses = [&RequiredClasses, TRI](MCPhysReg PR) {
+ return all_of(RequiredClasses, [PR, TRI](const TargetRegisterClass *C) {
+ return any_of(TRI->sub_and_superregs_inclusive(PR),
+ [C, TRI](MCPhysReg SubOrSuper) {
+ return C == TRI->getMinimalPhysRegClass(SubOrSuper);
+ });
+ });
+ };
+
+ auto *RegClass = TRI->getMinimalPhysRegClass(getLdStRegOp(FirstMI).getReg());
+ for (const MCPhysReg &PR : *RegClass) {
+ if (DefinedInBB.available(PR) && UsedInBetween.available(PR) &&
+ !RegInfo.isReserved(PR) && !AnySubOrSuperRegCalleePreserved(PR) &&
+ CanBeUsedForAllClasses(PR)) {
+ DefinedInBB.addReg(PR);
+ LLVM_DEBUG(dbgs() << "Found rename register " << printReg(PR, TRI)
+ << "\n");
+ return {PR};
+ }
+ }
+ LLVM_DEBUG(dbgs() << "No rename register found from "
+ << TRI->getRegClassName(RegClass) << "\n");
+ return None;
+}
+
/// Scan the instructions looking for a load/store that can be combined with the
/// current instruction into a wider equivalent or a load/store pair.
MachineBasicBlock::iterator
@@ -1215,6 +1420,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
bool FindNarrowMerge) {
MachineBasicBlock::iterator E = I->getParent()->end();
MachineBasicBlock::iterator MBBI = I;
+ MachineBasicBlock::iterator MBBIWithRenameReg;
MachineInstr &FirstMI = *I;
++MBBI;
@@ -1223,9 +1429,16 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
Register Reg = getLdStRegOp(FirstMI).getReg();
Register BaseReg = getLdStBaseOp(FirstMI).getReg();
int Offset = getLdStOffsetOp(FirstMI).getImm();
- int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1;
+ int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1;
bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
+ Optional<bool> MaybeCanRename = None;
+ SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
+ LiveRegUnits UsedInBetween;
+ UsedInBetween.init(*TRI);
+
+ Flags.clearRenameReg();
+
// Track which register units have been modified and used between the first
// insn (inclusive) and the second insn.
ModifiedRegUnits.clear();
@@ -1237,6 +1450,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
MachineInstr &MI = *MBBI;
+ UsedInBetween.accumulate(MI);
+
// Don't count transient instructions towards the search limit since there
// may be different numbers of them if e.g. debug information is present.
if (!MI.isTransient())
@@ -1259,7 +1474,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// We're trying to pair instructions that differ in how they are scaled.
// If FirstMI is scaled then scale the offset of MI accordingly.
// Otherwise, do the opposite (i.e., make MI's offset unscaled).
- int MemSize = getMemScale(MI);
+ int MemSize = TII->getMemScale(MI);
if (MIIsUnscaled) {
// If the unscaled offset isn't a multiple of the MemSize, we can't
// pair the operations together: bail and keep looking.
@@ -1329,7 +1544,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
!(MI.mayLoad() &&
!UsedRegUnits.available(getLdStRegOp(MI).getReg())) &&
!mayAlias(MI, MemInsns, AA)) {
+
Flags.setMergeForward(false);
+ Flags.clearRenameReg();
return MBBI;
}
@@ -1337,18 +1554,41 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
// between the two instructions and none of the instructions between the
// first and the second alias with the first, we can combine the first
// into the second.
- if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg()) &&
- !(MayLoad &&
+ if (!(MayLoad &&
!UsedRegUnits.available(getLdStRegOp(FirstMI).getReg())) &&
!mayAlias(FirstMI, MemInsns, AA)) {
- Flags.setMergeForward(true);
- return MBBI;
+
+ if (ModifiedRegUnits.available(getLdStRegOp(FirstMI).getReg())) {
+ Flags.setMergeForward(true);
+ Flags.clearRenameReg();
+ return MBBI;
+ }
+
+ if (DebugCounter::shouldExecute(RegRenamingCounter)) {
+ if (!MaybeCanRename)
+ MaybeCanRename = {canRenameUpToDef(FirstMI, UsedInBetween,
+ RequiredClasses, TRI)};
+
+ if (*MaybeCanRename) {
+ Optional<MCPhysReg> MaybeRenameReg = tryToFindRegisterToRename(
+ FirstMI, MI, DefinedInBB, UsedInBetween, RequiredClasses,
+ TRI);
+ if (MaybeRenameReg) {
+ Flags.setRenameReg(*MaybeRenameReg);
+ Flags.setMergeForward(true);
+ MBBIWithRenameReg = MBBI;
+ }
+ }
+ }
}
// Unable to combine these instructions due to interference in between.
// Keep looking.
}
}
+ if (Flags.getRenameReg())
+ return MBBIWithRenameReg;
+
// If the instruction wasn't a matching load or store. Stop searching if we
// encounter a call instruction that might modify memory.
if (MI.isCall())
@@ -1492,7 +1732,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
MachineBasicBlock::iterator MBBI = I;
Register BaseReg = getLdStBaseOp(MemMI).getReg();
- int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI);
+ int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI);
// Scan forward looking for post-index opportunities. Updating instructions
// can't be formed if the memory instruction doesn't have the offset we're
@@ -1663,7 +1903,7 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
// with Offset-1)
bool IsUnscaled = TII->isUnscaledLdSt(MI);
int Offset = getLdStOffsetOp(MI).getImm();
- int OffsetStride = IsUnscaled ? getMemScale(MI) : 1;
+ int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1;
// Allow one more for offset.
if (Offset > 0)
Offset -= OffsetStride;
@@ -1680,7 +1920,13 @@ bool AArch64LoadStoreOpt::tryToPairLdStInst(MachineBasicBlock::iterator &MBBI) {
++NumUnscaledPairCreated;
// Keeping the iterator straight is a pain, so we let the merge routine tell
// us what the next instruction is after it's done mucking about.
+ auto Prev = std::prev(MBBI);
MBBI = mergePairedInsns(MBBI, Paired, Flags);
+ // Collect liveness info for instructions between Prev and the new position
+ // MBBI.
+ for (auto I = std::next(Prev); I != MBBI; I++)
+ updateDefinedRegisters(*I, DefinedInBB, TRI);
+
return true;
}
return false;
@@ -1723,7 +1969,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
// The immediate in the load/store is scaled by the size of the memory
// operation. The immediate in the add we're looking for,
// however, is not, so adjust here.
- int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI);
+ int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI);
// Look forward to try to find a pre-index instruction. For example,
// ldr x1, [x0, #64]
@@ -1742,6 +1988,7 @@ bool AArch64LoadStoreOpt::tryToMergeLdStUpdate
bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
bool EnableNarrowZeroStOpt) {
+
bool Modified = false;
// Four tranformations to do here:
// 1) Find loads that directly read from stores and promote them by
@@ -1786,8 +2033,17 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// ldr x1, [x2, #8]
// ; becomes
// ldp x0, x1, [x2]
+
+ if (MBB.getParent()->getRegInfo().tracksLiveness()) {
+ DefinedInBB.clear();
+ DefinedInBB.addLiveIns(MBB);
+ }
+
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
MBBI != E;) {
+ // Track currently live registers up to this point, to help with
+ // searching for a rename register on demand.
+ updateDefinedRegisters(*MBBI, DefinedInBB, TRI);
if (TII->isPairableLdStInst(*MBBI) && tryToPairLdStInst(MBBI))
Modified = true;
else
@@ -1825,11 +2081,14 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
// or store.
ModifiedRegUnits.init(*TRI);
UsedRegUnits.init(*TRI);
+ DefinedInBB.init(*TRI);
bool Modified = false;
bool enableNarrowZeroStOpt = !Subtarget->requiresStrictAlign();
- for (auto &MBB : Fn)
- Modified |= optimizeBlock(MBB, enableNarrowZeroStOpt);
+ for (auto &MBB : Fn) {
+ auto M = optimizeBlock(MBB, enableNarrowZeroStOpt);
+ Modified |= M;
+ }
return Modified;
}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 0009fb7b5520..6ddb3fdb0046 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -19,6 +19,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCLinkerOptimizationHint.h"
#include <cassert>
@@ -51,10 +52,16 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
bool HasStackFrame = false;
/// Amount of stack frame size, not including callee-saved registers.
- unsigned LocalStackSize;
+ uint64_t LocalStackSize = 0;
+
+ /// The start and end frame indices for the SVE callee saves.
+ int MinSVECSFrameIndex = 0;
+ int MaxSVECSFrameIndex = 0;
/// Amount of stack frame size used for saving callee-saved registers.
- unsigned CalleeSavedStackSize;
+ unsigned CalleeSavedStackSize = 0;
+ unsigned SVECalleeSavedStackSize = 0;
+ bool HasCalleeSavedStackSize = false;
/// Number of TLS accesses using the special (combinable)
/// _TLS_MODULE_BASE_ symbol.
@@ -117,7 +124,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
// Offset from SP-at-entry to the tagged base pointer.
// Tagged base pointer is set up to point to the first (lowest address) tagged
// stack slot.
- unsigned TaggedBasePointerOffset;
+ unsigned TaggedBasePointerOffset = 0;
public:
AArch64FunctionInfo() = default;
@@ -160,15 +167,79 @@ public:
void setCalleeSaveStackHasFreeSpace(bool s) {
CalleeSaveStackHasFreeSpace = s;
}
-
bool isSplitCSR() const { return IsSplitCSR; }
void setIsSplitCSR(bool s) { IsSplitCSR = s; }
- void setLocalStackSize(unsigned Size) { LocalStackSize = Size; }
- unsigned getLocalStackSize() const { return LocalStackSize; }
+ void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; }
+ uint64_t getLocalStackSize() const { return LocalStackSize; }
+
+ void setCalleeSavedStackSize(unsigned Size) {
+ CalleeSavedStackSize = Size;
+ HasCalleeSavedStackSize = true;
+ }
+
+ // When CalleeSavedStackSize has not been set (for example when
+ // some MachineIR pass is run in isolation), then recalculate
+ // the CalleeSavedStackSize directly from the CalleeSavedInfo.
+ // Note: This information can only be recalculated after PEI
+ // has assigned offsets to the callee save objects.
+ unsigned getCalleeSavedStackSize(const MachineFrameInfo &MFI) const {
+ bool ValidateCalleeSavedStackSize = false;
+
+#ifndef NDEBUG
+ // Make sure the calculated size derived from the CalleeSavedInfo
+ // equals the cached size that was calculated elsewhere (e.g. in
+ // determineCalleeSaves).
+ ValidateCalleeSavedStackSize = HasCalleeSavedStackSize;
+#endif
+
+ if (!HasCalleeSavedStackSize || ValidateCalleeSavedStackSize) {
+ assert(MFI.isCalleeSavedInfoValid() && "CalleeSavedInfo not calculated");
+ if (MFI.getCalleeSavedInfo().empty())
+ return 0;
+
+ int64_t MinOffset = std::numeric_limits<int64_t>::max();
+ int64_t MaxOffset = std::numeric_limits<int64_t>::min();
+ for (const auto &Info : MFI.getCalleeSavedInfo()) {
+ int FrameIdx = Info.getFrameIdx();
+ if (MFI.getStackID(FrameIdx) != TargetStackID::Default)
+ continue;
+ int64_t Offset = MFI.getObjectOffset(FrameIdx);
+ int64_t ObjSize = MFI.getObjectSize(FrameIdx);
+ MinOffset = std::min<int64_t>(Offset, MinOffset);
+ MaxOffset = std::max<int64_t>(Offset + ObjSize, MaxOffset);
+ }
+
+ unsigned Size = alignTo(MaxOffset - MinOffset, 16);
+ assert((!HasCalleeSavedStackSize || getCalleeSavedStackSize() == Size) &&
+ "Invalid size calculated for callee saves");
+ return Size;
+ }
+
+ return getCalleeSavedStackSize();
+ }
+
+ unsigned getCalleeSavedStackSize() const {
+ assert(HasCalleeSavedStackSize &&
+ "CalleeSavedStackSize has not been calculated");
+ return CalleeSavedStackSize;
+ }
+
+ // Saves the CalleeSavedStackSize for SVE vectors in 'scalable bytes'
+ void setSVECalleeSavedStackSize(unsigned Size) {
+ SVECalleeSavedStackSize = Size;
+ }
+ unsigned getSVECalleeSavedStackSize() const {
+ return SVECalleeSavedStackSize;
+ }
+
+ void setMinMaxSVECSFrameIndex(int Min, int Max) {
+ MinSVECSFrameIndex = Min;
+ MaxSVECSFrameIndex = Max;
+ }
- void setCalleeSavedStackSize(unsigned Size) { CalleeSavedStackSize = Size; }
- unsigned getCalleeSavedStackSize() const { return CalleeSavedStackSize; }
+ int getMinSVECSFrameIndex() const { return MinSVECSFrameIndex; }
+ int getMaxSVECSFrameIndex() const { return MaxSVECSFrameIndex; }
void incNumLocalDynamicTLSAccesses() { ++NumLocalDynamicTLSAccesses; }
unsigned getNumLocalDynamicTLSAccesses() const {
diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
index d30ea120bae4..230fd514d022 100644
--- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
@@ -62,20 +62,6 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
CombinerHelper Helper(Observer, B, KB, MDT);
switch (MI.getOpcode()) {
- case TargetOpcode::G_CONCAT_VECTORS:
- return Helper.tryCombineConcatVectors(MI);
- case TargetOpcode::G_SHUFFLE_VECTOR:
- return Helper.tryCombineShuffleVector(MI);
- case TargetOpcode::G_LOAD:
- case TargetOpcode::G_SEXTLOAD:
- case TargetOpcode::G_ZEXTLOAD: {
- bool Changed = false;
- Changed |= Helper.tryCombineExtendingLoads(MI);
- Changed |= Helper.tryCombineIndexedLoadStore(MI);
- return Changed;
- }
- case TargetOpcode::G_STORE:
- return Helper.tryCombineIndexedLoadStore(MI);
case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
switch (MI.getIntrinsicID()) {
case Intrinsic::memcpy:
@@ -93,9 +79,16 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
}
}
- if (Generated.tryCombineAll(Observer, MI, B))
+ if (Generated.tryCombineAll(Observer, MI, B, Helper))
return true;
+ switch (MI.getOpcode()) {
+ case TargetOpcode::G_CONCAT_VECTORS:
+ return Helper.tryCombineConcatVectors(MI);
+ case TargetOpcode::G_SHUFFLE_VECTOR:
+ return Helper.tryCombineShuffleVector(MI);
+ }
+
return false;
}
diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
index a594ecb71fc9..9135f1b40122 100644
--- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -38,6 +38,7 @@
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 8ec73aa3c040..40efac261fd9 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -222,8 +222,9 @@ unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
return RegisterBankInfo::copyCost(A, B, Size);
}
-const RegisterBank &AArch64RegisterBankInfo::getRegBankFromRegClass(
- const TargetRegisterClass &RC) const {
+const RegisterBank &
+AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const {
switch (RC.getID()) {
case AArch64::FPR8RegClassID:
case AArch64::FPR16RegClassID:
@@ -529,7 +530,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Arithmetic ops.
case TargetOpcode::G_ADD:
case TargetOpcode::G_SUB:
- case TargetOpcode::G_GEP:
+ case TargetOpcode::G_PTR_ADD:
case TargetOpcode::G_MUL:
case TargetOpcode::G_SDIV:
case TargetOpcode::G_UDIV:
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
index 016fed65eb2a..e956fca1aa10 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -132,8 +132,8 @@ public:
unsigned copyCost(const RegisterBank &A, const RegisterBank &B,
unsigned Size) const override;
- const RegisterBank &
- getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+ const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const override;
InstructionMappings
getInstrAlternativeMappings(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index de176088595d..14f839cd4f81 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -43,6 +43,8 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
const MCPhysReg *
AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
+ if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
+ return CSR_Win_AArch64_CFGuard_Check_SaveList;
if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
return CSR_Win_AArch64_AAPCS_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::GHC)
@@ -53,6 +55,8 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return CSR_AArch64_AllRegs_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
return CSR_AArch64_AAVPCS_SaveList;
+ if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
+ return CSR_AArch64_SVE_AAPCS_SaveList;
if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
@@ -123,7 +127,10 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
if (CC == CallingConv::AArch64_VectorCall)
return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
if (CC == CallingConv::AArch64_SVE_VectorCall)
- return CSR_AArch64_SVE_AAPCS_RegMask;
+ return SCS ? CSR_AArch64_SVE_AAPCS_SCS_RegMask
+ : CSR_AArch64_SVE_AAPCS_RegMask;
+ if (CC == CallingConv::CFGuard_Check)
+ return CSR_Win_AArch64_CFGuard_Check_RegMask;
if (MF.getSubtarget<AArch64Subtarget>().getTargetLowering()
->supportSwiftError() &&
MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
@@ -390,7 +397,6 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
unsigned BaseReg,
int64_t Offset) const {
- assert(Offset <= INT_MAX && "Offset too big to fit in int.");
assert(MI && "Unable to get the legal offset for nil instruction.");
StackOffset SaveOffset(Offset, MVT::i8);
return isAArch64FrameOffsetLegal(*MI, SaveOffset) & AArch64FrameOffsetIsLegal;
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index 61fc0795c242..f52feab03953 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -481,7 +481,7 @@ def QQQQ : RegisterClass<"AArch64", [untyped], 128, (add QSeqQuads)> {
// Vector operand versions of the FP registers. Alternate name printing and
-// assmebler matching.
+// assembler matching.
def VectorReg64AsmOperand : AsmOperandClass {
let Name = "VectorReg64";
let PredicateMethod = "isNeonVectorReg";
@@ -858,35 +858,19 @@ def PPR3b64 : PPRRegOp<"d", PPRAsmOp3b64, ElementSizeD, PPR_3b>;
//******************************************************************************
-// SVE vector register class
-def ZPR : RegisterClass<"AArch64",
- [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
- nxv2f16, nxv4f16, nxv8f16,
- nxv1f32, nxv2f32, nxv4f32,
- nxv1f64, nxv2f64],
- 128, (sequence "Z%u", 0, 31)> {
+// SVE vector register classes
+class ZPRClass<int lastreg> : RegisterClass<"AArch64",
+ [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
+ nxv2f16, nxv4f16, nxv8f16,
+ nxv2f32, nxv4f32,
+ nxv2f64],
+ 128, (sequence "Z%u", 0, lastreg)> {
let Size = 128;
}
-// SVE restricted 4 bit scalable vector register class
-def ZPR_4b : RegisterClass<"AArch64",
- [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
- nxv2f16, nxv4f16, nxv8f16,
- nxv1f32, nxv2f32, nxv4f32,
- nxv1f64, nxv2f64],
- 128, (sequence "Z%u", 0, 15)> {
- let Size = 128;
-}
-
-// SVE restricted 3 bit scalable vector register class
-def ZPR_3b : RegisterClass<"AArch64",
- [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
- nxv2f16, nxv4f16, nxv8f16,
- nxv1f32, nxv2f32, nxv4f32,
- nxv1f64, nxv2f64],
- 128, (sequence "Z%u", 0, 7)> {
- let Size = 128;
-}
+def ZPR : ZPRClass<31>;
+def ZPR_4b : ZPRClass<15>; // Restricted 4 bit SVE vector register class.
+def ZPR_3b : ZPRClass<7>; // Restricted 3 bit SVE vector register class.
class ZPRAsmOperand<string name, int Width, string RegClassSuffix = "">
: AsmOperandClass {
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index b573eac76754..c849d7af9a40 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -10,6 +10,72 @@
//
//===----------------------------------------------------------------------===//
+def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [
+ SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
+ SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64st1_scatter : SDNode<"AArch64ISD::SST1", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_scaled : SDNode<"AArch64ISD::SST1_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_uxtw : SDNode<"AArch64ISD::SST1_UXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_sxtw : SDNode<"AArch64ISD::SST1_SXTW", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED", SDT_AArch64_SST1, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+def AArch64st1_scatter_imm : SDNode<"AArch64ISD::SST1_IMM", SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
+
+def AArch64ld1_gather : SDNode<"AArch64ISD::GLD1", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_scaled : SDNode<"AArch64ISD::GLD1_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_uxtw : SDNode<"AArch64ISD::GLD1_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_sxtw : SDNode<"AArch64ISD::GLD1_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
+def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_gather_sxtw : SDNode<"AArch64ISD::GLD1S_SXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_gather_uxtw_scaled : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_gather_imm : SDNode<"AArch64ISD::GLD1S_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
+def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>;
+
+def AArch64smaxv_pred : SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
+def AArch64umaxv_pred : SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
+def AArch64sminv_pred : SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
+def AArch64uminv_pred : SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>;
+def AArch64orv_pred : SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>;
+def AArch64eorv_pred : SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>;
+def AArch64andv_pred : SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>;
+def AArch64lasta : SDNode<"AArch64ISD::LASTA", SDT_AArch64Reduce>;
+def AArch64lastb : SDNode<"AArch64ISD::LASTB", SDT_AArch64Reduce>;
+
+def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
+def AArch64clasta_n : SDNode<"AArch64ISD::CLASTA_N", SDT_AArch64ReduceWithInit>;
+def AArch64clastb_n : SDNode<"AArch64ISD::CLASTB_N", SDT_AArch64ReduceWithInit>;
+
+def SDT_AArch64Rev : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def AArch64rev : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
+
+def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
+def AArch64ptest : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
+
let Predicates = [HasSVE] in {
def RDFFR_PPz : sve_int_rdffr_pred<0b0, "rdffr">;
@@ -18,69 +84,69 @@ let Predicates = [HasSVE] in {
def SETFFR : sve_int_setffr<"setffr">;
def WRFFR : sve_int_wrffr<"wrffr">;
- defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add">;
- defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub">;
- defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd">;
- defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd">;
- defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub">;
- defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub">;
-
- defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and">;
- defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr">;
- defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor">;
- defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic">;
-
- defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add">;
- defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub">;
- defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr">;
-
- defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr">;
- defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor">;
- defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and">;
- defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic">;
-
- defm ADD_ZI : sve_int_arith_imm0<0b000, "add">;
- defm SUB_ZI : sve_int_arith_imm0<0b001, "sub">;
- defm SUBR_ZI : sve_int_arith_imm0<0b011, "subr">;
- defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd">;
- defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd">;
- defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub">;
- defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub">;
-
- defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad">;
- defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb">;
- defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla">;
- defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls">;
+ defm ADD_ZZZ : sve_int_bin_cons_arit_0<0b000, "add", add>;
+ defm SUB_ZZZ : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
+ defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
+ defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>;
+ defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>;
+ defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>;
+
+ defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
+ defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
+ defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
+ defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;
+
+ defm ADD_ZPmZ : sve_int_bin_pred_arit_0<0b000, "add", int_aarch64_sve_add>;
+ defm SUB_ZPmZ : sve_int_bin_pred_arit_0<0b001, "sub", int_aarch64_sve_sub>;
+ defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", int_aarch64_sve_subr>;
+
+ defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
+ defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
+ defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
+ defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
+
+ defm ADD_ZI : sve_int_arith_imm0<0b000, "add", add>;
+ defm SUB_ZI : sve_int_arith_imm0<0b001, "sub", sub>;
+ defm SUBR_ZI : sve_int_arith_imm0_subr<0b011, "subr", sub>;
+ defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>;
+ defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
+ defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>;
+ defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>;
+
+ defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
+ defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
+ defm MLA_ZPmZZ : sve_int_mlas_vvv_pred<0b0, "mla", int_aarch64_sve_mla>;
+ defm MLS_ZPmZZ : sve_int_mlas_vvv_pred<0b1, "mls", int_aarch64_sve_mls>;
// SVE predicated integer reductions.
- defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv">;
- defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv">;
- defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv">;
- defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv">;
- defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv">;
- defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv">;
- defm ORV_VPZ : sve_int_reduce_2<0b000, "orv">;
- defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv">;
- defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv">;
-
- defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn">;
- defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon">;
- defm AND_ZI : sve_int_log_imm<0b10, "and", "bic">;
-
- defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", simm8>;
- defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", simm8>;
- defm UMAX_ZI : sve_int_arith_imm1<0b01, "umax", imm0_255>;
- defm UMIN_ZI : sve_int_arith_imm1<0b11, "umin", imm0_255>;
-
- defm MUL_ZI : sve_int_arith_imm2<"mul">;
- defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul">;
- defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh">;
- defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh">;
-
- defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv">;
- defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv">;
- defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr">;
- defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr">;
+ defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
+ defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
+ defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_pred>;
+ defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_pred>;
+ defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_pred>;
+ defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_pred>;
+ defm ORV_VPZ : sve_int_reduce_2<0b000, "orv", AArch64orv_pred>;
+ defm EORV_VPZ : sve_int_reduce_2<0b001, "eorv", AArch64eorv_pred>;
+ defm ANDV_VPZ : sve_int_reduce_2<0b010, "andv", AArch64andv_pred>;
+
+ defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>;
+ defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
+ defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;
+
+ defm SMAX_ZI : sve_int_arith_imm1<0b00, "smax", smax>;
+ defm SMIN_ZI : sve_int_arith_imm1<0b10, "smin", smin>;
+ defm UMAX_ZI : sve_int_arith_imm1_unsigned<0b01, "umax", umax>;
+ defm UMIN_ZI : sve_int_arith_imm1_unsigned<0b11, "umin", umin>;
+
+ defm MUL_ZI : sve_int_arith_imm2<"mul", mul>;
+ defm MUL_ZPmZ : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
+ defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
+ defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;
+
+ defm SDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b100, "sdiv", int_aarch64_sve_sdiv>;
+ defm UDIV_ZPmZ : sve_int_bin_pred_arit_2_div<0b101, "udiv", int_aarch64_sve_udiv>;
+ defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", int_aarch64_sve_sdivr>;
+ defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", int_aarch64_sve_udivr>;
defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
@@ -88,32 +154,32 @@ let Predicates = [HasSVE] in {
defm SDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b0, "sdot", int_aarch64_sve_sdot_lane>;
defm UDOT_ZZZI : sve_intx_dot_by_indexed_elem<0b1, "udot", int_aarch64_sve_udot_lane>;
- defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb">;
- defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb">;
- defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth">;
- defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth">;
- defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw">;
- defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw">;
- defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
- defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;
-
- defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", null_frag>;
- defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", null_frag>;
+ defm SXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b000, "sxtb", int_aarch64_sve_sxtb>;
+ defm UXTB_ZPmZ : sve_int_un_pred_arit_0_h<0b001, "uxtb", int_aarch64_sve_uxtb>;
+ defm SXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b010, "sxth", int_aarch64_sve_sxth>;
+ defm UXTH_ZPmZ : sve_int_un_pred_arit_0_w<0b011, "uxth", int_aarch64_sve_uxth>;
+ defm SXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b100, "sxtw", int_aarch64_sve_sxtw>;
+ defm UXTW_ZPmZ : sve_int_un_pred_arit_0_d<0b101, "uxtw", int_aarch64_sve_uxtw>;
+ defm ABS_ZPmZ : sve_int_un_pred_arit_0< 0b110, "abs", int_aarch64_sve_abs>;
+ defm NEG_ZPmZ : sve_int_un_pred_arit_0< 0b111, "neg", int_aarch64_sve_neg>;
+
+ defm CLS_ZPmZ : sve_int_un_pred_arit_1< 0b000, "cls", int_aarch64_sve_cls>;
+ defm CLZ_ZPmZ : sve_int_un_pred_arit_1< 0b001, "clz", int_aarch64_sve_clz>;
defm CNT_ZPmZ : sve_int_un_pred_arit_1< 0b010, "cnt", int_aarch64_sve_cnt>;
- defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", null_frag>;
- defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", null_frag>;
- defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs">;
- defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg">;
+ defm CNOT_ZPmZ : sve_int_un_pred_arit_1< 0b011, "cnot", int_aarch64_sve_cnot>;
+ defm NOT_ZPmZ : sve_int_un_pred_arit_1< 0b110, "not", int_aarch64_sve_not>;
+ defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
+ defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;
- defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax">;
- defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax">;
- defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin">;
- defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin">;
- defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd">;
- defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd">;
+ defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", int_aarch64_sve_smax>;
+ defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", int_aarch64_sve_umax>;
+ defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", int_aarch64_sve_smin>;
+ defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", int_aarch64_sve_umin>;
+ defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
+ defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;
- defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe">;
- defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte">;
+ defm FRECPE_ZZ : sve_fp_2op_u_zd<0b110, "frecpe", int_aarch64_sve_frecpe_x>;
+ defm FRSQRTE_ZZ : sve_fp_2op_u_zd<0b111, "frsqrte", int_aarch64_sve_frsqrte_x>;
defm FADD_ZPmI : sve_fp_2op_i_p_zds<0b000, "fadd", sve_fpimm_half_one>;
defm FSUB_ZPmI : sve_fp_2op_i_p_zds<0b001, "fsub", sve_fpimm_half_one>;
@@ -124,57 +190,57 @@ let Predicates = [HasSVE] in {
defm FMAX_ZPmI : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
defm FMIN_ZPmI : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
- defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd">;
- defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub">;
- defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul">;
- defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr">;
- defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm">;
- defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm">;
- defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax">;
- defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin">;
- defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd">;
- defm FSCALE_ZPmZ : sve_fp_2op_p_zds<0b1001, "fscale">;
- defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx">;
- defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr">;
- defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv">;
-
- defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
- defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", null_frag>;
- defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", null_frag>;
- defm FTSMUL_ZZZ : sve_fp_3op_u_zd<0b011, "ftsmul", null_frag>;
- defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", null_frag>;
- defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", null_frag>;
-
- defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel">;
-
- defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd">;
- defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla">;
-
- defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla">;
- defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls">;
- defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla">;
- defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls">;
-
- defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad">;
- defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb">;
- defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad">;
- defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb">;
-
- defm FTMAD_ZZI : sve_fp_ftmad<"ftmad">;
-
- defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla">;
- defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls">;
-
- defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla">;
- defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul">;
+ defm FADD_ZPmZ : sve_fp_2op_p_zds<0b0000, "fadd", int_aarch64_sve_fadd>;
+ defm FSUB_ZPmZ : sve_fp_2op_p_zds<0b0001, "fsub", int_aarch64_sve_fsub>;
+ defm FMUL_ZPmZ : sve_fp_2op_p_zds<0b0010, "fmul", int_aarch64_sve_fmul>;
+ defm FSUBR_ZPmZ : sve_fp_2op_p_zds<0b0011, "fsubr", int_aarch64_sve_fsubr>;
+ defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>;
+ defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>;
+ defm FMAX_ZPmZ : sve_fp_2op_p_zds<0b0110, "fmax", int_aarch64_sve_fmax>;
+ defm FMIN_ZPmZ : sve_fp_2op_p_zds<0b0111, "fmin", int_aarch64_sve_fmin>;
+ defm FABD_ZPmZ : sve_fp_2op_p_zds<0b1000, "fabd", int_aarch64_sve_fabd>;
+ defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
+ defm FMULX_ZPmZ : sve_fp_2op_p_zds<0b1010, "fmulx", int_aarch64_sve_fmulx>;
+ defm FDIVR_ZPmZ : sve_fp_2op_p_zds<0b1100, "fdivr", int_aarch64_sve_fdivr>;
+ defm FDIV_ZPmZ : sve_fp_2op_p_zds<0b1101, "fdiv", int_aarch64_sve_fdiv>;
+
+ defm FADD_ZZZ : sve_fp_3op_u_zd<0b000, "fadd", fadd>;
+ defm FSUB_ZZZ : sve_fp_3op_u_zd<0b001, "fsub", fsub>;
+ defm FMUL_ZZZ : sve_fp_3op_u_zd<0b010, "fmul", fmul>;
+ defm FTSMUL_ZZZ : sve_fp_3op_u_zd_ftsmul<0b011, "ftsmul", int_aarch64_sve_ftsmul_x>;
+ defm FRECPS_ZZZ : sve_fp_3op_u_zd<0b110, "frecps", int_aarch64_sve_frecps_x>;
+ defm FRSQRTS_ZZZ : sve_fp_3op_u_zd<0b111, "frsqrts", int_aarch64_sve_frsqrts_x>;
+
+ defm FTSSEL_ZZZ : sve_int_bin_cons_misc_0_b<"ftssel", int_aarch64_sve_ftssel_x>;
+
+ defm FCADD_ZPmZ : sve_fp_fcadd<"fcadd", int_aarch64_sve_fcadd>;
+ defm FCMLA_ZPmZZ : sve_fp_fcmla<"fcmla", int_aarch64_sve_fcmla>;
+
+ defm FMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b00, "fmla", int_aarch64_sve_fmla>;
+ defm FMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b01, "fmls", int_aarch64_sve_fmls>;
+ defm FNMLA_ZPmZZ : sve_fp_3op_p_zds_a<0b10, "fnmla", int_aarch64_sve_fnmla>;
+ defm FNMLS_ZPmZZ : sve_fp_3op_p_zds_a<0b11, "fnmls", int_aarch64_sve_fnmls>;
+
+ defm FMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b00, "fmad", int_aarch64_sve_fmad>;
+ defm FMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b01, "fmsb", int_aarch64_sve_fmsb>;
+ defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
+ defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;
+
+ defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
+
+ defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
+ defm FMLS_ZZZI : sve_fp_fma_by_indexed_elem<0b1, "fmls", int_aarch64_sve_fmls_lane>;
+
+ defm FCMLA_ZZZI : sve_fp_fcmla_by_indexed_elem<"fcmla", int_aarch64_sve_fcmla_lane>;
+ defm FMUL_ZZZI : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
// SVE floating point reductions.
- defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda">;
- defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv">;
- defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv">;
- defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv">;
- defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv">;
- defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv">;
+ defm FADDA_VPZ : sve_fp_2op_p_vd<0b000, "fadda", int_aarch64_sve_fadda>;
+ defm FADDV_VPZ : sve_fp_fast_red<0b000, "faddv", int_aarch64_sve_faddv>;
+ defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", int_aarch64_sve_fmaxnmv>;
+ defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", int_aarch64_sve_fminnmv>;
+ defm FMAXV_VPZ : sve_fp_fast_red<0b110, "fmaxv", int_aarch64_sve_fmaxv>;
+ defm FMINV_VPZ : sve_fp_fast_red<0b111, "fminv", int_aarch64_sve_fminv>;
// Splat immediate (unpredicated)
defm DUP_ZI : sve_int_dup_imm<"dup">;
@@ -195,21 +261,21 @@ let Predicates = [HasSVE] in {
defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">;
// Select elements from either vector (predicated)
- defm SEL_ZPZZ : sve_int_sel_vvv<"sel">;
+ defm SEL_ZPZZ : sve_int_sel_vvv<"sel", vselect>;
- defm SPLICE_ZPZ : sve_int_perm_splice<"splice">;
- defm COMPACT_ZPZ : sve_int_perm_compact<"compact">;
- defm INSR_ZR : sve_int_perm_insrs<"insr">;
- defm INSR_ZV : sve_int_perm_insrv<"insr">;
- def EXT_ZZI : sve_int_perm_extract_i<"ext">;
+ defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;
+ defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
+ defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
+ defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
+ defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
- defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit">;
- defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb">;
- defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh">;
- defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw">;
+ defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
+ defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
+ defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
+ defm REVW_ZPmZ : sve_int_perm_rev_revw<"revw", int_aarch64_sve_revw>;
- defm REV_PP : sve_int_perm_reverse_p<"rev">;
- defm REV_ZZ : sve_int_perm_reverse_z<"rev">;
+ defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
+ defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
@@ -222,9 +288,7 @@ let Predicates = [HasSVE] in {
defm MOVPRFX_ZPzZ : sve_int_movprfx_pred_zero<0b000, "movprfx">;
defm MOVPRFX_ZPmZ : sve_int_movprfx_pred_merge<0b001, "movprfx">;
def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
- def FEXPA_ZZ_H : sve_int_bin_cons_misc_0_c<0b01000000, "fexpa", ZPR16>;
- def FEXPA_ZZ_S : sve_int_bin_cons_misc_0_c<0b10000000, "fexpa", ZPR32>;
- def FEXPA_ZZ_D : sve_int_bin_cons_misc_0_c<0b11000000, "fexpa", ZPR64>;
+ defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
def BRKPA_PPzPP : sve_int_brkp<0b00, "brkpa">;
def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">;
@@ -243,36 +307,36 @@ let Predicates = [HasSVE] in {
def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
def PFALSE : sve_int_pfalse<0b000000, "pfalse">;
- defm PFIRST : sve_int_pfirst<0b00000, "pfirst">;
- defm PNEXT : sve_int_pnext<0b00110, "pnext">;
-
- def AND_PPzPP : sve_int_pred_log<0b0000, "and">;
- def BIC_PPzPP : sve_int_pred_log<0b0001, "bic">;
- def EOR_PPzPP : sve_int_pred_log<0b0010, "eor">;
- def SEL_PPPP : sve_int_pred_log<0b0011, "sel">;
- def ANDS_PPzPP : sve_int_pred_log<0b0100, "ands">;
- def BICS_PPzPP : sve_int_pred_log<0b0101, "bics">;
- def EORS_PPzPP : sve_int_pred_log<0b0110, "eors">;
- def ORR_PPzPP : sve_int_pred_log<0b1000, "orr">;
- def ORN_PPzPP : sve_int_pred_log<0b1001, "orn">;
- def NOR_PPzPP : sve_int_pred_log<0b1010, "nor">;
- def NAND_PPzPP : sve_int_pred_log<0b1011, "nand">;
- def ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs">;
- def ORNS_PPzPP : sve_int_pred_log<0b1101, "orns">;
- def NORS_PPzPP : sve_int_pred_log<0b1110, "nors">;
- def NANDS_PPzPP : sve_int_pred_log<0b1111, "nands">;
-
- defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta">;
- defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb">;
- defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta">;
- defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb">;
- defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta">;
- defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb">;
-
- defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta">;
- defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb">;
- defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta">;
- defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb">;
+ defm PFIRST : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
+ defm PNEXT : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
+
+ defm AND_PPzPP : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z>;
+ defm BIC_PPzPP : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
+ defm EOR_PPzPP : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z>;
+ defm SEL_PPPP : sve_int_pred_log<0b0011, "sel", vselect>;
+ defm ANDS_PPzPP : sve_int_pred_log<0b0100, "ands", null_frag>;
+ defm BICS_PPzPP : sve_int_pred_log<0b0101, "bics", null_frag>;
+ defm EORS_PPzPP : sve_int_pred_log<0b0110, "eors", null_frag>;
+ defm ORR_PPzPP : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>;
+ defm ORN_PPzPP : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
+ defm NOR_PPzPP : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
+ defm NAND_PPzPP : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;
+ defm ORRS_PPzPP : sve_int_pred_log<0b1100, "orrs", null_frag>;
+ defm ORNS_PPzPP : sve_int_pred_log<0b1101, "orns", null_frag>;
+ defm NORS_PPzPP : sve_int_pred_log<0b1110, "nors", null_frag>;
+ defm NANDS_PPzPP : sve_int_pred_log<0b1111, "nands", null_frag>;
+
+ defm CLASTA_RPZ : sve_int_perm_clast_rz<0, "clasta", AArch64clasta_n>;
+ defm CLASTB_RPZ : sve_int_perm_clast_rz<1, "clastb", AArch64clastb_n>;
+ defm CLASTA_VPZ : sve_int_perm_clast_vz<0, "clasta", AArch64clasta_n>;
+ defm CLASTB_VPZ : sve_int_perm_clast_vz<1, "clastb", AArch64clastb_n>;
+ defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
+ defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;
+
+ defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
+ defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
+ defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
+ defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;
// continuous load with reg+immediate
defm LD1B_IMM : sve_mem_cld_si<0b0000, "ld1b", Z_b, ZPR8>;
@@ -404,115 +468,115 @@ let Predicates = [HasSVE] in {
// Gathers using unscaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw]
- defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
+ defm GLD1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLD1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm GLD1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
// Gathers using scaled 32-bit offsets, e.g.
// ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
- defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
- defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
-
- // Gathers using scaled 32-bit pointers with offset, e.g.
+ defm GLD1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+
+ // Gathers using 32-bit pointers with scaled offset, e.g.
// ld1h z0.s, p0/z, [z0.s, #16]
- defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31>;
- defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31>;
- defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31>;
- defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31>;
- defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2>;
- defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2>;
- defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2>;
- defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2>;
- defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4>;
- defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4>;
-
- // Gathers using scaled 64-bit pointers with offset, e.g.
+ defm GLD1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv4i8>;
+ defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv4i8>;
+ defm GLD1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv4i8>;
+ defm GLDFF1B_S : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv4i8>;
+ defm GLD1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv4i16>;
+ defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv4i16>;
+ defm GLD1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv4i16>;
+ defm GLDFF1H_S : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv4i16>;
+ defm GLD1W : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv4i32>;
+ defm GLDFF1W : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv4i32>;
+
+ // Gathers using 64-bit pointers with scaled offset, e.g.
// ld1h z0.d, p0/z, [z0.d, #16]
- defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31>;
- defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31>;
- defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31>;
- defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2>;
- defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2>;
- defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2>;
- defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4>;
- defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4>;
- defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4>;
- defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8>;
- defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8>;
+ defm GLD1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb", imm0_31, AArch64ld1s_gather_imm, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b", imm0_31, AArch64ld1_gather_imm, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b", imm0_31, null_frag, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh", uimm5s2, AArch64ld1s_gather_imm, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h", uimm5s2, AArch64ld1_gather_imm, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h", uimm5s2, null_frag, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw", uimm5s4, AArch64ld1s_gather_imm, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w", uimm5s4, AArch64ld1_gather_imm, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w", uimm5s4, null_frag, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d", uimm5s8, AArch64ld1_gather_imm, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d", uimm5s8, null_frag, nxv2i64>;
// Gathers using unscaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d]
- defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb">;
- defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb">;
- defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b">;
- defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b">;
- defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh">;
- defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh">;
- defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h">;
- defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h">;
- defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw">;
- defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw">;
- defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w">;
- defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w">;
- defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d">;
- defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d">;
+ defm GLD1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb", AArch64ld1s_gather, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b", AArch64ld1_gather, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b", null_frag, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh", AArch64ld1s_gather, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h", AArch64ld1_gather, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h", null_frag, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw", AArch64ld1s_gather, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w", AArch64ld1_gather, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w", null_frag, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d", AArch64ld1_gather, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d", null_frag, nxv2i64>;
// Gathers using scaled 64-bit offsets, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
- defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", ZPR64ExtLSL16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", ZPR64ExtLSL16>;
- defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", ZPR64ExtLSL16>;
- defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", ZPR64ExtLSL16>;
- defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", ZPR64ExtLSL32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", ZPR64ExtLSL32>;
- defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", ZPR64ExtLSL32>;
- defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", ZPR64ExtLSL32>;
- defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", ZPR64ExtLSL64>;
- defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", ZPR64ExtLSL64>;
+ defm GLD1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh", AArch64ld1s_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh", null_frag, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h", AArch64ld1_gather_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h", null_frag, ZPR64ExtLSL16, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw", AArch64ld1s_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw", null_frag, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w", AArch64ld1_gather_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w", null_frag, ZPR64ExtLSL32, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d", AArch64ld1_gather_scaled, ZPR64ExtLSL64, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d", null_frag, ZPR64ExtLSL64, nxv2i64>;
// Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw]
- defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
+ defm GLD1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLD1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLDFF1B_D : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b", null_frag, null_frag, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm GLD1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw, AArch64ld1s_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d", AArch64ld1_gather_sxtw, AArch64ld1_gather_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
// Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
// ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
- defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh",ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw",ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
- defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+ defm GLD1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLD1H_D : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLDFF1H_D : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h", null_frag, null_frag, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm GLD1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw", AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLD1W_D : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLDFF1W_D : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w", null_frag, null_frag, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm GLD1D : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d", AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+ defm GLDFF1D : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d", null_frag, null_frag, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
// Non-temporal contiguous loads (register + immediate)
defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
@@ -550,51 +614,55 @@ let Predicates = [HasSVE] in {
defm ST1W_D : sve_mem_cst_ss<0b1011, "st1w", Z_d, ZPR64, GPR64NoXZRshifted32>;
defm ST1D : sve_mem_cst_ss<0b1111, "st1d", Z_d, ZPR64, GPR64NoXZRshifted64>;
- // Scatters using unscaled 32-bit offsets, e.g.
- // st1h z0.s, p0, [x0, z0.s, uxtw]
- // and unpacked:
+ // Scatters using unpacked, unscaled 32-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, uxtw]
- defm SST1B_D : sve_mem_sst_sv_32_unscaled<0b000, "st1b", Z_d, ZPR64, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
- defm SST1B_S : sve_mem_sst_sv_32_unscaled<0b001, "st1b", Z_s, ZPR32, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only>;
- defm SST1H_D : sve_mem_sst_sv_32_unscaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm SST1H_S : sve_mem_sst_sv_32_unscaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm SST1W_D : sve_mem_sst_sv_32_unscaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
- defm SST1W : sve_mem_sst_sv_32_unscaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW8, ZPR32ExtUXTW8>;
- defm SST1D : sve_mem_sst_sv_32_unscaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW8, ZPR64ExtUXTW8>;
-
- // Scatters using scaled 32-bit offsets, e.g.
+ defm SST1B_D : sve_mem_64b_sst_sv_32_unscaled<0b000, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+ defm SST1H_D : sve_mem_64b_sst_sv_32_unscaled<0b010, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+ defm SST1W_D : sve_mem_64b_sst_sv_32_unscaled<0b100, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8,nxv2i32>;
+ defm SST1D : sve_mem_64b_sst_sv_32_unscaled<0b110, "st1d", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+
+ // Scatters using packed, unscaled 32-bit offsets, e.g.
+ // st1h z0.s, p0, [x0, z0.s, uxtw]
+ defm SST1B_S : sve_mem_32b_sst_sv_32_unscaled<0b001, "st1b", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+ defm SST1H_S : sve_mem_32b_sst_sv_32_unscaled<0b011, "st1h", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
+ defm SST1W : sve_mem_32b_sst_sv_32_unscaled<0b101, "st1w", AArch64st1_scatter_sxtw, AArch64st1_scatter_uxtw, ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+
+ // Scatters using packed, scaled 32-bit offsets, e.g.
// st1h z0.s, p0, [x0, z0.s, uxtw #1]
- // and unpacked:
+ defm SST1H_S : sve_mem_32b_sst_sv_32_scaled<0b011, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+ defm SST1W : sve_mem_32b_sst_sv_32_scaled<0b101, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+
+ // Scatters using unpacked, scaled 32-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, uxtw #1]
- defm SST1H_D : sve_mem_sst_sv_32_scaled<0b010, "st1h", Z_d, ZPR64, ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
- defm SST1H_S : sve_mem_sst_sv_32_scaled<0b011, "st1h", Z_s, ZPR32, ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
- defm SST1W_D : sve_mem_sst_sv_32_scaled<0b100, "st1w", Z_d, ZPR64, ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
- defm SST1W : sve_mem_sst_sv_32_scaled<0b101, "st1w", Z_s, ZPR32, ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
- defm SST1D : sve_mem_sst_sv_32_scaled<0b110, "st1d", Z_d, ZPR64, ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+ defm SST1H_D : sve_mem_64b_sst_sv_32_scaled<0b010, "st1h", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+ defm SST1W_D : sve_mem_64b_sst_sv_32_scaled<0b100, "st1w", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+ defm SST1D : sve_mem_64b_sst_sv_32_scaled<0b110, "st1d", AArch64st1_scatter_sxtw_scaled, AArch64st1_scatter_uxtw_scaled, ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
// Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.s, p0, [z0.s, #16]
+ defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>;
+ defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>;
+ defm SST1W : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>;
+
+ // Scatters using 32/64-bit pointers with offset, e.g.
// st1h z0.d, p0, [z0.d, #16]
- defm SST1B_D : sve_mem_sst_vi_ptrs<0b000, "st1b", Z_d, ZPR64, imm0_31>;
- defm SST1B_S : sve_mem_sst_vi_ptrs<0b001, "st1b", Z_s, ZPR32, imm0_31>;
- defm SST1H_D : sve_mem_sst_vi_ptrs<0b010, "st1h", Z_d, ZPR64, uimm5s2>;
- defm SST1H_S : sve_mem_sst_vi_ptrs<0b011, "st1h", Z_s, ZPR32, uimm5s2>;
- defm SST1W_D : sve_mem_sst_vi_ptrs<0b100, "st1w", Z_d, ZPR64, uimm5s4>;
- defm SST1W : sve_mem_sst_vi_ptrs<0b101, "st1w", Z_s, ZPR32, uimm5s4>;
- defm SST1D : sve_mem_sst_vi_ptrs<0b110, "st1d", Z_d, ZPR64, uimm5s8>;
+ defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>;
+ defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>;
+ defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>;
+ defm SST1D : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>;
// Scatters using unscaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d]
- defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b">;
- defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h">;
- defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w">;
- defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d">;
+ defm SST1B_D : sve_mem_sst_sv_64_unscaled<0b00, "st1b", AArch64st1_scatter, nxv2i8>;
+ defm SST1H_D : sve_mem_sst_sv_64_unscaled<0b01, "st1h", AArch64st1_scatter, nxv2i16>;
+ defm SST1W_D : sve_mem_sst_sv_64_unscaled<0b10, "st1w", AArch64st1_scatter, nxv2i32>;
+ defm SST1D : sve_mem_sst_sv_64_unscaled<0b11, "st1d", AArch64st1_scatter, nxv2i64>;
// Scatters using scaled 64-bit offsets, e.g.
// st1h z0.d, p0, [x0, z0.d, lsl #1]
- defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", ZPR64ExtLSL16>;
- defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", ZPR64ExtLSL32>;
- defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", ZPR64ExtLSL64>;
+ defm SST1H_D_SCALED : sve_mem_sst_sv_64_scaled<0b01, "st1h", AArch64st1_scatter_scaled, ZPR64ExtLSL16, nxv2i16>;
+ defm SST1W_D_SCALED : sve_mem_sst_sv_64_scaled<0b10, "st1w", AArch64st1_scatter_scaled, ZPR64ExtLSL32, nxv2i32>;
+ defm SST1D_SCALED : sve_mem_sst_sv_64_scaled<0b11, "st1d", AArch64st1_scatter_scaled, ZPR64ExtLSL64, nxv2i64>;
// ST(2|3|4) structured stores (register + immediate)
defm ST2B_IMM : sve_mem_est_si<0b00, 0b01, ZZ_b, "st2b", simm4s2>;
@@ -693,58 +761,58 @@ let Predicates = [HasSVE] in {
defm ADR_LSL_ZZZ_S : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
defm ADR_LSL_ZZZ_D : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;
- defm TBL_ZZZ : sve_int_perm_tbl<"tbl">;
-
- defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1">;
- defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2">;
- defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1">;
- defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2">;
- defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1">;
- defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2">;
-
- defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1">;
- defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2">;
- defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1">;
- defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2">;
- defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1">;
- defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2">;
-
- defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs">;
- defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi">;
- defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge">;
- defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt">;
- defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq">;
- defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne">;
-
- defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq">;
- defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne">;
- defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge">;
- defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt">;
- defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt">;
- defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple">;
- defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs">;
- defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi">;
- defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo">;
- defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls">;
-
- defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge">;
- defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt">;
- defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt">;
- defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple">;
- defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq">;
- defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne">;
- defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs">;
- defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi">;
- defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo">;
- defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls">;
-
- defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge">;
- defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt">;
- defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq">;
- defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne">;
- defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo">;
- defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge">;
- defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt">;
+ defm TBL_ZZZ : sve_int_perm_tbl<"tbl", AArch64tbl>;
+
+ defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
+ defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
+ defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
+ defm UZP2_ZZZ : sve_int_perm_bin_perm_zz<0b011, "uzp2", AArch64uzp2>;
+ defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
+ defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;
+
+ defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
+ defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
+ defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
+ defm UZP2_PPP : sve_int_perm_bin_perm_pp<0b011, "uzp2", AArch64uzp2>;
+ defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
+ defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
+
+ defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", int_aarch64_sve_cmphs, SETUGE>;
+ defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", int_aarch64_sve_cmphi, SETUGT>;
+ defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", int_aarch64_sve_cmpge, SETGE>;
+ defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", int_aarch64_sve_cmpgt, SETGT>;
+ defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", int_aarch64_sve_cmpeq, SETEQ>;
+ defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", int_aarch64_sve_cmpne, SETNE>;
+
+ defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>;
+ defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>;
+ defm CMPGE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b000, "cmpge", int_aarch64_sve_cmpge_wide>;
+ defm CMPGT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b001, "cmpgt", int_aarch64_sve_cmpgt_wide>;
+ defm CMPLT_WIDE_PPzZZ : sve_int_cmp_1_wide<0b010, "cmplt", int_aarch64_sve_cmplt_wide>;
+ defm CMPLE_WIDE_PPzZZ : sve_int_cmp_1_wide<0b011, "cmple", int_aarch64_sve_cmple_wide>;
+ defm CMPHS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b100, "cmphs", int_aarch64_sve_cmphs_wide>;
+ defm CMPHI_WIDE_PPzZZ : sve_int_cmp_1_wide<0b101, "cmphi", int_aarch64_sve_cmphi_wide>;
+ defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>;
+ defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>;
+
+ defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, int_aarch64_sve_cmpge>;
+ defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, int_aarch64_sve_cmpgt>;
+ defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, null_frag, int_aarch64_sve_cmpgt>;
+ defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, null_frag, int_aarch64_sve_cmpge>;
+ defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, int_aarch64_sve_cmpeq>;
+ defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, int_aarch64_sve_cmpne>;
+ defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, int_aarch64_sve_cmphs>;
+ defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, int_aarch64_sve_cmphi>;
+ defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, null_frag, int_aarch64_sve_cmphi>;
+ defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, null_frag, int_aarch64_sve_cmphs>;
+
+ defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge", int_aarch64_sve_fcmpge>;
+ defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt", int_aarch64_sve_fcmpgt>;
+ defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq", int_aarch64_sve_fcmpeq>;
+ defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne", int_aarch64_sve_fcmpne>;
+ defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo", int_aarch64_sve_fcmpuo>;
+ defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
+ defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
defm FCMGE_PPzZ0 : sve_fp_2op_p_pd<0b000, "fcmge">;
defm FCMGT_PPzZ0 : sve_fp_2op_p_pd<0b001, "fcmgt">;
@@ -753,15 +821,15 @@ let Predicates = [HasSVE] in {
defm FCMEQ_PPzZ0 : sve_fp_2op_p_pd<0b100, "fcmeq">;
defm FCMNE_PPzZ0 : sve_fp_2op_p_pd<0b110, "fcmne">;
- defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt">;
- defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele">;
- defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo">;
- defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels">;
+ defm WHILELT_PWW : sve_int_while4_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
+ defm WHILELE_PWW : sve_int_while4_rr<0b011, "whilele", int_aarch64_sve_whilele>;
+ defm WHILELO_PWW : sve_int_while4_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
+ defm WHILELS_PWW : sve_int_while4_rr<0b111, "whilels", int_aarch64_sve_whilels>;
- defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt">;
- defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele">;
- defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo">;
- defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels">;
+ defm WHILELT_PXX : sve_int_while8_rr<0b010, "whilelt", int_aarch64_sve_whilelt>;
+ defm WHILELE_PXX : sve_int_while8_rr<0b011, "whilele", int_aarch64_sve_whilele>;
+ defm WHILELO_PXX : sve_int_while8_rr<0b110, "whilelo", int_aarch64_sve_whilelo>;
+ defm WHILELS_PXX : sve_int_while8_rr<0b111, "whilels", int_aarch64_sve_whilels>;
def CTERMEQ_WW : sve_int_cterm<0b0, 0b0, "ctermeq", GPR32>;
def CTERMNE_WW : sve_int_cterm<0b0, 0b1, "ctermne", GPR32>;
@@ -772,11 +840,11 @@ let Predicates = [HasSVE] in {
def ADDVL_XXI : sve_int_arith_vl<0b0, "addvl">;
def ADDPL_XXI : sve_int_arith_vl<0b1, "addpl">;
- defm CNTB_XPiI : sve_int_count<0b000, "cntb">;
- defm CNTH_XPiI : sve_int_count<0b010, "cnth">;
- defm CNTW_XPiI : sve_int_count<0b100, "cntw">;
- defm CNTD_XPiI : sve_int_count<0b110, "cntd">;
- defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp">;
+ defm CNTB_XPiI : sve_int_count<0b000, "cntb", int_aarch64_sve_cntb>;
+ defm CNTH_XPiI : sve_int_count<0b010, "cnth", int_aarch64_sve_cnth>;
+ defm CNTW_XPiI : sve_int_count<0b100, "cntw", int_aarch64_sve_cntw>;
+ defm CNTD_XPiI : sve_int_count<0b110, "cntd", int_aarch64_sve_cntd>;
+ defm CNTP_XPP : sve_int_pcount_pred<0b0000, "cntp", int_aarch64_sve_cntp>;
defm INCB_XPiI : sve_int_pred_pattern_a<0b000, "incb">;
defm DECB_XPiI : sve_int_pred_pattern_a<0b001, "decb">;
@@ -787,76 +855,76 @@ let Predicates = [HasSVE] in {
defm INCD_XPiI : sve_int_pred_pattern_a<0b110, "incd">;
defm DECD_XPiI : sve_int_pred_pattern_a<0b111, "decd">;
- defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb">;
- defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb">;
- defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb">;
- defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb">;
- defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb">;
- defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb">;
- defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb">;
- defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb">;
-
- defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch">;
- defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch">;
- defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech">;
- defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech">;
- defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch">;
- defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch">;
- defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech">;
- defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech">;
-
- defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw">;
- defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw">;
- defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw">;
- defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw">;
- defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw">;
- defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw">;
- defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw">;
- defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw">;
-
- defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd">;
- defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd">;
- defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd">;
- defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd">;
- defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd">;
- defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd">;
- defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd">;
- defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd">;
-
- defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16>;
- defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16>;
- defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16>;
- defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16>;
+ defm SQINCB_XPiWdI : sve_int_pred_pattern_b_s32<0b00000, "sqincb", int_aarch64_sve_sqincb_n32>;
+ defm UQINCB_WPiI : sve_int_pred_pattern_b_u32<0b00001, "uqincb", int_aarch64_sve_uqincb_n32>;
+ defm SQDECB_XPiWdI : sve_int_pred_pattern_b_s32<0b00010, "sqdecb", int_aarch64_sve_sqdecb_n32>;
+ defm UQDECB_WPiI : sve_int_pred_pattern_b_u32<0b00011, "uqdecb", int_aarch64_sve_uqdecb_n32>;
+ defm SQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00100, "sqincb", int_aarch64_sve_sqincb_n64>;
+ defm UQINCB_XPiI : sve_int_pred_pattern_b_x64<0b00101, "uqincb", int_aarch64_sve_uqincb_n64>;
+ defm SQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00110, "sqdecb", int_aarch64_sve_sqdecb_n64>;
+ defm UQDECB_XPiI : sve_int_pred_pattern_b_x64<0b00111, "uqdecb", int_aarch64_sve_uqdecb_n64>;
+
+ defm SQINCH_XPiWdI : sve_int_pred_pattern_b_s32<0b01000, "sqinch", int_aarch64_sve_sqinch_n32>;
+ defm UQINCH_WPiI : sve_int_pred_pattern_b_u32<0b01001, "uqinch", int_aarch64_sve_uqinch_n32>;
+ defm SQDECH_XPiWdI : sve_int_pred_pattern_b_s32<0b01010, "sqdech", int_aarch64_sve_sqdech_n32>;
+ defm UQDECH_WPiI : sve_int_pred_pattern_b_u32<0b01011, "uqdech", int_aarch64_sve_uqdech_n32>;
+ defm SQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01100, "sqinch", int_aarch64_sve_sqinch_n64>;
+ defm UQINCH_XPiI : sve_int_pred_pattern_b_x64<0b01101, "uqinch", int_aarch64_sve_uqinch_n64>;
+ defm SQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01110, "sqdech", int_aarch64_sve_sqdech_n64>;
+ defm UQDECH_XPiI : sve_int_pred_pattern_b_x64<0b01111, "uqdech", int_aarch64_sve_uqdech_n64>;
+
+ defm SQINCW_XPiWdI : sve_int_pred_pattern_b_s32<0b10000, "sqincw", int_aarch64_sve_sqincw_n32>;
+ defm UQINCW_WPiI : sve_int_pred_pattern_b_u32<0b10001, "uqincw", int_aarch64_sve_uqincw_n32>;
+ defm SQDECW_XPiWdI : sve_int_pred_pattern_b_s32<0b10010, "sqdecw", int_aarch64_sve_sqdecw_n32>;
+ defm UQDECW_WPiI : sve_int_pred_pattern_b_u32<0b10011, "uqdecw", int_aarch64_sve_uqdecw_n32>;
+ defm SQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10100, "sqincw", int_aarch64_sve_sqincw_n64>;
+ defm UQINCW_XPiI : sve_int_pred_pattern_b_x64<0b10101, "uqincw", int_aarch64_sve_uqincw_n64>;
+ defm SQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10110, "sqdecw", int_aarch64_sve_sqdecw_n64>;
+ defm UQDECW_XPiI : sve_int_pred_pattern_b_x64<0b10111, "uqdecw", int_aarch64_sve_uqdecw_n64>;
+
+ defm SQINCD_XPiWdI : sve_int_pred_pattern_b_s32<0b11000, "sqincd", int_aarch64_sve_sqincd_n32>;
+ defm UQINCD_WPiI : sve_int_pred_pattern_b_u32<0b11001, "uqincd", int_aarch64_sve_uqincd_n32>;
+ defm SQDECD_XPiWdI : sve_int_pred_pattern_b_s32<0b11010, "sqdecd", int_aarch64_sve_sqdecd_n32>;
+ defm UQDECD_WPiI : sve_int_pred_pattern_b_u32<0b11011, "uqdecd", int_aarch64_sve_uqdecd_n32>;
+ defm SQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11100, "sqincd", int_aarch64_sve_sqincd_n64>;
+ defm UQINCD_XPiI : sve_int_pred_pattern_b_x64<0b11101, "uqincd", int_aarch64_sve_uqincd_n64>;
+ defm SQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11110, "sqdecd", int_aarch64_sve_sqdecd_n64>;
+ defm UQDECD_XPiI : sve_int_pred_pattern_b_x64<0b11111, "uqdecd", int_aarch64_sve_uqdecd_n64>;
+
+ defm SQINCH_ZPiI : sve_int_countvlv<0b01000, "sqinch", ZPR16, int_aarch64_sve_sqinch, nxv8i16>;
+ defm UQINCH_ZPiI : sve_int_countvlv<0b01001, "uqinch", ZPR16, int_aarch64_sve_uqinch, nxv8i16>;
+ defm SQDECH_ZPiI : sve_int_countvlv<0b01010, "sqdech", ZPR16, int_aarch64_sve_sqdech, nxv8i16>;
+ defm UQDECH_ZPiI : sve_int_countvlv<0b01011, "uqdech", ZPR16, int_aarch64_sve_uqdech, nxv8i16>;
defm INCH_ZPiI : sve_int_countvlv<0b01100, "inch", ZPR16>;
defm DECH_ZPiI : sve_int_countvlv<0b01101, "dech", ZPR16>;
- defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32>;
- defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32>;
- defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32>;
- defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32>;
+ defm SQINCW_ZPiI : sve_int_countvlv<0b10000, "sqincw", ZPR32, int_aarch64_sve_sqincw, nxv4i32>;
+ defm UQINCW_ZPiI : sve_int_countvlv<0b10001, "uqincw", ZPR32, int_aarch64_sve_uqincw, nxv4i32>;
+ defm SQDECW_ZPiI : sve_int_countvlv<0b10010, "sqdecw", ZPR32, int_aarch64_sve_sqdecw, nxv4i32>;
+ defm UQDECW_ZPiI : sve_int_countvlv<0b10011, "uqdecw", ZPR32, int_aarch64_sve_uqdecw, nxv4i32>;
defm INCW_ZPiI : sve_int_countvlv<0b10100, "incw", ZPR32>;
defm DECW_ZPiI : sve_int_countvlv<0b10101, "decw", ZPR32>;
- defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64>;
- defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64>;
- defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64>;
- defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64>;
+ defm SQINCD_ZPiI : sve_int_countvlv<0b11000, "sqincd", ZPR64, int_aarch64_sve_sqincd, nxv2i64>;
+ defm UQINCD_ZPiI : sve_int_countvlv<0b11001, "uqincd", ZPR64, int_aarch64_sve_uqincd, nxv2i64>;
+ defm SQDECD_ZPiI : sve_int_countvlv<0b11010, "sqdecd", ZPR64, int_aarch64_sve_sqdecd, nxv2i64>;
+ defm UQDECD_ZPiI : sve_int_countvlv<0b11011, "uqdecd", ZPR64, int_aarch64_sve_uqdecd, nxv2i64>;
defm INCD_ZPiI : sve_int_countvlv<0b11100, "incd", ZPR64>;
defm DECD_ZPiI : sve_int_countvlv<0b11101, "decd", ZPR64>;
- defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp">;
- defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp">;
- defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp">;
- defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp">;
- defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp">;
- defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp">;
- defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp">;
- defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp">;
+ defm SQINCP_XPWd : sve_int_count_r_s32<0b00000, "sqincp", int_aarch64_sve_sqincp_n32>;
+ defm SQINCP_XP : sve_int_count_r_x64<0b00010, "sqincp", int_aarch64_sve_sqincp_n64>;
+ defm UQINCP_WP : sve_int_count_r_u32<0b00100, "uqincp", int_aarch64_sve_uqincp_n32>;
+ defm UQINCP_XP : sve_int_count_r_x64<0b00110, "uqincp", int_aarch64_sve_uqincp_n64>;
+ defm SQDECP_XPWd : sve_int_count_r_s32<0b01000, "sqdecp", int_aarch64_sve_sqdecp_n32>;
+ defm SQDECP_XP : sve_int_count_r_x64<0b01010, "sqdecp", int_aarch64_sve_sqdecp_n64>;
+ defm UQDECP_WP : sve_int_count_r_u32<0b01100, "uqdecp", int_aarch64_sve_uqdecp_n32>;
+ defm UQDECP_XP : sve_int_count_r_x64<0b01110, "uqdecp", int_aarch64_sve_uqdecp_n64>;
defm INCP_XP : sve_int_count_r_x64<0b10000, "incp">;
defm DECP_XP : sve_int_count_r_x64<0b10100, "decp">;
- defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp">;
- defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp">;
- defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp">;
- defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp">;
+ defm SQINCP_ZP : sve_int_count_v<0b00000, "sqincp", int_aarch64_sve_sqincp>;
+ defm UQINCP_ZP : sve_int_count_v<0b00100, "uqincp", int_aarch64_sve_uqincp>;
+ defm SQDECP_ZP : sve_int_count_v<0b01000, "sqdecp", int_aarch64_sve_sqdecp>;
+ defm UQDECP_ZP : sve_int_count_v<0b01100, "uqdecp", int_aarch64_sve_uqdecp>;
defm INCP_ZP : sve_int_count_v<0b10000, "incp">;
defm DECP_ZP : sve_int_count_v<0b10100, "decp">;
@@ -878,63 +946,63 @@ let Predicates = [HasSVE] in {
defm ASR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0000, "asr">;
defm LSR_ZPmI : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">;
defm LSL_ZPmI : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
- defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd">;
-
- defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr">;
- defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr">;
- defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl">;
- defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr">;
- defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr">;
- defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr">;
-
- defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr">;
- defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr">;
- defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl">;
-
- def FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, ElementSizeS>;
- def FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, ElementSizeS>;
- def SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, ElementSizeH>;
- def SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, ElementSizeS>;
- def UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, ElementSizeS>;
- def UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, ElementSizeH>;
- def FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, ElementSizeH>;
- def FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, ElementSizeS>;
- def FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, ElementSizeH>;
- def FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, ElementSizeS>;
- def FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, ElementSizeD>;
- def FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, ElementSizeD>;
- def FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, ElementSizeD>;
- def FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, ElementSizeD>;
- def SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, ElementSizeD>;
- def UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, ElementSizeD>;
- def UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, ElementSizeS>;
- def SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, ElementSizeD>;
- def SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, ElementSizeS>;
- def SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, ElementSizeD>;
- def UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, ElementSizeD>;
- def UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, ElementSizeD>;
- def SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, ElementSizeD>;
- def UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, ElementSizeD>;
- def FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, ElementSizeD>;
- def FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, ElementSizeD>;
- def FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, ElementSizeD>;
- def FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, ElementSizeS>;
- def FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, ElementSizeD>;
- def FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, ElementSizeS>;
- def FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, ElementSizeD>;
- def FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, ElementSizeD>;
- def FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, ElementSizeD>;
- def FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, ElementSizeD>;
-
- defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn">;
- defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp">;
- defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm">;
- defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz">;
- defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta">;
- defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx">;
- defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti">;
- defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx">;
- defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt">;
+ defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>;
+
+ defm ASR_ZPmZ : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>;
+ defm LSR_ZPmZ : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>;
+ defm LSL_ZPmZ : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>;
+ defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>;
+ defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>;
+ defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>;
+
+ defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
+ defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
+ defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
+
+ defm FCVT_ZPmZ_StoH : sve_fp_2op_p_zd<0b1001000, "fcvt", ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32, nxv8f16, nxv16i1, nxv4f32, ElementSizeS>;
+ defm FCVT_ZPmZ_HtoS : sve_fp_2op_p_zd<0b1001001, "fcvt", ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16, nxv4f32, nxv16i1, nxv8f16, ElementSizeS>;
+ defm SCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110010, "scvtf", ZPR16, ZPR16, int_aarch64_sve_scvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+ defm SCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010100, "scvtf", ZPR32, ZPR32, int_aarch64_sve_scvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+ defm UCVTF_ZPmZ_StoS : sve_fp_2op_p_zd<0b1010101, "ucvtf", ZPR32, ZPR32, int_aarch64_sve_ucvtf, nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+ defm UCVTF_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0110011, "ucvtf", ZPR16, ZPR16, int_aarch64_sve_ucvtf, nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+ defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+ defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu, nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+ defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu, nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+ defm FCVT_ZPmZ_DtoH : sve_fp_2op_p_zd<0b1101000, "fcvt", ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64, nxv8f16, nxv16i1, nxv2f64, ElementSizeD>;
+ defm FCVT_ZPmZ_HtoD : sve_fp_2op_p_zd<0b1101001, "fcvt", ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16, nxv2f64, nxv16i1, nxv8f16, ElementSizeD>;
+ defm FCVT_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1101010, "fcvt", ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64, nxv4f32, nxv16i1, nxv2f64, ElementSizeD>;
+ defm FCVT_ZPmZ_StoD : sve_fp_2op_p_zd<0b1101011, "fcvt", ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32, nxv2f64, nxv16i1, nxv4f32, ElementSizeD>;
+ defm SCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110000, "scvtf", ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
+ defm UCVTF_ZPmZ_StoD : sve_fp_2op_p_zd<0b1110001, "ucvtf", ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32, nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
+ defm UCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110101, "ucvtf", ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
+ defm SCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110100, "scvtf", ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
+ defm SCVTF_ZPmZ_StoH : sve_fp_2op_p_zd<0b0110100, "scvtf", ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32, nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
+ defm SCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110110, "scvtf", ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1110101, "ucvtf", ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64, nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoH : sve_fp_2op_p_zd<0b0110111, "ucvtf", ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64, nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
+ defm SCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110110, "scvtf", ZPR64, ZPR64, int_aarch64_sve_scvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+ defm UCVTF_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1110111, "ucvtf", ZPR64, ZPR64, int_aarch64_sve_ucvtf, nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+ defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
+ defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64, nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
+ defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
+ defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
+ defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
+ defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16, nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
+ defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16, nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
+ defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32, nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
+ defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+ defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu, nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+
+ defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
+ defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
+ defm FRINTM_ZPmZ : sve_fp_2op_p_zd_HSD<0b00010, "frintm", int_aarch64_sve_frintm>;
+ defm FRINTZ_ZPmZ : sve_fp_2op_p_zd_HSD<0b00011, "frintz", int_aarch64_sve_frintz>;
+ defm FRINTA_ZPmZ : sve_fp_2op_p_zd_HSD<0b00100, "frinta", int_aarch64_sve_frinta>;
+ defm FRINTX_ZPmZ : sve_fp_2op_p_zd_HSD<0b00110, "frintx", int_aarch64_sve_frintx>;
+ defm FRINTI_ZPmZ : sve_fp_2op_p_zd_HSD<0b00111, "frinti", int_aarch64_sve_frinti>;
+ defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
+ defm FSQRT_ZPmZ : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt", int_aarch64_sve_fsqrt>;
// InstAliases
def : InstAlias<"mov $Zd, $Zn",
@@ -1021,6 +1089,22 @@ let Predicates = [HasSVE] in {
def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
(FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
+ def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
+ (PTEST_PP PPR:$pg, PPR:$src)>;
+ def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
+ (PTEST_PP PPR:$pg, PPR:$src)>;
+ def : Pat<(AArch64ptest (nxv4i1 PPR:$pg), (nxv4i1 PPR:$src)),
+ (PTEST_PP PPR:$pg, PPR:$src)>;
+ def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
+ (PTEST_PP PPR:$pg, PPR:$src)>;
+
+ def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8), (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i16), (SXTH_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
+ def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
+
def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
@@ -1070,6 +1154,83 @@ let Predicates = [HasSVE] in {
def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+ // Add more complex addressing modes here as required
+ multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
+ Instruction RegImmInst> {
+
+ def _default_z : Pat<(Ty (Load GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
+ (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
+ }
+
+ // 2-element contiguous loads
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8, LD1B_D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8, LD1SB_D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16, LD1H_D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32, LD1W_D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D_IMM>;
+ defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load, LD1D_IMM>;
+ defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load, LD1H_D_IMM>;
+ defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load, LD1W_D_IMM>;
+ defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load, LD1D_IMM>;
+
+ // 4-element contiguous loads
+ defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8, LD1B_S_IMM>;
+ defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8, LD1SB_S_IMM>;
+ defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16, LD1H_S_IMM>;
+ defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S_IMM>;
+ defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load, LD1W_IMM>;
+ defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load, LD1H_S_IMM>;
+ defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load, LD1W_IMM>;
+
+ // 8-element contiguous loads
+ defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8, LD1B_H_IMM>;
+ defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H_IMM>;
+ defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load, LD1H_IMM>;
+ defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load, LD1H_IMM>;
+
+ // 16-element contiguous loads
+ defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B_IMM>;
+
+ multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
+ Instruction RegImmInst> {
+ def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
+ (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
+ }
+
+ // 2-element contiguous stores
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8, ST1B_D_IMM>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D_IMM>;
+ defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D_IMM>;
+ defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
+ defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store, ST1H_D_IMM>;
+ defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store, ST1W_D_IMM>;
+ defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store, ST1D_IMM>;
+
+ // 4-element contiguous stores
+ defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8, ST1B_S_IMM>;
+ defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S_IMM>;
+ defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
+ defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store, ST1H_S_IMM>;
+ defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store, ST1W_IMM>;
+
+ // 8-element contiguous stores
+ defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H_IMM>;
+ defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
+ defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
+
+ // 16-element contiguous stores
+ defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B_IMM>;
+
+ defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRI>;
+ defm : pred_load<nxv8i16, nxv8i1, non_temporal_load, LDNT1H_ZRI>;
+ defm : pred_load<nxv4i32, nxv4i1, non_temporal_load, LDNT1W_ZRI>;
+ defm : pred_load<nxv2i64, nxv2i1, non_temporal_load, LDNT1D_ZRI>;
+
+ defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRI>;
+ defm : pred_store<nxv8i16, nxv8i1, non_temporal_store, STNT1H_ZRI>;
+ defm : pred_store<nxv4i32, nxv4i1, non_temporal_store, STNT1W_ZRI>;
+ defm : pred_store<nxv2i64, nxv2i1, non_temporal_store, STNT1D_ZRI>;
}
let Predicates = [HasSVE2] in {
@@ -1286,46 +1447,46 @@ let Predicates = [HasSVE2] in {
defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
// SVE2 bitwise shift right narrow (bottom)
- defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb">;
- defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb">;
- defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb">;
- defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb">;
- defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb">;
- defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb">;
- defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb">;
- defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb">;
+ defm SQSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb", int_aarch64_sve_sqshrunb>;
+ defm SQRSHRUNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b001, "sqrshrunb", int_aarch64_sve_sqrshrunb>;
+ defm SHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b010, "shrnb", int_aarch64_sve_shrnb>;
+ defm RSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b011, "rshrnb", int_aarch64_sve_rshrnb>;
+ defm SQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b100, "sqshrnb", int_aarch64_sve_sqshrnb>;
+ defm SQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b101, "sqrshrnb", int_aarch64_sve_sqrshrnb>;
+ defm UQSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b110, "uqshrnb", int_aarch64_sve_uqshrnb>;
+ defm UQRSHRNB_ZZI : sve2_int_bin_shift_imm_right_narrow_bottom<0b111, "uqrshrnb", int_aarch64_sve_uqrshrnb>;
// SVE2 bitwise shift right narrow (top)
- defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt">;
- defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt">;
- defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt">;
- defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt">;
- defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt">;
- defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt">;
- defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt">;
- defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt">;
+ defm SQSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b000, "sqshrunt", int_aarch64_sve_sqshrunt>;
+ defm SQRSHRUNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b001, "sqrshrunt", int_aarch64_sve_sqrshrunt>;
+ defm SHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b010, "shrnt", int_aarch64_sve_shrnt>;
+ defm RSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b011, "rshrnt", int_aarch64_sve_rshrnt>;
+ defm SQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b100, "sqshrnt", int_aarch64_sve_sqshrnt>;
+ defm SQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b101, "sqrshrnt", int_aarch64_sve_sqrshrnt>;
+ defm UQSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b110, "uqshrnt", int_aarch64_sve_uqshrnt>;
+ defm UQRSHRNT_ZZI : sve2_int_bin_shift_imm_right_narrow_top<0b111, "uqrshrnt", int_aarch64_sve_uqrshrnt>;
// SVE2 integer add/subtract narrow high part (bottom)
- defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb">;
- defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb">;
- defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb">;
- defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb">;
+ defm ADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b00, "addhnb", int_aarch64_sve_addhnb>;
+ defm RADDHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b01, "raddhnb", int_aarch64_sve_raddhnb>;
+ defm SUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b10, "subhnb", int_aarch64_sve_subhnb>;
+ defm RSUBHNB_ZZZ : sve2_int_addsub_narrow_high_bottom<0b11, "rsubhnb", int_aarch64_sve_rsubhnb>;
// SVE2 integer add/subtract narrow high part (top)
- defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt">;
- defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt">;
- defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt">;
- defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt">;
+ defm ADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b00, "addhnt", int_aarch64_sve_addhnt>;
+ defm RADDHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b01, "raddhnt", int_aarch64_sve_raddhnt>;
+ defm SUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b10, "subhnt", int_aarch64_sve_subhnt>;
+ defm RSUBHNT_ZZZ : sve2_int_addsub_narrow_high_top<0b11, "rsubhnt", int_aarch64_sve_rsubhnt>;
// SVE2 saturating extract narrow (bottom)
- defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb">;
- defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb">;
- defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb">;
+ defm SQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b00, "sqxtnb", int_aarch64_sve_sqxtnb>;
+ defm UQXTNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b01, "uqxtnb", int_aarch64_sve_uqxtnb>;
+ defm SQXTUNB_ZZ : sve2_int_sat_extract_narrow_bottom<0b10, "sqxtunb", int_aarch64_sve_sqxtunb>;
// SVE2 saturating extract narrow (top)
- defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt">;
- defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt">;
- defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt">;
+ defm SQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b00, "sqxtnt", int_aarch64_sve_sqxtnt>;
+ defm UQXTNT_ZZ : sve2_int_sat_extract_narrow_top<0b01, "uqxtnt", int_aarch64_sve_uqxtnt>;
+ defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
// SVE2 character match
defm MATCH_PPzZZ : sve2_char_match<0b0, "match">;
@@ -1353,32 +1514,32 @@ let Predicates = [HasSVE2] in {
defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
// SVE2 floating-point base 2 logarithm as integer
- defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb">;
+ defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
// SVE2 floating-point convert precision
- defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtxnt">;
- defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt">;
- defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt">;
- def FCVTX_ZPmZ_DtoS : sve_fp_2op_p_zd<0b0001010, "fcvtx", ZPR64, ZPR32, ElementSizeD>;
+ defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">;
+ defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">;
+ defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">;
+ defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">;
// SVE2 floating-point pairwise operations
- defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp">;
- defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp">;
- defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp">;
- defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp">;
- defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp">;
+ defm FADDP_ZPmZZ : sve2_fp_pairwise_pred<0b000, "faddp", int_aarch64_sve_faddp>;
+ defm FMAXNMP_ZPmZZ : sve2_fp_pairwise_pred<0b100, "fmaxnmp", int_aarch64_sve_fmaxnmp>;
+ defm FMINNMP_ZPmZZ : sve2_fp_pairwise_pred<0b101, "fminnmp", int_aarch64_sve_fminnmp>;
+ defm FMAXP_ZPmZZ : sve2_fp_pairwise_pred<0b110, "fmaxp", int_aarch64_sve_fmaxp>;
+ defm FMINP_ZPmZZ : sve2_fp_pairwise_pred<0b111, "fminp", int_aarch64_sve_fminp>;
// SVE2 floating-point multiply-add long (indexed)
- def FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb">;
- def FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt">;
- def FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb">;
- def FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt">;
+ defm FMLALB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b00, "fmlalb", int_aarch64_sve_fmlalb_lane>;
+ defm FMLALT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b01, "fmlalt", int_aarch64_sve_fmlalt_lane>;
+ defm FMLSLB_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b10, "fmlslb", int_aarch64_sve_fmlslb_lane>;
+ defm FMLSLT_ZZZI_SHH : sve2_fp_mla_long_by_indexed_elem<0b11, "fmlslt", int_aarch64_sve_fmlslt_lane>;
// SVE2 floating-point multiply-add long
- def FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb">;
- def FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt">;
- def FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb">;
- def FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt">;
+ defm FMLALB_ZZZ_SHH : sve2_fp_mla_long<0b00, "fmlalb", int_aarch64_sve_fmlalb>;
+ defm FMLALT_ZZZ_SHH : sve2_fp_mla_long<0b01, "fmlalt", int_aarch64_sve_fmlalt>;
+ defm FMLSLB_ZZZ_SHH : sve2_fp_mla_long<0b10, "fmlslb", int_aarch64_sve_fmlslb>;
+ defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>;
// SVE2 bitwise ternary operations
defm EOR3_ZZZZ_D : sve2_int_bitwise_ternary_op<0b000, "eor3">;
@@ -1427,15 +1588,15 @@ let Predicates = [HasSVE2] in {
defm TBX_ZZZ : sve2_int_perm_tbx<"tbx">;
// SVE2 integer compare scalar count and limit
- defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege">;
- defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt">;
- defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs">;
- defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi">;
-
- defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege">;
- defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt">;
- defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs">;
- defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi">;
+ defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
+ defm WHILEGT_PWW : sve_int_while4_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
+ defm WHILEHS_PWW : sve_int_while4_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
+ defm WHILEHI_PWW : sve_int_while4_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;
+
+ defm WHILEGE_PXX : sve_int_while8_rr<0b000, "whilege", int_aarch64_sve_whilege>;
+ defm WHILEGT_PXX : sve_int_while8_rr<0b001, "whilegt", int_aarch64_sve_whilegt>;
+ defm WHILEHS_PXX : sve_int_while8_rr<0b100, "whilehs", int_aarch64_sve_whilehs>;
+ defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;
// SVE2 pointer conflict compare
defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td
deleted file mode 100644
index f1e76e2c20d3..000000000000
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM1.td
+++ /dev/null
@@ -1,850 +0,0 @@
-//=- AArch64SchedExynosM1.td - Samsung Exynos M1 Sched Defs --*- tablegen -*-=//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the machine model for the Samsung Exynos M1 to support
-// instruction scheduling and other instruction cost heuristics.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// The Exynos-M1 is a traditional superscalar microprocessor with a
-// 4-wide in-order stage for decode and dispatch and a wider issue stage.
-// The execution units and loads and stores are out-of-order.
-
-def ExynosM1Model : SchedMachineModel {
- let IssueWidth = 4; // Up to 4 uops per cycle.
- let MicroOpBufferSize = 96; // ROB size.
- let LoopMicroOpBufferSize = 24; // Based on the instruction queue size.
- let LoadLatency = 4; // Optimistic load cases.
- let MispredictPenalty = 14; // Minimum branch misprediction penalty.
- let CompleteModel = 1; // Use the default model otherwise.
-
- list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-}
-
-//===----------------------------------------------------------------------===//
-// Define each kind of processor resource and number available on the Exynos-M1,
-// which has 9 pipelines, each with its own queue with out-of-order dispatch.
-
-let SchedModel = ExynosM1Model in {
-
-def M1UnitA : ProcResource<2>; // Simple integer
-def M1UnitC : ProcResource<1>; // Simple and complex integer
-def M1UnitD : ProcResource<1>; // Integer division (inside C, serialized)
-def M1UnitB : ProcResource<2>; // Branch
-def M1UnitL : ProcResource<1>; // Load
-def M1UnitS : ProcResource<1>; // Store
-def M1PipeF0 : ProcResource<1>; // FP #0
-let Super = M1PipeF0 in {
- def M1UnitFMAC : ProcResource<1>; // FP multiplication
- def M1UnitNAL0 : ProcResource<1>; // Simple vector
- def M1UnitNMISC : ProcResource<1>; // Miscellanea
- def M1UnitFCVT : ProcResource<1>; // FP conversion
- def M1UnitNCRYPT : ProcResource<1>; // Cryptographic
-}
-def M1PipeF1 : ProcResource<1>; // FP #1
-let Super = M1PipeF1 in {
- def M1UnitFADD : ProcResource<1>; // Simple FP
- def M1UnitNAL1 : ProcResource<1>; // Simple vector
- def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized)
- def M1UnitFST : ProcResource<1>; // FP store
-}
-
-def M1UnitALU : ProcResGroup<[M1UnitA,
- M1UnitC]>; // All integer
-def M1UnitNALU : ProcResGroup<[M1UnitNAL0,
- M1UnitNAL1]>; // All simple vector
-
-//===----------------------------------------------------------------------===//
-// Coarse scheduling model.
-
-def M1WriteA1 : SchedWriteRes<[M1UnitALU]> { let Latency = 1; }
-def M1WriteA2 : SchedWriteRes<[M1UnitALU]> { let Latency = 2; }
-def M1WriteAA : SchedWriteRes<[M1UnitALU]> { let Latency = 2;
- let ResourceCycles = [2]; }
-def M1WriteAB : SchedWriteRes<[M1UnitALU,
- M1UnitC]> { let Latency = 1;
- let NumMicroOps = 2; }
-def M1WriteAC : SchedWriteRes<[M1UnitALU,
- M1UnitALU,
- M1UnitC]> { let Latency = 2;
- let NumMicroOps = 3; }
-def M1WriteAD : SchedWriteRes<[M1UnitALU,
- M1UnitC]> { let Latency = 2;
- let NumMicroOps = 2; }
-def M1WriteAX : SchedWriteVariant<[SchedVar<ExynosArithPred, [M1WriteA1]>,
- SchedVar<ExynosLogicPred, [M1WriteA1]>,
- SchedVar<NoSchedPred, [M1WriteAA]>]>;
-def M1WriteC1 : SchedWriteRes<[M1UnitC]> { let Latency = 1; }
-def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; }
-
-def M1WriteB1 : SchedWriteRes<[M1UnitB]> { let Latency = 1; }
-def M1WriteBX : SchedWriteVariant<[SchedVar<ExynosBranchLinkLRPred, [M1WriteAC]>,
- SchedVar<NoSchedPred, [M1WriteAB]>]>;
-
-def M1WriteL5 : SchedWriteRes<[M1UnitL]> { let Latency = 5; }
-def M1WriteL6 : SchedWriteRes<[M1UnitL]> { let Latency = 6; }
-def M1WriteLA : SchedWriteRes<[M1UnitL]> { let Latency = 6;
- let ResourceCycles = [2]; }
-def M1WriteLB : SchedWriteRes<[M1UnitL,
- M1UnitA]> { let Latency = 4;
- let NumMicroOps = 2; }
-def M1WriteLC : SchedWriteRes<[M1UnitL,
- M1UnitA]> { let Latency = 5;
- let NumMicroOps = 2; }
-def M1WriteLD : SchedWriteRes<[M1UnitL,
- M1UnitA]> { let Latency = 6;
- let NumMicroOps = 2;
- let ResourceCycles = [2, 1]; }
-def M1WriteLH : SchedWriteRes<[]> { let Latency = 5;
- let NumMicroOps = 0; }
-def M1WriteLX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteLC]>,
- SchedVar<NoSchedPred, [M1WriteL5]>]>;
-
-def M1WriteS1 : SchedWriteRes<[M1UnitS]> { let Latency = 1; }
-def M1WriteS3 : SchedWriteRes<[M1UnitS]> { let Latency = 3; }
-def M1WriteS4 : SchedWriteRes<[M1UnitS]> { let Latency = 4; }
-def M1WriteSA : SchedWriteRes<[M1UnitS,
- M1UnitFST,
- M1UnitA]> { let Latency = 3;
- let NumMicroOps = 2; }
-def M1WriteSB : SchedWriteRes<[M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitA]> { let Latency = 3;
- let NumMicroOps = 3; }
-def M1WriteSC : SchedWriteRes<[M1UnitS,
- M1UnitA]> { let Latency = 2;
- let NumMicroOps = 2; }
-def M1WriteSX : SchedWriteVariant<[SchedVar<ScaledIdxPred, [M1WriteSC]>,
- SchedVar<NoSchedPred, [M1WriteS1]>]>;
-
-def M1ReadAdrBase : SchedReadVariant<[SchedVar<ScaledIdxPred, [ReadDefault]>,
- SchedVar<NoSchedPred, [ReadDefault]>]>;
-
-// Branch instructions.
-def : WriteRes<WriteBr, []> { let Latency = 0; }
-def : WriteRes<WriteBrReg, [M1UnitC]> { let Latency = 1; }
-
-// Arithmetic and logical integer instructions.
-def : WriteRes<WriteI, [M1UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteISReg, [M1UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteIEReg, [M1UnitALU]> { let Latency = 1; }
-def : WriteRes<WriteIS, [M1UnitALU]> { let Latency = 1; }
-
-// Move instructions.
-def : WriteRes<WriteImm, [M1UnitALU]> { let Latency = 1; }
-
-// Divide and multiply instructions.
-def : WriteRes<WriteID32, [M1UnitC,
- M1UnitD]> { let Latency = 13;
- let ResourceCycles = [1, 13]; }
-def : WriteRes<WriteID64, [M1UnitC,
- M1UnitD]> { let Latency = 21;
- let ResourceCycles = [1, 21]; }
-// TODO: Long multiplication take 5 cycles and also the ALU.
-def : WriteRes<WriteIM32, [M1UnitC]> { let Latency = 3; }
-def : WriteRes<WriteIM64, [M1UnitC]> { let Latency = 4;
- let ResourceCycles = [2]; }
-
-// Miscellaneous instructions.
-def : WriteRes<WriteExtr, [M1UnitALU,
- M1UnitALU]> { let Latency = 2;
- let NumMicroOps = 2; }
-
-// Addressing modes.
-def : WriteRes<WriteAdr, []> { let Latency = 1;
- let NumMicroOps = 0; }
-def : SchedAlias<ReadAdrBase, M1ReadAdrBase>;
-
-// Load instructions.
-def : WriteRes<WriteLD, [M1UnitL]> { let Latency = 4; }
-def : WriteRes<WriteLDHi, []> { let Latency = 4;
- let NumMicroOps = 0; }
-def : SchedAlias<WriteLDIdx, M1WriteLX>;
-
-// Store instructions.
-def : WriteRes<WriteST, [M1UnitS]> { let Latency = 1; }
-def : WriteRes<WriteSTP, [M1UnitS]> { let Latency = 1; }
-def : WriteRes<WriteSTX, [M1UnitS]> { let Latency = 1; }
-def : SchedAlias<WriteSTIdx, M1WriteSX>;
-
-// FP data instructions.
-def : WriteRes<WriteF, [M1UnitFADD]> { let Latency = 3; }
-def : WriteRes<WriteFCmp, [M1UnitNMISC]> { let Latency = 4; }
-def : WriteRes<WriteFDiv, [M1UnitFVAR]> { let Latency = 15;
- let ResourceCycles = [15]; }
-def : WriteRes<WriteFMul, [M1UnitFMAC]> { let Latency = 4; }
-
-// FP miscellaneous instructions.
-def : WriteRes<WriteFCvt, [M1UnitFCVT]> { let Latency = 3; }
-def : WriteRes<WriteFImm, [M1UnitNALU]> { let Latency = 1; }
-def : WriteRes<WriteFCopy, [M1UnitS]> { let Latency = 4; }
-
-// FP load instructions.
-def : WriteRes<WriteVLD, [M1UnitL]> { let Latency = 5; }
-
-// FP store instructions.
-def : WriteRes<WriteVST, [M1UnitS,
- M1UnitFST]> { let Latency = 1;
- let NumMicroOps = 1; }
-
-// ASIMD FP instructions.
-def : WriteRes<WriteV, [M1UnitFADD]> { let Latency = 3; }
-
-// Other miscellaneous instructions.
-def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
-def : WriteRes<WriteBarrier, []> { let Latency = 1; }
-def : WriteRes<WriteHint, []> { let Latency = 1; }
-def : WriteRes<WriteSys, []> { let Latency = 1; }
-
-//===----------------------------------------------------------------------===//
-// Fast forwarding.
-
-// TODO: Add FP register forwarding rules.
-def : ReadAdvance<ReadI, 0>;
-def : ReadAdvance<ReadISReg, 0>;
-def : ReadAdvance<ReadIEReg, 0>;
-def : ReadAdvance<ReadIM, 0>;
-// TODO: The forwarding for WriteIM32 saves actually 2 cycles.
-def : ReadAdvance<ReadIMA, 3, [WriteIM32, WriteIM64]>;
-def : ReadAdvance<ReadID, 0>;
-def : ReadAdvance<ReadExtrHi, 0>;
-def : ReadAdvance<ReadAdrBase, 0>;
-def : ReadAdvance<ReadVLD, 0>;
-
-//===----------------------------------------------------------------------===//
-// Finer scheduling model.
-
-def M1WriteNEONA : SchedWriteRes<[M1UnitNALU,
- M1UnitNALU,
- M1UnitFADD]> { let Latency = 9;
- let NumMicroOps = 3; }
-def M1WriteNEONB : SchedWriteRes<[M1UnitNALU,
- M1UnitFST]> { let Latency = 5;
- let NumMicroOps = 2;}
-def M1WriteNEONC : SchedWriteRes<[M1UnitNALU,
- M1UnitFST]> { let Latency = 6;
- let NumMicroOps = 2; }
-def M1WriteNEOND : SchedWriteRes<[M1UnitNALU,
- M1UnitFST,
- M1UnitL]> { let Latency = 10;
- let NumMicroOps = 3; }
-def M1WriteNEONE : SchedWriteRes<[M1UnitFCVT,
- M1UnitFST]> { let Latency = 8;
- let NumMicroOps = 2; }
-def M1WriteNEONF : SchedWriteRes<[M1UnitFCVT,
- M1UnitFST,
- M1UnitL]> { let Latency = 13;
- let NumMicroOps = 3; }
-def M1WriteNEONG : SchedWriteRes<[M1UnitNMISC,
- M1UnitFST]> { let Latency = 6;
- let NumMicroOps = 2; }
-def M1WriteNEONH : SchedWriteRes<[M1UnitNALU,
- M1UnitFST]> { let Latency = 3;
- let NumMicroOps = 2; }
-def M1WriteNEONI : SchedWriteRes<[M1UnitFST,
- M1UnitL]> { let Latency = 9;
- let NumMicroOps = 2; }
-def M1WriteNEONJ : SchedWriteRes<[M1UnitNMISC,
- M1UnitFMAC]> { let Latency = 6;
- let NumMicroOps = 2; }
-def M1WriteNEONK : SchedWriteRes<[M1UnitNMISC,
- M1UnitFMAC]> { let Latency = 7;
- let NumMicroOps = 2; }
-def M1WriteNEONL : SchedWriteRes<[M1UnitNALU]> { let Latency = 2;
- let ResourceCycles = [2]; }
-def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; }
-def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; }
-def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; }
-def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; }
-def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; }
-// TODO
-def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15;
- let ResourceCycles = [15]; }
-def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23;
- let ResourceCycles = [23]; }
-def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; }
-def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; }
-def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; }
-def M1WriteNAL12 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 2; }
-def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; }
-def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
-def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; }
-def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; }
-def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; }
-def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; }
-def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; }
-def M1WriteTB : SchedWriteRes<[M1UnitC,
- M1UnitALU]> { let Latency = 2;
- let NumMicroOps = 2; }
-def M1WriteVLDA : SchedWriteRes<[M1UnitL,
- M1UnitL]> { let Latency = 6;
- let NumMicroOps = 2; }
-def M1WriteVLDB : SchedWriteRes<[M1UnitL,
- M1UnitL,
- M1UnitL]> { let Latency = 7;
- let NumMicroOps = 3; }
-def M1WriteVLDC : SchedWriteRes<[M1UnitL,
- M1UnitL,
- M1UnitL,
- M1UnitL]> { let Latency = 8;
- let NumMicroOps = 4; }
-def M1WriteVLDD : SchedWriteRes<[M1UnitL,
- M1UnitNALU]> { let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [2, 1]; }
-def M1WriteVLDE : SchedWriteRes<[M1UnitL,
- M1UnitNALU]> { let Latency = 6;
- let NumMicroOps = 2; }
-def M1WriteVLDF : SchedWriteRes<[M1UnitL,
- M1UnitL]> { let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1]; }
-def M1WriteVLDG : SchedWriteRes<[M1UnitL,
- M1UnitNALU,
- M1UnitNALU]> { let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1, 1]; }
-def M1WriteVLDH : SchedWriteRes<[M1UnitL,
- M1UnitNALU,
- M1UnitNALU]> { let Latency = 6;
- let NumMicroOps = 3; }
-def M1WriteVLDI : SchedWriteRes<[M1UnitL,
- M1UnitL,
- M1UnitL]> { let Latency = 12;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 2, 2]; }
-def M1WriteVLDJ : SchedWriteRes<[M1UnitL,
- M1UnitNALU,
- M1UnitNALU,
- M1UnitNALU]> { let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1, 1]; }
-def M1WriteVLDK : SchedWriteRes<[M1UnitL,
- M1UnitNALU,
- M1UnitNALU,
- M1UnitNALU,
- M1UnitNALU]> { let Latency = 9;
- let NumMicroOps = 5;
- let ResourceCycles = [2, 1, 1, 1, 1]; }
-def M1WriteVLDL : SchedWriteRes<[M1UnitL,
- M1UnitNALU,
- M1UnitNALU,
- M1UnitL,
- M1UnitNALU]> { let Latency = 7;
- let NumMicroOps = 5;
- let ResourceCycles = [1, 1, 1, 1, 1]; }
-def M1WriteVLDM : SchedWriteRes<[M1UnitL,
- M1UnitNALU,
- M1UnitNALU,
- M1UnitL,
- M1UnitNALU,
- M1UnitNALU]> { let Latency = 7;
- let NumMicroOps = 6;
- let ResourceCycles = [1, 1, 1, 1, 1, 1]; }
-def M1WriteVLDN : SchedWriteRes<[M1UnitL,
- M1UnitL,
- M1UnitL,
- M1UnitL]> { let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 2, 1]; }
-def M1WriteVSTA : WriteSequence<[WriteVST], 2>;
-def M1WriteVSTB : WriteSequence<[WriteVST], 3>;
-def M1WriteVSTC : WriteSequence<[WriteVST], 4>;
-def M1WriteVSTD : SchedWriteRes<[M1UnitS,
- M1UnitFST,
- M1UnitFST]> { let Latency = 7;
- let NumMicroOps = 2;
- let ResourceCycles = [7, 1, 1]; }
-def M1WriteVSTE : SchedWriteRes<[M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitFST]> { let Latency = 8;
- let NumMicroOps = 3;
- let ResourceCycles = [7, 1, 1, 1, 1]; }
-def M1WriteVSTF : SchedWriteRes<[M1UnitNALU,
- M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitFST,
- M1UnitFST]> { let Latency = 15;
- let NumMicroOps = 5;
- let ResourceCycles = [1, 7, 1, 7, 1, 1, 1]; }
-def M1WriteVSTG : SchedWriteRes<[M1UnitNALU,
- M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitFST,
- M1UnitFST]> { let Latency = 16;
- let NumMicroOps = 6;
- let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1]; }
-def M1WriteVSTH : SchedWriteRes<[M1UnitNALU,
- M1UnitS,
- M1UnitFST,
- M1UnitFST,
- M1UnitFST]> { let Latency = 14;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 7, 1, 7, 1]; }
-def M1WriteVSTI : SchedWriteRes<[M1UnitNALU,
- M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitS,
- M1UnitFST,
- M1UnitFST,
- M1UnitFST]> { let Latency = 17;
- let NumMicroOps = 7;
- let ResourceCycles = [1, 7, 1, 7, 1, 1, 1, 1, 1, 1, 1]; }
-
-// Special cases.
-def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
-def M1WriteCOPY : SchedWriteVariant<[SchedVar<ExynosFPPred, [M1WriteNALU1]>,
- SchedVar<NoSchedPred, [M1WriteA1]>]>;
-
-// Fast forwarding.
-def M1ReadAES : SchedReadAdvance<1, [M1WriteAES]>;
-
-// Branch instructions
-def : InstRW<[M1WriteB1], (instrs Bcc)>;
-def : InstRW<[M1WriteA1], (instrs BL)>;
-def : InstRW<[M1WriteBX], (instrs BLR)>;
-def : InstRW<[M1WriteC1], (instregex "^CBN?Z[WX]")>;
-def : InstRW<[M1WriteAD], (instregex "^TBN?Z[WX]")>;
-
-// Arithmetic and logical integer instructions.
-def : InstRW<[M1WriteAX], (instregex ".+rx(64)?$")>;
-def : InstRW<[M1WriteAX], (instregex ".+rs$")>;
-
-// Move instructions.
-def : InstRW<[M1WriteCOPY], (instrs COPY)>;
-
-// Divide and multiply instructions.
-
-// Miscellaneous instructions.
-
-// Load instructions.
-def : InstRW<[M1WriteLB,
- WriteLDHi,
- WriteAdr], (instregex "^LDP(SW|W|X)(post|pre)")>;
-def : InstRW<[M1WriteLC,
- ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roW")>;
-def : InstRW<[M1WriteL5,
- ReadAdrBase], (instregex "^LDR(BB|SBW|SBX|HH|SHW|SHX|SW|W|X)roX")>;
-def : InstRW<[M1WriteLC,
- ReadAdrBase], (instrs PRFMroW)>;
-def : InstRW<[M1WriteL5,
- ReadAdrBase], (instrs PRFMroX)>;
-
-// Store instructions.
-def : InstRW<[M1WriteSC,
- ReadAdrBase], (instregex "^STR(BB|HH|W|X)roW")>;
-def : InstRW<[WriteST,
- ReadAdrBase], (instregex "^STR(BB|HH|W|X)roX")>;
-
-// FP data instructions.
-def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)[DS]r")>;
-def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>;
-def : InstRW<[M1WriteNEONG], (instregex "^FCCMPE?[DS]rr")>;
-def : InstRW<[M1WriteNMISC4], (instregex "^FCMPE?[DS]r")>;
-def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>;
-def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>;
-def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>;
-def : InstRW<[M1WriteFMAC4], (instregex "^FN?MUL[DS]rr")>;
-def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>;
-def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>;
-def : InstRW<[M1WriteNEONH], (instregex "^FCSEL[DS]rrr")>;
-def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>;
-def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>;
-
-// FP miscellaneous instructions.
-def : InstRW<[M1WriteFCVT3], (instregex "^FCVT[DS][DS]r")>;
-def : InstRW<[M1WriteNEONF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>;
-def : InstRW<[M1WriteNEONE], (instregex "^[SU]CVTF[SU]")>;
-def : InstRW<[M1WriteNALU1], (instregex "^FMOV[DS][ir]")>;
-def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev1")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^FRECPXv1")>;
-def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)S(16|32|64)")>;
-def : InstRW<[M1WriteS4], (instregex "^FMOV[WX][DS](High)?r")>;
-def : InstRW<[M1WriteNEONI], (instregex "^FMOV[DS][WX](High)?r")>;
-
-// FP load instructions.
-def : InstRW<[WriteVLD], (instregex "^LDR[DSQ]l")>;
-def : InstRW<[WriteVLD], (instregex "^LDUR[BDHSQ]i")>;
-def : InstRW<[WriteVLD,
- WriteAdr], (instregex "^LDR[BDHSQ](post|pre)")>;
-def : InstRW<[WriteVLD], (instregex "^LDR[BDHSQ]ui")>;
-def : InstRW<[M1WriteLD,
- ReadAdrBase], (instregex "^LDR[BDHS]roW")>;
-def : InstRW<[WriteVLD,
- ReadAdrBase], (instregex "^LDR[BDHS]roX")>;
-def : InstRW<[M1WriteLD,
- ReadAdrBase], (instregex "^LDRQro[WX]")>;
-def : InstRW<[WriteVLD,
- M1WriteLH], (instregex "^LDN?P[DS]i")>;
-def : InstRW<[M1WriteLA,
- M1WriteLH], (instregex "^LDN?PQi")>;
-def : InstRW<[M1WriteLC,
- M1WriteLH,
- WriteAdr], (instregex "^LDP[DS](post|pre)")>;
-def : InstRW<[M1WriteLD,
- M1WriteLH,
- WriteAdr], (instregex "^LDPQ(post|pre)")>;
-
-// FP store instructions.
-def : InstRW<[WriteVST], (instregex "^STUR[BDHSQ]i")>;
-def : InstRW<[WriteVST,
- WriteAdr], (instregex "^STR[BDHSQ](post|pre)")>;
-def : InstRW<[WriteVST], (instregex "^STR[BDHSQ]ui")>;
-def : InstRW<[M1WriteSA,
- ReadAdrBase], (instregex "^STR[BDHS]roW")>;
-def : InstRW<[WriteVST,
- ReadAdrBase], (instregex "^STR[BDHS]roX")>;
-def : InstRW<[M1WriteSA,
- ReadAdrBase], (instregex "^STRQro[WX]")>;
-def : InstRW<[WriteVST], (instregex "^STN?P[DSQ]i")>;
-def : InstRW<[WriteVST,
- WriteAdr], (instregex "^STP[DS](post|pre)")>;
-def : InstRW<[M1WriteSB,
- WriteAdr], (instregex "^STPQ(post|pre)")>;
-
-// ASIMD instructions.
-def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABAL?v")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^[SU]ABDL?v")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^(SQ)?ABSv")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^SQNEGv")>;
-def : InstRW<[M1WriteNALU1], (instregex "^(ADD|NEG|SUB)v")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?H(ADD|SUB)v")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?AD[AD](L|LP|P|W)V?2?v")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?SUB[LW]2?v")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^R?(ADD|SUB)HN?2?v")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^[SU]+Q(ADD|SUB)v")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^[SU]RHADDv")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^CM(EQ|GE|GT|HI|HS|LE|LT)v")>;
-def : InstRW<[M1WriteNALU1], (instregex "^CMTSTv")>;
-def : InstRW<[M1WriteNALU1], (instregex "^(AND|BIC|EOR|MVNI|NOT|ORN|ORR)v")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)v")>;
-def : InstRW<[M1WriteNMISC2], (instregex "^[SU](MIN|MAX)Pv")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^[SU](MIN|MAX)Vv")>;
-def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>;
-def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>;
-def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD|SQRD)ML[AS][HL]v")>;
-def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULLv")>;
-def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRAv")>;
-def : InstRW<[M1WriteNALU1], (instregex "^SHL[dv]")>;
-def : InstRW<[M1WriteNALU1], (instregex "^[SU]SH[LR][dv]")>;
-def : InstRW<[M1WriteNALU1], (instregex "^S[RS]I[dv]")>;
-def : InstRW<[M1WriteNAL13], (instregex "^(([SU]Q)?R)?SHRU?N[bhsv]")>;
-def : InstRW<[M1WriteNAL13], (instregex "^[SU]RSH[LR][dv]")>;
-def : InstRW<[M1WriteNAL13], (instregex "^[SU]QR?SHLU?[bdhsv]")>;
-
-// ASIMD FP instructions.
-def : InstRW<[M1WriteNALU1], (instregex "^F(ABS|NEG)v")>;
-def : InstRW<[M1WriteNMISC3], (instregex "^F(ABD|ADD|SUB)v")>;
-def : InstRW<[M1WriteNEONA], (instregex "^FADDP")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v[^1]")>;
-def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVTX?[AFLMNPZ][SU]?(_Int)?v")>;
-def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>;
-def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>;
-def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>;
-def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>;
-def : InstRW<[M1WriteNMISC1], (instregex "^F(MAX|MIN)(NM)?V?v")>;
-def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN)(NM)?Pv")>;
-def : InstRW<[M1WriteNEONJ], (instregex "^FMULX?v.i")>;
-def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v.f")>;
-def : InstRW<[M1WriteNEONK], (instregex "^FML[AS]v.i")>;
-def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v.f")>;
-def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>;
-
-// ASIMD miscellaneous instructions.
-def : InstRW<[M1WriteNALU1], (instregex "^RBITv")>;
-def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v")>;
-def : InstRW<[M1WriteNEONB], (instregex "^DUPv.+gpr")>;
-def : InstRW<[M1WriteNALU1], (instregex "^DUPv.+lane")>;
-def : InstRW<[M1WriteNALU1], (instregex "^EXTv8")>;
-def : InstRW<[M1WriteNEONL], (instregex "^EXTv16")>;
-def : InstRW<[M1WriteNAL13], (instregex "^[SU]?Q?XTU?Nv")>;
-def : InstRW<[M1WriteNALU1], (instregex "^CPY")>;
-def : InstRW<[M1WriteNALU1], (instregex "^INSv.+lane")>;
-def : InstRW<[M1WriteNALU1], (instregex "^MOVI[Dv]")>;
-def : InstRW<[M1WriteNALU1], (instregex "^FMOVv")>;
-def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)Ev[248]")>;
-def : InstRW<[M1Wr